maxlen = 0 word_freqs = collections.Counter() num_recs = 0 ftrain = open(os.path.join(DATA_DIR, "umich-sentiment-train.txt"), 'rb') for line in ftrain: label, sentence = line.strip().split("t") words = nltk.word_tokenize(sentence.decode("ascii", "ignore").lower()) if len(words) > maxlen: maxlen = len(words) for word in words: word_freqs[word] += 1 num_recs += 1 ftrain.close()