from cppy.cp_util import * ########################################### # 生成器 ########################################### def characters(filename): # 弹出一行 for line in open(filename,encoding='utf-8'): for c in line: yield c def all_words(filename): # 弹出一个词 start_char = True for c in characters(filename): if start_char == True: word = "" if c.isalnum(): # start of a word word = c.lower() start_char = False else: pass else: if c.isalnum(): word += c.lower() # end of word, emit it else: start_char = True yield word def non_stop_words(filename, stopwords): for w in all_words(filename): if not w in stopwords: yield w # 弹出一个审核过的词 if __name__ == "__main__": stopwords = get_stopwords() freqs = {} for word in non_stop_words(testfilepath,stopwords): freqs[word] = freqs.get(word, 0) + 1 data = sort_dict(freqs) print_word_freqs(data)