import string from cppy.cp_util import stopwordfilepath,testfilepath # 准备词和停用词表 word_freqs = [] with open( stopwordfilepath,encoding='utf-8' ) as f: stop_words = f.read().split(',') stop_words.extend(list(string.ascii_lowercase)) for line in open( testfilepath ,encoding='utf-8' ): start_char = None i = 0 for c in line: if start_char == None: if c.isalnum(): # 一个单词开始 start_char = i else: if not c.isalnum(): # 一个单词结束 found = False word = line[start_char:i].lower() # 跳过停用词 if word not in stop_words: pair_index = 0 # 单词是否第一次出现 for pair in word_freqs: if word == pair[0]: pair[1] += 1 found = True break pair_index += 1 if not found: word_freqs.append([word, 1]) elif len(word_freqs) > 1: for n in reversed(range(pair_index)): if word_freqs[pair_index][1] > word_freqs[n][1]: # 交换 word_freqs[n], word_freqs[pair_index] = word_freqs[pair_index], word_freqs[n] pair_index = n # 重置开始标记 start_char = None i += 1 for tf in word_freqs[0:10]: print(tf[0], '-', tf[1])