# 引入停用词表和测试文件的路径 from cppy.cp_util import stopwordfilepath, testfilepath # 准备停用词表 with open(stopwordfilepath, encoding='utf-8') as f: stop_words = f.read().split(',') for letter in 'abcdefghijklmnopqrstuvwxyz': stop_words.append(letter) # 读文件,逐行扫描文本,发现词,确定不是停用词,计数 word_freqs = [] for line in open( testfilepath, encoding='utf-8' ): start_char = None i = 0 for c in line: if start_char is None: if c.isalnum(): # 一个单词开始 start_char = i else: if not c.isalnum(): # 一个单词结束 found = False word = line[start_char:i].lower() # 跳过停用词 if word not in stop_words: pair_index = 0 # 单词是否第一次出现 for pair in word_freqs: if word == pair[0]: pair[1] += 1 found = True break pair_index += 1 if not found: word_freqs.append([word, 1]) # 重置开始标记 start_char = None i += 1 # 使用冒泡排序对词频进行排序 n = len(word_freqs) for i in range(n): for j in range(0, n-i-1): if word_freqs[j][1] < word_freqs[j+1][1]: word_freqs[j], word_freqs[j+1] = word_freqs[j+1], word_freqs[j] # 打印频率最高的前10个词 for tf in word_freqs[:10]: print(tf[0], '-', tf[1])