from cppy.cp_util import stopwordfilepath,testfilepath import string from collections import Counter # 准备词和停用词表 stop_words = set(open(stopwordfilepath).read().split(',')) stop_words.update(list(string.ascii_lowercase)) # 读取文件并计算单词频率 word_freqs = Counter() with open(testfilepath,encoding = 'utf8') as f: for line_num, line in enumerate(f, 1): start_char = None for i, c in enumerate(line): if start_char is None and c.isalnum(): start_char = i elif start_char is not None and not c.isalnum(): word = line[start_char:i].lower() if word not in stop_words: word_freqs[word] += 1 start_char = None # 打印前10个最常见的单词 for word, freq in word_freqs.most_common(10): print(f"{word}-{freq}") ''' 相比 A01 使用collections.Counter来计数单词频率,从而简化了代码并提高了效率。 使用enumerate来获取行号和行内容,使用set来存储停用词,都有助于提高代码的性能和可读性。 使用most_common方法来获取最常见的单词,使输出更为简洁。 '''