You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

32 lines
1.2 KiB

8 months ago
from cppy.cp_util import stopwordfilepath,testfilepath
import string
9 months ago
from collections import Counter
# 准备词和停用词表
stop_words = set(open(stopwordfilepath).read().split(','))
stop_words.update(list(string.ascii_lowercase))
# 读取文件并计算单词频率
word_freqs = Counter()
with open(testfilepath,encoding = 'utf8') as f:
for line_num, line in enumerate(f, 1):
start_char = None
for i, c in enumerate(line):
if start_char is None and c.isalnum():
start_char = i
elif start_char is not None and not c.isalnum():
word = line[start_char:i].lower()
if word not in stop_words:
word_freqs[word] += 1
start_char = None
# 打印前10个最常见的单词
for word, freq in word_freqs.most_common(10):
print(f"{word}-{freq}")
'''
相比 A01
使用collections.Counter来计数单词频率从而简化了代码并提高了效率
使用enumerate来获取行号和行内容使用set来存储停用词都有助于提高代码的性能和可读性
使用most_common方法来获取最常见的单词使输出更为简洁
'''