You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

32 lines
1.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from cppy.cp_util import stopwordfilepath, testfilepath
import string
from collections import Counter
# 准备词和停用词表
stop_words = set(open(stopwordfilepath).read().split(','))
stop_words.update(list(string.ascii_lowercase))
# 读取文件并计算单词频率
word_freqs = Counter()
with open(testfilepath, encoding='utf8') as f:
for line_num, line in enumerate(f, 1):
start_char = None
for i, c in enumerate(line):
if start_char is None and c.isalnum():
start_char = i
elif start_char is not None and not c.isalnum():
word = line[start_char:i].lower()
if word not in stop_words:
word_freqs[word] += 1
start_char = None
# 打印前10个最常见的单词
for word, freq in word_freqs.most_common(10):
print(f"{word}-{freq}")
'''
相比 A01
使用collections.Counter来计数单词频率从而简化了代码并提高了效率。
使用enumerate来获取行号和行内容使用set来存储停用词都有助于提高代码的性能和可读性。
使用most_common方法来获取最常见的单词使输出更为简洁。
'''