You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
from cppy.cp_util import stopwordfilepath, testfilepath
|
|
|
|
|
import string
|
|
|
|
|
from collections import Counter
|
|
|
|
|
|
|
|
|
|
# 准备词和停用词表
|
|
|
|
|
stop_words = set(open(stopwordfilepath).read().split(','))
|
|
|
|
|
stop_words.update(list(string.ascii_lowercase))
|
|
|
|
|
|
|
|
|
|
# 读取文件并计算单词频率
|
|
|
|
|
word_freqs = Counter()
|
|
|
|
|
with open(testfilepath, encoding='utf8') as f:
|
|
|
|
|
for line_num, line in enumerate(f, 1):
|
|
|
|
|
start_char = None
|
|
|
|
|
for i, c in enumerate(line):
|
|
|
|
|
if start_char is None and c.isalnum():
|
|
|
|
|
start_char = i
|
|
|
|
|
elif start_char is not None and not c.isalnum():
|
|
|
|
|
word = line[start_char:i].lower()
|
|
|
|
|
if word not in stop_words:
|
|
|
|
|
word_freqs[word] += 1
|
|
|
|
|
start_char = None
|
|
|
|
|
|
|
|
|
|
# 打印前10个最常见的单词
|
|
|
|
|
for word, freq in word_freqs.most_common(10):
|
|
|
|
|
print(f"{word}-{freq}")
|
|
|
|
|
'''
|
|
|
|
|
相比 A01
|
|
|
|
|
使用collections.Counter来计数单词频率,从而简化了代码并提高了效率。
|
|
|
|
|
使用enumerate来获取行号和行内容,使用set来存储停用词,都有助于提高代码的性能和可读性。
|
|
|
|
|
使用most_common方法来获取最常见的单词,使输出更为简洁。
|
|
|
|
|
'''
|