You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
import re
|
|
|
|
|
from collections import Counter
|
|
|
|
|
import string
|
|
|
|
|
from cppy.cp_util import stopwordfilepath,testfilepath
|
|
|
|
|
|
|
|
|
|
# 读取停用词并创建一个集合以便快速查找
|
|
|
|
|
stop_words = set()
|
|
|
|
|
with open(stopwordfilepath, encoding='utf-8') as f:
|
|
|
|
|
for line in f:
|
|
|
|
|
stop_words.update(word.strip() for word in line.split(','))
|
|
|
|
|
|
|
|
|
|
# 停用词集合中添加所有小写英文字母
|
|
|
|
|
# 注意:这里我们不直接添加所有字母,而是在过滤时检查单词长度
|
|
|
|
|
# 如果单词只包含一个字符,则视为字母,排除在外
|
|
|
|
|
stop_words.update(set(string.ascii_lowercase))
|
|
|
|
|
|
|
|
|
|
# 读取测试文件并计算单词频率
|
|
|
|
|
with open(testfilepath, encoding='utf-8') as f:
|
|
|
|
|
# 使用正则表达式移除标点并分割单词,排除单个字符
|
|
|
|
|
words = re.findall(r'\b\w{2,}\b', f.read().lower()) # 只匹配至少两个字符的单词
|
|
|
|
|
# 过滤停用词并计数
|
|
|
|
|
word_freqs = Counter(word for word in words if word not in stop_words and len(word) > 1)
|
|
|
|
|
|
|
|
|
|
# 获取出现频率最高的前10个单词
|
|
|
|
|
most_common_words = word_freqs.most_common(10)
|
|
|
|
|
|
|
|
|
|
# 打印结果
|
|
|
|
|
for word, freq in most_common_words:
|
|
|
|
|
print(f'{word} - {freq}')
|
|
|
|
|
|
|
|
|
|
# 修改逻辑:A01没有排除逗号的影响,同时一遍提取一边排序,资源占用大
|
|
|
|
|
# 解决方案:引入re,将逗号去除。并且引入counter进行计数
|
|
|
|
|
|