You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.
import re
from collections import Counter
import string
from cppy . cp_util import stopwordfilepath , testfilepath
# 读取停用词并创建一个集合以便快速查找
stop_words = set ( )
with open ( stopwordfilepath , encoding = ' utf-8 ' ) as f :
for line in f :
stop_words . update ( word . strip ( ) for word in line . split ( ' , ' ) )
# 停用词集合中添加所有小写英文字母
# 注意:这里我们不直接添加所有字母,而是在过滤时检查单词长度
# 如果单词只包含一个字符,则视为字母,排除在外
stop_words . update ( set ( string . ascii_lowercase ) )
# 读取测试文件并计算单词频率
with open ( testfilepath , encoding = ' utf-8 ' ) as f :
# 使用正则表达式移除标点并分割单词,排除单个字符
words = re . findall ( r ' \ b \ w { 2,} \ b ' , f . read ( ) . lower ( ) ) # 只匹配至少两个字符的单词
# 过滤停用词并计数
word_freqs = Counter ( word for word in words if word not in stop_words and len ( word ) > 1 )
# 获取出现频率最高的前10个单词
most_common_words = word_freqs . most_common ( 10 )
# 打印结果
for word , freq in most_common_words :
print ( f ' { word } - { freq } ' )
# 修改逻辑: A01没有排除逗号的影响, 同时一遍提取一边排序, 资源占用大
# 解决方案: 引入re, 将逗号去除。并且引入counter进行计数