From a66617dcce53e610a9411fb7f7b8e09be9b51168 Mon Sep 17 00:00:00 2001 From: p26zockiw <1285381170@qq.com> Date: Thu, 21 Mar 2024 17:25:18 +0800 Subject: [PATCH] ADD file via upload --- 一盘大棋/A01修改.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 一盘大棋/A01修改.py diff --git a/一盘大棋/A01修改.py b/一盘大棋/A01修改.py new file mode 100644 index 0000000..d19a369 --- /dev/null +++ b/一盘大棋/A01修改.py @@ -0,0 +1,33 @@ +import re +from collections import Counter +import string +from cppy.cp_util import stopwordfilepath,testfilepath + +# 读取停用词并创建一个集合以便快速查找 +stop_words = set() +with open(stopwordfilepath, encoding='utf-8') as f: + for line in f: + stop_words.update(word.strip() for word in line.split(',')) + +# 停用词集合中添加所有小写英文字母 +# 注意:这里我们不直接添加所有字母,而是在过滤时检查单词长度 +# 如果单词只包含一个字符,则视为字母,排除在外 +stop_words.update(set(string.ascii_lowercase)) + +# 读取测试文件并计算单词频率 +with open(testfilepath, encoding='utf-8') as f: + # 使用正则表达式移除标点并分割单词,排除单个字符 + words = re.findall(r'\b\w{2,}\b', f.read().lower()) # 只匹配至少两个字符的单词 + # 过滤停用词并计数 + word_freqs = Counter(word for word in words if word not in stop_words and len(word) > 1) + +# 获取出现频率最高的前10个单词 +most_common_words = word_freqs.most_common(10) + +# 打印结果 +for word, freq in most_common_words: + print(f'{word} - {freq}') + +# 修改逻辑:A01没有排除逗号的影响,同时一遍提取一边排序,资源占用大 +# 解决方案:引入re,将逗号去除。并且引入counter进行计数 +