From a66617dcce53e610a9411fb7f7b8e09be9b51168 Mon Sep 17 00:00:00 2001
From: p26zockiw <1285381170@qq.com>
Date: Thu, 21 Mar 2024 17:25:18 +0800
Subject: [PATCH] ADD file via upload

---
 一盘大棋/A01修改.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 一盘大棋/A01修改.py

diff --git a/一盘大棋/A01修改.py b/一盘大棋/A01修改.py
new file mode 100644
index 0000000..d19a369
--- /dev/null
+++ b/一盘大棋/A01修改.py
@@ -0,0 +1,33 @@
+import re
+from collections import Counter
+import string
+from cppy.cp_util import stopwordfilepath,testfilepath
+
+# 读取停用词并创建一个集合以便快速查找
+stop_words = set()
+with open(stopwordfilepath, encoding='utf-8') as f:
+    for line in f:
+        stop_words.update(word.strip() for word in line.split(','))
+
+# 停用词集合中添加所有小写英文字母
+# 注意：这里我们不直接添加所有字母，而是在过滤时检查单词长度
+# 如果单词只包含一个字符，则视为字母，排除在外
+stop_words.update(set(string.ascii_lowercase))
+
+# 读取测试文件并计算单词频率
+with open(testfilepath, encoding='utf-8') as f:
+    # 使用正则表达式移除标点并分割单词，排除单个字符
+    words = re.findall(r'\b\w{2,}\b', f.read().lower())  # 只匹配至少两个字符的单词
+    # 过滤停用词并计数
+    word_freqs = Counter(word for word in words if word not in stop_words and len(word) > 1)
+
+# 获取出现频率最高的前10个单词
+most_common_words = word_freqs.most_common(10)
+
+# 打印结果
+for word, freq in most_common_words:
+    print(f'{word} - {freq}')
+
+# 修改逻辑：A01没有排除逗号的影响，同时一遍提取一边排序，资源占用大
+# 解决方案：引入re，将逗号去除。并且引入counter进行计数
+