From c22c921cf83f2b33eb2b93c33606a0eebc0699be Mon Sep 17 00:00:00 2001 From: p26zockiw <1285381170@qq.com> Date: Sun, 17 Mar 2024 20:53:58 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E8=A7=82=E5=AF=9F=E8=80=85=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 基本结构/观察者模式/Observer.py | 77 ++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 基本结构/观察者模式/Observer.py diff --git a/基本结构/观察者模式/Observer.py b/基本结构/观察者模式/Observer.py new file mode 100644 index 0000000..7d73002 --- /dev/null +++ b/基本结构/观察者模式/Observer.py @@ -0,0 +1,77 @@ +import os,re,string,operator +from collections import Counter + +# TextProcessor 类负责处理文本并计算词频。当文本处理完成后,它会通过 notify 方法通知所有注册的观察者。 +# WordFrequencyObserver 类是一个具体的观察者,它实现了 update 方法来接收词频更新并打印前10个最常见的单词。 +class Subject: + def __init__(self): + self._observers = [] + # 不能随意改变,所以肯定是私有 + def attach(self, observer): + self._observers.append(observer) + + def detach(self, observer): + self._observers.remove(observer) + + def notify(self, word_freqs): + for observer in self._observers: + observer.update(word_freqs) +# 关注,取消关注,通知有更新,Subject类是用来创建一个类,对订阅者(即观察者)列表进行维护 + +class Observer: + def update(self, word_freqs): + pass +# 定义一个抽象的Observer +# 而下面的是一个具体的Observer类 +class WordFrequencyObserver(Observer): + def update(self, word_freqs): + print("词频已经被更新:") + self.print_word_freqs(word_freqs) + + def print_word_freqs(self, word_freqs): + sorted_freqs = sorted(word_freqs.items(), key=operator.itemgetter(1), reverse=True) + for (w, c) in sorted_freqs[:10]: + print(f"{w}: {c}") + +# 对文本进行分析 +class TextProcessor: + def __init__(self, subject: Subject): +#subject是Subject的子类,类型注解,单独写也可以 + self._subject = subject + self._stop_words:str = set() +#是一个集合(其实这里需要表明是str) + def load_stop_words(self, path_to_file): + with open(path_to_file, encoding='utf-8') as f: + self._stop_words = set(line.strip().lower() for line in f) + + def process_text(self, path_to_file): + with open(path_to_file, encoding='utf-8') as f: + data = f.read() + word_list = self.re_split(data) + filtered_words = self.filter_words(word_list) + word_freqs = self.count_frequencies(filtered_words) + self._subject.notify(word_freqs) + + def re_split(self, data): + pattern = re.compile('[\W_]+') + return pattern.sub(' ', data).lower().split() + + def filter_words(self, word_list): + return [w for w in word_list if w not in self._stop_words and len(w) >= 3] + + def count_frequencies(self, word_list): + return Counter(word_list) + +# 开始测试 +if __name__ == "__main__": + stopwordfilepath = r'C:\Users\asus\Desktop\cppy余悦批注\cppy\data\stop_words.txt' + testfilepath = r'C:\Users\asus\Desktop\cppy余悦批注\cppy\data\pride-and-prejudice.txt' + + # 调用实例 + subject = Subject() + observer = WordFrequencyObserver() + subject.attach(observer) + + text_processor = TextProcessor(subject) + text_processor.load_stop_words(stopwordfilepath) + text_processor.process_text(testfilepath) \ No newline at end of file From a66617dcce53e610a9411fb7f7b8e09be9b51168 Mon Sep 17 00:00:00 2001 From: p26zockiw <1285381170@qq.com> Date: Thu, 21 Mar 2024 17:25:18 +0800 Subject: [PATCH 2/2] ADD file via upload --- 一盘大棋/A01修改.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 一盘大棋/A01修改.py diff --git a/一盘大棋/A01修改.py b/一盘大棋/A01修改.py new file mode 100644 index 0000000..d19a369 --- /dev/null +++ b/一盘大棋/A01修改.py @@ -0,0 +1,33 @@ +import re +from collections import Counter +import string +from cppy.cp_util import stopwordfilepath,testfilepath + +# 读取停用词并创建一个集合以便快速查找 +stop_words = set() +with open(stopwordfilepath, encoding='utf-8') as f: + for line in f: + stop_words.update(word.strip() for word in line.split(',')) + +# 停用词集合中添加所有小写英文字母 +# 注意:这里我们不直接添加所有字母,而是在过滤时检查单词长度 +# 如果单词只包含一个字符,则视为字母,排除在外 +stop_words.update(set(string.ascii_lowercase)) + +# 读取测试文件并计算单词频率 +with open(testfilepath, encoding='utf-8') as f: + # 使用正则表达式移除标点并分割单词,排除单个字符 + words = re.findall(r'\b\w{2,}\b', f.read().lower()) # 只匹配至少两个字符的单词 + # 过滤停用词并计数 + word_freqs = Counter(word for word in words if word not in stop_words and len(word) > 1) + +# 获取出现频率最高的前10个单词 +most_common_words = word_freqs.most_common(10) + +# 打印结果 +for word, freq in most_common_words: + print(f'{word} - {freq}') + +# 修改逻辑:A01没有排除逗号的影响,同时一遍提取一边排序,资源占用大 +# 解决方案:引入re,将逗号去除。并且引入counter进行计数 +