You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

77 lines
3.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import os,re,string,operator
from collections import Counter
# TextProcessor 类负责处理文本并计算词频。当文本处理完成后,它会通过 notify 方法通知所有注册的观察者。
# WordFrequencyObserver 类是一个具体的观察者,它实现了 update 方法来接收词频更新并打印前10个最常见的单词。
class Subject:
def __init__(self):
self._observers = []
# 不能随意改变,所以肯定是私有
def attach(self, observer):
self._observers.append(observer)
def detach(self, observer):
self._observers.remove(observer)
def notify(self, word_freqs):
for observer in self._observers:
observer.update(word_freqs)
# 关注,取消关注,通知有更新,Subject类是用来创建一个类对订阅者即观察者列表进行维护
class Observer:
def update(self, word_freqs):
pass
# 定义一个抽象的Observer
# 而下面的是一个具体的Observer类
class WordFrequencyObserver(Observer):
def update(self, word_freqs):
print("词频已经被更新:")
self.print_word_freqs(word_freqs)
def print_word_freqs(self, word_freqs):
sorted_freqs = sorted(word_freqs.items(), key=operator.itemgetter(1), reverse=True)
for (w, c) in sorted_freqs[:10]:
print(f"{w}: {c}")
# 对文本进行分析
class TextProcessor:
def __init__(self, subject: Subject):
#subject是Subject的子类类型注解单独写也可以
self._subject = subject
self._stop_words:str = set()
#是一个集合其实这里需要表明是str
def load_stop_words(self, path_to_file):
with open(path_to_file, encoding='utf-8') as f:
self._stop_words = set(line.strip().lower() for line in f)
def process_text(self, path_to_file):
with open(path_to_file, encoding='utf-8') as f:
data = f.read()
word_list = self.re_split(data)
filtered_words = self.filter_words(word_list)
word_freqs = self.count_frequencies(filtered_words)
self._subject.notify(word_freqs)
def re_split(self, data):
pattern = re.compile('[\W_]+')
return pattern.sub(' ', data).lower().split()
def filter_words(self, word_list):
return [w for w in word_list if w not in self._stop_words and len(w) >= 3]
def count_frequencies(self, word_list):
return Counter(word_list)
# 开始测试
if __name__ == "__main__":
stopwordfilepath = r'C:\Users\asus\Desktop\cppy余悦批注\cppy\data\stop_words.txt'
testfilepath = r'C:\Users\asus\Desktop\cppy余悦批注\cppy\data\pride-and-prejudice.txt'
# 调用实例
subject = Subject()
observer = WordFrequencyObserver()
subject.attach(observer)
text_processor = TextProcessor(subject)
text_processor.load_stop_words(stopwordfilepath)
text_processor.process_text(testfilepath)