You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

77 lines
3.0 KiB

9 months ago
import os,re,string,operator
from collections import Counter
# TextProcessor 类负责处理文本并计算词频。当文本处理完成后,它会通过 notify 方法通知所有注册的观察者。
# WordFrequencyObserver 类是一个具体的观察者,它实现了 update 方法来接收词频更新并打印前10个最常见的单词。
class Subject:
def __init__(self):
self._observers = []
# 不能随意改变,所以肯定是私有
def attach(self, observer):
self._observers.append(observer)
def detach(self, observer):
self._observers.remove(observer)
def notify(self, word_freqs):
for observer in self._observers:
observer.update(word_freqs)
# 关注,取消关注,通知有更新,Subject类是用来创建一个类对订阅者即观察者列表进行维护
class Observer:
def update(self, word_freqs):
pass
# 定义一个抽象的Observer
# 而下面的是一个具体的Observer类
class WordFrequencyObserver(Observer):
def update(self, word_freqs):
print("词频已经被更新:")
self.print_word_freqs(word_freqs)
def print_word_freqs(self, word_freqs):
sorted_freqs = sorted(word_freqs.items(), key=operator.itemgetter(1), reverse=True)
for (w, c) in sorted_freqs[:10]:
print(f"{w}: {c}")
# 对文本进行分析
class TextProcessor:
def __init__(self, subject: Subject):
#subject是Subject的子类类型注解单独写也可以
self._subject = subject
self._stop_words:str = set()
#是一个集合其实这里需要表明是str
def load_stop_words(self, path_to_file):
with open(path_to_file, encoding='utf-8') as f:
self._stop_words = set(line.strip().lower() for line in f)
def process_text(self, path_to_file):
with open(path_to_file, encoding='utf-8') as f:
data = f.read()
word_list = self.re_split(data)
filtered_words = self.filter_words(word_list)
word_freqs = self.count_frequencies(filtered_words)
self._subject.notify(word_freqs)
def re_split(self, data):
pattern = re.compile('[\W_]+')
return pattern.sub(' ', data).lower().split()
def filter_words(self, word_list):
return [w for w in word_list if w not in self._stop_words and len(w) >= 3]
def count_frequencies(self, word_list):
return Counter(word_list)
# 开始测试
if __name__ == "__main__":
stopwordfilepath = r'C:\Users\asus\Desktop\cppy余悦批注\cppy\data\stop_words.txt'
testfilepath = r'C:\Users\asus\Desktop\cppy余悦批注\cppy\data\pride-and-prejudice.txt'
# 调用实例
subject = Subject()
observer = WordFrequencyObserver()
subject.attach(observer)
text_processor = TextProcessor(subject)
text_processor.load_stop_words(stopwordfilepath)
text_processor.process_text(testfilepath)