|
|
@ -1,61 +1,105 @@
|
|
|
|
'''
|
|
|
|
'''
|
|
|
|
入门级示例,是用来帮助理解其他例子
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
把观察者挂到自己的处理队列上
|
|
|
|
把观察者挂到自己的处理队列上
|
|
|
|
WordSubject 调用所有观察者的 update 方法
|
|
|
|
|
|
|
|
'''
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
|
|
|
import collections
|
|
|
|
import os
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
import threading
|
|
|
|
|
|
|
|
from queue import Queue
|
|
|
|
|
|
|
|
from collections import Counter
|
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
from cppy.cp_util import *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 定义观察者接口
|
|
|
|
# 观察者接口
|
|
|
|
class Observer(ABC):
|
|
|
|
class Observer(ABC):
|
|
|
|
@abstractmethod
|
|
|
|
@abstractmethod
|
|
|
|
def update(self, word):
|
|
|
|
def update(self, word_counts: Counter):
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
# 定义具体观察者类,用于统计词频
|
|
|
|
# 具体观察者:打印前 10 高频词
|
|
|
|
class WordFrequencyObserver(Observer):
|
|
|
|
class PrintTopWordsObserver(Observer):
|
|
|
|
def __init__(self):
|
|
|
|
def update(self, word_counts: Counter):
|
|
|
|
self.word_count = collections.Counter()
|
|
|
|
print("Top 10 高频词:")
|
|
|
|
|
|
|
|
for word, count in word_counts.most_common(10):
|
|
|
|
def update(self, word):
|
|
|
|
print(f"{word}: {count}")
|
|
|
|
self.word_count[word] += 1
|
|
|
|
|
|
|
|
|
|
|
|
# 具体观察者:保存词频到文件
|
|
|
|
|
|
|
|
class SaveToFileObserver(Observer):
|
|
|
|
# 定义主题类
|
|
|
|
def __init__(self, output_file):
|
|
|
|
class WordSubject:
|
|
|
|
self.output_file = output_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def update(self, word_counts: Counter):
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
with open(self.output_file, 'w', encoding='utf-8') as f:
|
|
|
|
|
|
|
|
for word, count in word_counts.most_common(10):
|
|
|
|
|
|
|
|
f.write(f"{word}: {count}\n")
|
|
|
|
|
|
|
|
print(f"词频已保存到 {self.output_file}")
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
print(f"保存失败: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 词频统计器(主题)
|
|
|
|
|
|
|
|
class WordFrequencyCounter:
|
|
|
|
def __init__(self):
|
|
|
|
def __init__(self):
|
|
|
|
self.observers = []
|
|
|
|
self.observers = []
|
|
|
|
|
|
|
|
self.counter = Counter()
|
|
|
|
|
|
|
|
self.queue = Queue()
|
|
|
|
|
|
|
|
self.lock = threading.Lock()
|
|
|
|
|
|
|
|
|
|
|
|
def attach(self, observer):
|
|
|
|
def add_observer(self, observer: Observer):
|
|
|
|
self.observers.append(observer)
|
|
|
|
self.observers.append(observer)
|
|
|
|
|
|
|
|
|
|
|
|
def notify(self, word):
|
|
|
|
def remove_observer(self, observer: Observer):
|
|
|
|
for observer in self.observers:
|
|
|
|
self.observers.remove(observer)
|
|
|
|
observer.update(word)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 主函数
|
|
|
|
def notify_observers(self):
|
|
|
|
def main(testfilepath, top_n = 10 ):
|
|
|
|
for observer in self.observers:
|
|
|
|
stopwords = get_stopwords()
|
|
|
|
observer.update(self.counter)
|
|
|
|
subject = WordSubject()
|
|
|
|
|
|
|
|
|
|
|
|
def process_file(self):
|
|
|
|
# 创建一个观察者并附加到主题
|
|
|
|
while True:
|
|
|
|
observer = WordFrequencyObserver()
|
|
|
|
try:
|
|
|
|
subject.attach(observer)
|
|
|
|
file_path = self.queue.get_nowait()
|
|
|
|
|
|
|
|
except:
|
|
|
|
# 处理文件
|
|
|
|
break
|
|
|
|
wordlist = re_split( read_file(testfilepath) )
|
|
|
|
try:
|
|
|
|
for word in wordlist:
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
if word not in stopwords:
|
|
|
|
text = f.read().lower()
|
|
|
|
subject.notify(word) # 通知
|
|
|
|
words = re.findall(r'\b\w+\b', text)
|
|
|
|
|
|
|
|
with self.lock:
|
|
|
|
# 打印最高的N个词频
|
|
|
|
self.counter.update(words)
|
|
|
|
top_words = observer.word_count.most_common(top_n)
|
|
|
|
except Exception as e:
|
|
|
|
print_word_freqs(top_words)
|
|
|
|
print(f"Error processing {file_path}: {e}")
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
|
|
self.queue.task_done()
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
main( testfilepath )
|
|
|
|
def count_words(self, files, num_threads=4):
|
|
|
|
|
|
|
|
# 将文件路径放入队列
|
|
|
|
|
|
|
|
for file_path in files:
|
|
|
|
|
|
|
|
self.queue.put(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 创建并启动线程
|
|
|
|
|
|
|
|
threads = [threading.Thread(target=self.process_file) for _ in range(num_threads)]
|
|
|
|
|
|
|
|
for t in threads:
|
|
|
|
|
|
|
|
t.start()
|
|
|
|
|
|
|
|
for t in threads:
|
|
|
|
|
|
|
|
t.join()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 通知所有观察者
|
|
|
|
|
|
|
|
self.notify_observers()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
|
|
# 获取文件列表
|
|
|
|
|
|
|
|
data_dir = 'data'
|
|
|
|
|
|
|
|
files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.txt')]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 创建词频统计器
|
|
|
|
|
|
|
|
counter = WordFrequencyCounter()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 添加观察者
|
|
|
|
|
|
|
|
counter.add_observer(PrintTopWordsObserver())
|
|
|
|
|
|
|
|
counter.add_observer(SaveToFileObserver("word_frequency.txt"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 统计词频并通知观察者
|
|
|
|
|
|
|
|
counter.count_words(files)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
main()
|