|
|
|
|
'''
|
|
|
|
|
后续组件挂载到前序组件后续链上
|
|
|
|
|
仅提供 self.next_observer 的抽象关系
|
|
|
|
|
后续组件接到指令和数据,自己决定动作
|
|
|
|
|
|
|
|
|
|
理论上每个组件可以参与到多个生产队列
|
|
|
|
|
|
|
|
|
|
本例使用了类来封装消息,相对于字符串理论上提供了更丰富的扩展可能
|
|
|
|
|
这是一个示例性质的原型,具体环境下需要调整
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
|
from collections import Counter
|
|
|
|
|
from typing import List, Dict
|
|
|
|
|
from cppy.cp_util import *
|
|
|
|
|
|
|
|
|
|
# 定义消息类型
|
|
|
|
|
class Message:
|
|
|
|
|
def __init__(self, data):
|
|
|
|
|
self.data = data
|
|
|
|
|
|
|
|
|
|
class TokenizedText(Message):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
class FilteredText(Message):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
class WordFrequency(Message):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
# 定义观察者接口
|
|
|
|
|
class Observer:
|
|
|
|
|
def notify(self, message: Message):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
# 切词订阅者
|
|
|
|
|
class TokenizerSubscriber(Observer):
|
|
|
|
|
def __init__(self, next_observer: Observer):
|
|
|
|
|
self.next_observer = next_observer
|
|
|
|
|
|
|
|
|
|
def notify(self, message: Message):
|
|
|
|
|
if not isinstance(message.data, str):
|
|
|
|
|
return
|
|
|
|
|
tokenized_text = re_split(message.data)
|
|
|
|
|
self.next_observer.notify(TokenizedText(tokenized_text))
|
|
|
|
|
|
|
|
|
|
# 停用词订阅者
|
|
|
|
|
class StopWordsRemoverSubscriber(Observer):
|
|
|
|
|
def __init__(self, next_observer: Observer, stop_words: List[str]):
|
|
|
|
|
self.next_observer = next_observer
|
|
|
|
|
self.stop_words = set(stop_words)
|
|
|
|
|
|
|
|
|
|
def notify(self, message: Message):
|
|
|
|
|
if not isinstance(message, TokenizedText):
|
|
|
|
|
return
|
|
|
|
|
filtered_text = [word for word in message.data if word not in self.stop_words and len(word)>2 ]
|
|
|
|
|
self.next_observer.notify(FilteredText(filtered_text))
|
|
|
|
|
|
|
|
|
|
# 词频统计订阅者
|
|
|
|
|
class WordFrequencyCalculatorSubscriber(Observer):
|
|
|
|
|
def __init__(self, next_observer: Observer):
|
|
|
|
|
self.next_observer = next_observer
|
|
|
|
|
|
|
|
|
|
def notify(self, message: Message):
|
|
|
|
|
if not isinstance(message, FilteredText):
|
|
|
|
|
return
|
|
|
|
|
word_freq = Counter(message.data)
|
|
|
|
|
self.next_observer.notify( WordFrequency(word_freq) )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 输出前N个词订阅者
|
|
|
|
|
class TopNWordsDisplaySubscriber(Observer):
|
|
|
|
|
def __init__(self, n: int):
|
|
|
|
|
self.n = n
|
|
|
|
|
|
|
|
|
|
def notify(self, message: Message):
|
|
|
|
|
if not isinstance(message, WordFrequency):
|
|
|
|
|
return
|
|
|
|
|
print_word_freqs( message.data.most_common(self.n) )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 模拟发布者
|
|
|
|
|
def publish_text(text: str, observers: List[Observer]):
|
|
|
|
|
for observer in observers:
|
|
|
|
|
observer.notify(Message(text))
|
|
|
|
|
|
|
|
|
|
# 主函数
|
|
|
|
|
def main():
|
|
|
|
|
text = read_file()
|
|
|
|
|
|
|
|
|
|
stop_words = get_stopwords()
|
|
|
|
|
|
|
|
|
|
# 创建订阅者链
|
|
|
|
|
display_subscriber = TopNWordsDisplaySubscriber( n=10 )
|
|
|
|
|
freq_subscriber = WordFrequencyCalculatorSubscriber(display_subscriber)
|
|
|
|
|
stop_words_subscriber = StopWordsRemoverSubscriber(freq_subscriber, stop_words)
|
|
|
|
|
tokenizer_subscriber = TokenizerSubscriber(stop_words_subscriber)
|
|
|
|
|
|
|
|
|
|
# 发布文本
|
|
|
|
|
publish_text(text, [tokenizer_subscriber])
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|