zj3D 4 weeks ago
parent 068c0f9bb8
commit 7d03c9e5f0

@ -1,34 +0,0 @@
from collections import Counter
import cppy.cp_util as util
def word_frequency( top_n=10 ):
def decorator(func):
def wrapper(*args, **kwargs):
# 执行被装饰的函数
result = func(*args, **kwargs)
# 初始化词频计数器
word_counts = Counter()
# 分词并计数
for word in util.extract_str_words(result):
word_counts[word] += 1
# 输出所有词的频率最高的n个词
most_common = word_counts.most_common(top_n)
util.print_word_freqs( most_common )
return result
return wrapper
return decorator
# 使用装饰器
@word_frequency( top_n=10 )
def read_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
read_file( util.testfilepath )

@ -0,0 +1,64 @@
'''
模板方法模式 (Template Method Pattern)
适用场景定义词频统计的整体流程骨架允许子类自定义某些步骤如分词或输出格式
如何应用
定义一个抽象类 WordFrequencyAnalyzer包含固定的流程读取文件 -> 分词 -> 统计 -> 输出
具体步骤如分词或输出由子类实现
优点保证流程一致子类只关注具体实现
知识点
理解解继承与抽象类的使用
如何定义固定流程允许子类灵活实现强调流程与实现的分离
'''
from abc import ABC, abstractmethod
from typing import List, Dict
from collections import Counter
class WordFrequencyAnalyzer(ABC):
def analyze(self, directory: str, top_n: int = 10):
texts = self.read_texts(directory)
word_counts = self.count_words(texts)
self.print_results(word_counts, top_n)
@abstractmethod
def read_texts(self, directory: str) -> List[str]:
pass
@abstractmethod
def count_words(self, texts: List[str]) -> Dict[str, int]:
pass
@abstractmethod
def print_results(self, word_counts: Dict[str, int], top_n: int):
pass
class SimpleAnalyzer(WordFrequencyAnalyzer):
def read_texts(self, directory: str) -> List[str]:
import os
texts = []
for filename in os.listdir(directory):
if filename.endswith(".txt"):
with open(os.path.join(directory, filename), "r", encoding="utf-8") as f:
texts.append(f.read())
return texts
def count_words(self, texts: List[str]) -> Dict[str, int]:
import jieba
word_counts = Counter()
for text in texts:
words = jieba.cut(text)
word_counts.update([word for word in words if word.strip()])
return dict(word_counts)
def print_results(self, word_counts: Dict[str, int], top_n: int):
top_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:top_n]
for word, count in top_words:
print(f"{word}: {count}")
# 使用
analyzer = SimpleAnalyzer()
analyzer.analyze("data", 10)

@ -0,0 +1,107 @@
# -*- coding: utf-8 -*-
'''
设计说明
策略模式定义 Tokenizer 接口允许不同分词策略如中文分词英文分词
工厂模式TokenizerFactory 创建分词器隔离具体实现
解耦数据读取DataReader词频统计WordCounter结果输出ResultPrinter分离各自独立
可扩展支持添加新分词策略或输出格式
解耦
DataReader 只负责读取文件与分词和统计无关
WordCounter 依赖抽象的 Tokenizer不关心具体分词实现
ResultPrinter 只处理输出格式可扩展如输出到文件
策略模式
Tokenizer 接口允许切换分词策略如添加英文分词器
ChineseTokenizer 是具体实现易于替换
工厂模式
TokenizerFactory 封装分词器创建逻辑调用方无需知道具体类
简洁性代码结构清晰模块职责单一
扩展性
可添加新分词器如英文 EnglishTokenizer
可扩展 ResultPrinter 支持不同输出格式
实践可尝试添加英文分词器或新输出格式 CSV
'''
import os
import jieba
from collections import Counter
from abc import ABC, abstractmethod
from typing import List, Dict
# 策略接口:分词器
class Tokenizer(ABC):
@abstractmethod
def tokenize(self, text: str) -> List[str]:
pass
# 具体策略:中文分词(使用 jieba
class ChineseTokenizer(Tokenizer):
def tokenize(self, text: str) -> List[str]:
return [word for word in jieba.cut(text) if word.strip()]
# 工厂:创建分词器
class TokenizerFactory:
@staticmethod
def create_tokenizer(language: str = "chinese") -> Tokenizer:
if language == "chinese":
return ChineseTokenizer()
raise ValueError(f"Unsupported language: {language}")
# 数据读取模块
class DataReader:
def __init__(self, directory: str):
self.directory = directory
def read_texts(self) -> List[str]:
texts = []
for filename in os.listdir(self.directory):
if filename.endswith(".txt"):
with open(os.path.join(self.directory, filename), "r", encoding="utf-8") as f:
texts.append(f.read())
return texts
# 词频统计模块
class WordCounter:
def __init__(self, tokenizer: Tokenizer):
self.tokenizer = tokenizer
def count_words(self, texts: List[str]) -> Dict[str, int]:
word_counts = Counter()
for text in texts:
words = self.tokenizer.tokenize(text)
word_counts.update(words)
return dict(word_counts)
# 结果输出模块
class ResultPrinter:
@staticmethod
def print_top_words(word_counts: Dict[str, int], top_n: int = 10):
top_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:top_n]
for word, count in top_words:
print(f"{word}: {count}")
# 主程序
def main():
# 配置
data_dir = "data"
top_n = 10
# 创建组件
reader = DataReader(data_dir)
tokenizer = TokenizerFactory.create_tokenizer("chinese")
counter = WordCounter(tokenizer)
printer = ResultPrinter()
# 执行流程
texts = reader.read_texts()
word_counts = counter.count_words(texts)
printer.print_top_words(word_counts, top_n)
if __name__ == "__main__":
main()

@ -1,78 +0,0 @@
'''
主逻辑将关键动作开放为注册服务
各个组件注册到事件管理器相应服务
事件管理器按顺序执行注册的事件
'''
from collections import defaultdict
from cppy.cp_util import *
#
# event_manager
#
class EventManager:
def __init__(self):
self.load_handlers = [] # 用于加载文件的事件处理器
self.process_handlers = [] # 用于处理数据的事件处理器
self.end_handlers = [] # 用于结束流程的事件处理器
def register_load_event(self, handler):
self.load_handlers.append(handler)
def register_process_event(self, handler):
self.process_handlers.append(handler)
def register_end_event(self, handler):
self.end_handlers.append(handler)
# 运行框架,按顺序执行注册的事件处理器
def run(self, file_path):
for handler in self.load_handlers: handler(file_path)
for handler in self.process_handlers: handler()
for handler in self.end_handlers: handler()
#
# 功能组件
#
# 定义数据存储类,用于模拟文件内容的加载和处理
class TextData:
_word_event_handlers = []
def __init__( self, event_manager ):
self._stop_words = get_stopwords()
event_manager.register_load_event(self.__load)
event_manager.register_process_event(self.__process_words)
def __load(self, path_to_file):
self._data = re_split( read_file(path_to_file) )
def __process_words(self):
for word in self._data:
if word not in self._stop_words:
for handler in self._word_event_handlers:
handler(word)
def register_word_event(self, handler):
self._word_event_handlers.append(handler)
class WordFrequencyCounter:
def __init__(self, event_manager, data_storage):
self._word_freqs = defaultdict(int) # 存储单词频率
data_storage.register_word_event(self.__increment_count) # 注册单词事件
event_manager.register_end_event(self.__print_freqs) # 注册结束事件
def __increment_count(self, word):
self._word_freqs[word] += 1
def __print_freqs(self):
print_word_freqs ( sort_dict (self._word_freqs) )
if __name__ == '__main__':
em = EventManager()
data_storage = TextData(em)
word_freq_counter = WordFrequencyCounter(em, data_storage)
em.run(testfilepath)

@ -1,105 +0,0 @@
from cppy.cp_util import *
'''
订阅者 = 注册者 = 观察者
注册回调的一个变体
要点是中心化统一化
为了简化消息订阅可能形成的复杂性
提供一个中心消息管理器统一负责消息的订阅和回调
各个功能组件只是完成自己的功能
在中心管理器上订阅消息挂到自己响应的处理函数上
总结相比较的改变
- 注册的时候通过提供一个类型字段标识不同消息
- 其它实体不做注册和做回调统一这两个功能到一个中心单元
这是一个示例性质的原型具体分布式环境下需要调整
'''
from collections import defaultdict
#################################################
# Event Manager
#################################################
class EventManager:
def __init__(self):
self._subs = defaultdict(list)
def subscribe(self, event_type, handler):
self._subs[event_type].append(handler)
def publish(self, event):
event_type = event[0]
for handle in self._subs.get(event_type, []):
handle(event)
#################################################
# Application Entities
#################################################
class DataStorage:
def __init__(self, event_manager):
self._event_manager = event_manager
self._event_manager.subscribe('load', self._load)
self._event_manager.subscribe('start', self.produce_words)
def _load(self, event):
self._data = extract_file_words( event[1] )
def produce_words(self, _):
for word in self._data:
self._event_manager.publish(('word', word ))
self._event_manager.publish(('eof', None))
class StopWordFilter:
def __init__(self, event_manager):
self._event_manager = event_manager
self._event_manager.subscribe('load', self.load_stop_words)
self._event_manager.subscribe('word', self.filter_word)
self._stop_words = set()
def load_stop_words(self, _ ):
self._stop_words = set( get_stopwords() )
def filter_word(self, event):
word = event[1]
if word not in self._stop_words:
self._event_manager.publish(('valid_word', word))
class WordFrequencyCounter:
def __init__(self, event_manager):
self._event_manager = event_manager
self._event_manager.subscribe('valid_word', self.count_word)
self._event_manager.subscribe('print', self.print_freqs)
self._word_freqs = {}
def count_word(self, event):
word = event[1]
self._word_freqs[word] = self._word_freqs.get(word, 0) + 1
def print_freqs(self, _ ):
print_word_freqs ( sort_dict (self._word_freqs) )
class WordFrequencyApp:
def __init__(self, event_manager):
self._event_manager = event_manager
self._event_manager.subscribe('run', self.start_application)
self._event_manager.subscribe('eof', self.stop_application)
def start_application(self, event):
path_to_file = event[1]
self._event_manager.publish(('load', path_to_file))
self._event_manager.publish(('start', ))
def stop_application(self, _ ):
self._event_manager.publish(('print', ))
def main():
event_manager = EventManager()
DataStorage( event_manager )
StopWordFilter( event_manager )
WordFrequencyCounter( event_manager )
WordFrequencyApp( event_manager )
event_manager.publish(('run', testfilepath ))
if __name__ == "__main__":
main()

@ -1,68 +0,0 @@
'''
基类 Subject 提供注册和提醒注册上的对象提醒机制
每个派生组件做完自己的事情调用基类的 notify 方法
这样就可以实现一个简单的观察者模式也可以说是订阅模式
因为函数和参数混杂在一起传递使得各个模块的处理结构其实是 case by case
'''
from collections import Counter
from typing import List
from cppy.cp_util import *
class Subject:
def register_handler(self, handler: callable, *args, **kwargs):
self.handler = handler
self.args = args
self.kwargs = kwargs
def notify(self, *args, **kwargs):
self.handler( self.data, *self.args, **self.kwargs)
# 组件一TextLoader - 负责读取文本并过滤停用词
class TextLoader(Subject):
def load_text(self, filename: str) -> List[str]:
return extract_file_words(filename)
def notify(self, *args, **kwargs):
filename = args[0]
self.data = self.load_text(filename)
super().notify(self.data, *args, **kwargs)
# 组件二WordCounter - 计算词频
class WordCounter(Subject):
def count_words(self, words: List[str]) -> dict:
return Counter(words)
def notify(self, *args, **kwargs ):
words = args[0]
self.data = self.count_words(words)
super().notify(self.data, *args, **kwargs)
# 组件三TopWordsPresenter - 排序并输出前10个词
class TopWordsPresenter(Subject):
def notify(self, words,*args, **kwargs):
n = args[0]
top_words = words.most_common(n)
print_word_freqs( top_words )
# 主程序逻辑
def main():
loader = TextLoader()
counter = WordCounter()
presenter = TopWordsPresenter()
# 注册事件处理器
loader.register_handler(counter.notify)
counter.register_handler( presenter.notify,10 )
# 触发加载文本并开始流程
loader.notify(testfilepath)
if __name__ == "__main__":
main()

@ -0,0 +1,52 @@
'''
装饰器模式 (Decorator Pattern)
适用场景动态为词频统计添加功能如过滤停用词忽略标点或添加词性标注
如何应用
定义核心 WordCounter 接口负责基本词频统计
使用装饰器类动态添加功能 StopWordFilterDecorator 过滤停用词
优点功能扩展不修改核心逻辑符合开闭原则
知识点
如何动态扩展功能保持核心代码不变
体会组合优于继承的原则
'''
from abc import ABC, abstractmethod
from typing import List, Dict
from collections import Counter
class WordCounter(ABC):
@abstractmethod
def count_words(self, texts: List[str]) -> Dict[str, int]:
pass
class BasicWordCounter(WordCounter):
def count_words(self, texts: List[str]) -> Dict[str, int]:
import jieba
word_counts = Counter()
for text in texts:
words = jieba.cut(text)
word_counts.update([word for word in words if word.strip()])
return dict(word_counts)
class WordCounterDecorator(WordCounter, ABC):
def __init__(self, counter: WordCounter):
self.counter = counter
class StopWordFilterDecorator(WordCounterDecorator):
def __init__(self, counter: WordCounter, stop_words: List[str]):
super().__init__(counter)
self.stop_words = stop_words
def count_words(self, texts: List[str]) -> Dict[str, int]:
word_counts = self.counter.count_words(texts)
return {word: count for word, count in word_counts.items() if word not in self.stop_words}
# 使用
counter = StopWordFilterDecorator(BasicWordCounter(), stop_words=["", "", ""])
word_counts = counter.count_words(["这是一段测试文本。"])
print(word_counts)

@ -1,6 +1,9 @@
import time
import cppy.cp_util as util
'''
用反射实现装饰器的效果
'''
# 工具函数
def extract_words(path_to_file):
Loading…
Cancel
Save