## 策略模式(动态选择分词策略)

策略模式允许动态切换算法(如分词器),比元编程简单。

In [None]:
from abc import ABC, abstractmethod

class Tokenizer(ABC):
 """分词器接口"""
 @abstractmethod
 def tokenize(self, text: str, stop_words: set) -> List[str]:
 pass

class JiebaTokenizer(Tokenizer):
 """jieba 分词器"""
 def tokenize(self, text: str, stop_words: set) -> List[str]:
 return [w for w in jieba.lcut(text) if w not in stop_words]

class SimpleTokenizer(Tokenizer):
 """简单分词器"""
 def tokenize(self, text: str, stop_words: set) -> List[str]:
 return [w for w in text.split() if w not in stop_words]

class TextAnalyzer:
 def __init__(self, config_path='config.yaml'):
 with open(config_path, 'r', encoding='utf-8') as f:
 config = yaml.safe_load(f)
 self.data_dir = config['data_dir']
 self.top_n = config['top_n']
 self.stop_words_file = config['stop_words_file']
 self.output_file = config['output_file']
 self.stop_words = self.load_stop_words()
 self.word_count = Counter()
 # 动态选择分词器
 tokenizer_name = config.get('tokenizer', 'jieba')
 self.tokenizer = {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}[tokenizer_name]

 def tokenize(self, text: str) -> List[str]:
 """使用策略分词"""
 return self.tokenizer.tokenize(text, self.stop_words)

 # 其余方法同上

工程质量提升:
- 可扩展性:添加新分词器只需实现 Tokenizer 接口。
- 可维护性:分词逻辑与主类分离,修改更独立。

适用场景:适合需要动态切换算法的场景。

## 观察者模式(结果输出解耦)

观察者模式可用于解耦结果输出逻辑(如打印、保存文件、发送通知)。

In [None]:
class OutputObserver(ABC):
 """输出观察者接口"""
 @abstractmethod
 def update(self, top_words: List[Tuple[str, int]]):
 pass

class ConsoleOutput(OutputObserver):
 """控制台输出"""
 def update(self, top_words: List[Tuple[str, int]]):
 for word, count in top_words:
 print(f"{word}: {count}")

class FileOutput(OutputObserver):
 """文件输出"""
 def __init__(self, output_file: str):
 self.output_file = output_file

 def update(self, top_words: List[Tuple[str, int]]):
 with open(self.output_file, 'w', encoding='utf-8') as f:
 for word, count in top_words:
 f.write(f"{word}: {count}\n")

class TextAnalyzer:
 def __init__(self, config_path='config.yaml'):
 with open(config_path, 'r', encoding='utf-8') as f:
 config = yaml.safe_load(f)
 self.data_dir = config['data_dir']
 self.top_n = config['top_n']
 self.stop_words_file = config['stop_words_file']
 self.output_file = config['output_file']
 self.stop_words = self.load_stop_words()
 self.word_count = Counter()
 self.observers = [ConsoleOutput(), FileOutput(self.output_file)]

 def add_observer(self, observer: OutputObserver):
 """添加观察者"""
 self.observers.append(observer)

 def notify_observers(self, top_words: List[Tuple[str, int]]):
 """通知所有观察者"""
 for observer in self.observers:
 observer.update(top_words)

 def run(self):
 """执行词频统计并通知观察者"""
 self.process_directory()
 top_words = self.get_top_words()
 self.notify_observers(top_words)

 # 其余方法同上

### 分析

工程质量提升:
 - 可扩展性:添加新输出方式只需实现 OutputObserver 接口。
 - 解耦性:输出逻辑与统计逻辑分离,修改输出不影响核心功能。

适用场景:适合需要多种输出或通知的场景。

局限性:观察者模式增加代码复杂性,适合复杂输出需求。

## 工厂模式(动态创建分词器)

工厂模式可用于动态创建分词器,简化策略模式中的初始化逻辑。

In [None]:
class TokenizerFactory:
 """分词器工厂"""
 @staticmethod
 def create_tokenizer(name: str) -> Tokenizer:
 tokenizers = {
 'jieba': JiebaTokenizer(),
 'simple': SimpleTokenizer()
 }
 return tokenizers.get(name, JiebaTokenizer())

class TextAnalyzer:
 def __init__(self, config_path='config.yaml'):
 with open(config_path, 'r', encoding='utf-8') as f:
 config = yaml.safe_load(f)
 self.data_dir = config['data_dir']
 self.top_n = config['top_n']
 self.stop_words_file = config['stop_words_file']
 self.output_file = config['output_file']
 self.stop_words = self.load_stop_words()
 self.word_count = Counter()
 self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))

 # 其余方法同上

### 分析

工程质量提升:
 - 可维护性:分词器创建逻辑集中于工厂,易于修改。
 - 可扩展性:添加新分词器只需更新工厂方法。

适用场景:适合需要动态创建对象的场景。

局限性:对于简单场景,工厂模式可能略显冗余。

## 综合实现(整合特性与模式)

整合上下文管理器、生成器、策略模式和观察者模式的最终实现(部分代码展示)。

In [None]:
# text_analyzer.py

import os
import jieba
from collections import Counter
import yaml
from contextlib import contextmanager
from typing import List, Tuple
from abc import ABC, abstractmethod

@contextmanager
def file_reader(file_path: str):
 try:
 with open(file_path, 'r', encoding='utf-8') as f:
 yield f.read()
 except Exception as e:
 print(f"Error reading {file_path}: {e}")
 yield ""

class Tokenizer(ABC):
 @abstractmethod
 def tokenize(self, text: str, stop_words: set) -> List[str]:
 pass

class JiebaTokenizer(Tokenizer):
 def tokenize(self, text: str, stop_words: set) -> List[str]:
 for word in jieba.lcut(text):
 if word not in stop_words:
 yield word

class SimpleTokenizer(Tokenizer):
 def tokenize(self, text: str, stop_words: set) -> List[str]:
 for word in text.split():
 if word not in stop_words:
 yield word

class TokenizerFactory:
 @staticmethod
 def create_tokenizer(name: str) -> Tokenizer:
 return {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}.get(name, JiebaTokenizer())

class OutputObserver(ABC):
 @abstractmethod
 def update(self, top_words: List[Tuple[str, int]]):
 pass

class ConsoleOutput(OutputObserver):
 def update(self, top_words: List[Tuple[str, int]]):
 for word, count in top_words:
 print(f"{word}: {count}")

class FileOutput(OutputObserver):
 def __init__(self, output_file: str):
 self.output_file = output_file
 def update(self, top_words: List[Tuple[str, int]]):
 with open(self.output_file, 'w', encoding='utf-8') as f:
 for word, count in top_words:
 f.write(f"{word}: {count}\n")

class TextAnalyzer:
 def __init__(self, config_path='config.yaml'):
 with open(config_path, 'r', encoding='utf-8') as f:
 config = yaml.safe_load(f)
 self.data_dir = config['data_dir']
 self.top_n = config['top_n']
 self.stop_words_file = config['stop_words_file']
 self.output_file = config['output_file']
 self.stop_words = self.load_stop_words()
 self.word_count = Counter()
 self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))
 self.observers = [ConsoleOutput(), FileOutput(self.output_file)]

 def load_stop_words(self) -> set:
 with file_reader(self.stop_words_file) as content:
 return set(line.strip() for line in content.splitlines() if line.strip())

 def process_file(self, file_path: str):
 if file_path.endswith('.txt'):
 with file_reader(file_path) as text:
 words = self.tokenizer.tokenize(text, self.stop_words)
 self.word_count.update(words)

 def process_directory(self):
 for file in os.listdir(self.data_dir):
 file_path = os.path.join(self.data_dir, file)
 self.process_file(file_path)

 def get_top_words(self) -> List[Tuple[str, int]]:
 return self.word_count.most_common(self.top_n)

 def notify_observers(self, top_words: List[Tuple[str, int]]):
 for observer in self.observers:
 observer.update(top_words)

 def run(self):
 self.process_directory()
 top_words = self.get_top_words()
 self.notify_observers(top_words)

In [None]:
# main.py

from text_analyzer import TextAnalyzer

def main():
 analyzer = TextAnalyzer()
 analyzer.run()

if __name__ == '__main__':
 main()

## 结论

通过引入上下文管理器、生成器、元编程、策略模式、观察者模式和工厂模式,词频统计代码在可扩展性、可维护性和复用性上进一步提升。
这些特性和模式使代码更模块化、灵活,适合大型项目,同时保持清晰的工程结构。结合之前的装饰器和函数式编程,代码已达到工程化水平。

若需深入,可以进一步考虑其它性能特性.

## 进一步练习

实践练习:
- 实现新分词器(如 thulac)并通过策略模式或工厂模式集成。
- 添加新观察者(如 JSON 输出)。

使用生成器实现流式词频统计,比较内存占用。
实现缓存机制,缓存已处理文件的分词结果。

添加命令行接口(argparse),动态配置 top_n 和 tokenizer。

In [None]:
## 附:元编程

元编程允许动态修改类或函数行为,可用于动态配置分词器或输出格式。案例中,可通过元编程动态注册分词器。

In [None]:
class TokenizerRegistry(type):
 """元类:动态注册分词器"""
 tokenizers = {}

 def register_tokenizer(cls, name):
 def decorator(func):
 cls.tokenizers[name] = func
 return func
 return decorator

class TextAnalyzer(metaclass=TokenizerRegistry):
 def __init__(self, config_path='config.yaml'):
 with open(config_path, 'r', encoding='utf-8') as f:
 config = yaml.safe_load(f)
 self.data_dir = config['data_dir']
 self.top_n = config['top_n']
 self.stop_words_file = config['stop_words_file']
 self.output_file = config['output_file']
 self.stop_words = self.load_stop_words()
 self.word_count = Counter()
 self.tokenizer_name = config.get('tokenizer', 'jieba') # 从配置读取分词器

 @classmethod
 def register_tokenizer(cls, name):
 return cls.__class__.register_tokenizer(name)

 def tokenize(self, text: str) -> List[str]:
 """动态调用分词器"""
 tokenizer = self.__class__.tokenizers.get(self.tokenizer_name, self.jieba_tokenizer)
 return tokenizer(self, text)

 @register_tokenizer('jieba')
 def jieba_tokenizer(self, text: str) -> List[str]:
 """jieba 分词"""
 return [w for w in jieba.lcut(text) if w not in self.stop_words]

 @register_tokenizer('simple')
 def simple_tokenizer(self, text: str) -> List[str]:
 """简单分词(按空格)"""
 return [w for w in text.split() if w not in self.stop_words]

 # 其余方法(load_stop_words, process_file, etc.)同上

In [None]:
### 分析

功能:通过元类和装饰器动态注册分词器,支持配置切换(如 jieba 或 simple)。

工程质量提升:
 可扩展性:新分词器只需添加新方法并注册,无需修改核心逻辑。
 灵活性:通过配置文件动态选择分词器。

适用场景:适合需要动态配置或插件化系统的场景。

局限性:元编程增加代码复杂性,可能降低可读性,需谨慎使用。