|
|
|
@ -1,493 +0,0 @@
|
|
|
|
|
{
|
|
|
|
|
"cells": [
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "raw",
|
|
|
|
|
"id": "eccfe49f-de35-4241-90e3-a7095940b61a",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"设计模式提供高频重复出现需求的最佳解决方案。以下介绍适合词频统计案例的设计模式:策略模式、观察者模式、工厂模式。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "c186171f-d1f2-433e-a3eb-b266e2909a2c",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 策略模式(动态选择分词策略)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"策略模式允许动态切换算法(如分词器),比元编程简单。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "97c865cb-0b5a-4fa1-aa74-5ba2e65e7436",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"from abc import ABC, abstractmethod\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class Tokenizer(ABC):\n",
|
|
|
|
|
" \"\"\"分词器接口\"\"\"\n",
|
|
|
|
|
" @abstractmethod\n",
|
|
|
|
|
" def tokenize(self, text: str, stop_words: set) -> List[str]:\n",
|
|
|
|
|
" pass\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class JiebaTokenizer(Tokenizer):\n",
|
|
|
|
|
" \"\"\"jieba 分词器\"\"\"\n",
|
|
|
|
|
" def tokenize(self, text: str, stop_words: set) -> List[str]:\n",
|
|
|
|
|
" return [w for w in jieba.lcut(text) if w not in stop_words]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class SimpleTokenizer(Tokenizer):\n",
|
|
|
|
|
" \"\"\"简单分词器\"\"\"\n",
|
|
|
|
|
" def tokenize(self, text: str, stop_words: set) -> List[str]:\n",
|
|
|
|
|
" return [w for w in text.split() if w not in stop_words]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class TextAnalyzer:\n",
|
|
|
|
|
" def __init__(self, config_path='config.yaml'):\n",
|
|
|
|
|
" with open(config_path, 'r', encoding='utf-8') as f:\n",
|
|
|
|
|
" config = yaml.safe_load(f)\n",
|
|
|
|
|
" self.data_dir = config['data_dir']\n",
|
|
|
|
|
" self.top_n = config['top_n']\n",
|
|
|
|
|
" self.stop_words_file = config['stop_words_file']\n",
|
|
|
|
|
" self.output_file = config['output_file']\n",
|
|
|
|
|
" self.stop_words = self.load_stop_words()\n",
|
|
|
|
|
" self.word_count = Counter()\n",
|
|
|
|
|
" # 动态选择分词器\n",
|
|
|
|
|
" tokenizer_name = config.get('tokenizer', 'jieba')\n",
|
|
|
|
|
" self.tokenizer = {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}[tokenizer_name]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def tokenize(self, text: str) -> List[str]:\n",
|
|
|
|
|
" \"\"\"使用策略分词\"\"\"\n",
|
|
|
|
|
" return self.tokenizer.tokenize(text, self.stop_words)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # 其余方法同上"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "5435ebc3-d3b0-4475-8bd5-cb45fb51638c",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"工程质量提升:\n",
|
|
|
|
|
"- 可扩展性:添加新分词器只需实现 Tokenizer 接口。\n",
|
|
|
|
|
"- 可维护性:分词逻辑与主类分离,修改更独立。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"适用场景:适合需要动态切换算法的场景。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "fbf53455-558c-40fb-8718-446dec989b5d",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 观察者模式(结果输出解耦)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"观察者模式可用于解耦结果输出逻辑(如打印、保存文件、发送通知)。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "d7a2bd4c-df73-4800-b45b-9b6c73d28d7b",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"class OutputObserver(ABC):\n",
|
|
|
|
|
" \"\"\"输出观察者接口\"\"\"\n",
|
|
|
|
|
" @abstractmethod\n",
|
|
|
|
|
" def update(self, top_words: List[Tuple[str, int]]):\n",
|
|
|
|
|
" pass\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class ConsoleOutput(OutputObserver):\n",
|
|
|
|
|
" \"\"\"控制台输出\"\"\"\n",
|
|
|
|
|
" def update(self, top_words: List[Tuple[str, int]]):\n",
|
|
|
|
|
" for word, count in top_words:\n",
|
|
|
|
|
" print(f\"{word}: {count}\")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class FileOutput(OutputObserver):\n",
|
|
|
|
|
" \"\"\"文件输出\"\"\"\n",
|
|
|
|
|
" def __init__(self, output_file: str):\n",
|
|
|
|
|
" self.output_file = output_file\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def update(self, top_words: List[Tuple[str, int]]):\n",
|
|
|
|
|
" with open(self.output_file, 'w', encoding='utf-8') as f:\n",
|
|
|
|
|
" for word, count in top_words:\n",
|
|
|
|
|
" f.write(f\"{word}: {count}\\n\")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class TextAnalyzer:\n",
|
|
|
|
|
" def __init__(self, config_path='config.yaml'):\n",
|
|
|
|
|
" with open(config_path, 'r', encoding='utf-8') as f:\n",
|
|
|
|
|
" config = yaml.safe_load(f)\n",
|
|
|
|
|
" self.data_dir = config['data_dir']\n",
|
|
|
|
|
" self.top_n = config['top_n']\n",
|
|
|
|
|
" self.stop_words_file = config['stop_words_file']\n",
|
|
|
|
|
" self.output_file = config['output_file']\n",
|
|
|
|
|
" self.stop_words = self.load_stop_words()\n",
|
|
|
|
|
" self.word_count = Counter()\n",
|
|
|
|
|
" self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def add_observer(self, observer: OutputObserver):\n",
|
|
|
|
|
" \"\"\"添加观察者\"\"\"\n",
|
|
|
|
|
" self.observers.append(observer)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def notify_observers(self, top_words: List[Tuple[str, int]]):\n",
|
|
|
|
|
" \"\"\"通知所有观察者\"\"\"\n",
|
|
|
|
|
" for observer in self.observers:\n",
|
|
|
|
|
" observer.update(top_words)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def run(self):\n",
|
|
|
|
|
" \"\"\"执行词频统计并通知观察者\"\"\"\n",
|
|
|
|
|
" self.process_directory()\n",
|
|
|
|
|
" top_words = self.get_top_words()\n",
|
|
|
|
|
" self.notify_observers(top_words)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # 其余方法同上"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "02b5cfba-431c-4a01-a454-099e4f41922c",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"### 分析\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"工程质量提升:\n",
|
|
|
|
|
" - 可扩展性:添加新输出方式只需实现 OutputObserver 接口。\n",
|
|
|
|
|
" - 解耦性:输出逻辑与统计逻辑分离,修改输出不影响核心功能。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"适用场景:适合需要多种输出或通知的场景。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"局限性:观察者模式增加代码复杂性,适合复杂输出需求。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "11669305-8cd5-4317-afd5-e85c3f0a5a81",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 工厂模式(动态创建分词器)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"工厂模式可用于动态创建分词器,简化策略模式中的初始化逻辑。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "2fa50633-de22-40c8-912d-3ded5ebcedfc",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"class TokenizerFactory:\n",
|
|
|
|
|
" \"\"\"分词器工厂\"\"\"\n",
|
|
|
|
|
" @staticmethod\n",
|
|
|
|
|
" def create_tokenizer(name: str) -> Tokenizer:\n",
|
|
|
|
|
" tokenizers = {\n",
|
|
|
|
|
" 'jieba': JiebaTokenizer(),\n",
|
|
|
|
|
" 'simple': SimpleTokenizer()\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
" return tokenizers.get(name, JiebaTokenizer())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class TextAnalyzer:\n",
|
|
|
|
|
" def __init__(self, config_path='config.yaml'):\n",
|
|
|
|
|
" with open(config_path, 'r', encoding='utf-8') as f:\n",
|
|
|
|
|
" config = yaml.safe_load(f)\n",
|
|
|
|
|
" self.data_dir = config['data_dir']\n",
|
|
|
|
|
" self.top_n = config['top_n']\n",
|
|
|
|
|
" self.stop_words_file = config['stop_words_file']\n",
|
|
|
|
|
" self.output_file = config['output_file']\n",
|
|
|
|
|
" self.stop_words = self.load_stop_words()\n",
|
|
|
|
|
" self.word_count = Counter()\n",
|
|
|
|
|
" self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # 其余方法同上"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "a4db7046-dfe2-4bd8-81d1-49a42e2eeb5c",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"### 分析\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"工程质量提升:\n",
|
|
|
|
|
" - 可维护性:分词器创建逻辑集中于工厂,易于修改。\n",
|
|
|
|
|
" - 可扩展性:添加新分词器只需更新工厂方法。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"适用场景:适合需要动态创建对象的场景。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"局限性:对于简单场景,工厂模式可能略显冗余。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "e5f2aef4-a055-43a9-917c-fa183de6db2d",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 综合实现(整合特性与模式)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"整合上下文管理器、生成器、策略模式和观察者模式的最终实现(部分代码展示)。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "fa7f34e2-d355-4a22-8572-729c49b18605",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# text_analyzer.py\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"import os\n",
|
|
|
|
|
"import jieba\n",
|
|
|
|
|
"from collections import Counter\n",
|
|
|
|
|
"import yaml\n",
|
|
|
|
|
"from contextlib import contextmanager\n",
|
|
|
|
|
"from typing import List, Tuple\n",
|
|
|
|
|
"from abc import ABC, abstractmethod\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"@contextmanager\n",
|
|
|
|
|
"def file_reader(file_path: str):\n",
|
|
|
|
|
" try:\n",
|
|
|
|
|
" with open(file_path, 'r', encoding='utf-8') as f:\n",
|
|
|
|
|
" yield f.read()\n",
|
|
|
|
|
" except Exception as e:\n",
|
|
|
|
|
" print(f\"Error reading {file_path}: {e}\")\n",
|
|
|
|
|
" yield \"\"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class Tokenizer(ABC):\n",
|
|
|
|
|
" @abstractmethod\n",
|
|
|
|
|
" def tokenize(self, text: str, stop_words: set) -> List[str]:\n",
|
|
|
|
|
" pass\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class JiebaTokenizer(Tokenizer):\n",
|
|
|
|
|
" def tokenize(self, text: str, stop_words: set) -> List[str]:\n",
|
|
|
|
|
" for word in jieba.lcut(text):\n",
|
|
|
|
|
" if word not in stop_words:\n",
|
|
|
|
|
" yield word\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class SimpleTokenizer(Tokenizer):\n",
|
|
|
|
|
" def tokenize(self, text: str, stop_words: set) -> List[str]:\n",
|
|
|
|
|
" for word in text.split():\n",
|
|
|
|
|
" if word not in stop_words:\n",
|
|
|
|
|
" yield word\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class TokenizerFactory:\n",
|
|
|
|
|
" @staticmethod\n",
|
|
|
|
|
" def create_tokenizer(name: str) -> Tokenizer:\n",
|
|
|
|
|
" return {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}.get(name, JiebaTokenizer())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class OutputObserver(ABC):\n",
|
|
|
|
|
" @abstractmethod\n",
|
|
|
|
|
" def update(self, top_words: List[Tuple[str, int]]):\n",
|
|
|
|
|
" pass\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class ConsoleOutput(OutputObserver):\n",
|
|
|
|
|
" def update(self, top_words: List[Tuple[str, int]]):\n",
|
|
|
|
|
" for word, count in top_words:\n",
|
|
|
|
|
" print(f\"{word}: {count}\")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class FileOutput(OutputObserver):\n",
|
|
|
|
|
" def __init__(self, output_file: str):\n",
|
|
|
|
|
" self.output_file = output_file\n",
|
|
|
|
|
" def update(self, top_words: List[Tuple[str, int]]):\n",
|
|
|
|
|
" with open(self.output_file, 'w', encoding='utf-8') as f:\n",
|
|
|
|
|
" for word, count in top_words:\n",
|
|
|
|
|
" f.write(f\"{word}: {count}\\n\")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class TextAnalyzer:\n",
|
|
|
|
|
" def __init__(self, config_path='config.yaml'):\n",
|
|
|
|
|
" with open(config_path, 'r', encoding='utf-8') as f:\n",
|
|
|
|
|
" config = yaml.safe_load(f)\n",
|
|
|
|
|
" self.data_dir = config['data_dir']\n",
|
|
|
|
|
" self.top_n = config['top_n']\n",
|
|
|
|
|
" self.stop_words_file = config['stop_words_file']\n",
|
|
|
|
|
" self.output_file = config['output_file']\n",
|
|
|
|
|
" self.stop_words = self.load_stop_words()\n",
|
|
|
|
|
" self.word_count = Counter()\n",
|
|
|
|
|
" self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n",
|
|
|
|
|
" self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def load_stop_words(self) -> set:\n",
|
|
|
|
|
" with file_reader(self.stop_words_file) as content:\n",
|
|
|
|
|
" return set(line.strip() for line in content.splitlines() if line.strip())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def process_file(self, file_path: str):\n",
|
|
|
|
|
" if file_path.endswith('.txt'):\n",
|
|
|
|
|
" with file_reader(file_path) as text:\n",
|
|
|
|
|
" words = self.tokenizer.tokenize(text, self.stop_words)\n",
|
|
|
|
|
" self.word_count.update(words)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def process_directory(self):\n",
|
|
|
|
|
" for file in os.listdir(self.data_dir):\n",
|
|
|
|
|
" file_path = os.path.join(self.data_dir, file)\n",
|
|
|
|
|
" self.process_file(file_path)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def get_top_words(self) -> List[Tuple[str, int]]:\n",
|
|
|
|
|
" return self.word_count.most_common(self.top_n)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def notify_observers(self, top_words: List[Tuple[str, int]]):\n",
|
|
|
|
|
" for observer in self.observers:\n",
|
|
|
|
|
" observer.update(top_words)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def run(self):\n",
|
|
|
|
|
" self.process_directory()\n",
|
|
|
|
|
" top_words = self.get_top_words()\n",
|
|
|
|
|
" self.notify_observers(top_words)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "3d130312-b298-4c76-ae09-0fb4bd08b0c1",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# main.py\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"from text_analyzer import TextAnalyzer\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def main():\n",
|
|
|
|
|
" analyzer = TextAnalyzer()\n",
|
|
|
|
|
" analyzer.run()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"if __name__ == '__main__':\n",
|
|
|
|
|
" main()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "770618c9-428e-454a-97de-00e3b49c9d03",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 结论\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"通过引入上下文管理器、生成器、元编程、策略模式、观察者模式和工厂模式,词频统计代码在可扩展性、可维护性和复用性上进一步提升。\n",
|
|
|
|
|
"这些特性和模式使代码更模块化、灵活,适合大型项目,同时保持清晰的工程结构。结合之前的装饰器和函数式编程,代码已达到工程化水平。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"若需深入,可以进一步考虑其它性能特性."
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "cbeaa07d-272f-465b-a437-9c4b44827d23",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 进一步练习\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"实践练习:\n",
|
|
|
|
|
"- 实现新分词器(如 thulac)并通过策略模式或工厂模式集成。\n",
|
|
|
|
|
"- 添加新观察者(如 JSON 输出)。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"使用生成器实现流式词频统计,比较内存占用。\n",
|
|
|
|
|
"实现缓存机制,缓存已处理文件的分词结果。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"添加命令行接口(argparse),动态配置 top_n 和 tokenizer。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "6a43b53d-1e07-4ebe-a6c8-104353fd5f7b",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"## 附:元编程\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"元编程允许动态修改类或函数行为,可用于动态配置分词器或输出格式。案例中,可通过元编程动态注册分词器。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "4394008c-88da-44bd-aa0d-f1b7a6dbc7d6",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"class TokenizerRegistry(type):\n",
|
|
|
|
|
" \"\"\"元类:动态注册分词器\"\"\"\n",
|
|
|
|
|
" tokenizers = {}\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def register_tokenizer(cls, name):\n",
|
|
|
|
|
" def decorator(func):\n",
|
|
|
|
|
" cls.tokenizers[name] = func\n",
|
|
|
|
|
" return func\n",
|
|
|
|
|
" return decorator\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class TextAnalyzer(metaclass=TokenizerRegistry):\n",
|
|
|
|
|
" def __init__(self, config_path='config.yaml'):\n",
|
|
|
|
|
" with open(config_path, 'r', encoding='utf-8') as f:\n",
|
|
|
|
|
" config = yaml.safe_load(f)\n",
|
|
|
|
|
" self.data_dir = config['data_dir']\n",
|
|
|
|
|
" self.top_n = config['top_n']\n",
|
|
|
|
|
" self.stop_words_file = config['stop_words_file']\n",
|
|
|
|
|
" self.output_file = config['output_file']\n",
|
|
|
|
|
" self.stop_words = self.load_stop_words()\n",
|
|
|
|
|
" self.word_count = Counter()\n",
|
|
|
|
|
" self.tokenizer_name = config.get('tokenizer', 'jieba') # 从配置读取分词器\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" @classmethod\n",
|
|
|
|
|
" def register_tokenizer(cls, name):\n",
|
|
|
|
|
" return cls.__class__.register_tokenizer(name)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def tokenize(self, text: str) -> List[str]:\n",
|
|
|
|
|
" \"\"\"动态调用分词器\"\"\"\n",
|
|
|
|
|
" tokenizer = self.__class__.tokenizers.get(self.tokenizer_name, self.jieba_tokenizer)\n",
|
|
|
|
|
" return tokenizer(self, text)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" @register_tokenizer('jieba')\n",
|
|
|
|
|
" def jieba_tokenizer(self, text: str) -> List[str]:\n",
|
|
|
|
|
" \"\"\"jieba 分词\"\"\"\n",
|
|
|
|
|
" return [w for w in jieba.lcut(text) if w not in self.stop_words]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" @register_tokenizer('simple')\n",
|
|
|
|
|
" def simple_tokenizer(self, text: str) -> List[str]:\n",
|
|
|
|
|
" \"\"\"简单分词(按空格)\"\"\"\n",
|
|
|
|
|
" return [w for w in text.split() if w not in self.stop_words]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # 其余方法(load_stop_words, process_file, etc.)同上"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "2249f13a-7a3f-4376-ba2a-d92f11658d32",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"### 分析\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"功能:通过元类和装饰器动态注册分词器,支持配置切换(如 jieba 或 simple)。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"工程质量提升:\n",
|
|
|
|
|
" 可扩展性:新分词器只需添加新方法并注册,无需修改核心逻辑。\n",
|
|
|
|
|
" 灵活性:通过配置文件动态选择分词器。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"适用场景:适合需要动态配置或插件化系统的场景。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"局限性:元编程增加代码复杂性,可能降低可读性,需谨慎使用。"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"display_name": "Python 3 (ipykernel)",
|
|
|
|
|
"language": "python",
|
|
|
|
|
"name": "python3"
|
|
|
|
|
},
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
"version": 3
|
|
|
|
|
},
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
"name": "python",
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
|
"version": "3.12.7"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 5
|
|
|
|
|
}
|