|
|
|
@ -0,0 +1,665 @@
|
|
|
|
|
{
|
|
|
|
|
"cells": [
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "b6bc2a3c-3b15-4bc5-83a2-adeae3b7b4d0",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"## 项目结构\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"word_frequency_project/\n",
|
|
|
|
|
"│\n",
|
|
|
|
|
"├── data/ # 小说文本存放目录\n",
|
|
|
|
|
"│ ├── novel1.txt\n",
|
|
|
|
|
"│ ├── novel2.txt\n",
|
|
|
|
|
"│ └── ...\n",
|
|
|
|
|
"├── src/ # 源代码目录\n",
|
|
|
|
|
"│ ├── __init__.py\n",
|
|
|
|
|
"│ ├── config.py # 配置文件\n",
|
|
|
|
|
"│ ├── data_loader.py # 数据加载模块\n",
|
|
|
|
|
"│ ├── preprocessor.py # 文本预处理模块\n",
|
|
|
|
|
"│ ├── word_counter.py # 词频统计模块\n",
|
|
|
|
|
"│ ├── output_formatter.py # 输出格式化模块\n",
|
|
|
|
|
"│ └── main.py # 主程序入口\n",
|
|
|
|
|
"├── tests/ # 单元测试目录\n",
|
|
|
|
|
"│ ├── __init__.py\n",
|
|
|
|
|
"│ ├── test_data_loader.py\n",
|
|
|
|
|
"│ ├── test_preprocessor.py\n",
|
|
|
|
|
"│ ├── test_word_counter.py\n",
|
|
|
|
|
"│ └── test_output_formatter.py\n",
|
|
|
|
|
"├── requirements.txt # 依赖文件\n",
|
|
|
|
|
"└── README.md # 项目说明"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "d0b55f2e-24ba-49da-8d11-f0f5eea611b0",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"'''\n",
|
|
|
|
|
"src/config.py\n",
|
|
|
|
|
"定义配置项,便于扩展和修改。\n",
|
|
|
|
|
"'''\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"import os\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class Config:\n",
|
|
|
|
|
" DATA_DIR = \"data\"\n",
|
|
|
|
|
" TOP_N_WORDS = 10\n",
|
|
|
|
|
" STOP_WORDS = {\"的\", \"了\", \"是\", \"在\", \"和\", \"我\", \"你\", \"他\", \"她\"} # 示例停用词\n",
|
|
|
|
|
" ENCODING = \"utf-8\"\n",
|
|
|
|
|
" LOG_LEVEL = \"INFO\"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" @classmethod\n",
|
|
|
|
|
" def get_data_dir(cls):\n",
|
|
|
|
|
" return os.path.join(os.path.dirname(__file__), \"..\", cls.DATA_DIR)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "e5bdcdf0-16a2-4dda-85f1-d018c6370aee",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"'''\n",
|
|
|
|
|
"src/data_loader.py\n",
|
|
|
|
|
"负责加载小说文本,支持目录扫描和文件读取,提供扩展点以支持不同格式。\n",
|
|
|
|
|
"'''\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"import os\n",
|
|
|
|
|
"import logging\n",
|
|
|
|
|
"from src.config import Config\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class DataLoader:\n",
|
|
|
|
|
" def __init__(self):\n",
|
|
|
|
|
" self.data_dir = Config.get_data_dir()\n",
|
|
|
|
|
" logging.basicConfig(level=Config.LOG_LEVEL)\n",
|
|
|
|
|
" self.logger = logging.getLogger(__name__)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def load_texts(self):\n",
|
|
|
|
|
" \"\"\"加载 data 目录下的所有文本文件\"\"\"\n",
|
|
|
|
|
" texts = []\n",
|
|
|
|
|
" try:\n",
|
|
|
|
|
" for filename in os.listdir(self.data_dir):\n",
|
|
|
|
|
" if filename.endswith(\".txt\"):\n",
|
|
|
|
|
" file_path = os.path.join(self.data_dir, filename)\n",
|
|
|
|
|
" with open(file_path, \"r\", encoding=Config.ENCODING) as f:\n",
|
|
|
|
|
" texts.append(f.read())\n",
|
|
|
|
|
" self.logger.info(f\"Loaded file: {filename}\")\n",
|
|
|
|
|
" if not texts:\n",
|
|
|
|
|
" self.logger.warning(\"No text files found in data directory\")\n",
|
|
|
|
|
" return texts\n",
|
|
|
|
|
" except Exception as e:\n",
|
|
|
|
|
" self.logger.error(f\"Error loading files: {str(e)}\")\n",
|
|
|
|
|
" raise"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "786e7ffa-82bc-46b9-8ffc-444d6796b87b",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"'''\n",
|
|
|
|
|
"src/preprocessor.py\n",
|
|
|
|
|
"文本预处理模块,负责分词和清理,支持扩展以添加更多预处理逻辑。\n",
|
|
|
|
|
"'''\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"import jieba\n",
|
|
|
|
|
"import re\n",
|
|
|
|
|
"from src.config import Config\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def timing_decorator(func):\n",
|
|
|
|
|
" \"\"\"装饰器:记录方法执行时间\"\"\"\n",
|
|
|
|
|
" import time\n",
|
|
|
|
|
" def wrapper(*args, **kwargs):\n",
|
|
|
|
|
" start = time.time()\n",
|
|
|
|
|
" result = func(*args, **kwargs)\n",
|
|
|
|
|
" end = time.time()\n",
|
|
|
|
|
" print(f\"{func.__name__} took {end - start:.2f} seconds\")\n",
|
|
|
|
|
" return result\n",
|
|
|
|
|
" return wrapper\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class TextPreprocessor:\n",
|
|
|
|
|
" def __init__(self):\n",
|
|
|
|
|
" self.stop_words = Config.STOP_WORDS\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" @timing_decorator\n",
|
|
|
|
|
" def preprocess(self, text):\n",
|
|
|
|
|
" \"\"\"预处理:分词、去除停用词和非中文字符\"\"\"\n",
|
|
|
|
|
" # 移除非中文字符\n",
|
|
|
|
|
" text = re.sub(r\"[^\\u4e00-\\u9fff]\", \" \", text)\n",
|
|
|
|
|
" # 分词\n",
|
|
|
|
|
" words = jieba.cut(text)\n",
|
|
|
|
|
" # 过滤停用词和空字符\n",
|
|
|
|
|
" return [word for word in words if word.strip() and word not in self.stop_words]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "4edd5ca7-4ba7-4446-b93e-2cfd83efca2e",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"'''\n",
|
|
|
|
|
"src/word_counter.py\n",
|
|
|
|
|
"词频统计模块,使用单例模式确保全局唯一计数器。\n",
|
|
|
|
|
"'''\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"from collections import Counter\n",
|
|
|
|
|
"from typing import List, Dict\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class Singleton: 为啥需要单例?\n",
|
|
|
|
|
" \"\"\"单例模式装饰器\"\"\"\n",
|
|
|
|
|
" def __init__(self, cls):\n",
|
|
|
|
|
" self._cls = cls\n",
|
|
|
|
|
" self._instance = None\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def __call__(self, *args, **kwargs):\n",
|
|
|
|
|
" if self._instance is None:\n",
|
|
|
|
|
" self._instance = self._cls(*args, **kwargs)\n",
|
|
|
|
|
" return self._instance\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"@Singleton\n",
|
|
|
|
|
"class WordCounter:\n",
|
|
|
|
|
" def __init__(self):\n",
|
|
|
|
|
" self.counter = Counter()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def count_words(self, words: List[str]) -> None:\n",
|
|
|
|
|
" \"\"\"更新词频统计\"\"\"\n",
|
|
|
|
|
" self.counter.update(words)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def get_top_n(self, n: int = 10) -> Dict[str, int]:\n",
|
|
|
|
|
" \"\"\"获取前 N 个高频词\"\"\"\n",
|
|
|
|
|
" return dict(self.counter.most_common(n))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def reset(self):\n",
|
|
|
|
|
" \"\"\"重置计数器\"\"\"\n",
|
|
|
|
|
" self.counter.clear()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "41af3e0e-3153-4d23-9a9f-65b566b384e8",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"'''\n",
|
|
|
|
|
"src/output_formatter.py\n",
|
|
|
|
|
"输出格式化模块,支持多种输出格式,便于扩展。\n",
|
|
|
|
|
"'''\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"from typing import Dict\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class OutputFormatter:\n",
|
|
|
|
|
" @staticmethod\n",
|
|
|
|
|
" def format_json(data: Dict[str, int]) -> str:\n",
|
|
|
|
|
" import json\n",
|
|
|
|
|
" return json.dumps(data, ensure_ascii=False, indent=2)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" @staticmethod\n",
|
|
|
|
|
" def format_text(data: Dict[str, int]) -> str:\n",
|
|
|
|
|
" return \"\\n\".join(f\"{word}: {count}\" for word, count in data.items())"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "6596162c-fd42-4b32-b328-9987568b3846",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"'''\n",
|
|
|
|
|
"src/main.py\n",
|
|
|
|
|
"主程序入口,协调各模块工作。\n",
|
|
|
|
|
"'''\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"from src.data_loader import DataLoader\n",
|
|
|
|
|
"from src.preprocessor import TextPreprocessor\n",
|
|
|
|
|
"from src.word_counter import WordCounter\n",
|
|
|
|
|
"from src.output_formatter import OutputFormatter\n",
|
|
|
|
|
"from src.config import Config\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def main():\n",
|
|
|
|
|
" # 初始化模块\n",
|
|
|
|
|
" loader = DataLoader()\n",
|
|
|
|
|
" preprocessor = TextPreprocessor()\n",
|
|
|
|
|
" counter = WordCounter()\n",
|
|
|
|
|
" formatter = OutputFormatter()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # 加载文本\n",
|
|
|
|
|
" texts = loader.load_texts()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # 预处理并统计词频\n",
|
|
|
|
|
" for text in texts:\n",
|
|
|
|
|
" words = preprocessor.preprocess(text)\n",
|
|
|
|
|
" counter.count_words(words)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # 获取结果\n",
|
|
|
|
|
" top_words = counter.get_top_n(Config.TOP_N_WORDS)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # 输出结果\n",
|
|
|
|
|
" print(\"=== Top 10 Words (Text Format) ===\")\n",
|
|
|
|
|
" print(formatter.format_text(top_words))\n",
|
|
|
|
|
" print(\"\\n=== Top 10 Words (JSON Format) ===\")\n",
|
|
|
|
|
" print(formatter.format_json(top_words))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"if __name__ == \"__main__\":\n",
|
|
|
|
|
" main()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "36a32f17-5ce3-46e2-a563-f151454f6342",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"'''\n",
|
|
|
|
|
"tests/test_data_loader.py\n",
|
|
|
|
|
"单元测试示例,确保数据加载模块的正确性。\n",
|
|
|
|
|
"'''\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"import unittest\n",
|
|
|
|
|
"import os\n",
|
|
|
|
|
"from src.data_loader import DataLoader\n",
|
|
|
|
|
"from src.config import Config\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class TestDataLoader(unittest.TestCase):\n",
|
|
|
|
|
" def setUp(self):\n",
|
|
|
|
|
" self.loader = DataLoader()\n",
|
|
|
|
|
" # 创建临时测试文件\n",
|
|
|
|
|
" self.test_file = os.path.join(Config.get_data_dir(), \"test_novel.txt\")\n",
|
|
|
|
|
" with open(self.test_file, \"w\", encoding=Config.ENCODING) as f:\n",
|
|
|
|
|
" f.write(\"这是一个测试文本\")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def test_load_texts(self):\n",
|
|
|
|
|
" texts = self.loader.load_texts()\n",
|
|
|
|
|
" self.assertGreater(len(texts), 0)\n",
|
|
|
|
|
" self.assertIn(\"这是一个测试文本\", texts)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def tearDown(self):\n",
|
|
|
|
|
" if os.path.exists(self.test_file):\n",
|
|
|
|
|
" os.remove(self.test_file)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"if __name__ == \"__main__\":\n",
|
|
|
|
|
" unittest.main()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "1f550544-f0f4-4f0c-bdb7-9928b6820bdf",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"'''\n",
|
|
|
|
|
"tests/test_preprocessor.py\n",
|
|
|
|
|
"测试文本预处理模块。\n",
|
|
|
|
|
"'''\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"import unittest\n",
|
|
|
|
|
"from src.preprocessor import TextPreprocessor\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class TestTextPreprocessor(unittest.TestCase):\n",
|
|
|
|
|
" def setUp(self):\n",
|
|
|
|
|
" self.preprocessor = TextPreprocessor()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def test_preprocess(self):\n",
|
|
|
|
|
" text = \"这是一个测试文本,包含了123和一些符号!\"\n",
|
|
|
|
|
" words = self.preprocessor.preprocess(text)\n",
|
|
|
|
|
" expected = [\"测试\", \"文本\", \"包含\", \"一些\", \"符号\"]\n",
|
|
|
|
|
" self.assertEqual(words, expected)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"if __name__ == \"__main__\":\n",
|
|
|
|
|
" unittest.main()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "8fb8b4cd-0b27-426a-9556-8f21227c5374",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"'''\n",
|
|
|
|
|
"tests/test_word_counter.py\n",
|
|
|
|
|
"测试词频统计模块。\n",
|
|
|
|
|
"'''\n",
|
|
|
|
|
"import unittest\n",
|
|
|
|
|
"from src.word_counter import WordCounter\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class TestWordCounter(unittest.TestCase):\n",
|
|
|
|
|
" def setUp(self):\n",
|
|
|
|
|
" self.counter = WordCounter()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def test_count_words(self):\n",
|
|
|
|
|
" self.counter.count_words([\"测试\", \"文本\", \"测试\"])\n",
|
|
|
|
|
" result = self.counter.get_top_n(2)\n",
|
|
|
|
|
" expected = {\"测试\": 2, \"文本\": 1}\n",
|
|
|
|
|
" self.assertEqual(result, expected)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def test_reset(self):\n",
|
|
|
|
|
" self.counter.count_words([\"测试\"])\n",
|
|
|
|
|
" self.counter.reset()\n",
|
|
|
|
|
" self.assertEqual(self.counter.get_top_n(1), {})\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"if __name__ == \"__main__\":\n",
|
|
|
|
|
" unittest.main()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "4b7507dc-b693-4dbf-9a21-5f2833d13d0e",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"'''\n",
|
|
|
|
|
"requirements.txt\n",
|
|
|
|
|
"列出项目依赖。\n",
|
|
|
|
|
"''''\n",
|
|
|
|
|
"jieba==0.42.1"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "573c4ddd-800e-4b59-9e20-a87d6a2b14cd",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"'''\n",
|
|
|
|
|
"README.md\n",
|
|
|
|
|
"提供项目说明和使用方法。\n",
|
|
|
|
|
"'''\n",
|
|
|
|
|
"# Word Frequency Analysis Project\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"## Overview\n",
|
|
|
|
|
"This project processes 100 novels in the `data` directory, counts word frequencies, and outputs the top 10 words. It demonstrates software engineering principles like modularity, design patterns, and unit testing.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"## Setup\n",
|
|
|
|
|
"1. Install dependencies: `pip install -r requirements.txt`\n",
|
|
|
|
|
"2. Place novel files (.txt) in the `data` directory.\n",
|
|
|
|
|
"3. Run the program: `python src/main.py`\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"## Testing\n",
|
|
|
|
|
"Run tests: `python -m unittest discover tests`\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"## Extensibility\n",
|
|
|
|
|
"- Add new preprocessors in `preprocessor.py`.\n",
|
|
|
|
|
"- Support new output formats in `output_formatter.py`.\n",
|
|
|
|
|
"- Modify configurations in `config.py`."
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "4bd74972-f9c4-4ac9-a557-de4198889047",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 使用方法\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"准备环境:\n",
|
|
|
|
|
"pip install -r requirements.txt\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"准备数据:\n",
|
|
|
|
|
"- 在 data 目录下放入 100 个 .txt 小说文件(需为 UTF-8 编码)。\n",
|
|
|
|
|
"- 确保安装 jieba 分词库。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"运行程序:\n",
|
|
|
|
|
"python src/main.py\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"运行测试:\n",
|
|
|
|
|
"python -m unittest discover tests"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "16f7a973-7c49-4d11-ab3f-457d4622e5e6",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 扩展建议\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"- 支持多语言:在 TextPreprocessor 中添加英文分词(如使用 nltk 或 spacy)。\n",
|
|
|
|
|
"- 数据库存储:将词频结果保存到数据库(如 SQLite),在 WordCounter 中添加存储方法。\n",
|
|
|
|
|
"- 并行处理:使用 multiprocessing 加速大文件处理。\n",
|
|
|
|
|
"- 可视化:在 OutputFormatter 中添加图表输出(如使用 matplotlib)。\n",
|
|
|
|
|
"- 配置文件:将 Config 改为从外部 JSON/YAML 文件加载。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "b2ad0efb-4c7c-4f98-a809-ce6cdcefdb34",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"## 设计说明\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"模块化设计:\n",
|
|
|
|
|
"- 各模块(DataLoader, TextPreprocessor, WordCounter, OutputFormatter)职责单一,符合单一职责原则(SRP)。\n",
|
|
|
|
|
"- 模块间通过明确接口交互,易于替换或扩展。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"设计模式:\n",
|
|
|
|
|
"- 单例模式:WordCounter 使用单例模式,确保全局唯一计数器。\n",
|
|
|
|
|
"- 策略模式:OutputFormatter 支持多种输出格式(JSON、Text),易于添加新格式。\n",
|
|
|
|
|
"- 装饰器模式:timing_decorator 用于性能监控,便于扩展其他功能(如日志记录)。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"可扩展性:\n",
|
|
|
|
|
"- Config 类集中管理配置,便于调整参数(如停用词、输出数量)。\n",
|
|
|
|
|
"- DataLoader 支持动态扫描目录,新增文件无需改动代码。\n",
|
|
|
|
|
"- TextPreprocessor 可扩展以支持其他分词工具或预处理规则。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"单元测试:\n",
|
|
|
|
|
"- 每个模块都有对应的测试用例,确保功能正确性。\n",
|
|
|
|
|
"- 使用 unittest 框架,支持持续集成。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"语言特性利用:\n",
|
|
|
|
|
"- 使用 Python 的装饰器(timing_decorator)记录方法执行时间。\n",
|
|
|
|
|
"- 利用类型注解(typing 模块)提高代码可读性。\n",
|
|
|
|
|
"- 异常处理和日志记录(logging)增强鲁棒性。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"教学用途:\n",
|
|
|
|
|
"- 包含常见工程化实践:模块化、测试驱动开发、配置管理。\n",
|
|
|
|
|
"- 提供扩展点(如支持英文分词、数据库存储),便于学生实践。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "b1aac488-3a98-418c-8201-e7f77c392a1f",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# text_analyzer.py\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"import os\n",
|
|
|
|
|
"import jieba\n",
|
|
|
|
|
"from collections import Counter\n",
|
|
|
|
|
"import yaml\n",
|
|
|
|
|
"from contextlib import contextmanager\n",
|
|
|
|
|
"from typing import List, Tuple\n",
|
|
|
|
|
"from abc import ABC, abstractmethod\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"@contextmanager\n",
|
|
|
|
|
"def file_reader(file_path: str):\n",
|
|
|
|
|
" try:\n",
|
|
|
|
|
" with open(file_path, 'r', encoding='utf-8') as f:\n",
|
|
|
|
|
" yield f.read()\n",
|
|
|
|
|
" except Exception as e:\n",
|
|
|
|
|
" print(f\"Error reading {file_path}: {e}\")\n",
|
|
|
|
|
" yield \"\"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class Tokenizer(ABC):\n",
|
|
|
|
|
" @abstractmethod\n",
|
|
|
|
|
" def tokenize(self, text: str, stop_words: set) -> List[str]:\n",
|
|
|
|
|
" pass\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class JiebaTokenizer(Tokenizer):\n",
|
|
|
|
|
" def tokenize(self, text: str, stop_words: set) -> List[str]:\n",
|
|
|
|
|
" for word in jieba.lcut(text):\n",
|
|
|
|
|
" if word not in stop_words:\n",
|
|
|
|
|
" yield word\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class SimpleTokenizer(Tokenizer):\n",
|
|
|
|
|
" def tokenize(self, text: str, stop_words: set) -> List[str]:\n",
|
|
|
|
|
" for word in text.split():\n",
|
|
|
|
|
" if word not in stop_words:\n",
|
|
|
|
|
" yield word\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class TokenizerFactory:\n",
|
|
|
|
|
" @staticmethod\n",
|
|
|
|
|
" def create_tokenizer(name: str) -> Tokenizer:\n",
|
|
|
|
|
" return {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}.get(name, JiebaTokenizer())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class OutputObserver(ABC):\n",
|
|
|
|
|
" @abstractmethod\n",
|
|
|
|
|
" def update(self, top_words: List[Tuple[str, int]]):\n",
|
|
|
|
|
" pass\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class ConsoleOutput(OutputObserver):\n",
|
|
|
|
|
" def update(self, top_words: List[Tuple[str, int]]):\n",
|
|
|
|
|
" for word, count in top_words:\n",
|
|
|
|
|
" print(f\"{word}: {count}\")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class FileOutput(OutputObserver):\n",
|
|
|
|
|
" def __init__(self, output_file: str):\n",
|
|
|
|
|
" self.output_file = output_file\n",
|
|
|
|
|
" def update(self, top_words: List[Tuple[str, int]]):\n",
|
|
|
|
|
" with open(self.output_file, 'w', encoding='utf-8') as f:\n",
|
|
|
|
|
" for word, count in top_words:\n",
|
|
|
|
|
" f.write(f\"{word}: {count}\\n\")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class TextAnalyzer:\n",
|
|
|
|
|
" def __init__(self, config_path='config.yaml'):\n",
|
|
|
|
|
" with open(config_path, 'r', encoding='utf-8') as f:\n",
|
|
|
|
|
" config = yaml.safe_load(f)\n",
|
|
|
|
|
" self.data_dir = config['data_dir']\n",
|
|
|
|
|
" self.top_n = config['top_n']\n",
|
|
|
|
|
" self.stop_words_file = config['stop_words_file']\n",
|
|
|
|
|
" self.output_file = config['output_file']\n",
|
|
|
|
|
" self.stop_words = self.load_stop_words()\n",
|
|
|
|
|
" self.word_count = Counter()\n",
|
|
|
|
|
" self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n",
|
|
|
|
|
" self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def load_stop_words(self) -> set:\n",
|
|
|
|
|
" with file_reader(self.stop_words_file) as content:\n",
|
|
|
|
|
" return set(line.strip() for line in content.splitlines() if line.strip())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def process_file(self, file_path: str):\n",
|
|
|
|
|
" if file_path.endswith('.txt'):\n",
|
|
|
|
|
" with file_reader(file_path) as text:\n",
|
|
|
|
|
" words = self.tokenizer.tokenize(text, self.stop_words)\n",
|
|
|
|
|
" self.word_count.update(words)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def process_directory(self):\n",
|
|
|
|
|
" for file in os.listdir(self.data_dir):\n",
|
|
|
|
|
" file_path = os.path.join(self.data_dir, file)\n",
|
|
|
|
|
" self.process_file(file_path)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def get_top_words(self) -> List[Tuple[str, int]]:\n",
|
|
|
|
|
" return self.word_count.most_common(self.top_n)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def notify_observers(self, top_words: List[Tuple[str, int]]):\n",
|
|
|
|
|
" for observer in self.observers:\n",
|
|
|
|
|
" observer.update(top_words)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def run(self):\n",
|
|
|
|
|
" self.process_directory()\n",
|
|
|
|
|
" top_words = self.get_top_words()\n",
|
|
|
|
|
" self.notify_observers(top_words)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "d5c689f4-e363-4327-9dc4-15c7157d4288",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# main.py\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"from text_analyzer import TextAnalyzer\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def main():\n",
|
|
|
|
|
" analyzer = TextAnalyzer()\n",
|
|
|
|
|
" analyzer.run()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"if __name__ == '__main__':\n",
|
|
|
|
|
" main()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "cc1d9fb1-3bb5-4f71-aeb3-e304511f4785",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"## 结论\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"通过引入上下文管理器、生成器、元编程、策略模式、观察者模式和工厂模式,词频统计代码在可扩展性、可维护性和复用性上进一步提升。\n",
|
|
|
|
|
"这些特性和模式使代码更模块化、灵活,适合大型项目,同时保持清晰的工程结构。结合之前的装饰器和函数式编程,代码已达到工程化水平。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"若需深入,可以进一步考虑其它性能特性."
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "7244afd0-4405-402a-b9be-75f5d7ff883c",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"## 进一步练习\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"实践练习:\n",
|
|
|
|
|
"- 实现新分词器(如 thulac)并通过策略模式或工厂模式集成。\n",
|
|
|
|
|
"- 添加新观察者(如 JSON 输出)。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"使用生成器实现流式词频统计,比较内存占用。\n",
|
|
|
|
|
"实现缓存机制,缓存已处理文件的分词结果。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"添加命令行接口(argparse),动态配置 top_n 和 tokenizer。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "09c10307-f162-4b36-85b6-6bc01d0001e0",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 综合实现(整合特性与模式)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"整合上下文管理器、生成器、策略模式和观察者模式的最终实现(部分代码展示)。"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"display_name": "Python 3 (ipykernel)",
|
|
|
|
|
"language": "python",
|
|
|
|
|
"name": "python3"
|
|
|
|
|
},
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
"version": 3
|
|
|
|
|
},
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
"name": "python",
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
|
"version": "3.12.7"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 5
|
|
|
|
|
}
|