diff --git a/D Plus/.ipynb_checkpoints/00 封装-checkpoint.ipynb b/D Plus/.ipynb_checkpoints/00 封装-checkpoint.ipynb new file mode 100644 index 0000000..4d255e0 --- /dev/null +++ b/D Plus/.ipynb_checkpoints/00 封装-checkpoint.ipynb @@ -0,0 +1,478 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "69e76aa7-2c5d-4114-a302-85e17cc83e2c", + "metadata": {}, + "source": [ + "本文旨在通过一个案例(读取 data 目录下 100 篇小说文本,统计词频并输出前 10 高频词)来说明结构化编程和封装方法如何提升代码工程质量。\n", + "教案将逐步展示不同结构化方法和封装技术的应用,并分析其对代码可读性、可维护性、可扩展性和复用性的提升。" + ] + }, + { + "cell_type": "markdown", + "id": "b9a9a366-7fd3-422b-b3bc-b0bc00374da6", + "metadata": {}, + "source": [ + "# 教学目标\n", + "- 掌握封装方法(函数、类、模块)在代码组织中的作用。" + ] + }, + { + "cell_type": "markdown", + "id": "1387e026-c978-4217-9015-ab0e047c01a0", + "metadata": {}, + "source": [ + "## 第一部分:基础实现(无结构化、无封装)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33803186-d890-4cd7-9636-8920fcb86e14", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "files = os.listdir('data')\n", + "word_count = {}\n", + "for file in files:\n", + " with open('data/' + file, 'r', encoding='utf-8') as f:\n", + " text = f.read()\n", + " words = text.split() # 假设简单按空格分词\n", + " for word in words:\n", + " if word in word_count:\n", + " word_count[word] += 1\n", + " else:\n", + " word_count[word] = 1\n", + "\n", + "# 排序并输出前10\n", + "sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n", + "for i in range(10):\n", + " print(sorted_words[i])" + ] + }, + { + "cell_type": "markdown", + "id": "471351e7-8645-4690-973a-7d8de53bda5f", + "metadata": {}, + "source": [ + "### 问题分析\n", + "\n", + "- 可读性差:没有清晰的功能划分,代码逻辑混杂,难以阅读理解维护。\n", + "- 扩展性差:如果需要更改分词逻辑、文件路径或输出格式,需修改多处代码。\n", + "- 容错性差:未处理文件读取失败、空文件等问题。\n", + "- 复用性低:逻辑无法直接复用在其他类似任务中。" + ] + }, + { + "cell_type": "markdown", + "id": "a5881283-c295-4433-8edd-f915201a5f43", + "metadata": {}, + "source": [ + "## 第二部分:引入函数封装\n", + "\n", + "提炼出若干函数,减少代码的复杂性,提高可读性和可维护性。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7beadc81-f939-4ac5-b885-407c6810b7de", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "def read_file(file_path):\n", + " \"\"\"读取单个文件内容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + "def get_words(text):\n", + " \"\"\"简单分词(按空格)\"\"\"\n", + " return text.split()\n", + "\n", + "def count_words(words):\n", + " \"\"\"统计词频\"\"\"\n", + " word_count = {}\n", + " for word in words:\n", + " word_count[word] = word_count.get(word, 0) + 1\n", + " return word_count\n", + "\n", + "def get_top_n(word_count, n=10):\n", + " \"\"\"获取前 N 高频词\"\"\"\n", + " return sorted(word_count.items(), key=lambda x: x[1], reverse=True)[:n]\n", + "\n", + "def main():\n", + " \"\"\"主函数,控制流程\"\"\"\n", + " word_count = {}\n", + " data_dir = 'data'\n", + " \n", + " # 顺序结构:按步骤读取文件、处理文本\n", + " for file in os.listdir(data_dir):\n", + " file_path = os.path.join(data_dir, file)\n", + " # 选择结构:检查文件是否为 txt\n", + " if file_path.endswith('.txt'):\n", + " text = read_file(file_path)\n", + " # 循环结构:处理每个文件的词\n", + " words = get_words(text)\n", + " file_word_count = count_words(words)\n", + " # 合并词频\n", + " for word, count in file_word_count.items():\n", + " word_count[word] = word_count.get(word, 0) + count\n", + " \n", + " # 输出结果\n", + " top_words = get_top_n(word_count)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "4f7218a3-43d2-4159-9854-9880020c42fc", + "metadata": {}, + "source": [ + "### 改进分析\n", + " - 逻辑分层:main() 函数清晰定义了程序执行步骤(读取文件 -> 分词 -> 统计 -> 输出)。\n", + " - 模块化:将功能拆分为函数(read_file、get_words、count_words、get_top_n),提高代码复用性和可读性。\n", + " - 错误处理:增加 try-except 处理文件读取异常。\n", + " - 工程质量提升:\n", + " - 可读性:函数命名本身就帮助理解代码,逻辑分块。\n", + " - 可维护性:修改某部分功能(如分词逻辑)只需改对应函数。\n", + " - 复用性:函数可复用在其他类似任务中。" + ] + }, + { + "cell_type": "markdown", + "id": "50737966-57c9-4daf-ac3b-6a1c73b18136", + "metadata": {}, + "source": [ + "## 第三部分:引入类封装\n", + "\n", + "通过类封装功能,进一步提高代码的模块化、可扩展性和复用性。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81aa7f9c-de28-4a7a-8ba1-130c3e5e4f7f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "\n", + "class TextAnalyzer:\n", + " \"\"\"文本分析类,封装词频统计功能\"\"\"\n", + " def __init__(self, data_dir='data', top_n=10):\n", + " self.data_dir = data_dir\n", + " self.top_n = top_n\n", + " self.word_count = Counter()\n", + "\n", + " def read_file(self, file_path):\n", + " \"\"\"读取文件内容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " def tokenize(self, text):\n", + " \"\"\"使用 jieba 进行中文分词\"\"\"\n", + " return jieba.lcut(text)\n", + "\n", + " def process_file(self, file_path):\n", + " \"\"\"处理单个文件\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file_path)\n", + " words = self.tokenize(text)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " \"\"\"处理目录下所有文件\"\"\"\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self):\n", + " \"\"\"获取前 N 高频词\"\"\"\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def run(self):\n", + " \"\"\"执行词频统计\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "def main():\n", + " analyzer = TextAnalyzer(data_dir='data', top_n=10)\n", + " analyzer.run()\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "62e780d4-94de-4830-89c2-ab2c96500fc5", + "metadata": {}, + "source": [ + "### 改进分析\n", + "- 面向对象封装:\n", + " - 使用 TextAnalyzer 类将所有功能封装为一个对象,数据(如 word_count)和方法(如 tokenize)绑定在一起。\n", + " - 通过 __init__ 提供配置(如 data_dir 和 top_n),提高灵活性。\n", + " \n", + "- 模块化:类方法分工明确(如 read_file、tokenize、process_file),便于扩展。\n", + "- 工程质量提升:\n", + " - 可扩展性:可通过继承 TextAnalyzer 添加新功能(如支持其他分词器或文件格式)。\n", + " - 复用性:类可实例化多次,用于不同目录或参数。\n", + " - 可维护性:逻辑集中在类中,修改相对安全。" + ] + }, + { + "cell_type": "markdown", + "id": "9b4e17c4-f47e-4245-b3d9-e40fde0a2e04", + "metadata": {}, + "source": [ + "# 第四部分:引入文件模块封装\n", + "将代码进一步模块化到不同文件,引入配置文件和停用词过滤。" + ] + }, + { + "cell_type": "raw", + "id": "aadb5aea-8cc5-4a0f-9f5b-7eab28e90f1a", + "metadata": {}, + "source": [ + "目录结构\n", + "\n", + "project/\n", + "├── data/ # 小说文本目录\n", + "├── config.yaml # 配置文件\n", + "├── stop_words.txt # 停用词文件\n", + "├── text_analyzer.py # 分析模块\n", + "├── main.py # 主程序" + ] + }, + { + "cell_type": "raw", + "id": "2de4767b-8928-4f3f-8c8b-3c3cba2bc98a", + "metadata": {}, + "source": [ + "# config.yaml\n", + "\n", + "data_dir: data\n", + "top_n: 10\n", + "stop_words_file: stop_words.txt\n", + "output_file: output.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b442d61-c937-4757-b7b4-b6fc047c3529", + "metadata": {}, + "outputs": [], + "source": [ + "# text_analyzer.py\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.word_count = Counter()\n", + " self.stop_words = self.load_stop_words()\n", + "\n", + " def load_stop_words(self):\n", + " \"\"\"加载停用词\"\"\"\n", + " try:\n", + " with open(self.stop_words_file, 'r', encoding='utf-8') as f:\n", + " return set(line.strip() for line in f if line.strip())\n", + " except Exception as e:\n", + " print(f\"Error loading stop words: {e}\")\n", + " return set()\n", + "\n", + " def read_file(self, file_path):\n", + " \"\"\"读取文件内容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " def tokenize(self, text):\n", + " \"\"\"中文分词并过滤停用词\"\"\"\n", + " words = jieba.lcut(text)\n", + " return [word for word in words if word not in self.stop_words]\n", + "\n", + " def process_file(self, file_path):\n", + " \"\"\"处理单个文件\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file_path)\n", + " words = self.tokenize(text)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " \"\"\"处理目录下所有文件\"\"\"\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self):\n", + " \"\"\"获取前 N 高频词\"\"\"\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def save_results(self, top_words):\n", + " \"\"\"保存结果到文件\"\"\"\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + " def run(self):\n", + " \"\"\"执行词频统计并保存结果\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.save_results(top_words)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22f58992-0108-4c90-894d-e756e7301a5a", + "metadata": {}, + "outputs": [], + "source": [ + "# main.py\n", + "\n", + "from text_analyzer import TextAnalyzer\n", + "\n", + "def main():\n", + " analyzer = TextAnalyzer()\n", + " analyzer.run()\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "18d27410-8923-4662-a6b7-8e027609506e", + "metadata": {}, + "source": [ + "## 改进分析\n", + "\n", + "- 模块化:将分析逻辑放入 text_analyzer.py,主程序 main.py 仅负责调用,符合工程化项目结构。\n", + "- 配置文件:通过 config.yaml 配置参数,增强灵活性,无需修改代码即可更改目录、输出文件等。\n", + "- 输出到文件:增加 save_results 方法,支持结果持久化。\n", + "- 工程质量提升:\n", + " - 可维护性:配置文件和模块化分离了配置与逻辑,修改配置无需动代码。 \n", + " - 复用性:模块可导入到其他项目,类可重复实例化。" + ] + }, + { + "cell_type": "markdown", + "id": "10876929-69f9-43bf-ba2d-a5d7bb11f22b", + "metadata": {}, + "source": [ + "### 封装的总节\n", + "\n", + "封装方法:\n", + "- 模块化:函数划分逻辑,降低耦合。\n", + "- 函数封装:将重复逻辑封装为函数,提高复用性。\n", + "- 类封装:将数据和方法绑定,增强代码组织性和扩展性。\n", + "- 文件封装:通过文件模块化,符合工程化开发规范。\n", + "\n", + "工程质量提升:\n", + "- 分离配置与逻辑,降低维护成本。\n", + "- 模块化和面向对象设计支持功能扩展。\n", + "- 错误处理提高程序鲁棒性。" + ] + }, + { + "cell_type": "raw", + "id": "60ba30d8-d8c2-4183-996e-376ff71716bf", + "metadata": {}, + "source": [ + "## 另外一种文件模块化设计(分层架构)示例\n", + "\n", + "将代码拆分为独立模块,每个模块仅负责单一职责:\n", + " - 数据读取层:遍历目录、读取文件内容\n", + " - 数据处理层:文本清洗、分词、统计词频\n", + " - 结果输出层:排序并输出前10高频词\n", + "\n", + "目录结构:\n", + "project/\n", + "├── data_loader.py # 数据读取模块\n", + "├── text_processor.py # 数据处理模块\n", + "├── output_handler.py # 结果输出模块\n", + "└── main.py # 主程序入口" + ] + }, + { + "cell_type": "markdown", + "id": "517759ac-c4cf-402e-86f1-a9fae0d88bbb", + "metadata": {}, + "source": [ + "# 第七部分:运行说明\n", + "\n", + "环境准备:\n", + "- 安装 Python 3.8+。\n", + "- 安装依赖:pip install jieba pyyaml。\n", + "- 准备 data 目录,放入 100 个 txt 文件。\n", + "- 创建 stop_words.txt 和 config.yaml。" + ] + }, + { + "cell_type": "markdown", + "id": "a7e1836b-42a1-45f9-bf8c-2e04a38744e4", + "metadata": {}, + "source": [ + "通过从无结构到结构化,再到面向对象和模块化的逐步优化,展示了结构化编程和封装方法如何显著提升代码工程质量。最终实现不仅满足了词频统计需求,还具备高可读性、可维护性、可扩展性和复用性,适合实际工程应用。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/D Plus/.ipynb_checkpoints/01 特殊执行方式的语言特性.ipynb-checkpoint.ipynb b/D Plus/.ipynb_checkpoints/01 特殊执行方式的语言特性.ipynb-checkpoint.ipynb new file mode 100644 index 0000000..65b84ab --- /dev/null +++ b/D Plus/.ipynb_checkpoints/01 特殊执行方式的语言特性.ipynb-checkpoint.ipynb @@ -0,0 +1,312 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "86405617-889a-40c2-a895-7b51fb14b65d", + "metadata": {}, + "source": [ + "# 教学目标\n", + "\n", + "- 在词频统计案例中引入装饰器和函数式编程 。\n", + "- 分析这些特性和模式如何进一步优化代码质量(可读性、可维护性、可扩展性、复用性)。\n", + "- 探讨高级特性在案例中的适用性与局限性。" + ] + }, + { + "cell_type": "markdown", + "id": "e6a6a633-d3af-4778-815c-4490dff5f624", + "metadata": {}, + "source": [ + "## 第一部分:引入装饰器\n", + "\n", + "装饰器可用于在不修改函数代码的情况下添加功能。适合日志记录、性能分析、错误处理等场景。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a5c7d69-d445-4a9c-bb48-7fde0a36c646", + "metadata": {}, + "outputs": [], + "source": [ + "# 为 TextAnalyzer 类添加一个装饰器,用于记录方法执行时间。\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "import time\n", + "import functools\n", + "\n", + "def timing_decorator(func):\n", + " \"\"\"装饰器:记录函数执行时间\"\"\"\n", + " @functools.wraps(func)\n", + " def wrapper(*args, **kwargs):\n", + " start_time = time.time()\n", + " result = func(*args, **kwargs)\n", + " end_time = time.time()\n", + " print(f\"{func.__name__} took {end_time - start_time:.4f} seconds\")\n", + " return result\n", + " return wrapper\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.word_count = Counter()\n", + " self.stop_words = self.load_stop_words()\n", + "\n", + " def load_stop_words(self):\n", + " \"\"\"加载停用词\"\"\"\n", + " try:\n", + " with open(self.stop_words_file, 'r', encoding='utf-8') as f:\n", + " return set(line.strip() for line in f if line.strip())\n", + " except Exception as e:\n", + " print(f\"Error loading stop words: {e}\")\n", + " return set()\n", + "\n", + " @timing_decorator\n", + " def read_file(self, file_path):\n", + " \"\"\"读取文件内容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " @timing_decorator\n", + " def tokenize(self, text):\n", + " \"\"\"中文分词并过滤停用词\"\"\"\n", + " words = jieba.lcut(text)\n", + " return [word for word in words if word not in self.stop_words]\n", + "\n", + " def process_file(self, file_path):\n", + " \"\"\"处理单个文件\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file_path)\n", + " words = self.tokenize(text)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " \"\"\"处理目录下所有文件\"\"\"\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self):\n", + " \"\"\"获取前 N 高频词\"\"\"\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def save_results(self, top_words):\n", + " \"\"\"保存结果到文件\"\"\"\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + " def run(self):\n", + " \"\"\"执行词频统计并保存结果\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.save_results(top_words)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4dcabfd9-b8f9-4796-a060-9d9f6689e92f", + "metadata": {}, + "source": [ + "### 装饰器分析\n", + "\n", + "功能:timing_decorator 记录 read_file 和 tokenize 方法的执行时间,帮助分析性能瓶颈(如分词耗时较长)。\n", + "\n", + "工程质量提升:\n", + " - 可维护性:无需修改原方法代码即可添加性能监控,符合开闭原则,维护更方便。\n", + " - 可读性:装饰器将性能监控逻辑与业务逻辑分离,代码更清晰。\n", + " - 复用性:timing_decorator 可复用于其他方法或项目。\n", + "\n", + "局限性:装饰器增加少量性能开销,需谨慎用于高频调用的函数。" + ] + }, + { + "cell_type": "markdown", + "id": "8fcbe48d-de8f-4387-9be3-f05f88553029", + "metadata": {}, + "source": [ + "## 第二部分:引入函数式编程\n", + "\n", + "函数式编程(如高阶函数、lambda、map/reduce)强调无变量污染、数据转换简洁性。在词频统计案例中,函数式编程可用于:\n", + "- 数据处理:使用 map 和 filter 处理文件和单词。\n", + "- 词频统计:使用 reduce 合并词频。\n", + "- 管道式处理:通过函数组合实现数据流处理。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a6970b2-7488-43e3-ae9f-0174ff9b4b57", + "metadata": {}, + "outputs": [], + "source": [ + "# 函数式处理文件和词频\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "from functools import reduce\n", + "from typing import List, Tuple\n", + "\n", + "def timing_decorator(func):\n", + " \"\"\"装饰器:记录函数执行时间\"\"\"\n", + " import time\n", + " import functools\n", + " @functools.wraps(func)\n", + " def wrapper(*args, **kwargs):\n", + " start_time = time.time()\n", + " result = func(*args, **kwargs)\n", + " end_time = time.time()\n", + " print(f\"{func.__name__} took {end_time - start_time:.4f} seconds\")\n", + " return result\n", + " return wrapper\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + "\n", + " def load_stop_words(self) -> set:\n", + " \"\"\"加载停用词\"\"\"\n", + " try:\n", + " with open(self.stop_words_file, 'r', encoding='utf-8') as f:\n", + " return set(line.strip() for line in f if line.strip())\n", + " except Exception as e:\n", + " print(f\"Error loading stop words: {e}\")\n", + " return set()\n", + "\n", + " @timing_decorator\n", + " def read_file(self, file_path: str) -> str:\n", + " \"\"\"读取文件内容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " def tokenize(self, text: str) -> List[str]:\n", + " \"\"\"中文分词并过滤停用词(函数式)\"\"\"\n", + " return list(filter(lambda w: w not in self.stop_words, jieba.lcut(text)))\n", + "\n", + " def process_file(self, file_path: str) -> Counter:\n", + " \"\"\"处理单个文件,返回词频 Counter\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file Couple(path)\n", + " words = self.tokenize(text)\n", + " return Counter(words)\n", + " return Counter()\n", + "\n", + " def process_directory(self) -> Counter:\n", + " \"\"\"处理目录下所有文件(函数式)\"\"\"\n", + " file_paths = (os.path.join(self.data_dir, f) for f in os.listdir(self.data_dir))\n", + " counters = map(self.process_file, file_paths)\n", + " return reduce(lambda c1, c2: c1 + c2, counters, Counter())\n", + "\n", + " def get_top_words(self, word_count: Counter) -> List[Tuple[str, int]]:\n", + " \"\"\"获取前 N 高频词\"\"\"\n", + " return word_count.most_common(self.top_n)\n", + "\n", + " def save_results(self, top_words: List[Tuple[str, int]]):\n", + " \"\"\"保存结果到文件\"\"\"\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + " def run(self):\n", + " \"\"\"执行词频统计并保存结果\"\"\"\n", + " word_count = self.process_directory()\n", + " top_words = self.get_top_words(word_count)\n", + " self.save_results(top_words)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "id": "6ce3b7c3-f099-4e2c-b415-18b0e3ab492a", + "metadata": {}, + "source": [ + "### 函数式编程分析\n", + "\n", + "改进:\n", + "- map:在 process_directory 中,使用 map(self.process_file, file_paths) 并行处理文件路径,生成词频 Counter 列表。\n", + "- reduce:使用 reduce(lambda c1, c2: c1 + c2, counters, Counter()) 合并所有文件的词频,简洁且无副作用。\n", + "- filter:在 tokenize 中,使用 filter(lambda w: w not in self.stop_words, ...) 过滤停用词,替代列表推导式。\n", + "- 生成器:file_paths 使用生成器表达式,减少内存占用。\n", + "\n", + "工程质量提升:\n", + "- 可读性:函数式编程使数据处理逻辑更简洁,管道式处理清晰表达数据流(文件路径 -> 词频 -> 合并)。\n", + "- 性能:生成器和 map 优化内存使用,适合处理大量文件。\n", + "- 可维护性:函数式代码无副作用,易于测试和调试。\n", + "- 适用场景:适合数据转换和批量处理(如文件读取、词频合并)。\n", + "- 简洁性:map、reduce 等使数据处理逻辑更紧凑。\n", + "- 内存效率:生成器和惰性求值优化内存使用。\n", + "- 结合并发可显著提升效率。\n", + "\n", + "适用场景:数据流处理(如文件处理、词频合并)、无状态操作。\n", + "\n", + "局限性:\n", + "- 函数式代码对初学者可能不够直观,需熟悉 map、reduce 等概念。\n", + "- 对于复杂逻辑,函数式编程可能增加调试难度。" + ] + }, + { + "cell_type": "markdown", + "id": "458e18ec-b536-4860-9e12-d0bf5ed9d876", + "metadata": {}, + "source": [ + "# 练习\n", + "\n", + "实践练习:\n", + "- 添加日志装饰器,记录每次文件处理的详细信息。\n", + "- 使用 functools.reduce 重写 get_top_words,尝试不同排序逻辑。\n", + "\n", + "扩展任务:\n", + "- 添加缓存装饰器,避免重复分词相同文件。\n", + "- 实现函数式管道,将文件读取、分词、统计串联为单一流。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/D Plus/.ipynb_checkpoints/02 设计模式-checkpoint.ipynb b/D Plus/.ipynb_checkpoints/02 设计模式-checkpoint.ipynb new file mode 100644 index 0000000..8f14f9e --- /dev/null +++ b/D Plus/.ipynb_checkpoints/02 设计模式-checkpoint.ipynb @@ -0,0 +1,493 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "eccfe49f-de35-4241-90e3-a7095940b61a", + "metadata": {}, + "source": [ + "设计模式提供高频重复出现需求的最佳解决方案。以下介绍适合词频统计案例的设计模式:策略模式、观察者模式、工厂模式。" + ] + }, + { + "cell_type": "markdown", + "id": "c186171f-d1f2-433e-a3eb-b266e2909a2c", + "metadata": {}, + "source": [ + "## 策略模式(动态选择分词策略)\n", + "\n", + "策略模式允许动态切换算法(如分词器),比元编程简单。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97c865cb-0b5a-4fa1-aa74-5ba2e65e7436", + "metadata": {}, + "outputs": [], + "source": [ + "from abc import ABC, abstractmethod\n", + "\n", + "class Tokenizer(ABC):\n", + " \"\"\"分词器接口\"\"\"\n", + " @abstractmethod\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " pass\n", + "\n", + "class JiebaTokenizer(Tokenizer):\n", + " \"\"\"jieba 分词器\"\"\"\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " return [w for w in jieba.lcut(text) if w not in stop_words]\n", + "\n", + "class SimpleTokenizer(Tokenizer):\n", + " \"\"\"简单分词器\"\"\"\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " return [w for w in text.split() if w not in stop_words]\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " # 动态选择分词器\n", + " tokenizer_name = config.get('tokenizer', 'jieba')\n", + " self.tokenizer = {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}[tokenizer_name]\n", + "\n", + " def tokenize(self, text: str) -> List[str]:\n", + " \"\"\"使用策略分词\"\"\"\n", + " return self.tokenizer.tokenize(text, self.stop_words)\n", + "\n", + " # 其余方法同上" + ] + }, + { + "cell_type": "markdown", + "id": "5435ebc3-d3b0-4475-8bd5-cb45fb51638c", + "metadata": {}, + "source": [ + "工程质量提升:\n", + "- 可扩展性:添加新分词器只需实现 Tokenizer 接口。\n", + "- 可维护性:分词逻辑与主类分离,修改更独立。\n", + "\n", + "适用场景:适合需要动态切换算法的场景。" + ] + }, + { + "cell_type": "markdown", + "id": "fbf53455-558c-40fb-8718-446dec989b5d", + "metadata": {}, + "source": [ + "## 观察者模式(结果输出解耦)\n", + "\n", + "观察者模式可用于解耦结果输出逻辑(如打印、保存文件、发送通知)。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7a2bd4c-df73-4800-b45b-9b6c73d28d7b", + "metadata": {}, + "outputs": [], + "source": [ + "class OutputObserver(ABC):\n", + " \"\"\"输出观察者接口\"\"\"\n", + " @abstractmethod\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " pass\n", + "\n", + "class ConsoleOutput(OutputObserver):\n", + " \"\"\"控制台输出\"\"\"\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "class FileOutput(OutputObserver):\n", + " \"\"\"文件输出\"\"\"\n", + " def __init__(self, output_file: str):\n", + " self.output_file = output_file\n", + "\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n", + "\n", + " def add_observer(self, observer: OutputObserver):\n", + " \"\"\"添加观察者\"\"\"\n", + " self.observers.append(observer)\n", + "\n", + " def notify_observers(self, top_words: List[Tuple[str, int]]):\n", + " \"\"\"通知所有观察者\"\"\"\n", + " for observer in self.observers:\n", + " observer.update(top_words)\n", + "\n", + " def run(self):\n", + " \"\"\"执行词频统计并通知观察者\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.notify_observers(top_words)\n", + "\n", + " # 其余方法同上" + ] + }, + { + "cell_type": "markdown", + "id": "02b5cfba-431c-4a01-a454-099e4f41922c", + "metadata": {}, + "source": [ + "### 分析\n", + "\n", + "工程质量提升:\n", + " - 可扩展性:添加新输出方式只需实现 OutputObserver 接口。\n", + " - 解耦性:输出逻辑与统计逻辑分离,修改输出不影响核心功能。\n", + "\n", + "适用场景:适合需要多种输出或通知的场景。\n", + "\n", + "局限性:观察者模式增加代码复杂性,适合复杂输出需求。" + ] + }, + { + "cell_type": "markdown", + "id": "11669305-8cd5-4317-afd5-e85c3f0a5a81", + "metadata": {}, + "source": [ + "## 工厂模式(动态创建分词器)\n", + "\n", + "工厂模式可用于动态创建分词器,简化策略模式中的初始化逻辑。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fa50633-de22-40c8-912d-3ded5ebcedfc", + "metadata": {}, + "outputs": [], + "source": [ + "class TokenizerFactory:\n", + " \"\"\"分词器工厂\"\"\"\n", + " @staticmethod\n", + " def create_tokenizer(name: str) -> Tokenizer:\n", + " tokenizers = {\n", + " 'jieba': JiebaTokenizer(),\n", + " 'simple': SimpleTokenizer()\n", + " }\n", + " return tokenizers.get(name, JiebaTokenizer())\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n", + "\n", + " # 其余方法同上" + ] + }, + { + "cell_type": "markdown", + "id": "a4db7046-dfe2-4bd8-81d1-49a42e2eeb5c", + "metadata": {}, + "source": [ + "### 分析\n", + "\n", + "工程质量提升:\n", + " - 可维护性:分词器创建逻辑集中于工厂,易于修改。\n", + " - 可扩展性:添加新分词器只需更新工厂方法。\n", + "\n", + "适用场景:适合需要动态创建对象的场景。\n", + "\n", + "局限性:对于简单场景,工厂模式可能略显冗余。" + ] + }, + { + "cell_type": "markdown", + "id": "e5f2aef4-a055-43a9-917c-fa183de6db2d", + "metadata": {}, + "source": [ + "## 综合实现(整合特性与模式)\n", + "\n", + "整合上下文管理器、生成器、策略模式和观察者模式的最终实现(部分代码展示)。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa7f34e2-d355-4a22-8572-729c49b18605", + "metadata": {}, + "outputs": [], + "source": [ + "# text_analyzer.py\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "from contextlib import contextmanager\n", + "from typing import List, Tuple\n", + "from abc import ABC, abstractmethod\n", + "\n", + "@contextmanager\n", + "def file_reader(file_path: str):\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " yield f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " yield \"\"\n", + "\n", + "class Tokenizer(ABC):\n", + " @abstractmethod\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " pass\n", + "\n", + "class JiebaTokenizer(Tokenizer):\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " for word in jieba.lcut(text):\n", + " if word not in stop_words:\n", + " yield word\n", + "\n", + "class SimpleTokenizer(Tokenizer):\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " for word in text.split():\n", + " if word not in stop_words:\n", + " yield word\n", + "\n", + "class TokenizerFactory:\n", + " @staticmethod\n", + " def create_tokenizer(name: str) -> Tokenizer:\n", + " return {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}.get(name, JiebaTokenizer())\n", + "\n", + "class OutputObserver(ABC):\n", + " @abstractmethod\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " pass\n", + "\n", + "class ConsoleOutput(OutputObserver):\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "class FileOutput(OutputObserver):\n", + " def __init__(self, output_file: str):\n", + " self.output_file = output_file\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n", + " self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n", + "\n", + " def load_stop_words(self) -> set:\n", + " with file_reader(self.stop_words_file) as content:\n", + " return set(line.strip() for line in content.splitlines() if line.strip())\n", + "\n", + " def process_file(self, file_path: str):\n", + " if file_path.endswith('.txt'):\n", + " with file_reader(file_path) as text:\n", + " words = self.tokenizer.tokenize(text, self.stop_words)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self) -> List[Tuple[str, int]]:\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def notify_observers(self, top_words: List[Tuple[str, int]]):\n", + " for observer in self.observers:\n", + " observer.update(top_words)\n", + "\n", + " def run(self):\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.notify_observers(top_words)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d130312-b298-4c76-ae09-0fb4bd08b0c1", + "metadata": {}, + "outputs": [], + "source": [ + "# main.py\n", + "\n", + "from text_analyzer import TextAnalyzer\n", + "\n", + "def main():\n", + " analyzer = TextAnalyzer()\n", + " analyzer.run()\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "770618c9-428e-454a-97de-00e3b49c9d03", + "metadata": {}, + "source": [ + "## 结论\n", + "\n", + "通过引入上下文管理器、生成器、元编程、策略模式、观察者模式和工厂模式,词频统计代码在可扩展性、可维护性和复用性上进一步提升。\n", + "这些特性和模式使代码更模块化、灵活,适合大型项目,同时保持清晰的工程结构。结合之前的装饰器和函数式编程,代码已达到工程化水平。\n", + "\n", + "若需深入,可以进一步考虑其它性能特性." + ] + }, + { + "cell_type": "markdown", + "id": "cbeaa07d-272f-465b-a437-9c4b44827d23", + "metadata": {}, + "source": [ + "## 进一步练习\n", + "\n", + "实践练习:\n", + "- 实现新分词器(如 thulac)并通过策略模式或工厂模式集成。\n", + "- 添加新观察者(如 JSON 输出)。\n", + "\n", + "使用生成器实现流式词频统计,比较内存占用。\n", + "实现缓存机制,缓存已处理文件的分词结果。\n", + "\n", + "添加命令行接口(argparse),动态配置 top_n 和 tokenizer。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a43b53d-1e07-4ebe-a6c8-104353fd5f7b", + "metadata": {}, + "outputs": [], + "source": [ + "## 附:元编程\n", + "\n", + "元编程允许动态修改类或函数行为,可用于动态配置分词器或输出格式。案例中,可通过元编程动态注册分词器。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4394008c-88da-44bd-aa0d-f1b7a6dbc7d6", + "metadata": {}, + "outputs": [], + "source": [ + "class TokenizerRegistry(type):\n", + " \"\"\"元类:动态注册分词器\"\"\"\n", + " tokenizers = {}\n", + "\n", + " def register_tokenizer(cls, name):\n", + " def decorator(func):\n", + " cls.tokenizers[name] = func\n", + " return func\n", + " return decorator\n", + "\n", + "class TextAnalyzer(metaclass=TokenizerRegistry):\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.tokenizer_name = config.get('tokenizer', 'jieba') # 从配置读取分词器\n", + "\n", + " @classmethod\n", + " def register_tokenizer(cls, name):\n", + " return cls.__class__.register_tokenizer(name)\n", + "\n", + " def tokenize(self, text: str) -> List[str]:\n", + " \"\"\"动态调用分词器\"\"\"\n", + " tokenizer = self.__class__.tokenizers.get(self.tokenizer_name, self.jieba_tokenizer)\n", + " return tokenizer(self, text)\n", + "\n", + " @register_tokenizer('jieba')\n", + " def jieba_tokenizer(self, text: str) -> List[str]:\n", + " \"\"\"jieba 分词\"\"\"\n", + " return [w for w in jieba.lcut(text) if w not in self.stop_words]\n", + "\n", + " @register_tokenizer('simple')\n", + " def simple_tokenizer(self, text: str) -> List[str]:\n", + " \"\"\"简单分词(按空格)\"\"\"\n", + " return [w for w in text.split() if w not in self.stop_words]\n", + "\n", + " # 其余方法(load_stop_words, process_file, etc.)同上" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2249f13a-7a3f-4376-ba2a-d92f11658d32", + "metadata": {}, + "outputs": [], + "source": [ + "### 分析\n", + "\n", + "功能:通过元类和装饰器动态注册分词器,支持配置切换(如 jieba 或 simple)。\n", + "\n", + "工程质量提升:\n", + " 可扩展性:新分词器只需添加新方法并注册,无需修改核心逻辑。\n", + " 灵活性:通过配置文件动态选择分词器。\n", + "\n", + "适用场景:适合需要动态配置或插件化系统的场景。\n", + "\n", + "局限性:元编程增加代码复杂性,可能降低可读性,需谨慎使用。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/D Plus/00 封装.ipynb b/D Plus/00 封装.ipynb new file mode 100644 index 0000000..4d255e0 --- /dev/null +++ b/D Plus/00 封装.ipynb @@ -0,0 +1,478 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "69e76aa7-2c5d-4114-a302-85e17cc83e2c", + "metadata": {}, + "source": [ + "本文旨在通过一个案例(读取 data 目录下 100 篇小说文本,统计词频并输出前 10 高频词)来说明结构化编程和封装方法如何提升代码工程质量。\n", + "教案将逐步展示不同结构化方法和封装技术的应用,并分析其对代码可读性、可维护性、可扩展性和复用性的提升。" + ] + }, + { + "cell_type": "markdown", + "id": "b9a9a366-7fd3-422b-b3bc-b0bc00374da6", + "metadata": {}, + "source": [ + "# 教学目标\n", + "- 掌握封装方法(函数、类、模块)在代码组织中的作用。" + ] + }, + { + "cell_type": "markdown", + "id": "1387e026-c978-4217-9015-ab0e047c01a0", + "metadata": {}, + "source": [ + "## 第一部分:基础实现(无结构化、无封装)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33803186-d890-4cd7-9636-8920fcb86e14", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "files = os.listdir('data')\n", + "word_count = {}\n", + "for file in files:\n", + " with open('data/' + file, 'r', encoding='utf-8') as f:\n", + " text = f.read()\n", + " words = text.split() # 假设简单按空格分词\n", + " for word in words:\n", + " if word in word_count:\n", + " word_count[word] += 1\n", + " else:\n", + " word_count[word] = 1\n", + "\n", + "# 排序并输出前10\n", + "sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n", + "for i in range(10):\n", + " print(sorted_words[i])" + ] + }, + { + "cell_type": "markdown", + "id": "471351e7-8645-4690-973a-7d8de53bda5f", + "metadata": {}, + "source": [ + "### 问题分析\n", + "\n", + "- 可读性差:没有清晰的功能划分,代码逻辑混杂,难以阅读理解维护。\n", + "- 扩展性差:如果需要更改分词逻辑、文件路径或输出格式,需修改多处代码。\n", + "- 容错性差:未处理文件读取失败、空文件等问题。\n", + "- 复用性低:逻辑无法直接复用在其他类似任务中。" + ] + }, + { + "cell_type": "markdown", + "id": "a5881283-c295-4433-8edd-f915201a5f43", + "metadata": {}, + "source": [ + "## 第二部分:引入函数封装\n", + "\n", + "提炼出若干函数,减少代码的复杂性,提高可读性和可维护性。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7beadc81-f939-4ac5-b885-407c6810b7de", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "def read_file(file_path):\n", + " \"\"\"读取单个文件内容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + "def get_words(text):\n", + " \"\"\"简单分词(按空格)\"\"\"\n", + " return text.split()\n", + "\n", + "def count_words(words):\n", + " \"\"\"统计词频\"\"\"\n", + " word_count = {}\n", + " for word in words:\n", + " word_count[word] = word_count.get(word, 0) + 1\n", + " return word_count\n", + "\n", + "def get_top_n(word_count, n=10):\n", + " \"\"\"获取前 N 高频词\"\"\"\n", + " return sorted(word_count.items(), key=lambda x: x[1], reverse=True)[:n]\n", + "\n", + "def main():\n", + " \"\"\"主函数,控制流程\"\"\"\n", + " word_count = {}\n", + " data_dir = 'data'\n", + " \n", + " # 顺序结构:按步骤读取文件、处理文本\n", + " for file in os.listdir(data_dir):\n", + " file_path = os.path.join(data_dir, file)\n", + " # 选择结构:检查文件是否为 txt\n", + " if file_path.endswith('.txt'):\n", + " text = read_file(file_path)\n", + " # 循环结构:处理每个文件的词\n", + " words = get_words(text)\n", + " file_word_count = count_words(words)\n", + " # 合并词频\n", + " for word, count in file_word_count.items():\n", + " word_count[word] = word_count.get(word, 0) + count\n", + " \n", + " # 输出结果\n", + " top_words = get_top_n(word_count)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "4f7218a3-43d2-4159-9854-9880020c42fc", + "metadata": {}, + "source": [ + "### 改进分析\n", + " - 逻辑分层:main() 函数清晰定义了程序执行步骤(读取文件 -> 分词 -> 统计 -> 输出)。\n", + " - 模块化:将功能拆分为函数(read_file、get_words、count_words、get_top_n),提高代码复用性和可读性。\n", + " - 错误处理:增加 try-except 处理文件读取异常。\n", + " - 工程质量提升:\n", + " - 可读性:函数命名本身就帮助理解代码,逻辑分块。\n", + " - 可维护性:修改某部分功能(如分词逻辑)只需改对应函数。\n", + " - 复用性:函数可复用在其他类似任务中。" + ] + }, + { + "cell_type": "markdown", + "id": "50737966-57c9-4daf-ac3b-6a1c73b18136", + "metadata": {}, + "source": [ + "## 第三部分:引入类封装\n", + "\n", + "通过类封装功能,进一步提高代码的模块化、可扩展性和复用性。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81aa7f9c-de28-4a7a-8ba1-130c3e5e4f7f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "\n", + "class TextAnalyzer:\n", + " \"\"\"文本分析类,封装词频统计功能\"\"\"\n", + " def __init__(self, data_dir='data', top_n=10):\n", + " self.data_dir = data_dir\n", + " self.top_n = top_n\n", + " self.word_count = Counter()\n", + "\n", + " def read_file(self, file_path):\n", + " \"\"\"读取文件内容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " def tokenize(self, text):\n", + " \"\"\"使用 jieba 进行中文分词\"\"\"\n", + " return jieba.lcut(text)\n", + "\n", + " def process_file(self, file_path):\n", + " \"\"\"处理单个文件\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file_path)\n", + " words = self.tokenize(text)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " \"\"\"处理目录下所有文件\"\"\"\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self):\n", + " \"\"\"获取前 N 高频词\"\"\"\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def run(self):\n", + " \"\"\"执行词频统计\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "def main():\n", + " analyzer = TextAnalyzer(data_dir='data', top_n=10)\n", + " analyzer.run()\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "62e780d4-94de-4830-89c2-ab2c96500fc5", + "metadata": {}, + "source": [ + "### 改进分析\n", + "- 面向对象封装:\n", + " - 使用 TextAnalyzer 类将所有功能封装为一个对象,数据(如 word_count)和方法(如 tokenize)绑定在一起。\n", + " - 通过 __init__ 提供配置(如 data_dir 和 top_n),提高灵活性。\n", + " \n", + "- 模块化:类方法分工明确(如 read_file、tokenize、process_file),便于扩展。\n", + "- 工程质量提升:\n", + " - 可扩展性:可通过继承 TextAnalyzer 添加新功能(如支持其他分词器或文件格式)。\n", + " - 复用性:类可实例化多次,用于不同目录或参数。\n", + " - 可维护性:逻辑集中在类中,修改相对安全。" + ] + }, + { + "cell_type": "markdown", + "id": "9b4e17c4-f47e-4245-b3d9-e40fde0a2e04", + "metadata": {}, + "source": [ + "# 第四部分:引入文件模块封装\n", + "将代码进一步模块化到不同文件,引入配置文件和停用词过滤。" + ] + }, + { + "cell_type": "raw", + "id": "aadb5aea-8cc5-4a0f-9f5b-7eab28e90f1a", + "metadata": {}, + "source": [ + "目录结构\n", + "\n", + "project/\n", + "├── data/ # 小说文本目录\n", + "├── config.yaml # 配置文件\n", + "├── stop_words.txt # 停用词文件\n", + "├── text_analyzer.py # 分析模块\n", + "├── main.py # 主程序" + ] + }, + { + "cell_type": "raw", + "id": "2de4767b-8928-4f3f-8c8b-3c3cba2bc98a", + "metadata": {}, + "source": [ + "# config.yaml\n", + "\n", + "data_dir: data\n", + "top_n: 10\n", + "stop_words_file: stop_words.txt\n", + "output_file: output.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b442d61-c937-4757-b7b4-b6fc047c3529", + "metadata": {}, + "outputs": [], + "source": [ + "# text_analyzer.py\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.word_count = Counter()\n", + " self.stop_words = self.load_stop_words()\n", + "\n", + " def load_stop_words(self):\n", + " \"\"\"加载停用词\"\"\"\n", + " try:\n", + " with open(self.stop_words_file, 'r', encoding='utf-8') as f:\n", + " return set(line.strip() for line in f if line.strip())\n", + " except Exception as e:\n", + " print(f\"Error loading stop words: {e}\")\n", + " return set()\n", + "\n", + " def read_file(self, file_path):\n", + " \"\"\"读取文件内容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " def tokenize(self, text):\n", + " \"\"\"中文分词并过滤停用词\"\"\"\n", + " words = jieba.lcut(text)\n", + " return [word for word in words if word not in self.stop_words]\n", + "\n", + " def process_file(self, file_path):\n", + " \"\"\"处理单个文件\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file_path)\n", + " words = self.tokenize(text)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " \"\"\"处理目录下所有文件\"\"\"\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self):\n", + " \"\"\"获取前 N 高频词\"\"\"\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def save_results(self, top_words):\n", + " \"\"\"保存结果到文件\"\"\"\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + " def run(self):\n", + " \"\"\"执行词频统计并保存结果\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.save_results(top_words)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22f58992-0108-4c90-894d-e756e7301a5a", + "metadata": {}, + "outputs": [], + "source": [ + "# main.py\n", + "\n", + "from text_analyzer import TextAnalyzer\n", + "\n", + "def main():\n", + " analyzer = TextAnalyzer()\n", + " analyzer.run()\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "18d27410-8923-4662-a6b7-8e027609506e", + "metadata": {}, + "source": [ + "## 改进分析\n", + "\n", + "- 模块化:将分析逻辑放入 text_analyzer.py,主程序 main.py 仅负责调用,符合工程化项目结构。\n", + "- 配置文件:通过 config.yaml 配置参数,增强灵活性,无需修改代码即可更改目录、输出文件等。\n", + "- 输出到文件:增加 save_results 方法,支持结果持久化。\n", + "- 工程质量提升:\n", + " - 可维护性:配置文件和模块化分离了配置与逻辑,修改配置无需动代码。 \n", + " - 复用性:模块可导入到其他项目,类可重复实例化。" + ] + }, + { + "cell_type": "markdown", + "id": "10876929-69f9-43bf-ba2d-a5d7bb11f22b", + "metadata": {}, + "source": [ + "### 封装的总节\n", + "\n", + "封装方法:\n", + "- 模块化:函数划分逻辑,降低耦合。\n", + "- 函数封装:将重复逻辑封装为函数,提高复用性。\n", + "- 类封装:将数据和方法绑定,增强代码组织性和扩展性。\n", + "- 文件封装:通过文件模块化,符合工程化开发规范。\n", + "\n", + "工程质量提升:\n", + "- 分离配置与逻辑,降低维护成本。\n", + "- 模块化和面向对象设计支持功能扩展。\n", + "- 错误处理提高程序鲁棒性。" + ] + }, + { + "cell_type": "raw", + "id": "60ba30d8-d8c2-4183-996e-376ff71716bf", + "metadata": {}, + "source": [ + "## 另外一种文件模块化设计(分层架构)示例\n", + "\n", + "将代码拆分为独立模块,每个模块仅负责单一职责:\n", + " - 数据读取层:遍历目录、读取文件内容\n", + " - 数据处理层:文本清洗、分词、统计词频\n", + " - 结果输出层:排序并输出前10高频词\n", + "\n", + "目录结构:\n", + "project/\n", + "├── data_loader.py # 数据读取模块\n", + "├── text_processor.py # 数据处理模块\n", + "├── output_handler.py # 结果输出模块\n", + "└── main.py # 主程序入口" + ] + }, + { + "cell_type": "markdown", + "id": "517759ac-c4cf-402e-86f1-a9fae0d88bbb", + "metadata": {}, + "source": [ + "# 第七部分:运行说明\n", + "\n", + "环境准备:\n", + "- 安装 Python 3.8+。\n", + "- 安装依赖:pip install jieba pyyaml。\n", + "- 准备 data 目录,放入 100 个 txt 文件。\n", + "- 创建 stop_words.txt 和 config.yaml。" + ] + }, + { + "cell_type": "markdown", + "id": "a7e1836b-42a1-45f9-bf8c-2e04a38744e4", + "metadata": {}, + "source": [ + "通过从无结构到结构化,再到面向对象和模块化的逐步优化,展示了结构化编程和封装方法如何显著提升代码工程质量。最终实现不仅满足了词频统计需求,还具备高可读性、可维护性、可扩展性和复用性,适合实际工程应用。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/D Plus/01 特殊执行方式的语言特性.ipynb.ipynb b/D Plus/01 特殊执行方式的语言特性.ipynb.ipynb new file mode 100644 index 0000000..65b84ab --- /dev/null +++ b/D Plus/01 特殊执行方式的语言特性.ipynb.ipynb @@ -0,0 +1,312 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "86405617-889a-40c2-a895-7b51fb14b65d", + "metadata": {}, + "source": [ + "# 教学目标\n", + "\n", + "- 在词频统计案例中引入装饰器和函数式编程 。\n", + "- 分析这些特性和模式如何进一步优化代码质量(可读性、可维护性、可扩展性、复用性)。\n", + "- 探讨高级特性在案例中的适用性与局限性。" + ] + }, + { + "cell_type": "markdown", + "id": "e6a6a633-d3af-4778-815c-4490dff5f624", + "metadata": {}, + "source": [ + "## 第一部分:引入装饰器\n", + "\n", + "装饰器可用于在不修改函数代码的情况下添加功能。适合日志记录、性能分析、错误处理等场景。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a5c7d69-d445-4a9c-bb48-7fde0a36c646", + "metadata": {}, + "outputs": [], + "source": [ + "# 为 TextAnalyzer 类添加一个装饰器,用于记录方法执行时间。\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "import time\n", + "import functools\n", + "\n", + "def timing_decorator(func):\n", + " \"\"\"装饰器:记录函数执行时间\"\"\"\n", + " @functools.wraps(func)\n", + " def wrapper(*args, **kwargs):\n", + " start_time = time.time()\n", + " result = func(*args, **kwargs)\n", + " end_time = time.time()\n", + " print(f\"{func.__name__} took {end_time - start_time:.4f} seconds\")\n", + " return result\n", + " return wrapper\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.word_count = Counter()\n", + " self.stop_words = self.load_stop_words()\n", + "\n", + " def load_stop_words(self):\n", + " \"\"\"加载停用词\"\"\"\n", + " try:\n", + " with open(self.stop_words_file, 'r', encoding='utf-8') as f:\n", + " return set(line.strip() for line in f if line.strip())\n", + " except Exception as e:\n", + " print(f\"Error loading stop words: {e}\")\n", + " return set()\n", + "\n", + " @timing_decorator\n", + " def read_file(self, file_path):\n", + " \"\"\"读取文件内容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " @timing_decorator\n", + " def tokenize(self, text):\n", + " \"\"\"中文分词并过滤停用词\"\"\"\n", + " words = jieba.lcut(text)\n", + " return [word for word in words if word not in self.stop_words]\n", + "\n", + " def process_file(self, file_path):\n", + " \"\"\"处理单个文件\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file_path)\n", + " words = self.tokenize(text)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " \"\"\"处理目录下所有文件\"\"\"\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self):\n", + " \"\"\"获取前 N 高频词\"\"\"\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def save_results(self, top_words):\n", + " \"\"\"保存结果到文件\"\"\"\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + " def run(self):\n", + " \"\"\"执行词频统计并保存结果\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.save_results(top_words)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4dcabfd9-b8f9-4796-a060-9d9f6689e92f", + "metadata": {}, + "source": [ + "### 装饰器分析\n", + "\n", + "功能:timing_decorator 记录 read_file 和 tokenize 方法的执行时间,帮助分析性能瓶颈(如分词耗时较长)。\n", + "\n", + "工程质量提升:\n", + " - 可维护性:无需修改原方法代码即可添加性能监控,符合开闭原则,维护更方便。\n", + " - 可读性:装饰器将性能监控逻辑与业务逻辑分离,代码更清晰。\n", + " - 复用性:timing_decorator 可复用于其他方法或项目。\n", + "\n", + "局限性:装饰器增加少量性能开销,需谨慎用于高频调用的函数。" + ] + }, + { + "cell_type": "markdown", + "id": "8fcbe48d-de8f-4387-9be3-f05f88553029", + "metadata": {}, + "source": [ + "## 第二部分:引入函数式编程\n", + "\n", + "函数式编程(如高阶函数、lambda、map/reduce)强调无变量污染、数据转换简洁性。在词频统计案例中,函数式编程可用于:\n", + "- 数据处理:使用 map 和 filter 处理文件和单词。\n", + "- 词频统计:使用 reduce 合并词频。\n", + "- 管道式处理:通过函数组合实现数据流处理。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a6970b2-7488-43e3-ae9f-0174ff9b4b57", + "metadata": {}, + "outputs": [], + "source": [ + "# 函数式处理文件和词频\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "from functools import reduce\n", + "from typing import List, Tuple\n", + "\n", + "def timing_decorator(func):\n", + " \"\"\"装饰器:记录函数执行时间\"\"\"\n", + " import time\n", + " import functools\n", + " @functools.wraps(func)\n", + " def wrapper(*args, **kwargs):\n", + " start_time = time.time()\n", + " result = func(*args, **kwargs)\n", + " end_time = time.time()\n", + " print(f\"{func.__name__} took {end_time - start_time:.4f} seconds\")\n", + " return result\n", + " return wrapper\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + "\n", + " def load_stop_words(self) -> set:\n", + " \"\"\"加载停用词\"\"\"\n", + " try:\n", + " with open(self.stop_words_file, 'r', encoding='utf-8') as f:\n", + " return set(line.strip() for line in f if line.strip())\n", + " except Exception as e:\n", + " print(f\"Error loading stop words: {e}\")\n", + " return set()\n", + "\n", + " @timing_decorator\n", + " def read_file(self, file_path: str) -> str:\n", + " \"\"\"读取文件内容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " def tokenize(self, text: str) -> List[str]:\n", + " \"\"\"中文分词并过滤停用词(函数式)\"\"\"\n", + " return list(filter(lambda w: w not in self.stop_words, jieba.lcut(text)))\n", + "\n", + " def process_file(self, file_path: str) -> Counter:\n", + " \"\"\"处理单个文件,返回词频 Counter\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file Couple(path)\n", + " words = self.tokenize(text)\n", + " return Counter(words)\n", + " return Counter()\n", + "\n", + " def process_directory(self) -> Counter:\n", + " \"\"\"处理目录下所有文件(函数式)\"\"\"\n", + " file_paths = (os.path.join(self.data_dir, f) for f in os.listdir(self.data_dir))\n", + " counters = map(self.process_file, file_paths)\n", + " return reduce(lambda c1, c2: c1 + c2, counters, Counter())\n", + "\n", + " def get_top_words(self, word_count: Counter) -> List[Tuple[str, int]]:\n", + " \"\"\"获取前 N 高频词\"\"\"\n", + " return word_count.most_common(self.top_n)\n", + "\n", + " def save_results(self, top_words: List[Tuple[str, int]]):\n", + " \"\"\"保存结果到文件\"\"\"\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + " def run(self):\n", + " \"\"\"执行词频统计并保存结果\"\"\"\n", + " word_count = self.process_directory()\n", + " top_words = self.get_top_words(word_count)\n", + " self.save_results(top_words)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "id": "6ce3b7c3-f099-4e2c-b415-18b0e3ab492a", + "metadata": {}, + "source": [ + "### 函数式编程分析\n", + "\n", + "改进:\n", + "- map:在 process_directory 中,使用 map(self.process_file, file_paths) 并行处理文件路径,生成词频 Counter 列表。\n", + "- reduce:使用 reduce(lambda c1, c2: c1 + c2, counters, Counter()) 合并所有文件的词频,简洁且无副作用。\n", + "- filter:在 tokenize 中,使用 filter(lambda w: w not in self.stop_words, ...) 过滤停用词,替代列表推导式。\n", + "- 生成器:file_paths 使用生成器表达式,减少内存占用。\n", + "\n", + "工程质量提升:\n", + "- 可读性:函数式编程使数据处理逻辑更简洁,管道式处理清晰表达数据流(文件路径 -> 词频 -> 合并)。\n", + "- 性能:生成器和 map 优化内存使用,适合处理大量文件。\n", + "- 可维护性:函数式代码无副作用,易于测试和调试。\n", + "- 适用场景:适合数据转换和批量处理(如文件读取、词频合并)。\n", + "- 简洁性:map、reduce 等使数据处理逻辑更紧凑。\n", + "- 内存效率:生成器和惰性求值优化内存使用。\n", + "- 结合并发可显著提升效率。\n", + "\n", + "适用场景:数据流处理(如文件处理、词频合并)、无状态操作。\n", + "\n", + "局限性:\n", + "- 函数式代码对初学者可能不够直观,需熟悉 map、reduce 等概念。\n", + "- 对于复杂逻辑,函数式编程可能增加调试难度。" + ] + }, + { + "cell_type": "markdown", + "id": "458e18ec-b536-4860-9e12-d0bf5ed9d876", + "metadata": {}, + "source": [ + "# 练习\n", + "\n", + "实践练习:\n", + "- 添加日志装饰器,记录每次文件处理的详细信息。\n", + "- 使用 functools.reduce 重写 get_top_words,尝试不同排序逻辑。\n", + "\n", + "扩展任务:\n", + "- 添加缓存装饰器,避免重复分词相同文件。\n", + "- 实现函数式管道,将文件读取、分词、统计串联为单一流。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/D Plus/02 设计模式.ipynb b/D Plus/02 设计模式.ipynb new file mode 100644 index 0000000..8f14f9e --- /dev/null +++ b/D Plus/02 设计模式.ipynb @@ -0,0 +1,493 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "eccfe49f-de35-4241-90e3-a7095940b61a", + "metadata": {}, + "source": [ + "设计模式提供高频重复出现需求的最佳解决方案。以下介绍适合词频统计案例的设计模式:策略模式、观察者模式、工厂模式。" + ] + }, + { + "cell_type": "markdown", + "id": "c186171f-d1f2-433e-a3eb-b266e2909a2c", + "metadata": {}, + "source": [ + "## 策略模式(动态选择分词策略)\n", + "\n", + "策略模式允许动态切换算法(如分词器),比元编程简单。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97c865cb-0b5a-4fa1-aa74-5ba2e65e7436", + "metadata": {}, + "outputs": [], + "source": [ + "from abc import ABC, abstractmethod\n", + "\n", + "class Tokenizer(ABC):\n", + " \"\"\"分词器接口\"\"\"\n", + " @abstractmethod\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " pass\n", + "\n", + "class JiebaTokenizer(Tokenizer):\n", + " \"\"\"jieba 分词器\"\"\"\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " return [w for w in jieba.lcut(text) if w not in stop_words]\n", + "\n", + "class SimpleTokenizer(Tokenizer):\n", + " \"\"\"简单分词器\"\"\"\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " return [w for w in text.split() if w not in stop_words]\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " # 动态选择分词器\n", + " tokenizer_name = config.get('tokenizer', 'jieba')\n", + " self.tokenizer = {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}[tokenizer_name]\n", + "\n", + " def tokenize(self, text: str) -> List[str]:\n", + " \"\"\"使用策略分词\"\"\"\n", + " return self.tokenizer.tokenize(text, self.stop_words)\n", + "\n", + " # 其余方法同上" + ] + }, + { + "cell_type": "markdown", + "id": "5435ebc3-d3b0-4475-8bd5-cb45fb51638c", + "metadata": {}, + "source": [ + "工程质量提升:\n", + "- 可扩展性:添加新分词器只需实现 Tokenizer 接口。\n", + "- 可维护性:分词逻辑与主类分离,修改更独立。\n", + "\n", + "适用场景:适合需要动态切换算法的场景。" + ] + }, + { + "cell_type": "markdown", + "id": "fbf53455-558c-40fb-8718-446dec989b5d", + "metadata": {}, + "source": [ + "## 观察者模式(结果输出解耦)\n", + "\n", + "观察者模式可用于解耦结果输出逻辑(如打印、保存文件、发送通知)。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7a2bd4c-df73-4800-b45b-9b6c73d28d7b", + "metadata": {}, + "outputs": [], + "source": [ + "class OutputObserver(ABC):\n", + " \"\"\"输出观察者接口\"\"\"\n", + " @abstractmethod\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " pass\n", + "\n", + "class ConsoleOutput(OutputObserver):\n", + " \"\"\"控制台输出\"\"\"\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "class FileOutput(OutputObserver):\n", + " \"\"\"文件输出\"\"\"\n", + " def __init__(self, output_file: str):\n", + " self.output_file = output_file\n", + "\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n", + "\n", + " def add_observer(self, observer: OutputObserver):\n", + " \"\"\"添加观察者\"\"\"\n", + " self.observers.append(observer)\n", + "\n", + " def notify_observers(self, top_words: List[Tuple[str, int]]):\n", + " \"\"\"通知所有观察者\"\"\"\n", + " for observer in self.observers:\n", + " observer.update(top_words)\n", + "\n", + " def run(self):\n", + " \"\"\"执行词频统计并通知观察者\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.notify_observers(top_words)\n", + "\n", + " # 其余方法同上" + ] + }, + { + "cell_type": "markdown", + "id": "02b5cfba-431c-4a01-a454-099e4f41922c", + "metadata": {}, + "source": [ + "### 分析\n", + "\n", + "工程质量提升:\n", + " - 可扩展性:添加新输出方式只需实现 OutputObserver 接口。\n", + " - 解耦性:输出逻辑与统计逻辑分离,修改输出不影响核心功能。\n", + "\n", + "适用场景:适合需要多种输出或通知的场景。\n", + "\n", + "局限性:观察者模式增加代码复杂性,适合复杂输出需求。" + ] + }, + { + "cell_type": "markdown", + "id": "11669305-8cd5-4317-afd5-e85c3f0a5a81", + "metadata": {}, + "source": [ + "## 工厂模式(动态创建分词器)\n", + "\n", + "工厂模式可用于动态创建分词器,简化策略模式中的初始化逻辑。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fa50633-de22-40c8-912d-3ded5ebcedfc", + "metadata": {}, + "outputs": [], + "source": [ + "class TokenizerFactory:\n", + " \"\"\"分词器工厂\"\"\"\n", + " @staticmethod\n", + " def create_tokenizer(name: str) -> Tokenizer:\n", + " tokenizers = {\n", + " 'jieba': JiebaTokenizer(),\n", + " 'simple': SimpleTokenizer()\n", + " }\n", + " return tokenizers.get(name, JiebaTokenizer())\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n", + "\n", + " # 其余方法同上" + ] + }, + { + "cell_type": "markdown", + "id": "a4db7046-dfe2-4bd8-81d1-49a42e2eeb5c", + "metadata": {}, + "source": [ + "### 分析\n", + "\n", + "工程质量提升:\n", + " - 可维护性:分词器创建逻辑集中于工厂,易于修改。\n", + " - 可扩展性:添加新分词器只需更新工厂方法。\n", + "\n", + "适用场景:适合需要动态创建对象的场景。\n", + "\n", + "局限性:对于简单场景,工厂模式可能略显冗余。" + ] + }, + { + "cell_type": "markdown", + "id": "e5f2aef4-a055-43a9-917c-fa183de6db2d", + "metadata": {}, + "source": [ + "## 综合实现(整合特性与模式)\n", + "\n", + "整合上下文管理器、生成器、策略模式和观察者模式的最终实现(部分代码展示)。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa7f34e2-d355-4a22-8572-729c49b18605", + "metadata": {}, + "outputs": [], + "source": [ + "# text_analyzer.py\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "from contextlib import contextmanager\n", + "from typing import List, Tuple\n", + "from abc import ABC, abstractmethod\n", + "\n", + "@contextmanager\n", + "def file_reader(file_path: str):\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " yield f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " yield \"\"\n", + "\n", + "class Tokenizer(ABC):\n", + " @abstractmethod\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " pass\n", + "\n", + "class JiebaTokenizer(Tokenizer):\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " for word in jieba.lcut(text):\n", + " if word not in stop_words:\n", + " yield word\n", + "\n", + "class SimpleTokenizer(Tokenizer):\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " for word in text.split():\n", + " if word not in stop_words:\n", + " yield word\n", + "\n", + "class TokenizerFactory:\n", + " @staticmethod\n", + " def create_tokenizer(name: str) -> Tokenizer:\n", + " return {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}.get(name, JiebaTokenizer())\n", + "\n", + "class OutputObserver(ABC):\n", + " @abstractmethod\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " pass\n", + "\n", + "class ConsoleOutput(OutputObserver):\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "class FileOutput(OutputObserver):\n", + " def __init__(self, output_file: str):\n", + " self.output_file = output_file\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n", + " self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n", + "\n", + " def load_stop_words(self) -> set:\n", + " with file_reader(self.stop_words_file) as content:\n", + " return set(line.strip() for line in content.splitlines() if line.strip())\n", + "\n", + " def process_file(self, file_path: str):\n", + " if file_path.endswith('.txt'):\n", + " with file_reader(file_path) as text:\n", + " words = self.tokenizer.tokenize(text, self.stop_words)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self) -> List[Tuple[str, int]]:\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def notify_observers(self, top_words: List[Tuple[str, int]]):\n", + " for observer in self.observers:\n", + " observer.update(top_words)\n", + "\n", + " def run(self):\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.notify_observers(top_words)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d130312-b298-4c76-ae09-0fb4bd08b0c1", + "metadata": {}, + "outputs": [], + "source": [ + "# main.py\n", + "\n", + "from text_analyzer import TextAnalyzer\n", + "\n", + "def main():\n", + " analyzer = TextAnalyzer()\n", + " analyzer.run()\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "770618c9-428e-454a-97de-00e3b49c9d03", + "metadata": {}, + "source": [ + "## 结论\n", + "\n", + "通过引入上下文管理器、生成器、元编程、策略模式、观察者模式和工厂模式,词频统计代码在可扩展性、可维护性和复用性上进一步提升。\n", + "这些特性和模式使代码更模块化、灵活,适合大型项目,同时保持清晰的工程结构。结合之前的装饰器和函数式编程,代码已达到工程化水平。\n", + "\n", + "若需深入,可以进一步考虑其它性能特性." + ] + }, + { + "cell_type": "markdown", + "id": "cbeaa07d-272f-465b-a437-9c4b44827d23", + "metadata": {}, + "source": [ + "## 进一步练习\n", + "\n", + "实践练习:\n", + "- 实现新分词器(如 thulac)并通过策略模式或工厂模式集成。\n", + "- 添加新观察者(如 JSON 输出)。\n", + "\n", + "使用生成器实现流式词频统计,比较内存占用。\n", + "实现缓存机制,缓存已处理文件的分词结果。\n", + "\n", + "添加命令行接口(argparse),动态配置 top_n 和 tokenizer。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a43b53d-1e07-4ebe-a6c8-104353fd5f7b", + "metadata": {}, + "outputs": [], + "source": [ + "## 附:元编程\n", + "\n", + "元编程允许动态修改类或函数行为,可用于动态配置分词器或输出格式。案例中,可通过元编程动态注册分词器。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4394008c-88da-44bd-aa0d-f1b7a6dbc7d6", + "metadata": {}, + "outputs": [], + "source": [ + "class TokenizerRegistry(type):\n", + " \"\"\"元类:动态注册分词器\"\"\"\n", + " tokenizers = {}\n", + "\n", + " def register_tokenizer(cls, name):\n", + " def decorator(func):\n", + " cls.tokenizers[name] = func\n", + " return func\n", + " return decorator\n", + "\n", + "class TextAnalyzer(metaclass=TokenizerRegistry):\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.tokenizer_name = config.get('tokenizer', 'jieba') # 从配置读取分词器\n", + "\n", + " @classmethod\n", + " def register_tokenizer(cls, name):\n", + " return cls.__class__.register_tokenizer(name)\n", + "\n", + " def tokenize(self, text: str) -> List[str]:\n", + " \"\"\"动态调用分词器\"\"\"\n", + " tokenizer = self.__class__.tokenizers.get(self.tokenizer_name, self.jieba_tokenizer)\n", + " return tokenizer(self, text)\n", + "\n", + " @register_tokenizer('jieba')\n", + " def jieba_tokenizer(self, text: str) -> List[str]:\n", + " \"\"\"jieba 分词\"\"\"\n", + " return [w for w in jieba.lcut(text) if w not in self.stop_words]\n", + "\n", + " @register_tokenizer('simple')\n", + " def simple_tokenizer(self, text: str) -> List[str]:\n", + " \"\"\"简单分词(按空格)\"\"\"\n", + " return [w for w in text.split() if w not in self.stop_words]\n", + "\n", + " # 其余方法(load_stop_words, process_file, etc.)同上" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2249f13a-7a3f-4376-ba2a-d92f11658d32", + "metadata": {}, + "outputs": [], + "source": [ + "### 分析\n", + "\n", + "功能:通过元类和装饰器动态注册分词器,支持配置切换(如 jieba 或 simple)。\n", + "\n", + "工程质量提升:\n", + " 可扩展性:新分词器只需添加新方法并注册,无需修改核心逻辑。\n", + " 灵活性:通过配置文件动态选择分词器。\n", + "\n", + "适用场景:适合需要动态配置或插件化系统的场景。\n", + "\n", + "局限性:元编程增加代码复杂性,可能降低可读性,需谨慎使用。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/readme.MD b/readme.MD index 14627ac..ba2d822 100644 --- a/readme.MD +++ b/readme.MD @@ -18,15 +18,16 @@ C 高性能模式 可能的动机 【 效率 】 -时间快 -内存占用少 +- 执行快 +- 内存占用少 【 软件工程 】 -可读性强 -可复用高 -类型安全 -单元测试方便 +- 可读性强 +- 可复用高 +- 类型安全 +- 单元测试方便 【可靠性】 -并发、线程安全 +- 并发、线程安全 + ''' \ No newline at end of file