From aaba98303b3685ad9adfa28a9b8972f0246c734d Mon Sep 17 00:00:00 2001 From: zj3D Date: Fri, 16 May 2025 22:57:57 +0800 Subject: [PATCH] grok01 --- .../00 å°è£…-checkpoint.ipynb | 478 +++++++++++++++++ ...¹å¼çš„语言特性.ipynb-checkpoint.ipynb | 312 +++++++++++ .../02 设计模å¼-checkpoint.ipynb | 493 ++++++++++++++++++ D Plus/00 å°è£….ipynb | 478 +++++++++++++++++ ...殊执行方å¼çš„语言特性.ipynb.ipynb | 312 +++++++++++ D Plus/02 设计模å¼.ipynb | 493 ++++++++++++++++++ readme.MD | 15 +- 7 files changed, 2574 insertions(+), 7 deletions(-) create mode 100644 D Plus/.ipynb_checkpoints/00 å°è£…-checkpoint.ipynb create mode 100644 D Plus/.ipynb_checkpoints/01 特殊执行方å¼çš„语言特性.ipynb-checkpoint.ipynb create mode 100644 D Plus/.ipynb_checkpoints/02 设计模å¼-checkpoint.ipynb create mode 100644 D Plus/00 å°è£….ipynb create mode 100644 D Plus/01 特殊执行方å¼çš„语言特性.ipynb.ipynb create mode 100644 D Plus/02 设计模å¼.ipynb diff --git a/D Plus/.ipynb_checkpoints/00 å°è£…-checkpoint.ipynb b/D Plus/.ipynb_checkpoints/00 å°è£…-checkpoint.ipynb new file mode 100644 index 0000000..4d255e0 --- /dev/null +++ b/D Plus/.ipynb_checkpoints/00 å°è£…-checkpoint.ipynb @@ -0,0 +1,478 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "69e76aa7-2c5d-4114-a302-85e17cc83e2c", + "metadata": {}, + "source": [ + "æœ¬æ–‡æ—¨åœ¨é€šè¿‡ä¸€ä¸ªæ¡ˆä¾‹ï¼ˆè¯»å– data 目录下 100 篇å°è¯´æ–‡æœ¬ï¼Œç»Ÿè®¡è¯é¢‘å¹¶è¾“å‡ºå‰ 10 高频è¯ï¼‰æ¥è¯´æ˜Žç»“构化编程和å°è£…方法如何æå‡ä»£ç å·¥ç¨‹è´¨é‡ã€‚\n", + "æ•™æ¡ˆå°†é€æ­¥å±•示ä¸åŒç»“构化方法和å°è£…技术的应用,并分æžå…¶å¯¹ä»£ç å¯è¯»æ€§ã€å¯ç»´æŠ¤æ€§ã€å¯æ‰©å±•性和å¤ç”¨æ€§çš„æå‡ã€‚" + ] + }, + { + "cell_type": "markdown", + "id": "b9a9a366-7fd3-422b-b3bc-b0bc00374da6", + "metadata": {}, + "source": [ + "# 教学目标\n", + "- 掌æ¡å°è£…方法(函数ã€ç±»ã€æ¨¡å—)在代ç ç»„织中的作用。" + ] + }, + { + "cell_type": "markdown", + "id": "1387e026-c978-4217-9015-ab0e047c01a0", + "metadata": {}, + "source": [ + "## ç¬¬ä¸€éƒ¨åˆ†ï¼šåŸºç¡€å®žçŽ°ï¼ˆæ— ç»“æž„åŒ–ã€æ— å°è£…)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33803186-d890-4cd7-9636-8920fcb86e14", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "files = os.listdir('data')\n", + "word_count = {}\n", + "for file in files:\n", + " with open('data/' + file, 'r', encoding='utf-8') as f:\n", + " text = f.read()\n", + " words = text.split() # å‡è®¾ç®€å•按空格分è¯\n", + " for word in words:\n", + " if word in word_count:\n", + " word_count[word] += 1\n", + " else:\n", + " word_count[word] = 1\n", + "\n", + "# 排åºå¹¶è¾“出å‰10\n", + "sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n", + "for i in range(10):\n", + " print(sorted_words[i])" + ] + }, + { + "cell_type": "markdown", + "id": "471351e7-8645-4690-973a-7d8de53bda5f", + "metadata": {}, + "source": [ + "### 问题分æž\n", + "\n", + "- å¯è¯»æ€§å·®ï¼šæ²¡æœ‰æ¸…晰的功能划分,代ç é€»è¾‘æ··æ‚,难以阅读ç†è§£ç»´æŠ¤ã€‚\n", + "- æ‰©å±•æ€§å·®ï¼šå¦‚æžœéœ€è¦æ›´æ”¹åˆ†è¯é€»è¾‘ã€æ–‡ä»¶è·¯å¾„或输出格å¼ï¼Œéœ€ä¿®æ”¹å¤šå¤„代ç ã€‚\n", + "- å®¹é”™æ€§å·®ï¼šæœªå¤„ç†æ–‡ä»¶è¯»å–失败ã€ç©ºæ–‡ä»¶ç­‰é—®é¢˜ã€‚\n", + "- å¤ç”¨æ€§ä½Žï¼šé€»è¾‘无法直接å¤ç”¨åœ¨å…¶ä»–类似任务中。" + ] + }, + { + "cell_type": "markdown", + "id": "a5881283-c295-4433-8edd-f915201a5f43", + "metadata": {}, + "source": [ + "## 第二部分:引入函数å°è£…\n", + "\n", + "æç‚¼å‡ºè‹¥å¹²å‡½æ•°ï¼Œå‡å°‘代ç çš„夿‚性,æé«˜å¯è¯»æ€§å’Œå¯ç»´æŠ¤æ€§ã€‚" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7beadc81-f939-4ac5-b885-407c6810b7de", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "def read_file(file_path):\n", + " \"\"\"读å–å•个文件内容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + "def get_words(text):\n", + " \"\"\"简å•分è¯ï¼ˆæŒ‰ç©ºæ ¼ï¼‰\"\"\"\n", + " return text.split()\n", + "\n", + "def count_words(words):\n", + " \"\"\"统计è¯é¢‘\"\"\"\n", + " word_count = {}\n", + " for word in words:\n", + " word_count[word] = word_count.get(word, 0) + 1\n", + " return word_count\n", + "\n", + "def get_top_n(word_count, n=10):\n", + " \"\"\"获å–å‰ N 高频è¯\"\"\"\n", + " return sorted(word_count.items(), key=lambda x: x[1], reverse=True)[:n]\n", + "\n", + "def main():\n", + " \"\"\"主函数,控制æµç¨‹\"\"\"\n", + " word_count = {}\n", + " data_dir = 'data'\n", + " \n", + " # 顺åºç»“æž„ï¼šæŒ‰æ­¥éª¤è¯»å–æ–‡ä»¶ã€å¤„ç†æ–‡æœ¬\n", + " for file in os.listdir(data_dir):\n", + " file_path = os.path.join(data_dir, file)\n", + " # 选择结构:检查文件是å¦ä¸º txt\n", + " if file_path.endswith('.txt'):\n", + " text = read_file(file_path)\n", + " # å¾ªçŽ¯ç»“æž„ï¼šå¤„ç†æ¯ä¸ªæ–‡ä»¶çš„è¯\n", + " words = get_words(text)\n", + " file_word_count = count_words(words)\n", + " # åˆå¹¶è¯é¢‘\n", + " for word, count in file_word_count.items():\n", + " word_count[word] = word_count.get(word, 0) + count\n", + " \n", + " # 输出结果\n", + " top_words = get_top_n(word_count)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "4f7218a3-43d2-4159-9854-9880020c42fc", + "metadata": {}, + "source": [ + "### 改进分æž\n", + " - 逻辑分层:main() å‡½æ•°æ¸…æ™°å®šä¹‰äº†ç¨‹åºæ‰§è¡Œæ­¥éª¤ï¼ˆè¯»å–文件 -> åˆ†è¯ -> 统计 -> 输出)。\n", + " - 模å—化:将功能拆分为函数(read_fileã€get_wordsã€count_wordsã€get_top_n),æé«˜ä»£ç å¤ç”¨æ€§å’Œå¯è¯»æ€§ã€‚\n", + " - 错误处ç†ï¼šå¢žåŠ  try-except å¤„ç†æ–‡ä»¶è¯»å–异常。\n", + " - å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + " - å¯è¯»æ€§ï¼šå‡½æ•°å‘½å本身就帮助ç†è§£ä»£ç ï¼Œé€»è¾‘分å—。\n", + " - å¯ç»´æŠ¤æ€§ï¼šä¿®æ”¹æŸéƒ¨åˆ†åŠŸèƒ½ï¼ˆå¦‚åˆ†è¯é€»è¾‘)åªéœ€æ”¹å¯¹åº”函数。\n", + " - å¤ç”¨æ€§ï¼šå‡½æ•°å¯å¤ç”¨åœ¨å…¶ä»–类似任务中。" + ] + }, + { + "cell_type": "markdown", + "id": "50737966-57c9-4daf-ac3b-6a1c73b18136", + "metadata": {}, + "source": [ + "## 第三部分:引入类å°è£…\n", + "\n", + "通过类å°è£…功能,进一步æé«˜ä»£ç çš„æ¨¡å—化ã€å¯æ‰©å±•性和å¤ç”¨æ€§ã€‚" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81aa7f9c-de28-4a7a-8ba1-130c3e5e4f7f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "\n", + "class TextAnalyzer:\n", + " \"\"\"文本分æžç±»ï¼Œå°è£…è¯é¢‘统计功能\"\"\"\n", + " def __init__(self, data_dir='data', top_n=10):\n", + " self.data_dir = data_dir\n", + " self.top_n = top_n\n", + " self.word_count = Counter()\n", + "\n", + " def read_file(self, file_path):\n", + " \"\"\"è¯»å–æ–‡ä»¶å†…容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " def tokenize(self, text):\n", + " \"\"\"使用 jieba 进行中文分è¯\"\"\"\n", + " return jieba.lcut(text)\n", + "\n", + " def process_file(self, file_path):\n", + " \"\"\"处ç†å•个文件\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file_path)\n", + " words = self.tokenize(text)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " \"\"\"处ç†ç›®å½•下所有文件\"\"\"\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self):\n", + " \"\"\"获å–å‰ N 高频è¯\"\"\"\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def run(self):\n", + " \"\"\"执行è¯é¢‘统计\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "def main():\n", + " analyzer = TextAnalyzer(data_dir='data', top_n=10)\n", + " analyzer.run()\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "62e780d4-94de-4830-89c2-ab2c96500fc5", + "metadata": {}, + "source": [ + "### 改进分æž\n", + "- é¢å‘对象å°è£…:\n", + " - 使用 TextAnalyzer 类将所有功能å°è£…为一个对象,数æ®ï¼ˆå¦‚ word_count)和方法(如 tokenize)绑定在一起。\n", + " - 通过 __init__ æä¾›é…置(如 data_dir å’Œ top_n),æé«˜çµæ´»æ€§ã€‚\n", + " \n", + "- 模å—化:类方法分工明确(如 read_fileã€tokenizeã€process_file),便于扩展。\n", + "- å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + " - 坿‰©å±•性:å¯é€šè¿‡ç»§æ‰¿ TextAnalyzer 添加新功能(如支æŒå…¶ä»–分è¯å™¨æˆ–文件格å¼ï¼‰ã€‚\n", + " - å¤ç”¨æ€§ï¼šç±»å¯å®žä¾‹åŒ–多次,用于ä¸åŒç›®å½•æˆ–å‚æ•°ã€‚\n", + " - å¯ç»´æŠ¤æ€§ï¼šé€»è¾‘集中在类中,修改相对安全。" + ] + }, + { + "cell_type": "markdown", + "id": "9b4e17c4-f47e-4245-b3d9-e40fde0a2e04", + "metadata": {}, + "source": [ + "# 第四部分:引入文件模å—å°è£…\n", + "将代ç è¿›ä¸€æ­¥æ¨¡å—化到ä¸åŒæ–‡ä»¶ï¼Œå¼•å…¥é…置文件和åœç”¨è¯è¿‡æ»¤ã€‚" + ] + }, + { + "cell_type": "raw", + "id": "aadb5aea-8cc5-4a0f-9f5b-7eab28e90f1a", + "metadata": {}, + "source": [ + "目录结构\n", + "\n", + "project/\n", + "├── data/ # å°è¯´æ–‡æœ¬ç›®å½•\n", + "├── config.yaml # é…置文件\n", + "├── stop_words.txt # åœç”¨è¯æ–‡ä»¶\n", + "├── text_analyzer.py # åˆ†æžæ¨¡å—\n", + "├── main.py # 主程åº" + ] + }, + { + "cell_type": "raw", + "id": "2de4767b-8928-4f3f-8c8b-3c3cba2bc98a", + "metadata": {}, + "source": [ + "# config.yaml\n", + "\n", + "data_dir: data\n", + "top_n: 10\n", + "stop_words_file: stop_words.txt\n", + "output_file: output.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b442d61-c937-4757-b7b4-b6fc047c3529", + "metadata": {}, + "outputs": [], + "source": [ + "# text_analyzer.py\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.word_count = Counter()\n", + " self.stop_words = self.load_stop_words()\n", + "\n", + " def load_stop_words(self):\n", + " \"\"\"加载åœç”¨è¯\"\"\"\n", + " try:\n", + " with open(self.stop_words_file, 'r', encoding='utf-8') as f:\n", + " return set(line.strip() for line in f if line.strip())\n", + " except Exception as e:\n", + " print(f\"Error loading stop words: {e}\")\n", + " return set()\n", + "\n", + " def read_file(self, file_path):\n", + " \"\"\"è¯»å–æ–‡ä»¶å†…容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " def tokenize(self, text):\n", + " \"\"\"中文分è¯å¹¶è¿‡æ»¤åœç”¨è¯\"\"\"\n", + " words = jieba.lcut(text)\n", + " return [word for word in words if word not in self.stop_words]\n", + "\n", + " def process_file(self, file_path):\n", + " \"\"\"处ç†å•个文件\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file_path)\n", + " words = self.tokenize(text)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " \"\"\"处ç†ç›®å½•下所有文件\"\"\"\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self):\n", + " \"\"\"获å–å‰ N 高频è¯\"\"\"\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def save_results(self, top_words):\n", + " \"\"\"ä¿å­˜ç»“果到文件\"\"\"\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + " def run(self):\n", + " \"\"\"执行è¯é¢‘统计并ä¿å­˜ç»“æžœ\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.save_results(top_words)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22f58992-0108-4c90-894d-e756e7301a5a", + "metadata": {}, + "outputs": [], + "source": [ + "# main.py\n", + "\n", + "from text_analyzer import TextAnalyzer\n", + "\n", + "def main():\n", + " analyzer = TextAnalyzer()\n", + " analyzer.run()\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "18d27410-8923-4662-a6b7-8e027609506e", + "metadata": {}, + "source": [ + "## 改进分æž\n", + "\n", + "- 模å—化:将分æžé€»è¾‘放入 text_analyzer.pyï¼Œä¸»ç¨‹åº main.py 仅负责调用,符åˆå·¥ç¨‹åŒ–项目结构。\n", + "- é…置文件:通过 config.yaml é…ç½®å‚æ•°ï¼Œå¢žå¼ºçµæ´»æ€§ï¼Œæ— éœ€ä¿®æ”¹ä»£ç å³å¯æ›´æ”¹ç›®å½•ã€è¾“出文件等。\n", + "- 输出到文件:增加 save_results 方法,支æŒç»“æžœæŒä¹…化。\n", + "- å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + " - å¯ç»´æŠ¤æ€§ï¼šé…置文件和模å—化分离了é…置与逻辑,修改é…置无需动代ç ã€‚ \n", + " - å¤ç”¨æ€§ï¼šæ¨¡å—å¯å¯¼å…¥åˆ°å…¶ä»–项目,类å¯é‡å¤å®žä¾‹åŒ–。" + ] + }, + { + "cell_type": "markdown", + "id": "10876929-69f9-43bf-ba2d-a5d7bb11f22b", + "metadata": {}, + "source": [ + "### å°è£…的总节\n", + "\n", + "å°è£…方法:\n", + "- 模å—化:函数划分逻辑,é™ä½Žè€¦åˆã€‚\n", + "- 函数å°è£…:将é‡å¤é€»è¾‘å°è£…为函数,æé«˜å¤ç”¨æ€§ã€‚\n", + "- ç±»å°è£…:将数æ®å’Œæ–¹æ³•绑定,增强代ç ç»„织性和扩展性。\n", + "- 文件å°è£…:通过文件模å—化,符åˆå·¥ç¨‹åŒ–å¼€å‘规范。\n", + "\n", + "å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + "- 分离é…置与逻辑,é™ä½Žç»´æŠ¤æˆæœ¬ã€‚\n", + "- 模å—化和é¢å‘对象设计支æŒåŠŸèƒ½æ‰©å±•ã€‚\n", + "- é”™è¯¯å¤„ç†æé«˜ç¨‹åºé²æ£’性。" + ] + }, + { + "cell_type": "raw", + "id": "60ba30d8-d8c2-4183-996e-376ff71716bf", + "metadata": {}, + "source": [ + "## å¦å¤–ä¸€ç§æ–‡ä»¶æ¨¡å—化设计(分层架构)示例\n", + "\n", + "å°†ä»£ç æ‹†åˆ†ä¸ºç‹¬ç«‹æ¨¡å—,æ¯ä¸ªæ¨¡å—ä»…è´Ÿè´£å•一èŒè´£ï¼š\n", + " - æ•°æ®è¯»å–层:é历目录ã€è¯»å–文件内容\n", + " - æ•°æ®å¤„ç†å±‚:文本清洗ã€åˆ†è¯ã€ç»Ÿè®¡è¯é¢‘\n", + " - 结果输出层:排åºå¹¶è¾“出å‰10高频è¯\n", + "\n", + "目录结构:\n", + "project/\n", + "├── data_loader.py # æ•°æ®è¯»å–模å—\n", + "├── text_processor.py # æ•°æ®å¤„ç†æ¨¡å—\n", + "├── output_handler.py # 结果输出模å—\n", + "└── main.py # 主程åºå…¥å£" + ] + }, + { + "cell_type": "markdown", + "id": "517759ac-c4cf-402e-86f1-a9fae0d88bbb", + "metadata": {}, + "source": [ + "# 第七部分:è¿è¡Œè¯´æ˜Ž\n", + "\n", + "环境准备:\n", + "- 安装 Python 3.8+。\n", + "- 安装ä¾èµ–:pip install jieba pyyaml。\n", + "- 准备 data 目录,放入 100 个 txt 文件。\n", + "- 创建 stop_words.txt å’Œ config.yaml。" + ] + }, + { + "cell_type": "markdown", + "id": "a7e1836b-42a1-45f9-bf8c-2e04a38744e4", + "metadata": {}, + "source": [ + "通过从无结构到结构化,å†åˆ°é¢å‘对象和模å—åŒ–çš„é€æ­¥ä¼˜åŒ–,展示了结构化编程和å°è£…方法如何显著æå‡ä»£ç å·¥ç¨‹è´¨é‡ã€‚最终实现ä¸ä»…满足了è¯é¢‘统计需求,还具备高å¯è¯»æ€§ã€å¯ç»´æŠ¤æ€§ã€å¯æ‰©å±•性和å¤ç”¨æ€§ï¼Œé€‚åˆå®žé™…工程应用。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/D Plus/.ipynb_checkpoints/01 特殊执行方å¼çš„语言特性.ipynb-checkpoint.ipynb b/D Plus/.ipynb_checkpoints/01 特殊执行方å¼çš„语言特性.ipynb-checkpoint.ipynb new file mode 100644 index 0000000..65b84ab --- /dev/null +++ b/D Plus/.ipynb_checkpoints/01 特殊执行方å¼çš„语言特性.ipynb-checkpoint.ipynb @@ -0,0 +1,312 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "86405617-889a-40c2-a895-7b51fb14b65d", + "metadata": {}, + "source": [ + "# 教学目标\n", + "\n", + "- 在è¯é¢‘统计案例中引入装饰器和函数å¼ç¼–程 。\n", + "- 分æžè¿™äº›ç‰¹æ€§å’Œæ¨¡å¼å¦‚何进一步优化代ç è´¨é‡ï¼ˆå¯è¯»æ€§ã€å¯ç»´æŠ¤æ€§ã€å¯æ‰©å±•性ã€å¤ç”¨æ€§ï¼‰ã€‚\n", + "- æŽ¢è®¨é«˜çº§ç‰¹æ€§åœ¨æ¡ˆä¾‹ä¸­çš„é€‚ç”¨æ€§ä¸Žå±€é™æ€§ã€‚" + ] + }, + { + "cell_type": "markdown", + "id": "e6a6a633-d3af-4778-815c-4490dff5f624", + "metadata": {}, + "source": [ + "## 第一部分:引入装饰器\n", + "\n", + "装饰器å¯ç”¨äºŽåœ¨ä¸ä¿®æ”¹å‡½æ•°ä»£ç çš„æƒ…å†µä¸‹æ·»åŠ åŠŸèƒ½ã€‚é€‚åˆæ—¥å¿—è®°å½•ã€æ€§èƒ½åˆ†æžã€é”™è¯¯å¤„ç†ç­‰åœºæ™¯ã€‚" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a5c7d69-d445-4a9c-bb48-7fde0a36c646", + "metadata": {}, + "outputs": [], + "source": [ + "# 为 TextAnalyzer 类添加一个装饰器,用于记录方法执行时间。\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "import time\n", + "import functools\n", + "\n", + "def timing_decorator(func):\n", + " \"\"\"装饰器:记录函数执行时间\"\"\"\n", + " @functools.wraps(func)\n", + " def wrapper(*args, **kwargs):\n", + " start_time = time.time()\n", + " result = func(*args, **kwargs)\n", + " end_time = time.time()\n", + " print(f\"{func.__name__} took {end_time - start_time:.4f} seconds\")\n", + " return result\n", + " return wrapper\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.word_count = Counter()\n", + " self.stop_words = self.load_stop_words()\n", + "\n", + " def load_stop_words(self):\n", + " \"\"\"加载åœç”¨è¯\"\"\"\n", + " try:\n", + " with open(self.stop_words_file, 'r', encoding='utf-8') as f:\n", + " return set(line.strip() for line in f if line.strip())\n", + " except Exception as e:\n", + " print(f\"Error loading stop words: {e}\")\n", + " return set()\n", + "\n", + " @timing_decorator\n", + " def read_file(self, file_path):\n", + " \"\"\"è¯»å–æ–‡ä»¶å†…容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " @timing_decorator\n", + " def tokenize(self, text):\n", + " \"\"\"中文分è¯å¹¶è¿‡æ»¤åœç”¨è¯\"\"\"\n", + " words = jieba.lcut(text)\n", + " return [word for word in words if word not in self.stop_words]\n", + "\n", + " def process_file(self, file_path):\n", + " \"\"\"处ç†å•个文件\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file_path)\n", + " words = self.tokenize(text)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " \"\"\"处ç†ç›®å½•下所有文件\"\"\"\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self):\n", + " \"\"\"获å–å‰ N 高频è¯\"\"\"\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def save_results(self, top_words):\n", + " \"\"\"ä¿å­˜ç»“果到文件\"\"\"\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + " def run(self):\n", + " \"\"\"执行è¯é¢‘统计并ä¿å­˜ç»“æžœ\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.save_results(top_words)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4dcabfd9-b8f9-4796-a060-9d9f6689e92f", + "metadata": {}, + "source": [ + "### 装饰器分æž\n", + "\n", + "功能:timing_decorator 记录 read_file å’Œ tokenize æ–¹æ³•çš„æ‰§è¡Œæ—¶é—´ï¼Œå¸®åŠ©åˆ†æžæ€§èƒ½ç“¶é¢ˆï¼ˆå¦‚分è¯è€—时较长)。\n", + "\n", + "å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + " - å¯ç»´æŠ¤æ€§ï¼šæ— éœ€ä¿®æ”¹åŽŸæ–¹æ³•ä»£ç å³å¯æ·»åŠ æ€§èƒ½ç›‘æŽ§ï¼Œç¬¦åˆå¼€é—­åŽŸåˆ™,维护更方便。\n", + " - å¯è¯»æ€§ï¼šè£…é¥°å™¨å°†æ€§èƒ½ç›‘æŽ§é€»è¾‘ä¸Žä¸šåŠ¡é€»è¾‘åˆ†ç¦»ï¼Œä»£ç æ›´æ¸…晰。\n", + " - å¤ç”¨æ€§ï¼štiming_decorator å¯å¤ç”¨äºŽå…¶ä»–方法或项目。\n", + "\n", + "局陿€§ï¼šè£…é¥°å™¨å¢žåŠ å°‘é‡æ€§èƒ½å¼€é”€ï¼Œéœ€è°¨æ…Žç”¨äºŽé«˜é¢‘调用的函数。" + ] + }, + { + "cell_type": "markdown", + "id": "8fcbe48d-de8f-4387-9be3-f05f88553029", + "metadata": {}, + "source": [ + "## 第二部分:引入函数å¼ç¼–程\n", + "\n", + "函数å¼ç¼–程(如高阶函数ã€lambdaã€map/reduce)强调无å˜é‡æ±¡æŸ“ã€æ•°æ®è½¬æ¢ç®€æ´æ€§ã€‚在è¯é¢‘统计案例中,函数å¼ç¼–程å¯ç”¨äºŽï¼š\n", + "- æ•°æ®å¤„ç†ï¼šä½¿ç”¨ map å’Œ filter å¤„ç†æ–‡ä»¶å’Œå•è¯ã€‚\n", + "- è¯é¢‘统计:使用 reduce åˆå¹¶è¯é¢‘。\n", + "- 管é“å¼å¤„ç†ï¼šé€šè¿‡å‡½æ•°ç»„åˆå®žçŽ°æ•°æ®æµå¤„ç†ã€‚" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a6970b2-7488-43e3-ae9f-0174ff9b4b57", + "metadata": {}, + "outputs": [], + "source": [ + "# 函数å¼å¤„ç†æ–‡ä»¶å’Œè¯é¢‘\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "from functools import reduce\n", + "from typing import List, Tuple\n", + "\n", + "def timing_decorator(func):\n", + " \"\"\"装饰器:记录函数执行时间\"\"\"\n", + " import time\n", + " import functools\n", + " @functools.wraps(func)\n", + " def wrapper(*args, **kwargs):\n", + " start_time = time.time()\n", + " result = func(*args, **kwargs)\n", + " end_time = time.time()\n", + " print(f\"{func.__name__} took {end_time - start_time:.4f} seconds\")\n", + " return result\n", + " return wrapper\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + "\n", + " def load_stop_words(self) -> set:\n", + " \"\"\"加载åœç”¨è¯\"\"\"\n", + " try:\n", + " with open(self.stop_words_file, 'r', encoding='utf-8') as f:\n", + " return set(line.strip() for line in f if line.strip())\n", + " except Exception as e:\n", + " print(f\"Error loading stop words: {e}\")\n", + " return set()\n", + "\n", + " @timing_decorator\n", + " def read_file(self, file_path: str) -> str:\n", + " \"\"\"è¯»å–æ–‡ä»¶å†…容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " def tokenize(self, text: str) -> List[str]:\n", + " \"\"\"中文分è¯å¹¶è¿‡æ»¤åœç”¨è¯ï¼ˆå‡½æ•°å¼ï¼‰\"\"\"\n", + " return list(filter(lambda w: w not in self.stop_words, jieba.lcut(text)))\n", + "\n", + " def process_file(self, file_path: str) -> Counter:\n", + " \"\"\"处ç†å•个文件,返回è¯é¢‘ Counter\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file Couple(path)\n", + " words = self.tokenize(text)\n", + " return Counter(words)\n", + " return Counter()\n", + "\n", + " def process_directory(self) -> Counter:\n", + " \"\"\"处ç†ç›®å½•下所有文件(函数å¼ï¼‰\"\"\"\n", + " file_paths = (os.path.join(self.data_dir, f) for f in os.listdir(self.data_dir))\n", + " counters = map(self.process_file, file_paths)\n", + " return reduce(lambda c1, c2: c1 + c2, counters, Counter())\n", + "\n", + " def get_top_words(self, word_count: Counter) -> List[Tuple[str, int]]:\n", + " \"\"\"获å–å‰ N 高频è¯\"\"\"\n", + " return word_count.most_common(self.top_n)\n", + "\n", + " def save_results(self, top_words: List[Tuple[str, int]]):\n", + " \"\"\"ä¿å­˜ç»“果到文件\"\"\"\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + " def run(self):\n", + " \"\"\"执行è¯é¢‘统计并ä¿å­˜ç»“æžœ\"\"\"\n", + " word_count = self.process_directory()\n", + " top_words = self.get_top_words(word_count)\n", + " self.save_results(top_words)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "id": "6ce3b7c3-f099-4e2c-b415-18b0e3ab492a", + "metadata": {}, + "source": [ + "### 函数å¼ç¼–程分æž\n", + "\n", + "改进:\n", + "- map:在 process_directory 中,使用 map(self.process_file, file_paths) å¹¶è¡Œå¤„ç†æ–‡ä»¶è·¯å¾„,生æˆè¯é¢‘ Counter 列表。\n", + "- reduce:使用 reduce(lambda c1, c2: c1 + c2, counters, Counter()) åˆå¹¶æ‰€æœ‰æ–‡ä»¶çš„è¯é¢‘,简æ´ä¸”无副作用。\n", + "- filter:在 tokenize 中,使用 filter(lambda w: w not in self.stop_words, ...) 过滤åœç”¨è¯ï¼Œæ›¿ä»£åˆ—表推导å¼ã€‚\n", + "- 生æˆå™¨ï¼šfile_paths 使用生æˆå™¨è¡¨è¾¾å¼ï¼Œå‡å°‘内存å ç”¨ã€‚\n", + "\n", + "å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + "- å¯è¯»æ€§ï¼šå‡½æ•°å¼ç¼–程使数æ®å¤„ç†é€»è¾‘更简æ´ï¼Œç®¡é“å¼å¤„ç†æ¸…æ™°è¡¨è¾¾æ•°æ®æµï¼ˆæ–‡ä»¶è·¯å¾„ -> è¯é¢‘ -> åˆå¹¶ï¼‰ã€‚\n", + "- 性能:生æˆå™¨å’Œ map 优化内存使用,适åˆå¤„ç†å¤§é‡æ–‡ä»¶ã€‚\n", + "- å¯ç»´æŠ¤æ€§ï¼šå‡½æ•°å¼ä»£ç æ— å‰¯ä½œç”¨ï¼Œæ˜“于测试和调试。\n", + "- é€‚ç”¨åœºæ™¯ï¼šé€‚åˆæ•°æ®è½¬æ¢å’Œæ‰¹é‡å¤„ç†ï¼ˆå¦‚文件读å–ã€è¯é¢‘åˆå¹¶ï¼‰ã€‚\n", + "- ç®€æ´æ€§ï¼šmapã€reduce 等使数æ®å¤„ç†é€»è¾‘更紧凑。\n", + "- 内存效率:生æˆå™¨å’Œæƒ°æ€§æ±‚值优化内存使用。\n", + "- 结åˆå¹¶å‘坿˜¾è‘—æå‡æ•ˆçŽ‡ã€‚\n", + "\n", + "é€‚ç”¨åœºæ™¯ï¼šæ•°æ®æµå¤„ç†ï¼ˆå¦‚文件处ç†ã€è¯é¢‘åˆå¹¶ï¼‰ã€æ— çŠ¶æ€æ“作。\n", + "\n", + "局陿€§ï¼š\n", + "- 函数å¼ä»£ç å¯¹åˆå­¦è€…å¯èƒ½ä¸å¤Ÿç›´è§‚,需熟悉 mapã€reduce 等概念。\n", + "- å¯¹äºŽå¤æ‚逻辑,函数å¼ç¼–程å¯èƒ½å¢žåŠ è°ƒè¯•éš¾åº¦ã€‚" + ] + }, + { + "cell_type": "markdown", + "id": "458e18ec-b536-4860-9e12-d0bf5ed9d876", + "metadata": {}, + "source": [ + "# 练习\n", + "\n", + "实践练习:\n", + "- æ·»åŠ æ—¥å¿—è£…é¥°å™¨ï¼Œè®°å½•æ¯æ¬¡æ–‡ä»¶å¤„ç†çš„详细信æ¯ã€‚\n", + "- 使用 functools.reduce é‡å†™ get_top_words,å°è¯•ä¸åŒæŽ’åºé€»è¾‘。\n", + "\n", + "扩展任务:\n", + "- 添加缓存装饰器,é¿å…é‡å¤åˆ†è¯ç›¸åŒæ–‡ä»¶ã€‚\n", + "- 实现函数å¼ç®¡é“,将文件读å–ã€åˆ†è¯ã€ç»Ÿè®¡ä¸²è”为å•一æµã€‚" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/D Plus/.ipynb_checkpoints/02 设计模å¼-checkpoint.ipynb b/D Plus/.ipynb_checkpoints/02 设计模å¼-checkpoint.ipynb new file mode 100644 index 0000000..8f14f9e --- /dev/null +++ b/D Plus/.ipynb_checkpoints/02 设计模å¼-checkpoint.ipynb @@ -0,0 +1,493 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "eccfe49f-de35-4241-90e3-a7095940b61a", + "metadata": {}, + "source": [ + "è®¾è®¡æ¨¡å¼æä¾›é«˜é¢‘é‡å¤å‡ºçŽ°éœ€æ±‚çš„æœ€ä½³è§£å†³æ–¹æ¡ˆã€‚ä»¥ä¸‹ä»‹ç»é€‚åˆè¯é¢‘统计案例的设计模å¼ï¼šç­–略模å¼ã€è§‚察者模å¼ã€å·¥åŽ‚æ¨¡å¼ã€‚" + ] + }, + { + "cell_type": "markdown", + "id": "c186171f-d1f2-433e-a3eb-b266e2909a2c", + "metadata": {}, + "source": [ + "## 策略模å¼ï¼ˆåЍæ€é€‰æ‹©åˆ†è¯ç­–略)\n", + "\n", + "策略模å¼å…许动æ€åˆ‡æ¢ç®—法(如分è¯å™¨ï¼‰ï¼Œæ¯”元编程简å•。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97c865cb-0b5a-4fa1-aa74-5ba2e65e7436", + "metadata": {}, + "outputs": [], + "source": [ + "from abc import ABC, abstractmethod\n", + "\n", + "class Tokenizer(ABC):\n", + " \"\"\"分è¯å™¨æŽ¥å£\"\"\"\n", + " @abstractmethod\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " pass\n", + "\n", + "class JiebaTokenizer(Tokenizer):\n", + " \"\"\"jieba 分è¯å™¨\"\"\"\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " return [w for w in jieba.lcut(text) if w not in stop_words]\n", + "\n", + "class SimpleTokenizer(Tokenizer):\n", + " \"\"\"简å•分è¯å™¨\"\"\"\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " return [w for w in text.split() if w not in stop_words]\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " # 动æ€é€‰æ‹©åˆ†è¯å™¨\n", + " tokenizer_name = config.get('tokenizer', 'jieba')\n", + " self.tokenizer = {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}[tokenizer_name]\n", + "\n", + " def tokenize(self, text: str) -> List[str]:\n", + " \"\"\"使用策略分è¯\"\"\"\n", + " return self.tokenizer.tokenize(text, self.stop_words)\n", + "\n", + " # 其余方法åŒä¸Š" + ] + }, + { + "cell_type": "markdown", + "id": "5435ebc3-d3b0-4475-8bd5-cb45fb51638c", + "metadata": {}, + "source": [ + "å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + "- 坿‰©å±•性:添加新分è¯å™¨åªéœ€å®žçް Tokenizer 接å£ã€‚\n", + "- å¯ç»´æŠ¤æ€§ï¼šåˆ†è¯é€»è¾‘与主类分离,修改更独立。\n", + "\n", + "适用场景:适åˆéœ€è¦åЍæ€åˆ‡æ¢ç®—法的场景。" + ] + }, + { + "cell_type": "markdown", + "id": "fbf53455-558c-40fb-8718-446dec989b5d", + "metadata": {}, + "source": [ + "## 观察者模å¼ï¼ˆç»“果输出解耦)\n", + "\n", + "观察者模å¼å¯ç”¨äºŽè§£è€¦ç»“果输出逻辑(如打å°ã€ä¿å­˜æ–‡ä»¶ã€å‘é€é€šçŸ¥ï¼‰ã€‚" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7a2bd4c-df73-4800-b45b-9b6c73d28d7b", + "metadata": {}, + "outputs": [], + "source": [ + "class OutputObserver(ABC):\n", + " \"\"\"输出观察者接å£\"\"\"\n", + " @abstractmethod\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " pass\n", + "\n", + "class ConsoleOutput(OutputObserver):\n", + " \"\"\"控制å°è¾“出\"\"\"\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "class FileOutput(OutputObserver):\n", + " \"\"\"文件输出\"\"\"\n", + " def __init__(self, output_file: str):\n", + " self.output_file = output_file\n", + "\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n", + "\n", + " def add_observer(self, observer: OutputObserver):\n", + " \"\"\"添加观察者\"\"\"\n", + " self.observers.append(observer)\n", + "\n", + " def notify_observers(self, top_words: List[Tuple[str, int]]):\n", + " \"\"\"通知所有观察者\"\"\"\n", + " for observer in self.observers:\n", + " observer.update(top_words)\n", + "\n", + " def run(self):\n", + " \"\"\"执行è¯é¢‘统计并通知观察者\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.notify_observers(top_words)\n", + "\n", + " # 其余方法åŒä¸Š" + ] + }, + { + "cell_type": "markdown", + "id": "02b5cfba-431c-4a01-a454-099e4f41922c", + "metadata": {}, + "source": [ + "### 分æž\n", + "\n", + "å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + " - 坿‰©å±•性:添加新输出方å¼åªéœ€å®žçް OutputObserver 接å£ã€‚\n", + " - 解耦性:输出逻辑与统计逻辑分离,修改输出ä¸å½±å“核心功能。\n", + "\n", + "适用场景:适åˆéœ€è¦å¤šç§è¾“出或通知的场景。\n", + "\n", + "局陿€§ï¼šè§‚察者模å¼å¢žåР代ç å¤æ‚性,适åˆå¤æ‚输出需求。" + ] + }, + { + "cell_type": "markdown", + "id": "11669305-8cd5-4317-afd5-e85c3f0a5a81", + "metadata": {}, + "source": [ + "## 工厂模å¼ï¼ˆåЍæ€åˆ›å»ºåˆ†è¯å™¨ï¼‰\n", + "\n", + "工厂模å¼å¯ç”¨äºŽåЍæ€åˆ›å»ºåˆ†è¯å™¨ï¼Œç®€åŒ–策略模å¼ä¸­çš„åˆå§‹åŒ–逻辑。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fa50633-de22-40c8-912d-3ded5ebcedfc", + "metadata": {}, + "outputs": [], + "source": [ + "class TokenizerFactory:\n", + " \"\"\"分è¯å™¨å·¥åŽ‚\"\"\"\n", + " @staticmethod\n", + " def create_tokenizer(name: str) -> Tokenizer:\n", + " tokenizers = {\n", + " 'jieba': JiebaTokenizer(),\n", + " 'simple': SimpleTokenizer()\n", + " }\n", + " return tokenizers.get(name, JiebaTokenizer())\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n", + "\n", + " # 其余方法åŒä¸Š" + ] + }, + { + "cell_type": "markdown", + "id": "a4db7046-dfe2-4bd8-81d1-49a42e2eeb5c", + "metadata": {}, + "source": [ + "### 分æž\n", + "\n", + "å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + " - å¯ç»´æŠ¤æ€§ï¼šåˆ†è¯å™¨åˆ›å»ºé€»è¾‘集中于工厂,易于修改。\n", + " - 坿‰©å±•性:添加新分è¯å™¨åªéœ€æ›´æ–°å·¥åŽ‚æ–¹æ³•ã€‚\n", + "\n", + "适用场景:适åˆéœ€è¦åЍæ€åˆ›å»ºå¯¹è±¡çš„场景。\n", + "\n", + "局陿€§ï¼šå¯¹äºŽç®€å•场景,工厂模å¼å¯èƒ½ç•¥æ˜¾å†—余。" + ] + }, + { + "cell_type": "markdown", + "id": "e5f2aef4-a055-43a9-917c-fa183de6db2d", + "metadata": {}, + "source": [ + "## 综åˆå®žçŽ°ï¼ˆæ•´åˆç‰¹æ€§ä¸Žæ¨¡å¼ï¼‰\n", + "\n", + "æ•´åˆä¸Šä¸‹æ–‡ç®¡ç†å™¨ã€ç”Ÿæˆå™¨ã€ç­–略模å¼å’Œè§‚察者模å¼çš„æœ€ç»ˆå®žçŽ°ï¼ˆéƒ¨åˆ†ä»£ç å±•示)。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa7f34e2-d355-4a22-8572-729c49b18605", + "metadata": {}, + "outputs": [], + "source": [ + "# text_analyzer.py\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "from contextlib import contextmanager\n", + "from typing import List, Tuple\n", + "from abc import ABC, abstractmethod\n", + "\n", + "@contextmanager\n", + "def file_reader(file_path: str):\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " yield f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " yield \"\"\n", + "\n", + "class Tokenizer(ABC):\n", + " @abstractmethod\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " pass\n", + "\n", + "class JiebaTokenizer(Tokenizer):\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " for word in jieba.lcut(text):\n", + " if word not in stop_words:\n", + " yield word\n", + "\n", + "class SimpleTokenizer(Tokenizer):\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " for word in text.split():\n", + " if word not in stop_words:\n", + " yield word\n", + "\n", + "class TokenizerFactory:\n", + " @staticmethod\n", + " def create_tokenizer(name: str) -> Tokenizer:\n", + " return {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}.get(name, JiebaTokenizer())\n", + "\n", + "class OutputObserver(ABC):\n", + " @abstractmethod\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " pass\n", + "\n", + "class ConsoleOutput(OutputObserver):\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "class FileOutput(OutputObserver):\n", + " def __init__(self, output_file: str):\n", + " self.output_file = output_file\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n", + " self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n", + "\n", + " def load_stop_words(self) -> set:\n", + " with file_reader(self.stop_words_file) as content:\n", + " return set(line.strip() for line in content.splitlines() if line.strip())\n", + "\n", + " def process_file(self, file_path: str):\n", + " if file_path.endswith('.txt'):\n", + " with file_reader(file_path) as text:\n", + " words = self.tokenizer.tokenize(text, self.stop_words)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self) -> List[Tuple[str, int]]:\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def notify_observers(self, top_words: List[Tuple[str, int]]):\n", + " for observer in self.observers:\n", + " observer.update(top_words)\n", + "\n", + " def run(self):\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.notify_observers(top_words)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d130312-b298-4c76-ae09-0fb4bd08b0c1", + "metadata": {}, + "outputs": [], + "source": [ + "# main.py\n", + "\n", + "from text_analyzer import TextAnalyzer\n", + "\n", + "def main():\n", + " analyzer = TextAnalyzer()\n", + " analyzer.run()\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "770618c9-428e-454a-97de-00e3b49c9d03", + "metadata": {}, + "source": [ + "## 结论\n", + "\n", + "通过引入上下文管ç†å™¨ã€ç”Ÿæˆå™¨ã€å…ƒç¼–程ã€ç­–略模å¼ã€è§‚察者模å¼å’Œå·¥åŽ‚æ¨¡å¼ï¼Œè¯é¢‘统计代ç åœ¨å¯æ‰©å±•性ã€å¯ç»´æŠ¤æ€§å’Œå¤ç”¨æ€§ä¸Šè¿›ä¸€æ­¥æå‡ã€‚\n", + "这些特性和模å¼ä½¿ä»£ç æ›´æ¨¡å—化ã€çµæ´»ï¼Œé€‚åˆå¤§åž‹é¡¹ç›®ï¼ŒåŒæ—¶ä¿æŒæ¸…晰的工程结构。结åˆä¹‹å‰çš„装饰器和函数å¼ç¼–程,代ç å·²è¾¾åˆ°å·¥ç¨‹åŒ–水平。\n", + "\n", + "若需深入,å¯ä»¥è¿›ä¸€æ­¥è€ƒè™‘其它性能特性." + ] + }, + { + "cell_type": "markdown", + "id": "cbeaa07d-272f-465b-a437-9c4b44827d23", + "metadata": {}, + "source": [ + "## 进一步练习\n", + "\n", + "实践练习:\n", + "- 实现新分è¯å™¨ï¼ˆå¦‚ thulacï¼‰å¹¶é€šè¿‡ç­–ç•¥æ¨¡å¼æˆ–工厂模å¼é›†æˆã€‚\n", + "- 添加新观察者(如 JSON 输出)。\n", + "\n", + "使用生æˆå™¨å®žçްæµå¼è¯é¢‘统计,比较内存å ç”¨ã€‚\n", + "å®žçŽ°ç¼“å­˜æœºåˆ¶ï¼Œç¼“å­˜å·²å¤„ç†æ–‡ä»¶çš„分è¯ç»“果。\n", + "\n", + "添加命令行接å£ï¼ˆargparse),动æ€é…ç½® top_n å’Œ tokenizer。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a43b53d-1e07-4ebe-a6c8-104353fd5f7b", + "metadata": {}, + "outputs": [], + "source": [ + "## 附:元编程\n", + "\n", + "元编程å…许动æ€ä¿®æ”¹ç±»æˆ–函数行为,å¯ç”¨äºŽåЍæ€é…置分è¯å™¨æˆ–输出格å¼ã€‚案例中,å¯é€šè¿‡å…ƒç¼–ç¨‹åŠ¨æ€æ³¨å†Œåˆ†è¯å™¨ã€‚" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4394008c-88da-44bd-aa0d-f1b7a6dbc7d6", + "metadata": {}, + "outputs": [], + "source": [ + "class TokenizerRegistry(type):\n", + " \"\"\"å…ƒç±»ï¼šåŠ¨æ€æ³¨å†Œåˆ†è¯å™¨\"\"\"\n", + " tokenizers = {}\n", + "\n", + " def register_tokenizer(cls, name):\n", + " def decorator(func):\n", + " cls.tokenizers[name] = func\n", + " return func\n", + " return decorator\n", + "\n", + "class TextAnalyzer(metaclass=TokenizerRegistry):\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.tokenizer_name = config.get('tokenizer', 'jieba') # 从é…置读å–分è¯å™¨\n", + "\n", + " @classmethod\n", + " def register_tokenizer(cls, name):\n", + " return cls.__class__.register_tokenizer(name)\n", + "\n", + " def tokenize(self, text: str) -> List[str]:\n", + " \"\"\"动æ€è°ƒç”¨åˆ†è¯å™¨\"\"\"\n", + " tokenizer = self.__class__.tokenizers.get(self.tokenizer_name, self.jieba_tokenizer)\n", + " return tokenizer(self, text)\n", + "\n", + " @register_tokenizer('jieba')\n", + " def jieba_tokenizer(self, text: str) -> List[str]:\n", + " \"\"\"jieba 分è¯\"\"\"\n", + " return [w for w in jieba.lcut(text) if w not in self.stop_words]\n", + "\n", + " @register_tokenizer('simple')\n", + " def simple_tokenizer(self, text: str) -> List[str]:\n", + " \"\"\"简å•分è¯ï¼ˆæŒ‰ç©ºæ ¼ï¼‰\"\"\"\n", + " return [w for w in text.split() if w not in self.stop_words]\n", + "\n", + " # 其余方法(load_stop_words, process_file, etc.)åŒä¸Š" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2249f13a-7a3f-4376-ba2a-d92f11658d32", + "metadata": {}, + "outputs": [], + "source": [ + "### 分æž\n", + "\n", + "åŠŸèƒ½ï¼šé€šè¿‡å…ƒç±»å’Œè£…é¥°å™¨åŠ¨æ€æ³¨å†Œåˆ†è¯å™¨ï¼Œæ”¯æŒé…置切æ¢ï¼ˆå¦‚ jieba 或 simple)。\n", + "\n", + "å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + " 坿‰©å±•性:新分è¯å™¨åªéœ€æ·»åŠ æ–°æ–¹æ³•å¹¶æ³¨å†Œï¼Œæ— éœ€ä¿®æ”¹æ ¸å¿ƒé€»è¾‘ã€‚\n", + " çµæ´»æ€§ï¼šé€šè¿‡é…置文件动æ€é€‰æ‹©åˆ†è¯å™¨ã€‚\n", + "\n", + "适用场景:适åˆéœ€è¦åЍæ€é…置或æ’件化系统的场景。\n", + "\n", + "局陿€§ï¼šå…ƒç¼–程增加代ç å¤æ‚性,å¯èƒ½é™ä½Žå¯è¯»æ€§ï¼Œéœ€è°¨æ…Žä½¿ç”¨ã€‚" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/D Plus/00 å°è£….ipynb b/D Plus/00 å°è£….ipynb new file mode 100644 index 0000000..4d255e0 --- /dev/null +++ b/D Plus/00 å°è£….ipynb @@ -0,0 +1,478 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "69e76aa7-2c5d-4114-a302-85e17cc83e2c", + "metadata": {}, + "source": [ + "æœ¬æ–‡æ—¨åœ¨é€šè¿‡ä¸€ä¸ªæ¡ˆä¾‹ï¼ˆè¯»å– data 目录下 100 篇å°è¯´æ–‡æœ¬ï¼Œç»Ÿè®¡è¯é¢‘å¹¶è¾“å‡ºå‰ 10 高频è¯ï¼‰æ¥è¯´æ˜Žç»“构化编程和å°è£…方法如何æå‡ä»£ç å·¥ç¨‹è´¨é‡ã€‚\n", + "æ•™æ¡ˆå°†é€æ­¥å±•示ä¸åŒç»“构化方法和å°è£…技术的应用,并分æžå…¶å¯¹ä»£ç å¯è¯»æ€§ã€å¯ç»´æŠ¤æ€§ã€å¯æ‰©å±•性和å¤ç”¨æ€§çš„æå‡ã€‚" + ] + }, + { + "cell_type": "markdown", + "id": "b9a9a366-7fd3-422b-b3bc-b0bc00374da6", + "metadata": {}, + "source": [ + "# 教学目标\n", + "- 掌æ¡å°è£…方法(函数ã€ç±»ã€æ¨¡å—)在代ç ç»„织中的作用。" + ] + }, + { + "cell_type": "markdown", + "id": "1387e026-c978-4217-9015-ab0e047c01a0", + "metadata": {}, + "source": [ + "## ç¬¬ä¸€éƒ¨åˆ†ï¼šåŸºç¡€å®žçŽ°ï¼ˆæ— ç»“æž„åŒ–ã€æ— å°è£…)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33803186-d890-4cd7-9636-8920fcb86e14", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "files = os.listdir('data')\n", + "word_count = {}\n", + "for file in files:\n", + " with open('data/' + file, 'r', encoding='utf-8') as f:\n", + " text = f.read()\n", + " words = text.split() # å‡è®¾ç®€å•按空格分è¯\n", + " for word in words:\n", + " if word in word_count:\n", + " word_count[word] += 1\n", + " else:\n", + " word_count[word] = 1\n", + "\n", + "# 排åºå¹¶è¾“出å‰10\n", + "sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n", + "for i in range(10):\n", + " print(sorted_words[i])" + ] + }, + { + "cell_type": "markdown", + "id": "471351e7-8645-4690-973a-7d8de53bda5f", + "metadata": {}, + "source": [ + "### 问题分æž\n", + "\n", + "- å¯è¯»æ€§å·®ï¼šæ²¡æœ‰æ¸…晰的功能划分,代ç é€»è¾‘æ··æ‚,难以阅读ç†è§£ç»´æŠ¤ã€‚\n", + "- æ‰©å±•æ€§å·®ï¼šå¦‚æžœéœ€è¦æ›´æ”¹åˆ†è¯é€»è¾‘ã€æ–‡ä»¶è·¯å¾„或输出格å¼ï¼Œéœ€ä¿®æ”¹å¤šå¤„代ç ã€‚\n", + "- å®¹é”™æ€§å·®ï¼šæœªå¤„ç†æ–‡ä»¶è¯»å–失败ã€ç©ºæ–‡ä»¶ç­‰é—®é¢˜ã€‚\n", + "- å¤ç”¨æ€§ä½Žï¼šé€»è¾‘无法直接å¤ç”¨åœ¨å…¶ä»–类似任务中。" + ] + }, + { + "cell_type": "markdown", + "id": "a5881283-c295-4433-8edd-f915201a5f43", + "metadata": {}, + "source": [ + "## 第二部分:引入函数å°è£…\n", + "\n", + "æç‚¼å‡ºè‹¥å¹²å‡½æ•°ï¼Œå‡å°‘代ç çš„夿‚性,æé«˜å¯è¯»æ€§å’Œå¯ç»´æŠ¤æ€§ã€‚" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7beadc81-f939-4ac5-b885-407c6810b7de", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "def read_file(file_path):\n", + " \"\"\"读å–å•个文件内容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + "def get_words(text):\n", + " \"\"\"简å•分è¯ï¼ˆæŒ‰ç©ºæ ¼ï¼‰\"\"\"\n", + " return text.split()\n", + "\n", + "def count_words(words):\n", + " \"\"\"统计è¯é¢‘\"\"\"\n", + " word_count = {}\n", + " for word in words:\n", + " word_count[word] = word_count.get(word, 0) + 1\n", + " return word_count\n", + "\n", + "def get_top_n(word_count, n=10):\n", + " \"\"\"获å–å‰ N 高频è¯\"\"\"\n", + " return sorted(word_count.items(), key=lambda x: x[1], reverse=True)[:n]\n", + "\n", + "def main():\n", + " \"\"\"主函数,控制æµç¨‹\"\"\"\n", + " word_count = {}\n", + " data_dir = 'data'\n", + " \n", + " # 顺åºç»“æž„ï¼šæŒ‰æ­¥éª¤è¯»å–æ–‡ä»¶ã€å¤„ç†æ–‡æœ¬\n", + " for file in os.listdir(data_dir):\n", + " file_path = os.path.join(data_dir, file)\n", + " # 选择结构:检查文件是å¦ä¸º txt\n", + " if file_path.endswith('.txt'):\n", + " text = read_file(file_path)\n", + " # å¾ªçŽ¯ç»“æž„ï¼šå¤„ç†æ¯ä¸ªæ–‡ä»¶çš„è¯\n", + " words = get_words(text)\n", + " file_word_count = count_words(words)\n", + " # åˆå¹¶è¯é¢‘\n", + " for word, count in file_word_count.items():\n", + " word_count[word] = word_count.get(word, 0) + count\n", + " \n", + " # 输出结果\n", + " top_words = get_top_n(word_count)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "4f7218a3-43d2-4159-9854-9880020c42fc", + "metadata": {}, + "source": [ + "### 改进分æž\n", + " - 逻辑分层:main() å‡½æ•°æ¸…æ™°å®šä¹‰äº†ç¨‹åºæ‰§è¡Œæ­¥éª¤ï¼ˆè¯»å–文件 -> åˆ†è¯ -> 统计 -> 输出)。\n", + " - 模å—化:将功能拆分为函数(read_fileã€get_wordsã€count_wordsã€get_top_n),æé«˜ä»£ç å¤ç”¨æ€§å’Œå¯è¯»æ€§ã€‚\n", + " - 错误处ç†ï¼šå¢žåŠ  try-except å¤„ç†æ–‡ä»¶è¯»å–异常。\n", + " - å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + " - å¯è¯»æ€§ï¼šå‡½æ•°å‘½å本身就帮助ç†è§£ä»£ç ï¼Œé€»è¾‘分å—。\n", + " - å¯ç»´æŠ¤æ€§ï¼šä¿®æ”¹æŸéƒ¨åˆ†åŠŸèƒ½ï¼ˆå¦‚åˆ†è¯é€»è¾‘)åªéœ€æ”¹å¯¹åº”函数。\n", + " - å¤ç”¨æ€§ï¼šå‡½æ•°å¯å¤ç”¨åœ¨å…¶ä»–类似任务中。" + ] + }, + { + "cell_type": "markdown", + "id": "50737966-57c9-4daf-ac3b-6a1c73b18136", + "metadata": {}, + "source": [ + "## 第三部分:引入类å°è£…\n", + "\n", + "通过类å°è£…功能,进一步æé«˜ä»£ç çš„æ¨¡å—化ã€å¯æ‰©å±•性和å¤ç”¨æ€§ã€‚" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81aa7f9c-de28-4a7a-8ba1-130c3e5e4f7f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "\n", + "class TextAnalyzer:\n", + " \"\"\"文本分æžç±»ï¼Œå°è£…è¯é¢‘统计功能\"\"\"\n", + " def __init__(self, data_dir='data', top_n=10):\n", + " self.data_dir = data_dir\n", + " self.top_n = top_n\n", + " self.word_count = Counter()\n", + "\n", + " def read_file(self, file_path):\n", + " \"\"\"è¯»å–æ–‡ä»¶å†…容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " def tokenize(self, text):\n", + " \"\"\"使用 jieba 进行中文分è¯\"\"\"\n", + " return jieba.lcut(text)\n", + "\n", + " def process_file(self, file_path):\n", + " \"\"\"处ç†å•个文件\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file_path)\n", + " words = self.tokenize(text)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " \"\"\"处ç†ç›®å½•下所有文件\"\"\"\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self):\n", + " \"\"\"获å–å‰ N 高频è¯\"\"\"\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def run(self):\n", + " \"\"\"执行è¯é¢‘统计\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "def main():\n", + " analyzer = TextAnalyzer(data_dir='data', top_n=10)\n", + " analyzer.run()\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "62e780d4-94de-4830-89c2-ab2c96500fc5", + "metadata": {}, + "source": [ + "### 改进分æž\n", + "- é¢å‘对象å°è£…:\n", + " - 使用 TextAnalyzer 类将所有功能å°è£…为一个对象,数æ®ï¼ˆå¦‚ word_count)和方法(如 tokenize)绑定在一起。\n", + " - 通过 __init__ æä¾›é…置(如 data_dir å’Œ top_n),æé«˜çµæ´»æ€§ã€‚\n", + " \n", + "- 模å—化:类方法分工明确(如 read_fileã€tokenizeã€process_file),便于扩展。\n", + "- å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + " - 坿‰©å±•性:å¯é€šè¿‡ç»§æ‰¿ TextAnalyzer 添加新功能(如支æŒå…¶ä»–分è¯å™¨æˆ–文件格å¼ï¼‰ã€‚\n", + " - å¤ç”¨æ€§ï¼šç±»å¯å®žä¾‹åŒ–多次,用于ä¸åŒç›®å½•æˆ–å‚æ•°ã€‚\n", + " - å¯ç»´æŠ¤æ€§ï¼šé€»è¾‘集中在类中,修改相对安全。" + ] + }, + { + "cell_type": "markdown", + "id": "9b4e17c4-f47e-4245-b3d9-e40fde0a2e04", + "metadata": {}, + "source": [ + "# 第四部分:引入文件模å—å°è£…\n", + "将代ç è¿›ä¸€æ­¥æ¨¡å—化到ä¸åŒæ–‡ä»¶ï¼Œå¼•å…¥é…置文件和åœç”¨è¯è¿‡æ»¤ã€‚" + ] + }, + { + "cell_type": "raw", + "id": "aadb5aea-8cc5-4a0f-9f5b-7eab28e90f1a", + "metadata": {}, + "source": [ + "目录结构\n", + "\n", + "project/\n", + "├── data/ # å°è¯´æ–‡æœ¬ç›®å½•\n", + "├── config.yaml # é…置文件\n", + "├── stop_words.txt # åœç”¨è¯æ–‡ä»¶\n", + "├── text_analyzer.py # åˆ†æžæ¨¡å—\n", + "├── main.py # 主程åº" + ] + }, + { + "cell_type": "raw", + "id": "2de4767b-8928-4f3f-8c8b-3c3cba2bc98a", + "metadata": {}, + "source": [ + "# config.yaml\n", + "\n", + "data_dir: data\n", + "top_n: 10\n", + "stop_words_file: stop_words.txt\n", + "output_file: output.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b442d61-c937-4757-b7b4-b6fc047c3529", + "metadata": {}, + "outputs": [], + "source": [ + "# text_analyzer.py\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.word_count = Counter()\n", + " self.stop_words = self.load_stop_words()\n", + "\n", + " def load_stop_words(self):\n", + " \"\"\"加载åœç”¨è¯\"\"\"\n", + " try:\n", + " with open(self.stop_words_file, 'r', encoding='utf-8') as f:\n", + " return set(line.strip() for line in f if line.strip())\n", + " except Exception as e:\n", + " print(f\"Error loading stop words: {e}\")\n", + " return set()\n", + "\n", + " def read_file(self, file_path):\n", + " \"\"\"è¯»å–æ–‡ä»¶å†…容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " def tokenize(self, text):\n", + " \"\"\"中文分è¯å¹¶è¿‡æ»¤åœç”¨è¯\"\"\"\n", + " words = jieba.lcut(text)\n", + " return [word for word in words if word not in self.stop_words]\n", + "\n", + " def process_file(self, file_path):\n", + " \"\"\"处ç†å•个文件\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file_path)\n", + " words = self.tokenize(text)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " \"\"\"处ç†ç›®å½•下所有文件\"\"\"\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self):\n", + " \"\"\"获å–å‰ N 高频è¯\"\"\"\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def save_results(self, top_words):\n", + " \"\"\"ä¿å­˜ç»“果到文件\"\"\"\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + " def run(self):\n", + " \"\"\"执行è¯é¢‘统计并ä¿å­˜ç»“æžœ\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.save_results(top_words)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22f58992-0108-4c90-894d-e756e7301a5a", + "metadata": {}, + "outputs": [], + "source": [ + "# main.py\n", + "\n", + "from text_analyzer import TextAnalyzer\n", + "\n", + "def main():\n", + " analyzer = TextAnalyzer()\n", + " analyzer.run()\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "18d27410-8923-4662-a6b7-8e027609506e", + "metadata": {}, + "source": [ + "## 改进分æž\n", + "\n", + "- 模å—化:将分æžé€»è¾‘放入 text_analyzer.pyï¼Œä¸»ç¨‹åº main.py 仅负责调用,符åˆå·¥ç¨‹åŒ–项目结构。\n", + "- é…置文件:通过 config.yaml é…ç½®å‚æ•°ï¼Œå¢žå¼ºçµæ´»æ€§ï¼Œæ— éœ€ä¿®æ”¹ä»£ç å³å¯æ›´æ”¹ç›®å½•ã€è¾“出文件等。\n", + "- 输出到文件:增加 save_results 方法,支æŒç»“æžœæŒä¹…化。\n", + "- å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + " - å¯ç»´æŠ¤æ€§ï¼šé…置文件和模å—化分离了é…置与逻辑,修改é…置无需动代ç ã€‚ \n", + " - å¤ç”¨æ€§ï¼šæ¨¡å—å¯å¯¼å…¥åˆ°å…¶ä»–项目,类å¯é‡å¤å®žä¾‹åŒ–。" + ] + }, + { + "cell_type": "markdown", + "id": "10876929-69f9-43bf-ba2d-a5d7bb11f22b", + "metadata": {}, + "source": [ + "### å°è£…的总节\n", + "\n", + "å°è£…方法:\n", + "- 模å—化:函数划分逻辑,é™ä½Žè€¦åˆã€‚\n", + "- 函数å°è£…:将é‡å¤é€»è¾‘å°è£…为函数,æé«˜å¤ç”¨æ€§ã€‚\n", + "- ç±»å°è£…:将数æ®å’Œæ–¹æ³•绑定,增强代ç ç»„织性和扩展性。\n", + "- 文件å°è£…:通过文件模å—化,符åˆå·¥ç¨‹åŒ–å¼€å‘规范。\n", + "\n", + "å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + "- 分离é…置与逻辑,é™ä½Žç»´æŠ¤æˆæœ¬ã€‚\n", + "- 模å—化和é¢å‘对象设计支æŒåŠŸèƒ½æ‰©å±•ã€‚\n", + "- é”™è¯¯å¤„ç†æé«˜ç¨‹åºé²æ£’性。" + ] + }, + { + "cell_type": "raw", + "id": "60ba30d8-d8c2-4183-996e-376ff71716bf", + "metadata": {}, + "source": [ + "## å¦å¤–ä¸€ç§æ–‡ä»¶æ¨¡å—化设计(分层架构)示例\n", + "\n", + "å°†ä»£ç æ‹†åˆ†ä¸ºç‹¬ç«‹æ¨¡å—,æ¯ä¸ªæ¨¡å—ä»…è´Ÿè´£å•一èŒè´£ï¼š\n", + " - æ•°æ®è¯»å–层:é历目录ã€è¯»å–文件内容\n", + " - æ•°æ®å¤„ç†å±‚:文本清洗ã€åˆ†è¯ã€ç»Ÿè®¡è¯é¢‘\n", + " - 结果输出层:排åºå¹¶è¾“出å‰10高频è¯\n", + "\n", + "目录结构:\n", + "project/\n", + "├── data_loader.py # æ•°æ®è¯»å–模å—\n", + "├── text_processor.py # æ•°æ®å¤„ç†æ¨¡å—\n", + "├── output_handler.py # 结果输出模å—\n", + "└── main.py # 主程åºå…¥å£" + ] + }, + { + "cell_type": "markdown", + "id": "517759ac-c4cf-402e-86f1-a9fae0d88bbb", + "metadata": {}, + "source": [ + "# 第七部分:è¿è¡Œè¯´æ˜Ž\n", + "\n", + "环境准备:\n", + "- 安装 Python 3.8+。\n", + "- 安装ä¾èµ–:pip install jieba pyyaml。\n", + "- 准备 data 目录,放入 100 个 txt 文件。\n", + "- 创建 stop_words.txt å’Œ config.yaml。" + ] + }, + { + "cell_type": "markdown", + "id": "a7e1836b-42a1-45f9-bf8c-2e04a38744e4", + "metadata": {}, + "source": [ + "通过从无结构到结构化,å†åˆ°é¢å‘对象和模å—åŒ–çš„é€æ­¥ä¼˜åŒ–,展示了结构化编程和å°è£…方法如何显著æå‡ä»£ç å·¥ç¨‹è´¨é‡ã€‚最终实现ä¸ä»…满足了è¯é¢‘统计需求,还具备高å¯è¯»æ€§ã€å¯ç»´æŠ¤æ€§ã€å¯æ‰©å±•性和å¤ç”¨æ€§ï¼Œé€‚åˆå®žé™…工程应用。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/D Plus/01 特殊执行方å¼çš„语言特性.ipynb.ipynb b/D Plus/01 特殊执行方å¼çš„语言特性.ipynb.ipynb new file mode 100644 index 0000000..65b84ab --- /dev/null +++ b/D Plus/01 特殊执行方å¼çš„语言特性.ipynb.ipynb @@ -0,0 +1,312 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "86405617-889a-40c2-a895-7b51fb14b65d", + "metadata": {}, + "source": [ + "# 教学目标\n", + "\n", + "- 在è¯é¢‘统计案例中引入装饰器和函数å¼ç¼–程 。\n", + "- 分æžè¿™äº›ç‰¹æ€§å’Œæ¨¡å¼å¦‚何进一步优化代ç è´¨é‡ï¼ˆå¯è¯»æ€§ã€å¯ç»´æŠ¤æ€§ã€å¯æ‰©å±•性ã€å¤ç”¨æ€§ï¼‰ã€‚\n", + "- æŽ¢è®¨é«˜çº§ç‰¹æ€§åœ¨æ¡ˆä¾‹ä¸­çš„é€‚ç”¨æ€§ä¸Žå±€é™æ€§ã€‚" + ] + }, + { + "cell_type": "markdown", + "id": "e6a6a633-d3af-4778-815c-4490dff5f624", + "metadata": {}, + "source": [ + "## 第一部分:引入装饰器\n", + "\n", + "装饰器å¯ç”¨äºŽåœ¨ä¸ä¿®æ”¹å‡½æ•°ä»£ç çš„æƒ…å†µä¸‹æ·»åŠ åŠŸèƒ½ã€‚é€‚åˆæ—¥å¿—è®°å½•ã€æ€§èƒ½åˆ†æžã€é”™è¯¯å¤„ç†ç­‰åœºæ™¯ã€‚" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a5c7d69-d445-4a9c-bb48-7fde0a36c646", + "metadata": {}, + "outputs": [], + "source": [ + "# 为 TextAnalyzer 类添加一个装饰器,用于记录方法执行时间。\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "import time\n", + "import functools\n", + "\n", + "def timing_decorator(func):\n", + " \"\"\"装饰器:记录函数执行时间\"\"\"\n", + " @functools.wraps(func)\n", + " def wrapper(*args, **kwargs):\n", + " start_time = time.time()\n", + " result = func(*args, **kwargs)\n", + " end_time = time.time()\n", + " print(f\"{func.__name__} took {end_time - start_time:.4f} seconds\")\n", + " return result\n", + " return wrapper\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.word_count = Counter()\n", + " self.stop_words = self.load_stop_words()\n", + "\n", + " def load_stop_words(self):\n", + " \"\"\"加载åœç”¨è¯\"\"\"\n", + " try:\n", + " with open(self.stop_words_file, 'r', encoding='utf-8') as f:\n", + " return set(line.strip() for line in f if line.strip())\n", + " except Exception as e:\n", + " print(f\"Error loading stop words: {e}\")\n", + " return set()\n", + "\n", + " @timing_decorator\n", + " def read_file(self, file_path):\n", + " \"\"\"è¯»å–æ–‡ä»¶å†…容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " @timing_decorator\n", + " def tokenize(self, text):\n", + " \"\"\"中文分è¯å¹¶è¿‡æ»¤åœç”¨è¯\"\"\"\n", + " words = jieba.lcut(text)\n", + " return [word for word in words if word not in self.stop_words]\n", + "\n", + " def process_file(self, file_path):\n", + " \"\"\"处ç†å•个文件\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file_path)\n", + " words = self.tokenize(text)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " \"\"\"处ç†ç›®å½•下所有文件\"\"\"\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self):\n", + " \"\"\"获å–å‰ N 高频è¯\"\"\"\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def save_results(self, top_words):\n", + " \"\"\"ä¿å­˜ç»“果到文件\"\"\"\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + " def run(self):\n", + " \"\"\"执行è¯é¢‘统计并ä¿å­˜ç»“æžœ\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.save_results(top_words)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4dcabfd9-b8f9-4796-a060-9d9f6689e92f", + "metadata": {}, + "source": [ + "### 装饰器分æž\n", + "\n", + "功能:timing_decorator 记录 read_file å’Œ tokenize æ–¹æ³•çš„æ‰§è¡Œæ—¶é—´ï¼Œå¸®åŠ©åˆ†æžæ€§èƒ½ç“¶é¢ˆï¼ˆå¦‚分è¯è€—时较长)。\n", + "\n", + "å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + " - å¯ç»´æŠ¤æ€§ï¼šæ— éœ€ä¿®æ”¹åŽŸæ–¹æ³•ä»£ç å³å¯æ·»åŠ æ€§èƒ½ç›‘æŽ§ï¼Œç¬¦åˆå¼€é—­åŽŸåˆ™,维护更方便。\n", + " - å¯è¯»æ€§ï¼šè£…é¥°å™¨å°†æ€§èƒ½ç›‘æŽ§é€»è¾‘ä¸Žä¸šåŠ¡é€»è¾‘åˆ†ç¦»ï¼Œä»£ç æ›´æ¸…晰。\n", + " - å¤ç”¨æ€§ï¼štiming_decorator å¯å¤ç”¨äºŽå…¶ä»–方法或项目。\n", + "\n", + "局陿€§ï¼šè£…é¥°å™¨å¢žåŠ å°‘é‡æ€§èƒ½å¼€é”€ï¼Œéœ€è°¨æ…Žç”¨äºŽé«˜é¢‘调用的函数。" + ] + }, + { + "cell_type": "markdown", + "id": "8fcbe48d-de8f-4387-9be3-f05f88553029", + "metadata": {}, + "source": [ + "## 第二部分:引入函数å¼ç¼–程\n", + "\n", + "函数å¼ç¼–程(如高阶函数ã€lambdaã€map/reduce)强调无å˜é‡æ±¡æŸ“ã€æ•°æ®è½¬æ¢ç®€æ´æ€§ã€‚在è¯é¢‘统计案例中,函数å¼ç¼–程å¯ç”¨äºŽï¼š\n", + "- æ•°æ®å¤„ç†ï¼šä½¿ç”¨ map å’Œ filter å¤„ç†æ–‡ä»¶å’Œå•è¯ã€‚\n", + "- è¯é¢‘统计:使用 reduce åˆå¹¶è¯é¢‘。\n", + "- 管é“å¼å¤„ç†ï¼šé€šè¿‡å‡½æ•°ç»„åˆå®žçŽ°æ•°æ®æµå¤„ç†ã€‚" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a6970b2-7488-43e3-ae9f-0174ff9b4b57", + "metadata": {}, + "outputs": [], + "source": [ + "# 函数å¼å¤„ç†æ–‡ä»¶å’Œè¯é¢‘\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "from functools import reduce\n", + "from typing import List, Tuple\n", + "\n", + "def timing_decorator(func):\n", + " \"\"\"装饰器:记录函数执行时间\"\"\"\n", + " import time\n", + " import functools\n", + " @functools.wraps(func)\n", + " def wrapper(*args, **kwargs):\n", + " start_time = time.time()\n", + " result = func(*args, **kwargs)\n", + " end_time = time.time()\n", + " print(f\"{func.__name__} took {end_time - start_time:.4f} seconds\")\n", + " return result\n", + " return wrapper\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + "\n", + " def load_stop_words(self) -> set:\n", + " \"\"\"加载åœç”¨è¯\"\"\"\n", + " try:\n", + " with open(self.stop_words_file, 'r', encoding='utf-8') as f:\n", + " return set(line.strip() for line in f if line.strip())\n", + " except Exception as e:\n", + " print(f\"Error loading stop words: {e}\")\n", + " return set()\n", + "\n", + " @timing_decorator\n", + " def read_file(self, file_path: str) -> str:\n", + " \"\"\"è¯»å–æ–‡ä»¶å†…容\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " return f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " return \"\"\n", + "\n", + " def tokenize(self, text: str) -> List[str]:\n", + " \"\"\"中文分è¯å¹¶è¿‡æ»¤åœç”¨è¯ï¼ˆå‡½æ•°å¼ï¼‰\"\"\"\n", + " return list(filter(lambda w: w not in self.stop_words, jieba.lcut(text)))\n", + "\n", + " def process_file(self, file_path: str) -> Counter:\n", + " \"\"\"处ç†å•个文件,返回è¯é¢‘ Counter\"\"\"\n", + " if file_path.endswith('.txt'):\n", + " text = self.read_file(file Couple(path)\n", + " words = self.tokenize(text)\n", + " return Counter(words)\n", + " return Counter()\n", + "\n", + " def process_directory(self) -> Counter:\n", + " \"\"\"处ç†ç›®å½•下所有文件(函数å¼ï¼‰\"\"\"\n", + " file_paths = (os.path.join(self.data_dir, f) for f in os.listdir(self.data_dir))\n", + " counters = map(self.process_file, file_paths)\n", + " return reduce(lambda c1, c2: c1 + c2, counters, Counter())\n", + "\n", + " def get_top_words(self, word_count: Counter) -> List[Tuple[str, int]]:\n", + " \"\"\"获å–å‰ N 高频è¯\"\"\"\n", + " return word_count.most_common(self.top_n)\n", + "\n", + " def save_results(self, top_words: List[Tuple[str, int]]):\n", + " \"\"\"ä¿å­˜ç»“果到文件\"\"\"\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + " def run(self):\n", + " \"\"\"执行è¯é¢‘统计并ä¿å­˜ç»“æžœ\"\"\"\n", + " word_count = self.process_directory()\n", + " top_words = self.get_top_words(word_count)\n", + " self.save_results(top_words)\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "id": "6ce3b7c3-f099-4e2c-b415-18b0e3ab492a", + "metadata": {}, + "source": [ + "### 函数å¼ç¼–程分æž\n", + "\n", + "改进:\n", + "- map:在 process_directory 中,使用 map(self.process_file, file_paths) å¹¶è¡Œå¤„ç†æ–‡ä»¶è·¯å¾„,生æˆè¯é¢‘ Counter 列表。\n", + "- reduce:使用 reduce(lambda c1, c2: c1 + c2, counters, Counter()) åˆå¹¶æ‰€æœ‰æ–‡ä»¶çš„è¯é¢‘,简æ´ä¸”无副作用。\n", + "- filter:在 tokenize 中,使用 filter(lambda w: w not in self.stop_words, ...) 过滤åœç”¨è¯ï¼Œæ›¿ä»£åˆ—表推导å¼ã€‚\n", + "- 生æˆå™¨ï¼šfile_paths 使用生æˆå™¨è¡¨è¾¾å¼ï¼Œå‡å°‘内存å ç”¨ã€‚\n", + "\n", + "å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + "- å¯è¯»æ€§ï¼šå‡½æ•°å¼ç¼–程使数æ®å¤„ç†é€»è¾‘更简æ´ï¼Œç®¡é“å¼å¤„ç†æ¸…æ™°è¡¨è¾¾æ•°æ®æµï¼ˆæ–‡ä»¶è·¯å¾„ -> è¯é¢‘ -> åˆå¹¶ï¼‰ã€‚\n", + "- 性能:生æˆå™¨å’Œ map 优化内存使用,适åˆå¤„ç†å¤§é‡æ–‡ä»¶ã€‚\n", + "- å¯ç»´æŠ¤æ€§ï¼šå‡½æ•°å¼ä»£ç æ— å‰¯ä½œç”¨ï¼Œæ˜“于测试和调试。\n", + "- é€‚ç”¨åœºæ™¯ï¼šé€‚åˆæ•°æ®è½¬æ¢å’Œæ‰¹é‡å¤„ç†ï¼ˆå¦‚文件读å–ã€è¯é¢‘åˆå¹¶ï¼‰ã€‚\n", + "- ç®€æ´æ€§ï¼šmapã€reduce 等使数æ®å¤„ç†é€»è¾‘更紧凑。\n", + "- 内存效率:生æˆå™¨å’Œæƒ°æ€§æ±‚值优化内存使用。\n", + "- 结åˆå¹¶å‘坿˜¾è‘—æå‡æ•ˆçŽ‡ã€‚\n", + "\n", + "é€‚ç”¨åœºæ™¯ï¼šæ•°æ®æµå¤„ç†ï¼ˆå¦‚文件处ç†ã€è¯é¢‘åˆå¹¶ï¼‰ã€æ— çŠ¶æ€æ“作。\n", + "\n", + "局陿€§ï¼š\n", + "- 函数å¼ä»£ç å¯¹åˆå­¦è€…å¯èƒ½ä¸å¤Ÿç›´è§‚,需熟悉 mapã€reduce 等概念。\n", + "- å¯¹äºŽå¤æ‚逻辑,函数å¼ç¼–程å¯èƒ½å¢žåŠ è°ƒè¯•éš¾åº¦ã€‚" + ] + }, + { + "cell_type": "markdown", + "id": "458e18ec-b536-4860-9e12-d0bf5ed9d876", + "metadata": {}, + "source": [ + "# 练习\n", + "\n", + "实践练习:\n", + "- æ·»åŠ æ—¥å¿—è£…é¥°å™¨ï¼Œè®°å½•æ¯æ¬¡æ–‡ä»¶å¤„ç†çš„详细信æ¯ã€‚\n", + "- 使用 functools.reduce é‡å†™ get_top_words,å°è¯•ä¸åŒæŽ’åºé€»è¾‘。\n", + "\n", + "扩展任务:\n", + "- 添加缓存装饰器,é¿å…é‡å¤åˆ†è¯ç›¸åŒæ–‡ä»¶ã€‚\n", + "- 实现函数å¼ç®¡é“,将文件读å–ã€åˆ†è¯ã€ç»Ÿè®¡ä¸²è”为å•一æµã€‚" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/D Plus/02 设计模å¼.ipynb b/D Plus/02 设计模å¼.ipynb new file mode 100644 index 0000000..8f14f9e --- /dev/null +++ b/D Plus/02 设计模å¼.ipynb @@ -0,0 +1,493 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "eccfe49f-de35-4241-90e3-a7095940b61a", + "metadata": {}, + "source": [ + "è®¾è®¡æ¨¡å¼æä¾›é«˜é¢‘é‡å¤å‡ºçŽ°éœ€æ±‚çš„æœ€ä½³è§£å†³æ–¹æ¡ˆã€‚ä»¥ä¸‹ä»‹ç»é€‚åˆè¯é¢‘统计案例的设计模å¼ï¼šç­–略模å¼ã€è§‚察者模å¼ã€å·¥åŽ‚æ¨¡å¼ã€‚" + ] + }, + { + "cell_type": "markdown", + "id": "c186171f-d1f2-433e-a3eb-b266e2909a2c", + "metadata": {}, + "source": [ + "## 策略模å¼ï¼ˆåЍæ€é€‰æ‹©åˆ†è¯ç­–略)\n", + "\n", + "策略模å¼å…许动æ€åˆ‡æ¢ç®—法(如分è¯å™¨ï¼‰ï¼Œæ¯”元编程简å•。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97c865cb-0b5a-4fa1-aa74-5ba2e65e7436", + "metadata": {}, + "outputs": [], + "source": [ + "from abc import ABC, abstractmethod\n", + "\n", + "class Tokenizer(ABC):\n", + " \"\"\"分è¯å™¨æŽ¥å£\"\"\"\n", + " @abstractmethod\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " pass\n", + "\n", + "class JiebaTokenizer(Tokenizer):\n", + " \"\"\"jieba 分è¯å™¨\"\"\"\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " return [w for w in jieba.lcut(text) if w not in stop_words]\n", + "\n", + "class SimpleTokenizer(Tokenizer):\n", + " \"\"\"简å•分è¯å™¨\"\"\"\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " return [w for w in text.split() if w not in stop_words]\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " # 动æ€é€‰æ‹©åˆ†è¯å™¨\n", + " tokenizer_name = config.get('tokenizer', 'jieba')\n", + " self.tokenizer = {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}[tokenizer_name]\n", + "\n", + " def tokenize(self, text: str) -> List[str]:\n", + " \"\"\"使用策略分è¯\"\"\"\n", + " return self.tokenizer.tokenize(text, self.stop_words)\n", + "\n", + " # 其余方法åŒä¸Š" + ] + }, + { + "cell_type": "markdown", + "id": "5435ebc3-d3b0-4475-8bd5-cb45fb51638c", + "metadata": {}, + "source": [ + "å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + "- 坿‰©å±•性:添加新分è¯å™¨åªéœ€å®žçް Tokenizer 接å£ã€‚\n", + "- å¯ç»´æŠ¤æ€§ï¼šåˆ†è¯é€»è¾‘与主类分离,修改更独立。\n", + "\n", + "适用场景:适åˆéœ€è¦åЍæ€åˆ‡æ¢ç®—法的场景。" + ] + }, + { + "cell_type": "markdown", + "id": "fbf53455-558c-40fb-8718-446dec989b5d", + "metadata": {}, + "source": [ + "## 观察者模å¼ï¼ˆç»“果输出解耦)\n", + "\n", + "观察者模å¼å¯ç”¨äºŽè§£è€¦ç»“果输出逻辑(如打å°ã€ä¿å­˜æ–‡ä»¶ã€å‘é€é€šçŸ¥ï¼‰ã€‚" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7a2bd4c-df73-4800-b45b-9b6c73d28d7b", + "metadata": {}, + "outputs": [], + "source": [ + "class OutputObserver(ABC):\n", + " \"\"\"输出观察者接å£\"\"\"\n", + " @abstractmethod\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " pass\n", + "\n", + "class ConsoleOutput(OutputObserver):\n", + " \"\"\"控制å°è¾“出\"\"\"\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "class FileOutput(OutputObserver):\n", + " \"\"\"文件输出\"\"\"\n", + " def __init__(self, output_file: str):\n", + " self.output_file = output_file\n", + "\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n", + "\n", + " def add_observer(self, observer: OutputObserver):\n", + " \"\"\"添加观察者\"\"\"\n", + " self.observers.append(observer)\n", + "\n", + " def notify_observers(self, top_words: List[Tuple[str, int]]):\n", + " \"\"\"通知所有观察者\"\"\"\n", + " for observer in self.observers:\n", + " observer.update(top_words)\n", + "\n", + " def run(self):\n", + " \"\"\"执行è¯é¢‘统计并通知观察者\"\"\"\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.notify_observers(top_words)\n", + "\n", + " # 其余方法åŒä¸Š" + ] + }, + { + "cell_type": "markdown", + "id": "02b5cfba-431c-4a01-a454-099e4f41922c", + "metadata": {}, + "source": [ + "### 分æž\n", + "\n", + "å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + " - 坿‰©å±•性:添加新输出方å¼åªéœ€å®žçް OutputObserver 接å£ã€‚\n", + " - 解耦性:输出逻辑与统计逻辑分离,修改输出ä¸å½±å“核心功能。\n", + "\n", + "适用场景:适åˆéœ€è¦å¤šç§è¾“出或通知的场景。\n", + "\n", + "局陿€§ï¼šè§‚察者模å¼å¢žåР代ç å¤æ‚性,适åˆå¤æ‚输出需求。" + ] + }, + { + "cell_type": "markdown", + "id": "11669305-8cd5-4317-afd5-e85c3f0a5a81", + "metadata": {}, + "source": [ + "## 工厂模å¼ï¼ˆåЍæ€åˆ›å»ºåˆ†è¯å™¨ï¼‰\n", + "\n", + "工厂模å¼å¯ç”¨äºŽåЍæ€åˆ›å»ºåˆ†è¯å™¨ï¼Œç®€åŒ–策略模å¼ä¸­çš„åˆå§‹åŒ–逻辑。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fa50633-de22-40c8-912d-3ded5ebcedfc", + "metadata": {}, + "outputs": [], + "source": [ + "class TokenizerFactory:\n", + " \"\"\"分è¯å™¨å·¥åŽ‚\"\"\"\n", + " @staticmethod\n", + " def create_tokenizer(name: str) -> Tokenizer:\n", + " tokenizers = {\n", + " 'jieba': JiebaTokenizer(),\n", + " 'simple': SimpleTokenizer()\n", + " }\n", + " return tokenizers.get(name, JiebaTokenizer())\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n", + "\n", + " # 其余方法åŒä¸Š" + ] + }, + { + "cell_type": "markdown", + "id": "a4db7046-dfe2-4bd8-81d1-49a42e2eeb5c", + "metadata": {}, + "source": [ + "### 分æž\n", + "\n", + "å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + " - å¯ç»´æŠ¤æ€§ï¼šåˆ†è¯å™¨åˆ›å»ºé€»è¾‘集中于工厂,易于修改。\n", + " - 坿‰©å±•性:添加新分è¯å™¨åªéœ€æ›´æ–°å·¥åŽ‚æ–¹æ³•ã€‚\n", + "\n", + "适用场景:适åˆéœ€è¦åЍæ€åˆ›å»ºå¯¹è±¡çš„场景。\n", + "\n", + "局陿€§ï¼šå¯¹äºŽç®€å•场景,工厂模å¼å¯èƒ½ç•¥æ˜¾å†—余。" + ] + }, + { + "cell_type": "markdown", + "id": "e5f2aef4-a055-43a9-917c-fa183de6db2d", + "metadata": {}, + "source": [ + "## 综åˆå®žçŽ°ï¼ˆæ•´åˆç‰¹æ€§ä¸Žæ¨¡å¼ï¼‰\n", + "\n", + "æ•´åˆä¸Šä¸‹æ–‡ç®¡ç†å™¨ã€ç”Ÿæˆå™¨ã€ç­–略模å¼å’Œè§‚察者模å¼çš„æœ€ç»ˆå®žçŽ°ï¼ˆéƒ¨åˆ†ä»£ç å±•示)。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa7f34e2-d355-4a22-8572-729c49b18605", + "metadata": {}, + "outputs": [], + "source": [ + "# text_analyzer.py\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "from contextlib import contextmanager\n", + "from typing import List, Tuple\n", + "from abc import ABC, abstractmethod\n", + "\n", + "@contextmanager\n", + "def file_reader(file_path: str):\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " yield f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " yield \"\"\n", + "\n", + "class Tokenizer(ABC):\n", + " @abstractmethod\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " pass\n", + "\n", + "class JiebaTokenizer(Tokenizer):\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " for word in jieba.lcut(text):\n", + " if word not in stop_words:\n", + " yield word\n", + "\n", + "class SimpleTokenizer(Tokenizer):\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " for word in text.split():\n", + " if word not in stop_words:\n", + " yield word\n", + "\n", + "class TokenizerFactory:\n", + " @staticmethod\n", + " def create_tokenizer(name: str) -> Tokenizer:\n", + " return {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}.get(name, JiebaTokenizer())\n", + "\n", + "class OutputObserver(ABC):\n", + " @abstractmethod\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " pass\n", + "\n", + "class ConsoleOutput(OutputObserver):\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "class FileOutput(OutputObserver):\n", + " def __init__(self, output_file: str):\n", + " self.output_file = output_file\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n", + " self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n", + "\n", + " def load_stop_words(self) -> set:\n", + " with file_reader(self.stop_words_file) as content:\n", + " return set(line.strip() for line in content.splitlines() if line.strip())\n", + "\n", + " def process_file(self, file_path: str):\n", + " if file_path.endswith('.txt'):\n", + " with file_reader(file_path) as text:\n", + " words = self.tokenizer.tokenize(text, self.stop_words)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self) -> List[Tuple[str, int]]:\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def notify_observers(self, top_words: List[Tuple[str, int]]):\n", + " for observer in self.observers:\n", + " observer.update(top_words)\n", + "\n", + " def run(self):\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.notify_observers(top_words)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d130312-b298-4c76-ae09-0fb4bd08b0c1", + "metadata": {}, + "outputs": [], + "source": [ + "# main.py\n", + "\n", + "from text_analyzer import TextAnalyzer\n", + "\n", + "def main():\n", + " analyzer = TextAnalyzer()\n", + " analyzer.run()\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "770618c9-428e-454a-97de-00e3b49c9d03", + "metadata": {}, + "source": [ + "## 结论\n", + "\n", + "通过引入上下文管ç†å™¨ã€ç”Ÿæˆå™¨ã€å…ƒç¼–程ã€ç­–略模å¼ã€è§‚察者模å¼å’Œå·¥åŽ‚æ¨¡å¼ï¼Œè¯é¢‘统计代ç åœ¨å¯æ‰©å±•性ã€å¯ç»´æŠ¤æ€§å’Œå¤ç”¨æ€§ä¸Šè¿›ä¸€æ­¥æå‡ã€‚\n", + "这些特性和模å¼ä½¿ä»£ç æ›´æ¨¡å—化ã€çµæ´»ï¼Œé€‚åˆå¤§åž‹é¡¹ç›®ï¼ŒåŒæ—¶ä¿æŒæ¸…晰的工程结构。结åˆä¹‹å‰çš„装饰器和函数å¼ç¼–程,代ç å·²è¾¾åˆ°å·¥ç¨‹åŒ–水平。\n", + "\n", + "若需深入,å¯ä»¥è¿›ä¸€æ­¥è€ƒè™‘其它性能特性." + ] + }, + { + "cell_type": "markdown", + "id": "cbeaa07d-272f-465b-a437-9c4b44827d23", + "metadata": {}, + "source": [ + "## 进一步练习\n", + "\n", + "实践练习:\n", + "- 实现新分è¯å™¨ï¼ˆå¦‚ thulacï¼‰å¹¶é€šè¿‡ç­–ç•¥æ¨¡å¼æˆ–工厂模å¼é›†æˆã€‚\n", + "- 添加新观察者(如 JSON 输出)。\n", + "\n", + "使用生æˆå™¨å®žçްæµå¼è¯é¢‘统计,比较内存å ç”¨ã€‚\n", + "å®žçŽ°ç¼“å­˜æœºåˆ¶ï¼Œç¼“å­˜å·²å¤„ç†æ–‡ä»¶çš„分è¯ç»“果。\n", + "\n", + "添加命令行接å£ï¼ˆargparse),动æ€é…ç½® top_n å’Œ tokenizer。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a43b53d-1e07-4ebe-a6c8-104353fd5f7b", + "metadata": {}, + "outputs": [], + "source": [ + "## 附:元编程\n", + "\n", + "元编程å…许动æ€ä¿®æ”¹ç±»æˆ–函数行为,å¯ç”¨äºŽåЍæ€é…置分è¯å™¨æˆ–输出格å¼ã€‚案例中,å¯é€šè¿‡å…ƒç¼–ç¨‹åŠ¨æ€æ³¨å†Œåˆ†è¯å™¨ã€‚" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4394008c-88da-44bd-aa0d-f1b7a6dbc7d6", + "metadata": {}, + "outputs": [], + "source": [ + "class TokenizerRegistry(type):\n", + " \"\"\"å…ƒç±»ï¼šåŠ¨æ€æ³¨å†Œåˆ†è¯å™¨\"\"\"\n", + " tokenizers = {}\n", + "\n", + " def register_tokenizer(cls, name):\n", + " def decorator(func):\n", + " cls.tokenizers[name] = func\n", + " return func\n", + " return decorator\n", + "\n", + "class TextAnalyzer(metaclass=TokenizerRegistry):\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.tokenizer_name = config.get('tokenizer', 'jieba') # 从é…置读å–分è¯å™¨\n", + "\n", + " @classmethod\n", + " def register_tokenizer(cls, name):\n", + " return cls.__class__.register_tokenizer(name)\n", + "\n", + " def tokenize(self, text: str) -> List[str]:\n", + " \"\"\"动æ€è°ƒç”¨åˆ†è¯å™¨\"\"\"\n", + " tokenizer = self.__class__.tokenizers.get(self.tokenizer_name, self.jieba_tokenizer)\n", + " return tokenizer(self, text)\n", + "\n", + " @register_tokenizer('jieba')\n", + " def jieba_tokenizer(self, text: str) -> List[str]:\n", + " \"\"\"jieba 分è¯\"\"\"\n", + " return [w for w in jieba.lcut(text) if w not in self.stop_words]\n", + "\n", + " @register_tokenizer('simple')\n", + " def simple_tokenizer(self, text: str) -> List[str]:\n", + " \"\"\"简å•分è¯ï¼ˆæŒ‰ç©ºæ ¼ï¼‰\"\"\"\n", + " return [w for w in text.split() if w not in self.stop_words]\n", + "\n", + " # 其余方法(load_stop_words, process_file, etc.)åŒä¸Š" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2249f13a-7a3f-4376-ba2a-d92f11658d32", + "metadata": {}, + "outputs": [], + "source": [ + "### 分æž\n", + "\n", + "åŠŸèƒ½ï¼šé€šè¿‡å…ƒç±»å’Œè£…é¥°å™¨åŠ¨æ€æ³¨å†Œåˆ†è¯å™¨ï¼Œæ”¯æŒé…置切æ¢ï¼ˆå¦‚ jieba 或 simple)。\n", + "\n", + "å·¥ç¨‹è´¨é‡æå‡ï¼š\n", + " 坿‰©å±•性:新分è¯å™¨åªéœ€æ·»åŠ æ–°æ–¹æ³•å¹¶æ³¨å†Œï¼Œæ— éœ€ä¿®æ”¹æ ¸å¿ƒé€»è¾‘ã€‚\n", + " çµæ´»æ€§ï¼šé€šè¿‡é…置文件动æ€é€‰æ‹©åˆ†è¯å™¨ã€‚\n", + "\n", + "适用场景:适åˆéœ€è¦åЍæ€é…置或æ’件化系统的场景。\n", + "\n", + "局陿€§ï¼šå…ƒç¼–程增加代ç å¤æ‚性,å¯èƒ½é™ä½Žå¯è¯»æ€§ï¼Œéœ€è°¨æ…Žä½¿ç”¨ã€‚" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/readme.MD b/readme.MD index 14627ac..ba2d822 100644 --- a/readme.MD +++ b/readme.MD @@ -18,15 +18,16 @@ C é«˜æ€§èƒ½æ¨¡å¼ å¯èƒ½çš„动机 〠效率 】 -æ—¶é—´å¿« -内存å ç”¨å°‘ +- 执行快 +- 内存å ç”¨å°‘ 〠软件工程 】 -å¯è¯»æ€§å¼º -å¯å¤ç”¨é«˜ -类型安全 -å•元测试方便 +- å¯è¯»æ€§å¼º +- å¯å¤ç”¨é«˜ +- 类型安全 +- å•元测试方便 ã€å¯é æ€§ã€‘ -å¹¶å‘ã€çº¿ç¨‹å®‰å…¨ +- å¹¶å‘ã€çº¿ç¨‹å®‰å…¨ + ''' \ No newline at end of file