From d727f0cba282d4b97e84b6eadbe9944b57bc07a3 Mon Sep 17 00:00:00 2001 From: zj3D Date: Wed, 21 May 2025 16:56:29 +0800 Subject: [PATCH] 0521 --- .ipynb_checkpoints/readme-checkpoint.MD | 14 + .../3 Hacker-checkpoint.py} | 0 .../1 最基础的写法.py | 0 .../2 使用一些函数.py} | 0 A 代码模式/10 跑起来了/3 Hacker.py | 15 + .../函数封装}/1 全局变量.py | 0 .../函数封装}/2 临时变量传递.py | 0 .../函数封装}/3 常见风格.py | 0 .../函数封装}/4 递归.py | 0 .../对象封装}/1 类对象.py | 0 .../对象封装}/2 字典对象.py | 0 .../流式调用/1 嵌套调用.py | 0 .../流式调用/2 方法bind.py | 0 .../流式调用/3 重载管道.py | 0 .../流式调用/4 类方法.py | 0 .../流式调用/5 类方法.py | 0 .../元类.py | 0 .../1 复用/函数调用复用.py | 0 .../1 复用/对象复用.py | 0 .../2 松耦合/1 观察者/1 观察者.py | 0 .../2 松耦合/1 观察者/2 观察者.py | 0 .../2 松耦合/1 观察者/3 注册回调.py | 0 .../2 松耦合/1 观察者/4 订阅发布.py | 0 .../2 松耦合/1 观察者/readme.MD | 0 .../2 消息链/1 只有消息接口.py | 0 .../2 松耦合/2 消息链/2 调用链.py | 0 .../2 松耦合/2 消息链/3 消息链.py | 0 .../2 松耦合/2 消息链/4 消息队列.py | 0 .../2 松耦合/3 微服务/client_app.py | 0 .../3 微服务/counter_service.py | 0 .../2 松耦合/3 微服务/sorter_service.py | 0 .../3 微服务/tokenizer_service.py | 0 .../2 松耦合/4 插件/config.ini | 0 .../2 松耦合/4 插件/plugin.py | 0 .../4 插件/plugins-src/buildingPyc.py | 0 .../2 松耦合/4 插件/plugins-src/f1.py | 0 .../2 松耦合/4 插件/plugins-src/f2.py | 0 .../2 松耦合/4 插件/plugins/f1.pyc | Bin .../2 松耦合/4 插件/plugins/f2.pyc | Bin .../3 类型申明/参数类型申明.py | 0 .../4 对象接口/tf-14A.py | 0 .../4 对象接口/tf-14B.py | 0 .../5 对象属性 .py | 0 .../03 多线程的应用场景.md | 1 - C 高性能模式/readme.md | 11 +- .../02 设计模式-checkpoint.ipynb | 493 ------------- .../99 工业级代码-checkpoint.ipynb | 665 ++++++++++++++++++ D Plus/00 封装.ipynb | 478 ------------- ...装-checkpoint.ipynb => 01 封装.ipynb} | 100 +-- ...性.ipynb => 02 利用语言特性.ipynb} | 2 +- ...计模式.ipynb => 03 设计模式.ipynb} | 190 +---- D Plus/99 工业级代码.ipynb | 665 ++++++++++++++++++ D Plus/readme.MD | 34 + D Plus/测试驱动开发.ipynb | 35 + readme.MD | 29 +- 55 files changed, 1467 insertions(+), 1265 deletions(-) create mode 100644 .ipynb_checkpoints/readme-checkpoint.MD rename A 代码模式/{10 一盘大棋/3 Hacker.py => 10 跑起来了/.ipynb_checkpoints/3 Hacker-checkpoint.py} (100%) rename A 代码模式/{10 一盘大棋 => 10 跑起来了}/1 最基础的写法.py (100%) rename A 代码模式/{10 一盘大棋/2 加入语言特性.py => 10 跑起来了/2 使用一些函数.py} (100%) create mode 100644 A 代码模式/10 跑起来了/3 Hacker.py rename A 代码模式/{11 基础结构/函数 => 11 封装/函数封装}/1 全局变量.py (100%) rename A 代码模式/{11 基础结构/函数 => 11 封装/函数封装}/2 临时变量传递.py (100%) rename A 代码模式/{11 基础结构/函数 => 11 封装/函数封装}/3 常见风格.py (100%) rename A 代码模式/{11 基础结构/函数 => 11 封装/函数封装}/4 递归.py (100%) rename A 代码模式/{11 基础结构/对象化 => 11 封装/对象封装}/1 类对象.py (100%) rename A 代码模式/{11 基础结构/对象化 => 11 封装/对象封装}/2 字典对象.py (100%) rename A 代码模式/{11 基础结构 => 11 封装}/流式调用/1 嵌套调用.py (100%) rename A 代码模式/{11 基础结构 => 11 封装}/流式调用/2 方法bind.py (100%) rename A 代码模式/{11 基础结构 => 11 封装}/流式调用/3 重载管道.py (100%) rename A 代码模式/{11 基础结构 => 11 封装}/流式调用/4 类方法.py (100%) rename A 代码模式/{11 基础结构 => 11 封装}/流式调用/5 类方法.py (100%) rename A 代码模式/12 语言特性/{1 类生成器 => 1 动态定制类}/元类.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/1 复用/函数调用复用.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/1 复用/对象复用.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/1 观察者/1 观察者.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/1 观察者/2 观察者.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/1 观察者/3 注册回调.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/1 观察者/4 订阅发布.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/1 观察者/readme.MD (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/2 消息链/1 只有消息接口.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/2 消息链/2 调用链.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/2 消息链/3 消息链.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/2 消息链/4 消息队列.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/3 微服务/client_app.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/3 微服务/counter_service.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/3 微服务/sorter_service.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/3 微服务/tokenizer_service.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/4 插件/config.ini (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/4 插件/plugin.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/4 插件/plugins-src/buildingPyc.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/4 插件/plugins-src/f1.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/4 插件/plugins-src/f2.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/4 插件/plugins/f1.pyc (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/2 松耦合/4 插件/plugins/f2.pyc (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/3 类型申明/参数类型申明.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/4 对象接口/tf-14A.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/4 对象接口/tf-14B.py (100%) rename A 代码模式/{15 工程化 => 15 工程化考虑}/5 对象属性 .py (100%) delete mode 100644 D Plus/.ipynb_checkpoints/02 设计模式-checkpoint.ipynb create mode 100644 D Plus/.ipynb_checkpoints/99 工业级代码-checkpoint.ipynb delete mode 100644 D Plus/00 封装.ipynb rename D Plus/{.ipynb_checkpoints/00 封装-checkpoint.ipynb => 01 封装.ipynb} (81%) rename D Plus/{01 简洁的语言特性.ipynb => 02 利用语言特性.ipynb} (99%) rename D Plus/{02 设计模式.ipynb => 03 设计模式.ipynb} (60%) create mode 100644 D Plus/99 工业级代码.ipynb create mode 100644 D Plus/readme.MD create mode 100644 D Plus/测试驱动开发.ipynb diff --git a/.ipynb_checkpoints/readme-checkpoint.MD b/.ipynb_checkpoints/readme-checkpoint.MD new file mode 100644 index 0000000..ba82ebe --- /dev/null +++ b/.ipynb_checkpoints/readme-checkpoint.MD @@ -0,0 +1,14 @@ + +## 代码为啥要这样写,我要这样写代码 + +A 代码模式 +用一个简单任务,展示各种软件工程需求(完成任务简单、可读性强、可复用高、维护成本低等)下的代码写法 + +B 面向对象设计模式 +用一个业务场景复现面向对象的经典设计模式 + +C 高性能模式 +考虑执行时间快,资源占用少的一些思路、办法和结论 + +D 制造工业级代码 +问题同 A ,以构建工业级的代码为目标,用多种方式做了优化提升演示 \ No newline at end of file diff --git a/A 代码模式/10 一盘大棋/3 Hacker.py b/A 代码模式/10 跑起来了/.ipynb_checkpoints/3 Hacker-checkpoint.py similarity index 100% rename from A 代码模式/10 一盘大棋/3 Hacker.py rename to A 代码模式/10 跑起来了/.ipynb_checkpoints/3 Hacker-checkpoint.py diff --git a/A 代码模式/10 一盘大棋/1 最基础的写法.py b/A 代码模式/10 跑起来了/1 最基础的写法.py similarity index 100% rename from A 代码模式/10 一盘大棋/1 最基础的写法.py rename to A 代码模式/10 跑起来了/1 最基础的写法.py diff --git a/A 代码模式/10 一盘大棋/2 加入语言特性.py b/A 代码模式/10 跑起来了/2 使用一些函数.py similarity index 100% rename from A 代码模式/10 一盘大棋/2 加入语言特性.py rename to A 代码模式/10 跑起来了/2 使用一些函数.py diff --git a/A 代码模式/10 跑起来了/3 Hacker.py b/A 代码模式/10 跑起来了/3 Hacker.py new file mode 100644 index 0000000..1f1598f --- /dev/null +++ b/A 代码模式/10 跑起来了/3 Hacker.py @@ -0,0 +1,15 @@ +import re +import collections +from cppy.cp_util import stopwordfilepath, testfilepath + +stopwords = set(open(stopwordfilepath, encoding='utf8').read().split(',')) +words = re.findall('[a-z]{2,}', + open(testfilepath, encoding='utf8').read().lower()) +counts = collections.Counter(w for w in words if w not in stopwords) +for (w, c) in counts.most_common(10): + print(w, '-', c) + +''' +熟练的软件工程师,会如此简单完成任务 +后面的例子,我们必须变的啰嗦一些,不能用这种太 hacker 的写法 +''' diff --git a/A 代码模式/11 基础结构/函数/1 全局变量.py b/A 代码模式/11 封装/函数封装/1 全局变量.py similarity index 100% rename from A 代码模式/11 基础结构/函数/1 全局变量.py rename to A 代码模式/11 封装/函数封装/1 全局变量.py diff --git a/A 代码模式/11 基础结构/函数/2 临时变量传递.py b/A 代码模式/11 封装/函数封装/2 临时变量传递.py similarity index 100% rename from A 代码模式/11 基础结构/函数/2 临时变量传递.py rename to A 代码模式/11 封装/函数封装/2 临时变量传递.py diff --git a/A 代码模式/11 基础结构/函数/3 常见风格.py b/A 代码模式/11 封装/函数封装/3 常见风格.py similarity index 100% rename from A 代码模式/11 基础结构/函数/3 常见风格.py rename to A 代码模式/11 封装/函数封装/3 常见风格.py diff --git a/A 代码模式/11 基础结构/函数/4 递归.py b/A 代码模式/11 封装/函数封装/4 递归.py similarity index 100% rename from A 代码模式/11 基础结构/函数/4 递归.py rename to A 代码模式/11 封装/函数封装/4 递归.py diff --git a/A 代码模式/11 基础结构/对象化/1 类对象.py b/A 代码模式/11 封装/对象封装/1 类对象.py similarity index 100% rename from A 代码模式/11 基础结构/对象化/1 类对象.py rename to A 代码模式/11 封装/对象封装/1 类对象.py diff --git a/A 代码模式/11 基础结构/对象化/2 字典对象.py b/A 代码模式/11 封装/对象封装/2 字典对象.py similarity index 100% rename from A 代码模式/11 基础结构/对象化/2 字典对象.py rename to A 代码模式/11 封装/对象封装/2 字典对象.py diff --git a/A 代码模式/11 基础结构/流式调用/1 嵌套调用.py b/A 代码模式/11 封装/流式调用/1 嵌套调用.py similarity index 100% rename from A 代码模式/11 基础结构/流式调用/1 嵌套调用.py rename to A 代码模式/11 封装/流式调用/1 嵌套调用.py diff --git a/A 代码模式/11 基础结构/流式调用/2 方法bind.py b/A 代码模式/11 封装/流式调用/2 方法bind.py similarity index 100% rename from A 代码模式/11 基础结构/流式调用/2 方法bind.py rename to A 代码模式/11 封装/流式调用/2 方法bind.py diff --git a/A 代码模式/11 基础结构/流式调用/3 重载管道.py b/A 代码模式/11 封装/流式调用/3 重载管道.py similarity index 100% rename from A 代码模式/11 基础结构/流式调用/3 重载管道.py rename to A 代码模式/11 封装/流式调用/3 重载管道.py diff --git a/A 代码模式/11 基础结构/流式调用/4 类方法.py b/A 代码模式/11 封装/流式调用/4 类方法.py similarity index 100% rename from A 代码模式/11 基础结构/流式调用/4 类方法.py rename to A 代码模式/11 封装/流式调用/4 类方法.py diff --git a/A 代码模式/11 基础结构/流式调用/5 类方法.py b/A 代码模式/11 封装/流式调用/5 类方法.py similarity index 100% rename from A 代码模式/11 基础结构/流式调用/5 类方法.py rename to A 代码模式/11 封装/流式调用/5 类方法.py diff --git a/A 代码模式/12 语言特性/1 类生成器/元类.py b/A 代码模式/12 语言特性/1 动态定制类/元类.py similarity index 100% rename from A 代码模式/12 语言特性/1 类生成器/元类.py rename to A 代码模式/12 语言特性/1 动态定制类/元类.py diff --git a/A 代码模式/15 工程化/1 复用/函数调用复用.py b/A 代码模式/15 工程化考虑/1 复用/函数调用复用.py similarity index 100% rename from A 代码模式/15 工程化/1 复用/函数调用复用.py rename to A 代码模式/15 工程化考虑/1 复用/函数调用复用.py diff --git a/A 代码模式/15 工程化/1 复用/对象复用.py b/A 代码模式/15 工程化考虑/1 复用/对象复用.py similarity index 100% rename from A 代码模式/15 工程化/1 复用/对象复用.py rename to A 代码模式/15 工程化考虑/1 复用/对象复用.py diff --git a/A 代码模式/15 工程化/2 松耦合/1 观察者/1 观察者.py b/A 代码模式/15 工程化考虑/2 松耦合/1 观察者/1 观察者.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/1 观察者/1 观察者.py rename to A 代码模式/15 工程化考虑/2 松耦合/1 观察者/1 观察者.py diff --git a/A 代码模式/15 工程化/2 松耦合/1 观察者/2 观察者.py b/A 代码模式/15 工程化考虑/2 松耦合/1 观察者/2 观察者.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/1 观察者/2 观察者.py rename to A 代码模式/15 工程化考虑/2 松耦合/1 观察者/2 观察者.py diff --git a/A 代码模式/15 工程化/2 松耦合/1 观察者/3 注册回调.py b/A 代码模式/15 工程化考虑/2 松耦合/1 观察者/3 注册回调.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/1 观察者/3 注册回调.py rename to A 代码模式/15 工程化考虑/2 松耦合/1 观察者/3 注册回调.py diff --git a/A 代码模式/15 工程化/2 松耦合/1 观察者/4 订阅发布.py b/A 代码模式/15 工程化考虑/2 松耦合/1 观察者/4 订阅发布.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/1 观察者/4 订阅发布.py rename to A 代码模式/15 工程化考虑/2 松耦合/1 观察者/4 订阅发布.py diff --git a/A 代码模式/15 工程化/2 松耦合/1 观察者/readme.MD b/A 代码模式/15 工程化考虑/2 松耦合/1 观察者/readme.MD similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/1 观察者/readme.MD rename to A 代码模式/15 工程化考虑/2 松耦合/1 观察者/readme.MD diff --git a/A 代码模式/15 工程化/2 松耦合/2 消息链/1 只有消息接口.py b/A 代码模式/15 工程化考虑/2 松耦合/2 消息链/1 只有消息接口.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/2 消息链/1 只有消息接口.py rename to A 代码模式/15 工程化考虑/2 松耦合/2 消息链/1 只有消息接口.py diff --git a/A 代码模式/15 工程化/2 松耦合/2 消息链/2 调用链.py b/A 代码模式/15 工程化考虑/2 松耦合/2 消息链/2 调用链.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/2 消息链/2 调用链.py rename to A 代码模式/15 工程化考虑/2 松耦合/2 消息链/2 调用链.py diff --git a/A 代码模式/15 工程化/2 松耦合/2 消息链/3 消息链.py b/A 代码模式/15 工程化考虑/2 松耦合/2 消息链/3 消息链.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/2 消息链/3 消息链.py rename to A 代码模式/15 工程化考虑/2 松耦合/2 消息链/3 消息链.py diff --git a/A 代码模式/15 工程化/2 松耦合/2 消息链/4 消息队列.py b/A 代码模式/15 工程化考虑/2 松耦合/2 消息链/4 消息队列.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/2 消息链/4 消息队列.py rename to A 代码模式/15 工程化考虑/2 松耦合/2 消息链/4 消息队列.py diff --git a/A 代码模式/15 工程化/2 松耦合/3 微服务/client_app.py b/A 代码模式/15 工程化考虑/2 松耦合/3 微服务/client_app.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/3 微服务/client_app.py rename to A 代码模式/15 工程化考虑/2 松耦合/3 微服务/client_app.py diff --git a/A 代码模式/15 工程化/2 松耦合/3 微服务/counter_service.py b/A 代码模式/15 工程化考虑/2 松耦合/3 微服务/counter_service.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/3 微服务/counter_service.py rename to A 代码模式/15 工程化考虑/2 松耦合/3 微服务/counter_service.py diff --git a/A 代码模式/15 工程化/2 松耦合/3 微服务/sorter_service.py b/A 代码模式/15 工程化考虑/2 松耦合/3 微服务/sorter_service.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/3 微服务/sorter_service.py rename to A 代码模式/15 工程化考虑/2 松耦合/3 微服务/sorter_service.py diff --git a/A 代码模式/15 工程化/2 松耦合/3 微服务/tokenizer_service.py b/A 代码模式/15 工程化考虑/2 松耦合/3 微服务/tokenizer_service.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/3 微服务/tokenizer_service.py rename to A 代码模式/15 工程化考虑/2 松耦合/3 微服务/tokenizer_service.py diff --git a/A 代码模式/15 工程化/2 松耦合/4 插件/config.ini b/A 代码模式/15 工程化考虑/2 松耦合/4 插件/config.ini similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/4 插件/config.ini rename to A 代码模式/15 工程化考虑/2 松耦合/4 插件/config.ini diff --git a/A 代码模式/15 工程化/2 松耦合/4 插件/plugin.py b/A 代码模式/15 工程化考虑/2 松耦合/4 插件/plugin.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/4 插件/plugin.py rename to A 代码模式/15 工程化考虑/2 松耦合/4 插件/plugin.py diff --git a/A 代码模式/15 工程化/2 松耦合/4 插件/plugins-src/buildingPyc.py b/A 代码模式/15 工程化考虑/2 松耦合/4 插件/plugins-src/buildingPyc.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/4 插件/plugins-src/buildingPyc.py rename to A 代码模式/15 工程化考虑/2 松耦合/4 插件/plugins-src/buildingPyc.py diff --git a/A 代码模式/15 工程化/2 松耦合/4 插件/plugins-src/f1.py b/A 代码模式/15 工程化考虑/2 松耦合/4 插件/plugins-src/f1.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/4 插件/plugins-src/f1.py rename to A 代码模式/15 工程化考虑/2 松耦合/4 插件/plugins-src/f1.py diff --git a/A 代码模式/15 工程化/2 松耦合/4 插件/plugins-src/f2.py b/A 代码模式/15 工程化考虑/2 松耦合/4 插件/plugins-src/f2.py similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/4 插件/plugins-src/f2.py rename to A 代码模式/15 工程化考虑/2 松耦合/4 插件/plugins-src/f2.py diff --git a/A 代码模式/15 工程化/2 松耦合/4 插件/plugins/f1.pyc b/A 代码模式/15 工程化考虑/2 松耦合/4 插件/plugins/f1.pyc similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/4 插件/plugins/f1.pyc rename to A 代码模式/15 工程化考虑/2 松耦合/4 插件/plugins/f1.pyc diff --git a/A 代码模式/15 工程化/2 松耦合/4 插件/plugins/f2.pyc b/A 代码模式/15 工程化考虑/2 松耦合/4 插件/plugins/f2.pyc similarity index 100% rename from A 代码模式/15 工程化/2 松耦合/4 插件/plugins/f2.pyc rename to A 代码模式/15 工程化考虑/2 松耦合/4 插件/plugins/f2.pyc diff --git a/A 代码模式/15 工程化/3 类型申明/参数类型申明.py b/A 代码模式/15 工程化考虑/3 类型申明/参数类型申明.py similarity index 100% rename from A 代码模式/15 工程化/3 类型申明/参数类型申明.py rename to A 代码模式/15 工程化考虑/3 类型申明/参数类型申明.py diff --git a/A 代码模式/15 工程化/4 对象接口/tf-14A.py b/A 代码模式/15 工程化考虑/4 对象接口/tf-14A.py similarity index 100% rename from A 代码模式/15 工程化/4 对象接口/tf-14A.py rename to A 代码模式/15 工程化考虑/4 对象接口/tf-14A.py diff --git a/A 代码模式/15 工程化/4 对象接口/tf-14B.py b/A 代码模式/15 工程化考虑/4 对象接口/tf-14B.py similarity index 100% rename from A 代码模式/15 工程化/4 对象接口/tf-14B.py rename to A 代码模式/15 工程化考虑/4 对象接口/tf-14B.py diff --git a/A 代码模式/15 工程化/5 对象属性 .py b/A 代码模式/15 工程化考虑/5 对象属性 .py similarity index 100% rename from A 代码模式/15 工程化/5 对象属性 .py rename to A 代码模式/15 工程化考虑/5 对象属性 .py diff --git a/C 高性能模式/03 多线程的应用场景.md b/C 高性能模式/03 多线程的应用场景.md index 76343ca..95fc10a 100644 --- a/C 高性能模式/03 多线程的应用场景.md +++ b/C 高性能模式/03 多线程的应用场景.md @@ -1,5 +1,4 @@ - Python的多线程时间切片间隔可以通过 sys.setswitchinterval() 设置。其他切换触发条件 : - 当线程等待I/O操作(如网络请求或磁盘读写) - 某些函数(如 time.sleep())会触发切换 diff --git a/C 高性能模式/readme.md b/C 高性能模式/readme.md index d9eea22..ef89afe 100644 --- a/C 高性能模式/readme.md +++ b/C 高性能模式/readme.md @@ -1,6 +1,7 @@ 从计算机系统结构的角度,提高 Python 任务执行速度的核心在于:减少解释器开销(编译/JIT)、提升并行性(多核/GPU)、优化内存访问(缓存友好)、降低 I/O 瓶颈以及适配硬件特性等。当前主要办法如下: + ### 计算单元层面利用多核并行计算 对于 CPU 密集型任务,使用多进程,每个进程拥有独立的 Python 解释器和内存空间,运行在独立的内核上,实现并行计算。 @@ -27,15 +28,17 @@ ### 使用第三方高性能库 -- NumPy、Pandas这些库用 C/C++ 编写并经过优化。 -- NumPy 使用连续内存块存储数据,向量化操作来代替显式的Python循环,更高效 。 +- NumPy/Pandas 用 C/C++ 编写并经过优化,使用连续内存块存储数据,向量化操作比显式的Python循环更高效。 - SIMD 指令加速,NumPy、Numba、Pandas/SciPy 都使用了 SIMD。Cython 可以直接用 C 代码使用 SIMD 。 - `gzip` 模块可压缩数据,减少网络传输的数据量,提高网络传输速度。 - `mmap` 模块实现内存映射文件,在处理超大文件、优化I/O性能以及进程间通信方面具有显著优势。 - `functools.lru_cache` 缓存计算结果,避免重复计算 。 +### 使用性能分析工具 +如 cProfile 、Py-Spy、timeit 或 line_profiler + ## 总结 -具体实施时,应根据任务特点选择合适的策略,并结合性能分析工具(如 cProfile 、timeit或 line_profiler)定位瓶颈。 -计算设备方面的简单提升办法:使用多机、更快的 CPU、更多核的CPU、更多的内存、更快的存储、增加 GPU/FPGA/TPU 。 +具体实施时,应结合性能分析工具定位瓶颈,并根据任务特点选择合适的策略 。 +当然计算设备方面也可以简单提升:多机、更快的 CPU、更多核的CPU、更多的内存、更快的存储、增加 GPU/FPGA/TPU 。 此外,随着Python社区的发展,新的技术和工具不断涌现,开发者应持续关注最新进展,以便更好地优化自己的代码 。 \ No newline at end of file diff --git a/D Plus/.ipynb_checkpoints/02 设计模式-checkpoint.ipynb b/D Plus/.ipynb_checkpoints/02 设计模式-checkpoint.ipynb deleted file mode 100644 index 8f14f9e..0000000 --- a/D Plus/.ipynb_checkpoints/02 设计模式-checkpoint.ipynb +++ /dev/null @@ -1,493 +0,0 @@ -{ - "cells": [ - { - "cell_type": "raw", - "id": "eccfe49f-de35-4241-90e3-a7095940b61a", - "metadata": {}, - "source": [ - "设计模式提供高频重复出现需求的最佳解决方案。以下介绍适合词频统计案例的设计模式:策略模式、观察者模式、工厂模式。" - ] - }, - { - "cell_type": "markdown", - "id": "c186171f-d1f2-433e-a3eb-b266e2909a2c", - "metadata": {}, - "source": [ - "## 策略模式(动态选择分词策略)\n", - "\n", - "策略模式允许动态切换算法(如分词器),比元编程简单。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "97c865cb-0b5a-4fa1-aa74-5ba2e65e7436", - "metadata": {}, - "outputs": [], - "source": [ - "from abc import ABC, abstractmethod\n", - "\n", - "class Tokenizer(ABC):\n", - " \"\"\"分词器接口\"\"\"\n", - " @abstractmethod\n", - " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", - " pass\n", - "\n", - "class JiebaTokenizer(Tokenizer):\n", - " \"\"\"jieba 分词器\"\"\"\n", - " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", - " return [w for w in jieba.lcut(text) if w not in stop_words]\n", - "\n", - "class SimpleTokenizer(Tokenizer):\n", - " \"\"\"简单分词器\"\"\"\n", - " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", - " return [w for w in text.split() if w not in stop_words]\n", - "\n", - "class TextAnalyzer:\n", - " def __init__(self, config_path='config.yaml'):\n", - " with open(config_path, 'r', encoding='utf-8') as f:\n", - " config = yaml.safe_load(f)\n", - " self.data_dir = config['data_dir']\n", - " self.top_n = config['top_n']\n", - " self.stop_words_file = config['stop_words_file']\n", - " self.output_file = config['output_file']\n", - " self.stop_words = self.load_stop_words()\n", - " self.word_count = Counter()\n", - " # 动态选择分词器\n", - " tokenizer_name = config.get('tokenizer', 'jieba')\n", - " self.tokenizer = {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}[tokenizer_name]\n", - "\n", - " def tokenize(self, text: str) -> List[str]:\n", - " \"\"\"使用策略分词\"\"\"\n", - " return self.tokenizer.tokenize(text, self.stop_words)\n", - "\n", - " # 其余方法同上" - ] - }, - { - "cell_type": "markdown", - "id": "5435ebc3-d3b0-4475-8bd5-cb45fb51638c", - "metadata": {}, - "source": [ - "工程质量提升:\n", - "- 可扩展性:添加新分词器只需实现 Tokenizer 接口。\n", - "- 可维护性:分词逻辑与主类分离,修改更独立。\n", - "\n", - "适用场景:适合需要动态切换算法的场景。" - ] - }, - { - "cell_type": "markdown", - "id": "fbf53455-558c-40fb-8718-446dec989b5d", - "metadata": {}, - "source": [ - "## 观察者模式(结果输出解耦)\n", - "\n", - "观察者模式可用于解耦结果输出逻辑(如打印、保存文件、发送通知)。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d7a2bd4c-df73-4800-b45b-9b6c73d28d7b", - "metadata": {}, - "outputs": [], - "source": [ - "class OutputObserver(ABC):\n", - " \"\"\"输出观察者接口\"\"\"\n", - " @abstractmethod\n", - " def update(self, top_words: List[Tuple[str, int]]):\n", - " pass\n", - "\n", - "class ConsoleOutput(OutputObserver):\n", - " \"\"\"控制台输出\"\"\"\n", - " def update(self, top_words: List[Tuple[str, int]]):\n", - " for word, count in top_words:\n", - " print(f\"{word}: {count}\")\n", - "\n", - "class FileOutput(OutputObserver):\n", - " \"\"\"文件输出\"\"\"\n", - " def __init__(self, output_file: str):\n", - " self.output_file = output_file\n", - "\n", - " def update(self, top_words: List[Tuple[str, int]]):\n", - " with open(self.output_file, 'w', encoding='utf-8') as f:\n", - " for word, count in top_words:\n", - " f.write(f\"{word}: {count}\\n\")\n", - "\n", - "class TextAnalyzer:\n", - " def __init__(self, config_path='config.yaml'):\n", - " with open(config_path, 'r', encoding='utf-8') as f:\n", - " config = yaml.safe_load(f)\n", - " self.data_dir = config['data_dir']\n", - " self.top_n = config['top_n']\n", - " self.stop_words_file = config['stop_words_file']\n", - " self.output_file = config['output_file']\n", - " self.stop_words = self.load_stop_words()\n", - " self.word_count = Counter()\n", - " self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n", - "\n", - " def add_observer(self, observer: OutputObserver):\n", - " \"\"\"添加观察者\"\"\"\n", - " self.observers.append(observer)\n", - "\n", - " def notify_observers(self, top_words: List[Tuple[str, int]]):\n", - " \"\"\"通知所有观察者\"\"\"\n", - " for observer in self.observers:\n", - " observer.update(top_words)\n", - "\n", - " def run(self):\n", - " \"\"\"执行词频统计并通知观察者\"\"\"\n", - " self.process_directory()\n", - " top_words = self.get_top_words()\n", - " self.notify_observers(top_words)\n", - "\n", - " # 其余方法同上" - ] - }, - { - "cell_type": "markdown", - "id": "02b5cfba-431c-4a01-a454-099e4f41922c", - "metadata": {}, - "source": [ - "### 分析\n", - "\n", - "工程质量提升:\n", - " - 可扩展性:添加新输出方式只需实现 OutputObserver 接口。\n", - " - 解耦性:输出逻辑与统计逻辑分离,修改输出不影响核心功能。\n", - "\n", - "适用场景:适合需要多种输出或通知的场景。\n", - "\n", - "局限性:观察者模式增加代码复杂性,适合复杂输出需求。" - ] - }, - { - "cell_type": "markdown", - "id": "11669305-8cd5-4317-afd5-e85c3f0a5a81", - "metadata": {}, - "source": [ - "## 工厂模式(动态创建分词器)\n", - "\n", - "工厂模式可用于动态创建分词器,简化策略模式中的初始化逻辑。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2fa50633-de22-40c8-912d-3ded5ebcedfc", - "metadata": {}, - "outputs": [], - "source": [ - "class TokenizerFactory:\n", - " \"\"\"分词器工厂\"\"\"\n", - " @staticmethod\n", - " def create_tokenizer(name: str) -> Tokenizer:\n", - " tokenizers = {\n", - " 'jieba': JiebaTokenizer(),\n", - " 'simple': SimpleTokenizer()\n", - " }\n", - " return tokenizers.get(name, JiebaTokenizer())\n", - "\n", - "class TextAnalyzer:\n", - " def __init__(self, config_path='config.yaml'):\n", - " with open(config_path, 'r', encoding='utf-8') as f:\n", - " config = yaml.safe_load(f)\n", - " self.data_dir = config['data_dir']\n", - " self.top_n = config['top_n']\n", - " self.stop_words_file = config['stop_words_file']\n", - " self.output_file = config['output_file']\n", - " self.stop_words = self.load_stop_words()\n", - " self.word_count = Counter()\n", - " self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n", - "\n", - " # 其余方法同上" - ] - }, - { - "cell_type": "markdown", - "id": "a4db7046-dfe2-4bd8-81d1-49a42e2eeb5c", - "metadata": {}, - "source": [ - "### 分析\n", - "\n", - "工程质量提升:\n", - " - 可维护性:分词器创建逻辑集中于工厂,易于修改。\n", - " - 可扩展性:添加新分词器只需更新工厂方法。\n", - "\n", - "适用场景:适合需要动态创建对象的场景。\n", - "\n", - "局限性:对于简单场景,工厂模式可能略显冗余。" - ] - }, - { - "cell_type": "markdown", - "id": "e5f2aef4-a055-43a9-917c-fa183de6db2d", - "metadata": {}, - "source": [ - "## 综合实现(整合特性与模式)\n", - "\n", - "整合上下文管理器、生成器、策略模式和观察者模式的最终实现(部分代码展示)。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa7f34e2-d355-4a22-8572-729c49b18605", - "metadata": {}, - "outputs": [], - "source": [ - "# text_analyzer.py\n", - "\n", - "import os\n", - "import jieba\n", - "from collections import Counter\n", - "import yaml\n", - "from contextlib import contextmanager\n", - "from typing import List, Tuple\n", - "from abc import ABC, abstractmethod\n", - "\n", - "@contextmanager\n", - "def file_reader(file_path: str):\n", - " try:\n", - " with open(file_path, 'r', encoding='utf-8') as f:\n", - " yield f.read()\n", - " except Exception as e:\n", - " print(f\"Error reading {file_path}: {e}\")\n", - " yield \"\"\n", - "\n", - "class Tokenizer(ABC):\n", - " @abstractmethod\n", - " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", - " pass\n", - "\n", - "class JiebaTokenizer(Tokenizer):\n", - " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", - " for word in jieba.lcut(text):\n", - " if word not in stop_words:\n", - " yield word\n", - "\n", - "class SimpleTokenizer(Tokenizer):\n", - " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", - " for word in text.split():\n", - " if word not in stop_words:\n", - " yield word\n", - "\n", - "class TokenizerFactory:\n", - " @staticmethod\n", - " def create_tokenizer(name: str) -> Tokenizer:\n", - " return {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}.get(name, JiebaTokenizer())\n", - "\n", - "class OutputObserver(ABC):\n", - " @abstractmethod\n", - " def update(self, top_words: List[Tuple[str, int]]):\n", - " pass\n", - "\n", - "class ConsoleOutput(OutputObserver):\n", - " def update(self, top_words: List[Tuple[str, int]]):\n", - " for word, count in top_words:\n", - " print(f\"{word}: {count}\")\n", - "\n", - "class FileOutput(OutputObserver):\n", - " def __init__(self, output_file: str):\n", - " self.output_file = output_file\n", - " def update(self, top_words: List[Tuple[str, int]]):\n", - " with open(self.output_file, 'w', encoding='utf-8') as f:\n", - " for word, count in top_words:\n", - " f.write(f\"{word}: {count}\\n\")\n", - "\n", - "class TextAnalyzer:\n", - " def __init__(self, config_path='config.yaml'):\n", - " with open(config_path, 'r', encoding='utf-8') as f:\n", - " config = yaml.safe_load(f)\n", - " self.data_dir = config['data_dir']\n", - " self.top_n = config['top_n']\n", - " self.stop_words_file = config['stop_words_file']\n", - " self.output_file = config['output_file']\n", - " self.stop_words = self.load_stop_words()\n", - " self.word_count = Counter()\n", - " self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n", - " self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n", - "\n", - " def load_stop_words(self) -> set:\n", - " with file_reader(self.stop_words_file) as content:\n", - " return set(line.strip() for line in content.splitlines() if line.strip())\n", - "\n", - " def process_file(self, file_path: str):\n", - " if file_path.endswith('.txt'):\n", - " with file_reader(file_path) as text:\n", - " words = self.tokenizer.tokenize(text, self.stop_words)\n", - " self.word_count.update(words)\n", - "\n", - " def process_directory(self):\n", - " for file in os.listdir(self.data_dir):\n", - " file_path = os.path.join(self.data_dir, file)\n", - " self.process_file(file_path)\n", - "\n", - " def get_top_words(self) -> List[Tuple[str, int]]:\n", - " return self.word_count.most_common(self.top_n)\n", - "\n", - " def notify_observers(self, top_words: List[Tuple[str, int]]):\n", - " for observer in self.observers:\n", - " observer.update(top_words)\n", - "\n", - " def run(self):\n", - " self.process_directory()\n", - " top_words = self.get_top_words()\n", - " self.notify_observers(top_words)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3d130312-b298-4c76-ae09-0fb4bd08b0c1", - "metadata": {}, - "outputs": [], - "source": [ - "# main.py\n", - "\n", - "from text_analyzer import TextAnalyzer\n", - "\n", - "def main():\n", - " analyzer = TextAnalyzer()\n", - " analyzer.run()\n", - "\n", - "if __name__ == '__main__':\n", - " main()" - ] - }, - { - "cell_type": "markdown", - "id": "770618c9-428e-454a-97de-00e3b49c9d03", - "metadata": {}, - "source": [ - "## 结论\n", - "\n", - "通过引入上下文管理器、生成器、元编程、策略模式、观察者模式和工厂模式,词频统计代码在可扩展性、可维护性和复用性上进一步提升。\n", - "这些特性和模式使代码更模块化、灵活,适合大型项目,同时保持清晰的工程结构。结合之前的装饰器和函数式编程,代码已达到工程化水平。\n", - "\n", - "若需深入,可以进一步考虑其它性能特性." - ] - }, - { - "cell_type": "markdown", - "id": "cbeaa07d-272f-465b-a437-9c4b44827d23", - "metadata": {}, - "source": [ - "## 进一步练习\n", - "\n", - "实践练习:\n", - "- 实现新分词器(如 thulac)并通过策略模式或工厂模式集成。\n", - "- 添加新观察者(如 JSON 输出)。\n", - "\n", - "使用生成器实现流式词频统计,比较内存占用。\n", - "实现缓存机制,缓存已处理文件的分词结果。\n", - "\n", - "添加命令行接口(argparse),动态配置 top_n 和 tokenizer。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6a43b53d-1e07-4ebe-a6c8-104353fd5f7b", - "metadata": {}, - "outputs": [], - "source": [ - "## 附:元编程\n", - "\n", - "元编程允许动态修改类或函数行为,可用于动态配置分词器或输出格式。案例中,可通过元编程动态注册分词器。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4394008c-88da-44bd-aa0d-f1b7a6dbc7d6", - "metadata": {}, - "outputs": [], - "source": [ - "class TokenizerRegistry(type):\n", - " \"\"\"元类:动态注册分词器\"\"\"\n", - " tokenizers = {}\n", - "\n", - " def register_tokenizer(cls, name):\n", - " def decorator(func):\n", - " cls.tokenizers[name] = func\n", - " return func\n", - " return decorator\n", - "\n", - "class TextAnalyzer(metaclass=TokenizerRegistry):\n", - " def __init__(self, config_path='config.yaml'):\n", - " with open(config_path, 'r', encoding='utf-8') as f:\n", - " config = yaml.safe_load(f)\n", - " self.data_dir = config['data_dir']\n", - " self.top_n = config['top_n']\n", - " self.stop_words_file = config['stop_words_file']\n", - " self.output_file = config['output_file']\n", - " self.stop_words = self.load_stop_words()\n", - " self.word_count = Counter()\n", - " self.tokenizer_name = config.get('tokenizer', 'jieba') # 从配置读取分词器\n", - "\n", - " @classmethod\n", - " def register_tokenizer(cls, name):\n", - " return cls.__class__.register_tokenizer(name)\n", - "\n", - " def tokenize(self, text: str) -> List[str]:\n", - " \"\"\"动态调用分词器\"\"\"\n", - " tokenizer = self.__class__.tokenizers.get(self.tokenizer_name, self.jieba_tokenizer)\n", - " return tokenizer(self, text)\n", - "\n", - " @register_tokenizer('jieba')\n", - " def jieba_tokenizer(self, text: str) -> List[str]:\n", - " \"\"\"jieba 分词\"\"\"\n", - " return [w for w in jieba.lcut(text) if w not in self.stop_words]\n", - "\n", - " @register_tokenizer('simple')\n", - " def simple_tokenizer(self, text: str) -> List[str]:\n", - " \"\"\"简单分词(按空格)\"\"\"\n", - " return [w for w in text.split() if w not in self.stop_words]\n", - "\n", - " # 其余方法(load_stop_words, process_file, etc.)同上" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2249f13a-7a3f-4376-ba2a-d92f11658d32", - "metadata": {}, - "outputs": [], - "source": [ - "### 分析\n", - "\n", - "功能:通过元类和装饰器动态注册分词器,支持配置切换(如 jieba 或 simple)。\n", - "\n", - "工程质量提升:\n", - " 可扩展性:新分词器只需添加新方法并注册,无需修改核心逻辑。\n", - " 灵活性:通过配置文件动态选择分词器。\n", - "\n", - "适用场景:适合需要动态配置或插件化系统的场景。\n", - "\n", - "局限性:元编程增加代码复杂性,可能降低可读性,需谨慎使用。" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/D Plus/.ipynb_checkpoints/99 工业级代码-checkpoint.ipynb b/D Plus/.ipynb_checkpoints/99 工业级代码-checkpoint.ipynb new file mode 100644 index 0000000..b88eaf7 --- /dev/null +++ b/D Plus/.ipynb_checkpoints/99 工业级代码-checkpoint.ipynb @@ -0,0 +1,665 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "b6bc2a3c-3b15-4bc5-83a2-adeae3b7b4d0", + "metadata": {}, + "outputs": [], + "source": [ + "## 项目结构\n", + "\n", + "word_frequency_project/\n", + "│\n", + "├── data/ # 小说文本存放目录\n", + "│ ├── novel1.txt\n", + "│ ├── novel2.txt\n", + "│ └── ...\n", + "├── src/ # 源代码目录\n", + "│ ├── __init__.py\n", + "│ ├── config.py # 配置文件\n", + "│ ├── data_loader.py # 数据加载模块\n", + "│ ├── preprocessor.py # 文本预处理模块\n", + "│ ├── word_counter.py # 词频统计模块\n", + "│ ├── output_formatter.py # 输出格式化模块\n", + "│ └── main.py # 主程序入口\n", + "├── tests/ # 单元测试目录\n", + "│ ├── __init__.py\n", + "│ ├── test_data_loader.py\n", + "│ ├── test_preprocessor.py\n", + "│ ├── test_word_counter.py\n", + "│ └── test_output_formatter.py\n", + "├── requirements.txt # 依赖文件\n", + "└── README.md # 项目说明" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0b55f2e-24ba-49da-8d11-f0f5eea611b0", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "src/config.py\n", + "定义配置项,便于扩展和修改。\n", + "'''\n", + "\n", + "import os\n", + "\n", + "class Config:\n", + " DATA_DIR = \"data\"\n", + " TOP_N_WORDS = 10\n", + " STOP_WORDS = {\"的\", \"了\", \"是\", \"在\", \"和\", \"我\", \"你\", \"他\", \"她\"} # 示例停用词\n", + " ENCODING = \"utf-8\"\n", + " LOG_LEVEL = \"INFO\"\n", + "\n", + " @classmethod\n", + " def get_data_dir(cls):\n", + " return os.path.join(os.path.dirname(__file__), \"..\", cls.DATA_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5bdcdf0-16a2-4dda-85f1-d018c6370aee", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "src/data_loader.py\n", + "负责加载小说文本,支持目录扫描和文件读取,提供扩展点以支持不同格式。\n", + "'''\n", + "\n", + "import os\n", + "import logging\n", + "from src.config import Config\n", + "\n", + "class DataLoader:\n", + " def __init__(self):\n", + " self.data_dir = Config.get_data_dir()\n", + " logging.basicConfig(level=Config.LOG_LEVEL)\n", + " self.logger = logging.getLogger(__name__)\n", + "\n", + " def load_texts(self):\n", + " \"\"\"加载 data 目录下的所有文本文件\"\"\"\n", + " texts = []\n", + " try:\n", + " for filename in os.listdir(self.data_dir):\n", + " if filename.endswith(\".txt\"):\n", + " file_path = os.path.join(self.data_dir, filename)\n", + " with open(file_path, \"r\", encoding=Config.ENCODING) as f:\n", + " texts.append(f.read())\n", + " self.logger.info(f\"Loaded file: {filename}\")\n", + " if not texts:\n", + " self.logger.warning(\"No text files found in data directory\")\n", + " return texts\n", + " except Exception as e:\n", + " self.logger.error(f\"Error loading files: {str(e)}\")\n", + " raise" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "786e7ffa-82bc-46b9-8ffc-444d6796b87b", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "src/preprocessor.py\n", + "文本预处理模块,负责分词和清理,支持扩展以添加更多预处理逻辑。\n", + "'''\n", + "\n", + "import jieba\n", + "import re\n", + "from src.config import Config\n", + "\n", + "def timing_decorator(func):\n", + " \"\"\"装饰器:记录方法执行时间\"\"\"\n", + " import time\n", + " def wrapper(*args, **kwargs):\n", + " start = time.time()\n", + " result = func(*args, **kwargs)\n", + " end = time.time()\n", + " print(f\"{func.__name__} took {end - start:.2f} seconds\")\n", + " return result\n", + " return wrapper\n", + "\n", + "class TextPreprocessor:\n", + " def __init__(self):\n", + " self.stop_words = Config.STOP_WORDS\n", + "\n", + " @timing_decorator\n", + " def preprocess(self, text):\n", + " \"\"\"预处理:分词、去除停用词和非中文字符\"\"\"\n", + " # 移除非中文字符\n", + " text = re.sub(r\"[^\\u4e00-\\u9fff]\", \" \", text)\n", + " # 分词\n", + " words = jieba.cut(text)\n", + " # 过滤停用词和空字符\n", + " return [word for word in words if word.strip() and word not in self.stop_words]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4edd5ca7-4ba7-4446-b93e-2cfd83efca2e", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "src/word_counter.py\n", + "词频统计模块,使用单例模式确保全局唯一计数器。\n", + "'''\n", + "\n", + "from collections import Counter\n", + "from typing import List, Dict\n", + "\n", + "class Singleton: 为啥需要单例?\n", + " \"\"\"单例模式装饰器\"\"\"\n", + " def __init__(self, cls):\n", + " self._cls = cls\n", + " self._instance = None\n", + "\n", + " def __call__(self, *args, **kwargs):\n", + " if self._instance is None:\n", + " self._instance = self._cls(*args, **kwargs)\n", + " return self._instance\n", + "\n", + "@Singleton\n", + "class WordCounter:\n", + " def __init__(self):\n", + " self.counter = Counter()\n", + "\n", + " def count_words(self, words: List[str]) -> None:\n", + " \"\"\"更新词频统计\"\"\"\n", + " self.counter.update(words)\n", + "\n", + " def get_top_n(self, n: int = 10) -> Dict[str, int]:\n", + " \"\"\"获取前 N 个高频词\"\"\"\n", + " return dict(self.counter.most_common(n))\n", + "\n", + " def reset(self):\n", + " \"\"\"重置计数器\"\"\"\n", + " self.counter.clear()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41af3e0e-3153-4d23-9a9f-65b566b384e8", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "src/output_formatter.py\n", + "输出格式化模块,支持多种输出格式,便于扩展。\n", + "'''\n", + "\n", + "from typing import Dict\n", + "\n", + "class OutputFormatter:\n", + " @staticmethod\n", + " def format_json(data: Dict[str, int]) -> str:\n", + " import json\n", + " return json.dumps(data, ensure_ascii=False, indent=2)\n", + "\n", + " @staticmethod\n", + " def format_text(data: Dict[str, int]) -> str:\n", + " return \"\\n\".join(f\"{word}: {count}\" for word, count in data.items())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6596162c-fd42-4b32-b328-9987568b3846", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "src/main.py\n", + "主程序入口,协调各模块工作。\n", + "'''\n", + "\n", + "from src.data_loader import DataLoader\n", + "from src.preprocessor import TextPreprocessor\n", + "from src.word_counter import WordCounter\n", + "from src.output_formatter import OutputFormatter\n", + "from src.config import Config\n", + "\n", + "def main():\n", + " # 初始化模块\n", + " loader = DataLoader()\n", + " preprocessor = TextPreprocessor()\n", + " counter = WordCounter()\n", + " formatter = OutputFormatter()\n", + "\n", + " # 加载文本\n", + " texts = loader.load_texts()\n", + "\n", + " # 预处理并统计词频\n", + " for text in texts:\n", + " words = preprocessor.preprocess(text)\n", + " counter.count_words(words)\n", + "\n", + " # 获取结果\n", + " top_words = counter.get_top_n(Config.TOP_N_WORDS)\n", + "\n", + " # 输出结果\n", + " print(\"=== Top 10 Words (Text Format) ===\")\n", + " print(formatter.format_text(top_words))\n", + " print(\"\\n=== Top 10 Words (JSON Format) ===\")\n", + " print(formatter.format_json(top_words))\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36a32f17-5ce3-46e2-a563-f151454f6342", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "tests/test_data_loader.py\n", + "单元测试示例,确保数据加载模块的正确性。\n", + "'''\n", + "\n", + "import unittest\n", + "import os\n", + "from src.data_loader import DataLoader\n", + "from src.config import Config\n", + "\n", + "class TestDataLoader(unittest.TestCase):\n", + " def setUp(self):\n", + " self.loader = DataLoader()\n", + " # 创建临时测试文件\n", + " self.test_file = os.path.join(Config.get_data_dir(), \"test_novel.txt\")\n", + " with open(self.test_file, \"w\", encoding=Config.ENCODING) as f:\n", + " f.write(\"这是一个测试文本\")\n", + "\n", + " def test_load_texts(self):\n", + " texts = self.loader.load_texts()\n", + " self.assertGreater(len(texts), 0)\n", + " self.assertIn(\"这是一个测试文本\", texts)\n", + "\n", + " def tearDown(self):\n", + " if os.path.exists(self.test_file):\n", + " os.remove(self.test_file)\n", + "\n", + "if __name__ == \"__main__\":\n", + " unittest.main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f550544-f0f4-4f0c-bdb7-9928b6820bdf", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "tests/test_preprocessor.py\n", + "测试文本预处理模块。\n", + "'''\n", + "\n", + "import unittest\n", + "from src.preprocessor import TextPreprocessor\n", + "\n", + "class TestTextPreprocessor(unittest.TestCase):\n", + " def setUp(self):\n", + " self.preprocessor = TextPreprocessor()\n", + "\n", + " def test_preprocess(self):\n", + " text = \"这是一个测试文本,包含了123和一些符号!\"\n", + " words = self.preprocessor.preprocess(text)\n", + " expected = [\"测试\", \"文本\", \"包含\", \"一些\", \"符号\"]\n", + " self.assertEqual(words, expected)\n", + "\n", + "if __name__ == \"__main__\":\n", + " unittest.main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fb8b4cd-0b27-426a-9556-8f21227c5374", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "tests/test_word_counter.py\n", + "测试词频统计模块。\n", + "'''\n", + "import unittest\n", + "from src.word_counter import WordCounter\n", + "\n", + "class TestWordCounter(unittest.TestCase):\n", + " def setUp(self):\n", + " self.counter = WordCounter()\n", + "\n", + " def test_count_words(self):\n", + " self.counter.count_words([\"测试\", \"文本\", \"测试\"])\n", + " result = self.counter.get_top_n(2)\n", + " expected = {\"测试\": 2, \"文本\": 1}\n", + " self.assertEqual(result, expected)\n", + "\n", + " def test_reset(self):\n", + " self.counter.count_words([\"测试\"])\n", + " self.counter.reset()\n", + " self.assertEqual(self.counter.get_top_n(1), {})\n", + "\n", + "if __name__ == \"__main__\":\n", + " unittest.main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b7507dc-b693-4dbf-9a21-5f2833d13d0e", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "requirements.txt\n", + "列出项目依赖。\n", + "''''\n", + "jieba==0.42.1" + ] + }, + { + "cell_type": "markdown", + "id": "573c4ddd-800e-4b59-9e20-a87d6a2b14cd", + "metadata": {}, + "source": [ + "'''\n", + "README.md\n", + "提供项目说明和使用方法。\n", + "'''\n", + "# Word Frequency Analysis Project\n", + "\n", + "## Overview\n", + "This project processes 100 novels in the `data` directory, counts word frequencies, and outputs the top 10 words. It demonstrates software engineering principles like modularity, design patterns, and unit testing.\n", + "\n", + "## Setup\n", + "1. Install dependencies: `pip install -r requirements.txt`\n", + "2. Place novel files (.txt) in the `data` directory.\n", + "3. Run the program: `python src/main.py`\n", + "\n", + "## Testing\n", + "Run tests: `python -m unittest discover tests`\n", + "\n", + "## Extensibility\n", + "- Add new preprocessors in `preprocessor.py`.\n", + "- Support new output formats in `output_formatter.py`.\n", + "- Modify configurations in `config.py`." + ] + }, + { + "cell_type": "markdown", + "id": "4bd74972-f9c4-4ac9-a557-de4198889047", + "metadata": {}, + "source": [ + "## 使用方法\n", + "\n", + "准备环境:\n", + "pip install -r requirements.txt\n", + "\n", + "准备数据:\n", + "- 在 data 目录下放入 100 个 .txt 小说文件(需为 UTF-8 编码)。\n", + "- 确保安装 jieba 分词库。\n", + "\n", + "运行程序:\n", + "python src/main.py\n", + "\n", + "运行测试:\n", + "python -m unittest discover tests" + ] + }, + { + "cell_type": "markdown", + "id": "16f7a973-7c49-4d11-ab3f-457d4622e5e6", + "metadata": {}, + "source": [ + "## 扩展建议\n", + "\n", + "- 支持多语言:在 TextPreprocessor 中添加英文分词(如使用 nltk 或 spacy)。\n", + "- 数据库存储:将词频结果保存到数据库(如 SQLite),在 WordCounter 中添加存储方法。\n", + "- 并行处理:使用 multiprocessing 加速大文件处理。\n", + "- 可视化:在 OutputFormatter 中添加图表输出(如使用 matplotlib)。\n", + "- 配置文件:将 Config 改为从外部 JSON/YAML 文件加载。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2ad0efb-4c7c-4f98-a809-ce6cdcefdb34", + "metadata": {}, + "outputs": [], + "source": [ + "## 设计说明\n", + "\n", + "模块化设计:\n", + "- 各模块(DataLoader, TextPreprocessor, WordCounter, OutputFormatter)职责单一,符合单一职责原则(SRP)。\n", + "- 模块间通过明确接口交互,易于替换或扩展。\n", + "\n", + "设计模式:\n", + "- 单例模式:WordCounter 使用单例模式,确保全局唯一计数器。\n", + "- 策略模式:OutputFormatter 支持多种输出格式(JSON、Text),易于添加新格式。\n", + "- 装饰器模式:timing_decorator 用于性能监控,便于扩展其他功能(如日志记录)。\n", + "\n", + "可扩展性:\n", + "- Config 类集中管理配置,便于调整参数(如停用词、输出数量)。\n", + "- DataLoader 支持动态扫描目录,新增文件无需改动代码。\n", + "- TextPreprocessor 可扩展以支持其他分词工具或预处理规则。\n", + "\n", + "单元测试:\n", + "- 每个模块都有对应的测试用例,确保功能正确性。\n", + "- 使用 unittest 框架,支持持续集成。\n", + "\n", + "语言特性利用:\n", + "- 使用 Python 的装饰器(timing_decorator)记录方法执行时间。\n", + "- 利用类型注解(typing 模块)提高代码可读性。\n", + "- 异常处理和日志记录(logging)增强鲁棒性。\n", + "\n", + "教学用途:\n", + "- 包含常见工程化实践:模块化、测试驱动开发、配置管理。\n", + "- 提供扩展点(如支持英文分词、数据库存储),便于学生实践。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1aac488-3a98-418c-8201-e7f77c392a1f", + "metadata": {}, + "outputs": [], + "source": [ + "# text_analyzer.py\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "from contextlib import contextmanager\n", + "from typing import List, Tuple\n", + "from abc import ABC, abstractmethod\n", + "\n", + "@contextmanager\n", + "def file_reader(file_path: str):\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " yield f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " yield \"\"\n", + "\n", + "class Tokenizer(ABC):\n", + " @abstractmethod\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " pass\n", + "\n", + "class JiebaTokenizer(Tokenizer):\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " for word in jieba.lcut(text):\n", + " if word not in stop_words:\n", + " yield word\n", + "\n", + "class SimpleTokenizer(Tokenizer):\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " for word in text.split():\n", + " if word not in stop_words:\n", + " yield word\n", + "\n", + "class TokenizerFactory:\n", + " @staticmethod\n", + " def create_tokenizer(name: str) -> Tokenizer:\n", + " return {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}.get(name, JiebaTokenizer())\n", + "\n", + "class OutputObserver(ABC):\n", + " @abstractmethod\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " pass\n", + "\n", + "class ConsoleOutput(OutputObserver):\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "class FileOutput(OutputObserver):\n", + " def __init__(self, output_file: str):\n", + " self.output_file = output_file\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n", + " self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n", + "\n", + " def load_stop_words(self) -> set:\n", + " with file_reader(self.stop_words_file) as content:\n", + " return set(line.strip() for line in content.splitlines() if line.strip())\n", + "\n", + " def process_file(self, file_path: str):\n", + " if file_path.endswith('.txt'):\n", + " with file_reader(file_path) as text:\n", + " words = self.tokenizer.tokenize(text, self.stop_words)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self) -> List[Tuple[str, int]]:\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def notify_observers(self, top_words: List[Tuple[str, int]]):\n", + " for observer in self.observers:\n", + " observer.update(top_words)\n", + "\n", + " def run(self):\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.notify_observers(top_words)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5c689f4-e363-4327-9dc4-15c7157d4288", + "metadata": {}, + "outputs": [], + "source": [ + "# main.py\n", + "\n", + "from text_analyzer import TextAnalyzer\n", + "\n", + "def main():\n", + " analyzer = TextAnalyzer()\n", + " analyzer.run()\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc1d9fb1-3bb5-4f71-aeb3-e304511f4785", + "metadata": {}, + "outputs": [], + "source": [ + "## 结论\n", + "\n", + "通过引入上下文管理器、生成器、元编程、策略模式、观察者模式和工厂模式,词频统计代码在可扩展性、可维护性和复用性上进一步提升。\n", + "这些特性和模式使代码更模块化、灵活,适合大型项目,同时保持清晰的工程结构。结合之前的装饰器和函数式编程,代码已达到工程化水平。\n", + "\n", + "若需深入,可以进一步考虑其它性能特性." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7244afd0-4405-402a-b9be-75f5d7ff883c", + "metadata": {}, + "outputs": [], + "source": [ + "## 进一步练习\n", + "\n", + "实践练习:\n", + "- 实现新分词器(如 thulac)并通过策略模式或工厂模式集成。\n", + "- 添加新观察者(如 JSON 输出)。\n", + "\n", + "使用生成器实现流式词频统计,比较内存占用。\n", + "实现缓存机制,缓存已处理文件的分词结果。\n", + "\n", + "添加命令行接口(argparse),动态配置 top_n 和 tokenizer。" + ] + }, + { + "cell_type": "markdown", + "id": "09c10307-f162-4b36-85b6-6bc01d0001e0", + "metadata": {}, + "source": [ + "## 综合实现(整合特性与模式)\n", + "\n", + "整合上下文管理器、生成器、策略模式和观察者模式的最终实现(部分代码展示)。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/D Plus/00 封装.ipynb b/D Plus/00 封装.ipynb deleted file mode 100644 index 4d255e0..0000000 --- a/D Plus/00 封装.ipynb +++ /dev/null @@ -1,478 +0,0 @@ -{ - "cells": [ - { - "cell_type": "raw", - "id": "69e76aa7-2c5d-4114-a302-85e17cc83e2c", - "metadata": {}, - "source": [ - "本文旨在通过一个案例(读取 data 目录下 100 篇小说文本,统计词频并输出前 10 高频词)来说明结构化编程和封装方法如何提升代码工程质量。\n", - "教案将逐步展示不同结构化方法和封装技术的应用,并分析其对代码可读性、可维护性、可扩展性和复用性的提升。" - ] - }, - { - "cell_type": "markdown", - "id": "b9a9a366-7fd3-422b-b3bc-b0bc00374da6", - "metadata": {}, - "source": [ - "# 教学目标\n", - "- 掌握封装方法(函数、类、模块)在代码组织中的作用。" - ] - }, - { - "cell_type": "markdown", - "id": "1387e026-c978-4217-9015-ab0e047c01a0", - "metadata": {}, - "source": [ - "## 第一部分:基础实现(无结构化、无封装)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "33803186-d890-4cd7-9636-8920fcb86e14", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "files = os.listdir('data')\n", - "word_count = {}\n", - "for file in files:\n", - " with open('data/' + file, 'r', encoding='utf-8') as f:\n", - " text = f.read()\n", - " words = text.split() # 假设简单按空格分词\n", - " for word in words:\n", - " if word in word_count:\n", - " word_count[word] += 1\n", - " else:\n", - " word_count[word] = 1\n", - "\n", - "# 排序并输出前10\n", - "sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n", - "for i in range(10):\n", - " print(sorted_words[i])" - ] - }, - { - "cell_type": "markdown", - "id": "471351e7-8645-4690-973a-7d8de53bda5f", - "metadata": {}, - "source": [ - "### 问题分析\n", - "\n", - "- 可读性差:没有清晰的功能划分,代码逻辑混杂,难以阅读理解维护。\n", - "- 扩展性差:如果需要更改分词逻辑、文件路径或输出格式,需修改多处代码。\n", - "- 容错性差:未处理文件读取失败、空文件等问题。\n", - "- 复用性低:逻辑无法直接复用在其他类似任务中。" - ] - }, - { - "cell_type": "markdown", - "id": "a5881283-c295-4433-8edd-f915201a5f43", - "metadata": {}, - "source": [ - "## 第二部分:引入函数封装\n", - "\n", - "提炼出若干函数,减少代码的复杂性,提高可读性和可维护性。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7beadc81-f939-4ac5-b885-407c6810b7de", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "def read_file(file_path):\n", - " \"\"\"读取单个文件内容\"\"\"\n", - " try:\n", - " with open(file_path, 'r', encoding='utf-8') as f:\n", - " return f.read()\n", - " except Exception as e:\n", - " print(f\"Error reading {file_path}: {e}\")\n", - " return \"\"\n", - "\n", - "def get_words(text):\n", - " \"\"\"简单分词(按空格)\"\"\"\n", - " return text.split()\n", - "\n", - "def count_words(words):\n", - " \"\"\"统计词频\"\"\"\n", - " word_count = {}\n", - " for word in words:\n", - " word_count[word] = word_count.get(word, 0) + 1\n", - " return word_count\n", - "\n", - "def get_top_n(word_count, n=10):\n", - " \"\"\"获取前 N 高频词\"\"\"\n", - " return sorted(word_count.items(), key=lambda x: x[1], reverse=True)[:n]\n", - "\n", - "def main():\n", - " \"\"\"主函数,控制流程\"\"\"\n", - " word_count = {}\n", - " data_dir = 'data'\n", - " \n", - " # 顺序结构:按步骤读取文件、处理文本\n", - " for file in os.listdir(data_dir):\n", - " file_path = os.path.join(data_dir, file)\n", - " # 选择结构:检查文件是否为 txt\n", - " if file_path.endswith('.txt'):\n", - " text = read_file(file_path)\n", - " # 循环结构:处理每个文件的词\n", - " words = get_words(text)\n", - " file_word_count = count_words(words)\n", - " # 合并词频\n", - " for word, count in file_word_count.items():\n", - " word_count[word] = word_count.get(word, 0) + count\n", - " \n", - " # 输出结果\n", - " top_words = get_top_n(word_count)\n", - " for word, count in top_words:\n", - " print(f\"{word}: {count}\")\n", - "\n", - "if __name__ == '__main__':\n", - " main()" - ] - }, - { - "cell_type": "markdown", - "id": "4f7218a3-43d2-4159-9854-9880020c42fc", - "metadata": {}, - "source": [ - "### 改进分析\n", - " - 逻辑分层:main() 函数清晰定义了程序执行步骤(读取文件 -> 分词 -> 统计 -> 输出)。\n", - " - 模块化:将功能拆分为函数(read_file、get_words、count_words、get_top_n),提高代码复用性和可读性。\n", - " - 错误处理:增加 try-except 处理文件读取异常。\n", - " - 工程质量提升:\n", - " - 可读性:函数命名本身就帮助理解代码,逻辑分块。\n", - " - 可维护性:修改某部分功能(如分词逻辑)只需改对应函数。\n", - " - 复用性:函数可复用在其他类似任务中。" - ] - }, - { - "cell_type": "markdown", - "id": "50737966-57c9-4daf-ac3b-6a1c73b18136", - "metadata": {}, - "source": [ - "## 第三部分:引入类封装\n", - "\n", - "通过类封装功能,进一步提高代码的模块化、可扩展性和复用性。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "81aa7f9c-de28-4a7a-8ba1-130c3e5e4f7f", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import jieba\n", - "from collections import Counter\n", - "\n", - "class TextAnalyzer:\n", - " \"\"\"文本分析类,封装词频统计功能\"\"\"\n", - " def __init__(self, data_dir='data', top_n=10):\n", - " self.data_dir = data_dir\n", - " self.top_n = top_n\n", - " self.word_count = Counter()\n", - "\n", - " def read_file(self, file_path):\n", - " \"\"\"读取文件内容\"\"\"\n", - " try:\n", - " with open(file_path, 'r', encoding='utf-8') as f:\n", - " return f.read()\n", - " except Exception as e:\n", - " print(f\"Error reading {file_path}: {e}\")\n", - " return \"\"\n", - "\n", - " def tokenize(self, text):\n", - " \"\"\"使用 jieba 进行中文分词\"\"\"\n", - " return jieba.lcut(text)\n", - "\n", - " def process_file(self, file_path):\n", - " \"\"\"处理单个文件\"\"\"\n", - " if file_path.endswith('.txt'):\n", - " text = self.read_file(file_path)\n", - " words = self.tokenize(text)\n", - " self.word_count.update(words)\n", - "\n", - " def process_directory(self):\n", - " \"\"\"处理目录下所有文件\"\"\"\n", - " for file in os.listdir(self.data_dir):\n", - " file_path = os.path.join(self.data_dir, file)\n", - " self.process_file(file_path)\n", - "\n", - " def get_top_words(self):\n", - " \"\"\"获取前 N 高频词\"\"\"\n", - " return self.word_count.most_common(self.top_n)\n", - "\n", - " def run(self):\n", - " \"\"\"执行词频统计\"\"\"\n", - " self.process_directory()\n", - " top_words = self.get_top_words()\n", - " for word, count in top_words:\n", - " print(f\"{word}: {count}\")\n", - "\n", - "def main():\n", - " analyzer = TextAnalyzer(data_dir='data', top_n=10)\n", - " analyzer.run()\n", - "\n", - "if __name__ == '__main__':\n", - " main()" - ] - }, - { - "cell_type": "markdown", - "id": "62e780d4-94de-4830-89c2-ab2c96500fc5", - "metadata": {}, - "source": [ - "### 改进分析\n", - "- 面向对象封装:\n", - " - 使用 TextAnalyzer 类将所有功能封装为一个对象,数据(如 word_count)和方法(如 tokenize)绑定在一起。\n", - " - 通过 __init__ 提供配置(如 data_dir 和 top_n),提高灵活性。\n", - " \n", - "- 模块化:类方法分工明确(如 read_file、tokenize、process_file),便于扩展。\n", - "- 工程质量提升:\n", - " - 可扩展性:可通过继承 TextAnalyzer 添加新功能(如支持其他分词器或文件格式)。\n", - " - 复用性:类可实例化多次,用于不同目录或参数。\n", - " - 可维护性:逻辑集中在类中,修改相对安全。" - ] - }, - { - "cell_type": "markdown", - "id": "9b4e17c4-f47e-4245-b3d9-e40fde0a2e04", - "metadata": {}, - "source": [ - "# 第四部分:引入文件模块封装\n", - "将代码进一步模块化到不同文件,引入配置文件和停用词过滤。" - ] - }, - { - "cell_type": "raw", - "id": "aadb5aea-8cc5-4a0f-9f5b-7eab28e90f1a", - "metadata": {}, - "source": [ - "目录结构\n", - "\n", - "project/\n", - "├── data/ # 小说文本目录\n", - "├── config.yaml # 配置文件\n", - "├── stop_words.txt # 停用词文件\n", - "├── text_analyzer.py # 分析模块\n", - "├── main.py # 主程序" - ] - }, - { - "cell_type": "raw", - "id": "2de4767b-8928-4f3f-8c8b-3c3cba2bc98a", - "metadata": {}, - "source": [ - "# config.yaml\n", - "\n", - "data_dir: data\n", - "top_n: 10\n", - "stop_words_file: stop_words.txt\n", - "output_file: output.txt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b442d61-c937-4757-b7b4-b6fc047c3529", - "metadata": {}, - "outputs": [], - "source": [ - "# text_analyzer.py\n", - "\n", - "import os\n", - "import jieba\n", - "from collections import Counter\n", - "import yaml\n", - "\n", - "class TextAnalyzer:\n", - " def __init__(self, config_path='config.yaml'):\n", - " with open(config_path, 'r', encoding='utf-8') as f:\n", - " config = yaml.safe_load(f)\n", - " self.data_dir = config['data_dir']\n", - " self.top_n = config['top_n']\n", - " self.stop_words_file = config['stop_words_file']\n", - " self.output_file = config['output_file']\n", - " self.word_count = Counter()\n", - " self.stop_words = self.load_stop_words()\n", - "\n", - " def load_stop_words(self):\n", - " \"\"\"加载停用词\"\"\"\n", - " try:\n", - " with open(self.stop_words_file, 'r', encoding='utf-8') as f:\n", - " return set(line.strip() for line in f if line.strip())\n", - " except Exception as e:\n", - " print(f\"Error loading stop words: {e}\")\n", - " return set()\n", - "\n", - " def read_file(self, file_path):\n", - " \"\"\"读取文件内容\"\"\"\n", - " try:\n", - " with open(file_path, 'r', encoding='utf-8') as f:\n", - " return f.read()\n", - " except Exception as e:\n", - " print(f\"Error reading {file_path}: {e}\")\n", - " return \"\"\n", - "\n", - " def tokenize(self, text):\n", - " \"\"\"中文分词并过滤停用词\"\"\"\n", - " words = jieba.lcut(text)\n", - " return [word for word in words if word not in self.stop_words]\n", - "\n", - " def process_file(self, file_path):\n", - " \"\"\"处理单个文件\"\"\"\n", - " if file_path.endswith('.txt'):\n", - " text = self.read_file(file_path)\n", - " words = self.tokenize(text)\n", - " self.word_count.update(words)\n", - "\n", - " def process_directory(self):\n", - " \"\"\"处理目录下所有文件\"\"\"\n", - " for file in os.listdir(self.data_dir):\n", - " file_path = os.path.join(self.data_dir, file)\n", - " self.process_file(file_path)\n", - "\n", - " def get_top_words(self):\n", - " \"\"\"获取前 N 高频词\"\"\"\n", - " return self.word_count.most_common(self.top_n)\n", - "\n", - " def save_results(self, top_words):\n", - " \"\"\"保存结果到文件\"\"\"\n", - " with open(self.output_file, 'w', encoding='utf-8') as f:\n", - " for word, count in top_words:\n", - " f.write(f\"{word}: {count}\\n\")\n", - "\n", - " def run(self):\n", - " \"\"\"执行词频统计并保存结果\"\"\"\n", - " self.process_directory()\n", - " top_words = self.get_top_words()\n", - " self.save_results(top_words)\n", - " for word, count in top_words:\n", - " print(f\"{word}: {count}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22f58992-0108-4c90-894d-e756e7301a5a", - "metadata": {}, - "outputs": [], - "source": [ - "# main.py\n", - "\n", - "from text_analyzer import TextAnalyzer\n", - "\n", - "def main():\n", - " analyzer = TextAnalyzer()\n", - " analyzer.run()\n", - "\n", - "if __name__ == '__main__':\n", - " main()" - ] - }, - { - "cell_type": "markdown", - "id": "18d27410-8923-4662-a6b7-8e027609506e", - "metadata": {}, - "source": [ - "## 改进分析\n", - "\n", - "- 模块化:将分析逻辑放入 text_analyzer.py,主程序 main.py 仅负责调用,符合工程化项目结构。\n", - "- 配置文件:通过 config.yaml 配置参数,增强灵活性,无需修改代码即可更改目录、输出文件等。\n", - "- 输出到文件:增加 save_results 方法,支持结果持久化。\n", - "- 工程质量提升:\n", - " - 可维护性:配置文件和模块化分离了配置与逻辑,修改配置无需动代码。 \n", - " - 复用性:模块可导入到其他项目,类可重复实例化。" - ] - }, - { - "cell_type": "markdown", - "id": "10876929-69f9-43bf-ba2d-a5d7bb11f22b", - "metadata": {}, - "source": [ - "### 封装的总节\n", - "\n", - "封装方法:\n", - "- 模块化:函数划分逻辑,降低耦合。\n", - "- 函数封装:将重复逻辑封装为函数,提高复用性。\n", - "- 类封装:将数据和方法绑定,增强代码组织性和扩展性。\n", - "- 文件封装:通过文件模块化,符合工程化开发规范。\n", - "\n", - "工程质量提升:\n", - "- 分离配置与逻辑,降低维护成本。\n", - "- 模块化和面向对象设计支持功能扩展。\n", - "- 错误处理提高程序鲁棒性。" - ] - }, - { - "cell_type": "raw", - "id": "60ba30d8-d8c2-4183-996e-376ff71716bf", - "metadata": {}, - "source": [ - "## 另外一种文件模块化设计(分层架构)示例\n", - "\n", - "将代码拆分为独立模块,每个模块仅负责单一职责:\n", - " - 数据读取层:遍历目录、读取文件内容\n", - " - 数据处理层:文本清洗、分词、统计词频\n", - " - 结果输出层:排序并输出前10高频词\n", - "\n", - "目录结构:\n", - "project/\n", - "├── data_loader.py # 数据读取模块\n", - "├── text_processor.py # 数据处理模块\n", - "├── output_handler.py # 结果输出模块\n", - "└── main.py # 主程序入口" - ] - }, - { - "cell_type": "markdown", - "id": "517759ac-c4cf-402e-86f1-a9fae0d88bbb", - "metadata": {}, - "source": [ - "# 第七部分:运行说明\n", - "\n", - "环境准备:\n", - "- 安装 Python 3.8+。\n", - "- 安装依赖:pip install jieba pyyaml。\n", - "- 准备 data 目录,放入 100 个 txt 文件。\n", - "- 创建 stop_words.txt 和 config.yaml。" - ] - }, - { - "cell_type": "markdown", - "id": "a7e1836b-42a1-45f9-bf8c-2e04a38744e4", - "metadata": {}, - "source": [ - "通过从无结构到结构化,再到面向对象和模块化的逐步优化,展示了结构化编程和封装方法如何显著提升代码工程质量。最终实现不仅满足了词频统计需求,还具备高可读性、可维护性、可扩展性和复用性,适合实际工程应用。" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/D Plus/.ipynb_checkpoints/00 封装-checkpoint.ipynb b/D Plus/01 封装.ipynb similarity index 81% rename from D Plus/.ipynb_checkpoints/00 封装-checkpoint.ipynb rename to D Plus/01 封装.ipynb index 4d255e0..f1e30bf 100644 --- a/D Plus/.ipynb_checkpoints/00 封装-checkpoint.ipynb +++ b/D Plus/01 封装.ipynb @@ -1,69 +1,17 @@ { "cells": [ - { - "cell_type": "raw", - "id": "69e76aa7-2c5d-4114-a302-85e17cc83e2c", - "metadata": {}, - "source": [ - "本文旨在通过一个案例(读取 data 目录下 100 篇小说文本,统计词频并输出前 10 高频词)来说明结构化编程和封装方法如何提升代码工程质量。\n", - "教案将逐步展示不同结构化方法和封装技术的应用,并分析其对代码可读性、可维护性、可扩展性和复用性的提升。" - ] - }, { "cell_type": "markdown", "id": "b9a9a366-7fd3-422b-b3bc-b0bc00374da6", "metadata": {}, "source": [ - "# 教学目标\n", - "- 掌握封装方法(函数、类、模块)在代码组织中的作用。" - ] - }, - { - "cell_type": "markdown", - "id": "1387e026-c978-4217-9015-ab0e047c01a0", - "metadata": {}, - "source": [ - "## 第一部分:基础实现(无结构化、无封装)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "33803186-d890-4cd7-9636-8920fcb86e14", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "files = os.listdir('data')\n", - "word_count = {}\n", - "for file in files:\n", - " with open('data/' + file, 'r', encoding='utf-8') as f:\n", - " text = f.read()\n", - " words = text.split() # 假设简单按空格分词\n", - " for word in words:\n", - " if word in word_count:\n", - " word_count[word] += 1\n", - " else:\n", - " word_count[word] = 1\n", - "\n", - "# 排序并输出前10\n", - "sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n", - "for i in range(10):\n", - " print(sorted_words[i])" - ] - }, - { - "cell_type": "markdown", - "id": "471351e7-8645-4690-973a-7d8de53bda5f", - "metadata": {}, - "source": [ - "### 问题分析\n", + "# 目标\n", + "- 理解封装方法(函数、类、模块)在代码组织中的作用。\n", "\n", - "- 可读性差:没有清晰的功能划分,代码逻辑混杂,难以阅读理解维护。\n", - "- 扩展性差:如果需要更改分词逻辑、文件路径或输出格式,需修改多处代码。\n", - "- 容错性差:未处理文件读取失败、空文件等问题。\n", - "- 复用性低:逻辑无法直接复用在其他类似任务中。" + "封装方法:\n", + "- 函数封装:将重复逻辑封装为函数,降低耦合,提高复用性。\n", + "- 类封装:将数据和方法绑定,增强代码组织性和扩展性。\n", + "- 文件封装:通过文件模块化,符合工程化开发规范。" ] }, { @@ -71,7 +19,7 @@ "id": "a5881283-c295-4433-8edd-f915201a5f43", "metadata": {}, "source": [ - "## 第二部分:引入函数封装\n", + "## A :函数封装\n", "\n", "提炼出若干函数,减少代码的复杂性,提高可读性和可维护性。" ] @@ -144,7 +92,7 @@ "### 改进分析\n", " - 逻辑分层:main() 函数清晰定义了程序执行步骤(读取文件 -> 分词 -> 统计 -> 输出)。\n", " - 模块化:将功能拆分为函数(read_file、get_words、count_words、get_top_n),提高代码复用性和可读性。\n", - " - 错误处理:增加 try-except 处理文件读取异常。\n", + " - 错误处理:增加 try-except 处理文件读取异常。提高程序健壮性\n", " - 工程质量提升:\n", " - 可读性:函数命名本身就帮助理解代码,逻辑分块。\n", " - 可维护性:修改某部分功能(如分词逻辑)只需改对应函数。\n", @@ -156,7 +104,7 @@ "id": "50737966-57c9-4daf-ac3b-6a1c73b18136", "metadata": {}, "source": [ - "## 第三部分:引入类封装\n", + "## B:类封装\n", "\n", "通过类封装功能,进一步提高代码的模块化、可扩展性和复用性。" ] @@ -233,8 +181,12 @@ "- 面向对象封装:\n", " - 使用 TextAnalyzer 类将所有功能封装为一个对象,数据(如 word_count)和方法(如 tokenize)绑定在一起。\n", " - 通过 __init__ 提供配置(如 data_dir 和 top_n),提高灵活性。\n", + "\n", " \n", - "- 模块化:类方法分工明确(如 read_file、tokenize、process_file),便于扩展。\n", + "- 模块化:\n", + " - 类方法分工明确(如 read_file、tokenize、process_file),便于扩展。\n", + " \n", + " \n", "- 工程质量提升:\n", " - 可扩展性:可通过继承 TextAnalyzer 添加新功能(如支持其他分词器或文件格式)。\n", " - 复用性:类可实例化多次,用于不同目录或参数。\n", @@ -246,7 +198,7 @@ "id": "9b4e17c4-f47e-4245-b3d9-e40fde0a2e04", "metadata": {}, "source": [ - "# 第四部分:引入文件模块封装\n", + "# C:引入文件模块封装\n", "将代码进一步模块化到不同文件,引入配置文件和停用词过滤。" ] }, @@ -389,7 +341,7 @@ "- 输出到文件:增加 save_results 方法,支持结果持久化。\n", "- 工程质量提升:\n", " - 可维护性:配置文件和模块化分离了配置与逻辑,修改配置无需动代码。 \n", - " - 复用性:模块可导入到其他项目,类可重复实例化。" + " - 复用性:模块可导入到其他项目。" ] }, { @@ -397,15 +349,7 @@ "id": "10876929-69f9-43bf-ba2d-a5d7bb11f22b", "metadata": {}, "source": [ - "### 封装的总节\n", - "\n", - "封装方法:\n", - "- 模块化:函数划分逻辑,降低耦合。\n", - "- 函数封装:将重复逻辑封装为函数,提高复用性。\n", - "- 类封装:将数据和方法绑定,增强代码组织性和扩展性。\n", - "- 文件封装:通过文件模块化,符合工程化开发规范。\n", - "\n", - "工程质量提升:\n", + "### 工程质量总结\n", "- 分离配置与逻辑,降低维护成本。\n", "- 模块化和面向对象设计支持功能扩展。\n", "- 错误处理提高程序鲁棒性。" @@ -436,7 +380,7 @@ "id": "517759ac-c4cf-402e-86f1-a9fae0d88bbb", "metadata": {}, "source": [ - "# 第七部分:运行说明\n", + "# 运行说明\n", "\n", "环境准备:\n", "- 安装 Python 3.8+。\n", @@ -444,14 +388,6 @@ "- 准备 data 目录,放入 100 个 txt 文件。\n", "- 创建 stop_words.txt 和 config.yaml。" ] - }, - { - "cell_type": "markdown", - "id": "a7e1836b-42a1-45f9-bf8c-2e04a38744e4", - "metadata": {}, - "source": [ - "通过从无结构到结构化,再到面向对象和模块化的逐步优化,展示了结构化编程和封装方法如何显著提升代码工程质量。最终实现不仅满足了词频统计需求,还具备高可读性、可维护性、可扩展性和复用性,适合实际工程应用。" - ] } ], "metadata": { diff --git a/D Plus/01 简洁的语言特性.ipynb b/D Plus/02 利用语言特性.ipynb similarity index 99% rename from D Plus/01 简洁的语言特性.ipynb rename to D Plus/02 利用语言特性.ipynb index 65b84ab..5398df7 100644 --- a/D Plus/01 简洁的语言特性.ipynb +++ b/D Plus/02 利用语言特性.ipynb @@ -5,7 +5,7 @@ "id": "86405617-889a-40c2-a895-7b51fb14b65d", "metadata": {}, "source": [ - "# 教学目标\n", + "# 目标\n", "\n", "- 在词频统计案例中引入装饰器和函数式编程 。\n", "- 分析这些特性和模式如何进一步优化代码质量(可读性、可维护性、可扩展性、复用性)。\n", diff --git a/D Plus/02 设计模式.ipynb b/D Plus/03 设计模式.ipynb similarity index 60% rename from D Plus/02 设计模式.ipynb rename to D Plus/03 设计模式.ipynb index 8f14f9e..1a6f3a9 100644 --- a/D Plus/02 设计模式.ipynb +++ b/D Plus/03 设计模式.ipynb @@ -5,7 +5,7 @@ "id": "eccfe49f-de35-4241-90e3-a7095940b61a", "metadata": {}, "source": [ - "设计模式提供高频重复出现需求的最佳解决方案。以下介绍适合词频统计案例的设计模式:策略模式、观察者模式、工厂模式。" + "设计模式提供高频重复出现的需求的最佳解决方案。以下介绍适合词频统计案例的设计模式:策略模式、观察者模式、工厂模式。" ] }, { @@ -221,176 +221,8 @@ }, { "cell_type": "markdown", - "id": "e5f2aef4-a055-43a9-917c-fa183de6db2d", + "id": "07158f09-703e-4abb-ac1a-881ba1b3b26d", "metadata": {}, - "source": [ - "## 综合实现(整合特性与模式)\n", - "\n", - "整合上下文管理器、生成器、策略模式和观察者模式的最终实现(部分代码展示)。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa7f34e2-d355-4a22-8572-729c49b18605", - "metadata": {}, - "outputs": [], - "source": [ - "# text_analyzer.py\n", - "\n", - "import os\n", - "import jieba\n", - "from collections import Counter\n", - "import yaml\n", - "from contextlib import contextmanager\n", - "from typing import List, Tuple\n", - "from abc import ABC, abstractmethod\n", - "\n", - "@contextmanager\n", - "def file_reader(file_path: str):\n", - " try:\n", - " with open(file_path, 'r', encoding='utf-8') as f:\n", - " yield f.read()\n", - " except Exception as e:\n", - " print(f\"Error reading {file_path}: {e}\")\n", - " yield \"\"\n", - "\n", - "class Tokenizer(ABC):\n", - " @abstractmethod\n", - " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", - " pass\n", - "\n", - "class JiebaTokenizer(Tokenizer):\n", - " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", - " for word in jieba.lcut(text):\n", - " if word not in stop_words:\n", - " yield word\n", - "\n", - "class SimpleTokenizer(Tokenizer):\n", - " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", - " for word in text.split():\n", - " if word not in stop_words:\n", - " yield word\n", - "\n", - "class TokenizerFactory:\n", - " @staticmethod\n", - " def create_tokenizer(name: str) -> Tokenizer:\n", - " return {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}.get(name, JiebaTokenizer())\n", - "\n", - "class OutputObserver(ABC):\n", - " @abstractmethod\n", - " def update(self, top_words: List[Tuple[str, int]]):\n", - " pass\n", - "\n", - "class ConsoleOutput(OutputObserver):\n", - " def update(self, top_words: List[Tuple[str, int]]):\n", - " for word, count in top_words:\n", - " print(f\"{word}: {count}\")\n", - "\n", - "class FileOutput(OutputObserver):\n", - " def __init__(self, output_file: str):\n", - " self.output_file = output_file\n", - " def update(self, top_words: List[Tuple[str, int]]):\n", - " with open(self.output_file, 'w', encoding='utf-8') as f:\n", - " for word, count in top_words:\n", - " f.write(f\"{word}: {count}\\n\")\n", - "\n", - "class TextAnalyzer:\n", - " def __init__(self, config_path='config.yaml'):\n", - " with open(config_path, 'r', encoding='utf-8') as f:\n", - " config = yaml.safe_load(f)\n", - " self.data_dir = config['data_dir']\n", - " self.top_n = config['top_n']\n", - " self.stop_words_file = config['stop_words_file']\n", - " self.output_file = config['output_file']\n", - " self.stop_words = self.load_stop_words()\n", - " self.word_count = Counter()\n", - " self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n", - " self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n", - "\n", - " def load_stop_words(self) -> set:\n", - " with file_reader(self.stop_words_file) as content:\n", - " return set(line.strip() for line in content.splitlines() if line.strip())\n", - "\n", - " def process_file(self, file_path: str):\n", - " if file_path.endswith('.txt'):\n", - " with file_reader(file_path) as text:\n", - " words = self.tokenizer.tokenize(text, self.stop_words)\n", - " self.word_count.update(words)\n", - "\n", - " def process_directory(self):\n", - " for file in os.listdir(self.data_dir):\n", - " file_path = os.path.join(self.data_dir, file)\n", - " self.process_file(file_path)\n", - "\n", - " def get_top_words(self) -> List[Tuple[str, int]]:\n", - " return self.word_count.most_common(self.top_n)\n", - "\n", - " def notify_observers(self, top_words: List[Tuple[str, int]]):\n", - " for observer in self.observers:\n", - " observer.update(top_words)\n", - "\n", - " def run(self):\n", - " self.process_directory()\n", - " top_words = self.get_top_words()\n", - " self.notify_observers(top_words)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3d130312-b298-4c76-ae09-0fb4bd08b0c1", - "metadata": {}, - "outputs": [], - "source": [ - "# main.py\n", - "\n", - "from text_analyzer import TextAnalyzer\n", - "\n", - "def main():\n", - " analyzer = TextAnalyzer()\n", - " analyzer.run()\n", - "\n", - "if __name__ == '__main__':\n", - " main()" - ] - }, - { - "cell_type": "markdown", - "id": "770618c9-428e-454a-97de-00e3b49c9d03", - "metadata": {}, - "source": [ - "## 结论\n", - "\n", - "通过引入上下文管理器、生成器、元编程、策略模式、观察者模式和工厂模式,词频统计代码在可扩展性、可维护性和复用性上进一步提升。\n", - "这些特性和模式使代码更模块化、灵活,适合大型项目,同时保持清晰的工程结构。结合之前的装饰器和函数式编程,代码已达到工程化水平。\n", - "\n", - "若需深入,可以进一步考虑其它性能特性." - ] - }, - { - "cell_type": "markdown", - "id": "cbeaa07d-272f-465b-a437-9c4b44827d23", - "metadata": {}, - "source": [ - "## 进一步练习\n", - "\n", - "实践练习:\n", - "- 实现新分词器(如 thulac)并通过策略模式或工厂模式集成。\n", - "- 添加新观察者(如 JSON 输出)。\n", - "\n", - "使用生成器实现流式词频统计,比较内存占用。\n", - "实现缓存机制,缓存已处理文件的分词结果。\n", - "\n", - "添加命令行接口(argparse),动态配置 top_n 和 tokenizer。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6a43b53d-1e07-4ebe-a6c8-104353fd5f7b", - "metadata": {}, - "outputs": [], "source": [ "## 附:元编程\n", "\n", @@ -424,7 +256,7 @@ " self.output_file = config['output_file']\n", " self.stop_words = self.load_stop_words()\n", " self.word_count = Counter()\n", - " self.tokenizer_name = config.get('tokenizer', 'jieba') # 从配置读取分词器\n", + " self.tokenizer_name = config.get('tokenizer', 'jieba')\n", "\n", " @classmethod\n", " def register_tokenizer(cls, name):\n", @@ -432,7 +264,7 @@ "\n", " def tokenize(self, text: str) -> List[str]:\n", " \"\"\"动态调用分词器\"\"\"\n", - " tokenizer = self.__class__.tokenizers.get(self.tokenizer_name, self.jieba_tokenizer)\n", + " tokenizer = self.__class__.tokenizers.get(self.tokenizer_name)\n", " return tokenizer(self, text)\n", "\n", " @register_tokenizer('jieba')\n", @@ -449,23 +281,19 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "2249f13a-7a3f-4376-ba2a-d92f11658d32", + "cell_type": "markdown", + "id": "30ba75ea-f769-4f90-9075-27670db9ada4", "metadata": {}, - "outputs": [], "source": [ "### 分析\n", "\n", - "功能:通过元类和装饰器动态注册分词器,支持配置切换(如 jieba 或 simple)。\n", - "\n", "工程质量提升:\n", - " 可扩展性:新分词器只需添加新方法并注册,无需修改核心逻辑。\n", - " 灵活性:通过配置文件动态选择分词器。\n", + "- 可扩展性:新分词器只需添加新方法并注册,无需修改核心部分。\n", + "- 灵活性:通过配置文件动态选择分词器。\n", "\n", "适用场景:适合需要动态配置或插件化系统的场景。\n", "\n", - "局限性:元编程增加代码复杂性,可能降低可读性,需谨慎使用。" + "局限性:元编程增加代码复杂性,需要团队整体技术能力支持 。" ] } ], diff --git a/D Plus/99 工业级代码.ipynb b/D Plus/99 工业级代码.ipynb new file mode 100644 index 0000000..b88eaf7 --- /dev/null +++ b/D Plus/99 工业级代码.ipynb @@ -0,0 +1,665 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "b6bc2a3c-3b15-4bc5-83a2-adeae3b7b4d0", + "metadata": {}, + "outputs": [], + "source": [ + "## 项目结构\n", + "\n", + "word_frequency_project/\n", + "│\n", + "├── data/ # 小说文本存放目录\n", + "│ ├── novel1.txt\n", + "│ ├── novel2.txt\n", + "│ └── ...\n", + "├── src/ # 源代码目录\n", + "│ ├── __init__.py\n", + "│ ├── config.py # 配置文件\n", + "│ ├── data_loader.py # 数据加载模块\n", + "│ ├── preprocessor.py # 文本预处理模块\n", + "│ ├── word_counter.py # 词频统计模块\n", + "│ ├── output_formatter.py # 输出格式化模块\n", + "│ └── main.py # 主程序入口\n", + "├── tests/ # 单元测试目录\n", + "│ ├── __init__.py\n", + "│ ├── test_data_loader.py\n", + "│ ├── test_preprocessor.py\n", + "│ ├── test_word_counter.py\n", + "│ └── test_output_formatter.py\n", + "├── requirements.txt # 依赖文件\n", + "└── README.md # 项目说明" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0b55f2e-24ba-49da-8d11-f0f5eea611b0", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "src/config.py\n", + "定义配置项,便于扩展和修改。\n", + "'''\n", + "\n", + "import os\n", + "\n", + "class Config:\n", + " DATA_DIR = \"data\"\n", + " TOP_N_WORDS = 10\n", + " STOP_WORDS = {\"的\", \"了\", \"是\", \"在\", \"和\", \"我\", \"你\", \"他\", \"她\"} # 示例停用词\n", + " ENCODING = \"utf-8\"\n", + " LOG_LEVEL = \"INFO\"\n", + "\n", + " @classmethod\n", + " def get_data_dir(cls):\n", + " return os.path.join(os.path.dirname(__file__), \"..\", cls.DATA_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5bdcdf0-16a2-4dda-85f1-d018c6370aee", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "src/data_loader.py\n", + "负责加载小说文本,支持目录扫描和文件读取,提供扩展点以支持不同格式。\n", + "'''\n", + "\n", + "import os\n", + "import logging\n", + "from src.config import Config\n", + "\n", + "class DataLoader:\n", + " def __init__(self):\n", + " self.data_dir = Config.get_data_dir()\n", + " logging.basicConfig(level=Config.LOG_LEVEL)\n", + " self.logger = logging.getLogger(__name__)\n", + "\n", + " def load_texts(self):\n", + " \"\"\"加载 data 目录下的所有文本文件\"\"\"\n", + " texts = []\n", + " try:\n", + " for filename in os.listdir(self.data_dir):\n", + " if filename.endswith(\".txt\"):\n", + " file_path = os.path.join(self.data_dir, filename)\n", + " with open(file_path, \"r\", encoding=Config.ENCODING) as f:\n", + " texts.append(f.read())\n", + " self.logger.info(f\"Loaded file: {filename}\")\n", + " if not texts:\n", + " self.logger.warning(\"No text files found in data directory\")\n", + " return texts\n", + " except Exception as e:\n", + " self.logger.error(f\"Error loading files: {str(e)}\")\n", + " raise" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "786e7ffa-82bc-46b9-8ffc-444d6796b87b", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "src/preprocessor.py\n", + "文本预处理模块,负责分词和清理,支持扩展以添加更多预处理逻辑。\n", + "'''\n", + "\n", + "import jieba\n", + "import re\n", + "from src.config import Config\n", + "\n", + "def timing_decorator(func):\n", + " \"\"\"装饰器:记录方法执行时间\"\"\"\n", + " import time\n", + " def wrapper(*args, **kwargs):\n", + " start = time.time()\n", + " result = func(*args, **kwargs)\n", + " end = time.time()\n", + " print(f\"{func.__name__} took {end - start:.2f} seconds\")\n", + " return result\n", + " return wrapper\n", + "\n", + "class TextPreprocessor:\n", + " def __init__(self):\n", + " self.stop_words = Config.STOP_WORDS\n", + "\n", + " @timing_decorator\n", + " def preprocess(self, text):\n", + " \"\"\"预处理:分词、去除停用词和非中文字符\"\"\"\n", + " # 移除非中文字符\n", + " text = re.sub(r\"[^\\u4e00-\\u9fff]\", \" \", text)\n", + " # 分词\n", + " words = jieba.cut(text)\n", + " # 过滤停用词和空字符\n", + " return [word for word in words if word.strip() and word not in self.stop_words]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4edd5ca7-4ba7-4446-b93e-2cfd83efca2e", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "src/word_counter.py\n", + "词频统计模块,使用单例模式确保全局唯一计数器。\n", + "'''\n", + "\n", + "from collections import Counter\n", + "from typing import List, Dict\n", + "\n", + "class Singleton: 为啥需要单例?\n", + " \"\"\"单例模式装饰器\"\"\"\n", + " def __init__(self, cls):\n", + " self._cls = cls\n", + " self._instance = None\n", + "\n", + " def __call__(self, *args, **kwargs):\n", + " if self._instance is None:\n", + " self._instance = self._cls(*args, **kwargs)\n", + " return self._instance\n", + "\n", + "@Singleton\n", + "class WordCounter:\n", + " def __init__(self):\n", + " self.counter = Counter()\n", + "\n", + " def count_words(self, words: List[str]) -> None:\n", + " \"\"\"更新词频统计\"\"\"\n", + " self.counter.update(words)\n", + "\n", + " def get_top_n(self, n: int = 10) -> Dict[str, int]:\n", + " \"\"\"获取前 N 个高频词\"\"\"\n", + " return dict(self.counter.most_common(n))\n", + "\n", + " def reset(self):\n", + " \"\"\"重置计数器\"\"\"\n", + " self.counter.clear()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41af3e0e-3153-4d23-9a9f-65b566b384e8", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "src/output_formatter.py\n", + "输出格式化模块,支持多种输出格式,便于扩展。\n", + "'''\n", + "\n", + "from typing import Dict\n", + "\n", + "class OutputFormatter:\n", + " @staticmethod\n", + " def format_json(data: Dict[str, int]) -> str:\n", + " import json\n", + " return json.dumps(data, ensure_ascii=False, indent=2)\n", + "\n", + " @staticmethod\n", + " def format_text(data: Dict[str, int]) -> str:\n", + " return \"\\n\".join(f\"{word}: {count}\" for word, count in data.items())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6596162c-fd42-4b32-b328-9987568b3846", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "src/main.py\n", + "主程序入口,协调各模块工作。\n", + "'''\n", + "\n", + "from src.data_loader import DataLoader\n", + "from src.preprocessor import TextPreprocessor\n", + "from src.word_counter import WordCounter\n", + "from src.output_formatter import OutputFormatter\n", + "from src.config import Config\n", + "\n", + "def main():\n", + " # 初始化模块\n", + " loader = DataLoader()\n", + " preprocessor = TextPreprocessor()\n", + " counter = WordCounter()\n", + " formatter = OutputFormatter()\n", + "\n", + " # 加载文本\n", + " texts = loader.load_texts()\n", + "\n", + " # 预处理并统计词频\n", + " for text in texts:\n", + " words = preprocessor.preprocess(text)\n", + " counter.count_words(words)\n", + "\n", + " # 获取结果\n", + " top_words = counter.get_top_n(Config.TOP_N_WORDS)\n", + "\n", + " # 输出结果\n", + " print(\"=== Top 10 Words (Text Format) ===\")\n", + " print(formatter.format_text(top_words))\n", + " print(\"\\n=== Top 10 Words (JSON Format) ===\")\n", + " print(formatter.format_json(top_words))\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36a32f17-5ce3-46e2-a563-f151454f6342", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "tests/test_data_loader.py\n", + "单元测试示例,确保数据加载模块的正确性。\n", + "'''\n", + "\n", + "import unittest\n", + "import os\n", + "from src.data_loader import DataLoader\n", + "from src.config import Config\n", + "\n", + "class TestDataLoader(unittest.TestCase):\n", + " def setUp(self):\n", + " self.loader = DataLoader()\n", + " # 创建临时测试文件\n", + " self.test_file = os.path.join(Config.get_data_dir(), \"test_novel.txt\")\n", + " with open(self.test_file, \"w\", encoding=Config.ENCODING) as f:\n", + " f.write(\"这是一个测试文本\")\n", + "\n", + " def test_load_texts(self):\n", + " texts = self.loader.load_texts()\n", + " self.assertGreater(len(texts), 0)\n", + " self.assertIn(\"这是一个测试文本\", texts)\n", + "\n", + " def tearDown(self):\n", + " if os.path.exists(self.test_file):\n", + " os.remove(self.test_file)\n", + "\n", + "if __name__ == \"__main__\":\n", + " unittest.main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f550544-f0f4-4f0c-bdb7-9928b6820bdf", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "tests/test_preprocessor.py\n", + "测试文本预处理模块。\n", + "'''\n", + "\n", + "import unittest\n", + "from src.preprocessor import TextPreprocessor\n", + "\n", + "class TestTextPreprocessor(unittest.TestCase):\n", + " def setUp(self):\n", + " self.preprocessor = TextPreprocessor()\n", + "\n", + " def test_preprocess(self):\n", + " text = \"这是一个测试文本,包含了123和一些符号!\"\n", + " words = self.preprocessor.preprocess(text)\n", + " expected = [\"测试\", \"文本\", \"包含\", \"一些\", \"符号\"]\n", + " self.assertEqual(words, expected)\n", + "\n", + "if __name__ == \"__main__\":\n", + " unittest.main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fb8b4cd-0b27-426a-9556-8f21227c5374", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "tests/test_word_counter.py\n", + "测试词频统计模块。\n", + "'''\n", + "import unittest\n", + "from src.word_counter import WordCounter\n", + "\n", + "class TestWordCounter(unittest.TestCase):\n", + " def setUp(self):\n", + " self.counter = WordCounter()\n", + "\n", + " def test_count_words(self):\n", + " self.counter.count_words([\"测试\", \"文本\", \"测试\"])\n", + " result = self.counter.get_top_n(2)\n", + " expected = {\"测试\": 2, \"文本\": 1}\n", + " self.assertEqual(result, expected)\n", + "\n", + " def test_reset(self):\n", + " self.counter.count_words([\"测试\"])\n", + " self.counter.reset()\n", + " self.assertEqual(self.counter.get_top_n(1), {})\n", + "\n", + "if __name__ == \"__main__\":\n", + " unittest.main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b7507dc-b693-4dbf-9a21-5f2833d13d0e", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "requirements.txt\n", + "列出项目依赖。\n", + "''''\n", + "jieba==0.42.1" + ] + }, + { + "cell_type": "markdown", + "id": "573c4ddd-800e-4b59-9e20-a87d6a2b14cd", + "metadata": {}, + "source": [ + "'''\n", + "README.md\n", + "提供项目说明和使用方法。\n", + "'''\n", + "# Word Frequency Analysis Project\n", + "\n", + "## Overview\n", + "This project processes 100 novels in the `data` directory, counts word frequencies, and outputs the top 10 words. It demonstrates software engineering principles like modularity, design patterns, and unit testing.\n", + "\n", + "## Setup\n", + "1. Install dependencies: `pip install -r requirements.txt`\n", + "2. Place novel files (.txt) in the `data` directory.\n", + "3. Run the program: `python src/main.py`\n", + "\n", + "## Testing\n", + "Run tests: `python -m unittest discover tests`\n", + "\n", + "## Extensibility\n", + "- Add new preprocessors in `preprocessor.py`.\n", + "- Support new output formats in `output_formatter.py`.\n", + "- Modify configurations in `config.py`." + ] + }, + { + "cell_type": "markdown", + "id": "4bd74972-f9c4-4ac9-a557-de4198889047", + "metadata": {}, + "source": [ + "## 使用方法\n", + "\n", + "准备环境:\n", + "pip install -r requirements.txt\n", + "\n", + "准备数据:\n", + "- 在 data 目录下放入 100 个 .txt 小说文件(需为 UTF-8 编码)。\n", + "- 确保安装 jieba 分词库。\n", + "\n", + "运行程序:\n", + "python src/main.py\n", + "\n", + "运行测试:\n", + "python -m unittest discover tests" + ] + }, + { + "cell_type": "markdown", + "id": "16f7a973-7c49-4d11-ab3f-457d4622e5e6", + "metadata": {}, + "source": [ + "## 扩展建议\n", + "\n", + "- 支持多语言:在 TextPreprocessor 中添加英文分词(如使用 nltk 或 spacy)。\n", + "- 数据库存储:将词频结果保存到数据库(如 SQLite),在 WordCounter 中添加存储方法。\n", + "- 并行处理:使用 multiprocessing 加速大文件处理。\n", + "- 可视化:在 OutputFormatter 中添加图表输出(如使用 matplotlib)。\n", + "- 配置文件:将 Config 改为从外部 JSON/YAML 文件加载。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2ad0efb-4c7c-4f98-a809-ce6cdcefdb34", + "metadata": {}, + "outputs": [], + "source": [ + "## 设计说明\n", + "\n", + "模块化设计:\n", + "- 各模块(DataLoader, TextPreprocessor, WordCounter, OutputFormatter)职责单一,符合单一职责原则(SRP)。\n", + "- 模块间通过明确接口交互,易于替换或扩展。\n", + "\n", + "设计模式:\n", + "- 单例模式:WordCounter 使用单例模式,确保全局唯一计数器。\n", + "- 策略模式:OutputFormatter 支持多种输出格式(JSON、Text),易于添加新格式。\n", + "- 装饰器模式:timing_decorator 用于性能监控,便于扩展其他功能(如日志记录)。\n", + "\n", + "可扩展性:\n", + "- Config 类集中管理配置,便于调整参数(如停用词、输出数量)。\n", + "- DataLoader 支持动态扫描目录,新增文件无需改动代码。\n", + "- TextPreprocessor 可扩展以支持其他分词工具或预处理规则。\n", + "\n", + "单元测试:\n", + "- 每个模块都有对应的测试用例,确保功能正确性。\n", + "- 使用 unittest 框架,支持持续集成。\n", + "\n", + "语言特性利用:\n", + "- 使用 Python 的装饰器(timing_decorator)记录方法执行时间。\n", + "- 利用类型注解(typing 模块)提高代码可读性。\n", + "- 异常处理和日志记录(logging)增强鲁棒性。\n", + "\n", + "教学用途:\n", + "- 包含常见工程化实践:模块化、测试驱动开发、配置管理。\n", + "- 提供扩展点(如支持英文分词、数据库存储),便于学生实践。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1aac488-3a98-418c-8201-e7f77c392a1f", + "metadata": {}, + "outputs": [], + "source": [ + "# text_analyzer.py\n", + "\n", + "import os\n", + "import jieba\n", + "from collections import Counter\n", + "import yaml\n", + "from contextlib import contextmanager\n", + "from typing import List, Tuple\n", + "from abc import ABC, abstractmethod\n", + "\n", + "@contextmanager\n", + "def file_reader(file_path: str):\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " yield f.read()\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " yield \"\"\n", + "\n", + "class Tokenizer(ABC):\n", + " @abstractmethod\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " pass\n", + "\n", + "class JiebaTokenizer(Tokenizer):\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " for word in jieba.lcut(text):\n", + " if word not in stop_words:\n", + " yield word\n", + "\n", + "class SimpleTokenizer(Tokenizer):\n", + " def tokenize(self, text: str, stop_words: set) -> List[str]:\n", + " for word in text.split():\n", + " if word not in stop_words:\n", + " yield word\n", + "\n", + "class TokenizerFactory:\n", + " @staticmethod\n", + " def create_tokenizer(name: str) -> Tokenizer:\n", + " return {'jieba': JiebaTokenizer(), 'simple': SimpleTokenizer()}.get(name, JiebaTokenizer())\n", + "\n", + "class OutputObserver(ABC):\n", + " @abstractmethod\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " pass\n", + "\n", + "class ConsoleOutput(OutputObserver):\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " for word, count in top_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "class FileOutput(OutputObserver):\n", + " def __init__(self, output_file: str):\n", + " self.output_file = output_file\n", + " def update(self, top_words: List[Tuple[str, int]]):\n", + " with open(self.output_file, 'w', encoding='utf-8') as f:\n", + " for word, count in top_words:\n", + " f.write(f\"{word}: {count}\\n\")\n", + "\n", + "class TextAnalyzer:\n", + " def __init__(self, config_path='config.yaml'):\n", + " with open(config_path, 'r', encoding='utf-8') as f:\n", + " config = yaml.safe_load(f)\n", + " self.data_dir = config['data_dir']\n", + " self.top_n = config['top_n']\n", + " self.stop_words_file = config['stop_words_file']\n", + " self.output_file = config['output_file']\n", + " self.stop_words = self.load_stop_words()\n", + " self.word_count = Counter()\n", + " self.tokenizer = TokenizerFactory.create_tokenizer(config.get('tokenizer', 'jieba'))\n", + " self.observers = [ConsoleOutput(), FileOutput(self.output_file)]\n", + "\n", + " def load_stop_words(self) -> set:\n", + " with file_reader(self.stop_words_file) as content:\n", + " return set(line.strip() for line in content.splitlines() if line.strip())\n", + "\n", + " def process_file(self, file_path: str):\n", + " if file_path.endswith('.txt'):\n", + " with file_reader(file_path) as text:\n", + " words = self.tokenizer.tokenize(text, self.stop_words)\n", + " self.word_count.update(words)\n", + "\n", + " def process_directory(self):\n", + " for file in os.listdir(self.data_dir):\n", + " file_path = os.path.join(self.data_dir, file)\n", + " self.process_file(file_path)\n", + "\n", + " def get_top_words(self) -> List[Tuple[str, int]]:\n", + " return self.word_count.most_common(self.top_n)\n", + "\n", + " def notify_observers(self, top_words: List[Tuple[str, int]]):\n", + " for observer in self.observers:\n", + " observer.update(top_words)\n", + "\n", + " def run(self):\n", + " self.process_directory()\n", + " top_words = self.get_top_words()\n", + " self.notify_observers(top_words)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5c689f4-e363-4327-9dc4-15c7157d4288", + "metadata": {}, + "outputs": [], + "source": [ + "# main.py\n", + "\n", + "from text_analyzer import TextAnalyzer\n", + "\n", + "def main():\n", + " analyzer = TextAnalyzer()\n", + " analyzer.run()\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc1d9fb1-3bb5-4f71-aeb3-e304511f4785", + "metadata": {}, + "outputs": [], + "source": [ + "## 结论\n", + "\n", + "通过引入上下文管理器、生成器、元编程、策略模式、观察者模式和工厂模式,词频统计代码在可扩展性、可维护性和复用性上进一步提升。\n", + "这些特性和模式使代码更模块化、灵活,适合大型项目,同时保持清晰的工程结构。结合之前的装饰器和函数式编程,代码已达到工程化水平。\n", + "\n", + "若需深入,可以进一步考虑其它性能特性." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7244afd0-4405-402a-b9be-75f5d7ff883c", + "metadata": {}, + "outputs": [], + "source": [ + "## 进一步练习\n", + "\n", + "实践练习:\n", + "- 实现新分词器(如 thulac)并通过策略模式或工厂模式集成。\n", + "- 添加新观察者(如 JSON 输出)。\n", + "\n", + "使用生成器实现流式词频统计,比较内存占用。\n", + "实现缓存机制,缓存已处理文件的分词结果。\n", + "\n", + "添加命令行接口(argparse),动态配置 top_n 和 tokenizer。" + ] + }, + { + "cell_type": "markdown", + "id": "09c10307-f162-4b36-85b6-6bc01d0001e0", + "metadata": {}, + "source": [ + "## 综合实现(整合特性与模式)\n", + "\n", + "整合上下文管理器、生成器、策略模式和观察者模式的最终实现(部分代码展示)。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/D Plus/readme.MD b/D Plus/readme.MD new file mode 100644 index 0000000..f0f231d --- /dev/null +++ b/D Plus/readme.MD @@ -0,0 +1,34 @@ +本文旨在通过一个案例(读取 data 目录下 100 篇小说文本,统计词频并输出前 10 高频词)来说明如何提升代码工程质量。 +教案将逐步展示不同编程技术的应用,并分析其对代码可读性、可维护性、可扩展性和复用性的提升。 + +本案例不做性能提升方面的考量。 + + +## 起点:基础实现 + +``` +import os + +files = os.listdir('data') +word_count = {} +for file in files: + with open('data/' + file, 'r', encoding='utf-8') as f: + text = f.read() + words = text.split() # 假设简单按空格分词 + for word in words: + if word in word_count: + word_count[word] += 1 + else: + word_count[word] = 1 + +# 排序并输出前10 +sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True) +for i in range(10): + print(sorted_words[i]) +``` + +## 问题分析 +- 可读性差:没有清晰的功能划分,代码逻辑混杂,难以阅读理解维护。 +- 扩展性差:如果需要更改分词逻辑、文件路径或输出格式,需修改多处代码。 +- 容错性差:未处理文件读取失败、空文件等问题。 +- 复用性低:逻辑无法直接复用在其他类似任务中。 \ No newline at end of file diff --git a/D Plus/测试驱动开发.ipynb b/D Plus/测试驱动开发.ipynb new file mode 100644 index 0000000..2f3ac38 --- /dev/null +++ b/D Plus/测试驱动开发.ipynb @@ -0,0 +1,35 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "13037781-7175-4a52-9d26-6c7d9f068b5f", + "metadata": {}, + "outputs": [], + "source": [ + "单元测试、集成测试、性能测试" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/readme.MD b/readme.MD index aef69c2..ba82ebe 100644 --- a/readme.MD +++ b/readme.MD @@ -1,35 +1,14 @@ - ## 代码为啥要这样写,我要这样写代码 A 代码模式 -用一个简单任务,展示各种需求(完成任务简单、可读性强、可复用高、维护成本低等)下的代码写法 - +用一个简单任务,展示各种软件工程需求(完成任务简单、可读性强、可复用高、维护成本低等)下的代码写法 B 面向对象设计模式 用一个业务场景复现面向对象的经典设计模式 - C 高性能模式 -考虑执行时间快,内存占用少的一些办法 - -D plus -问题同 A ,从构建工业级的代码目标出发,分三个层面用多种方式做了优化 - -''' -可能的动机 - -【 效率 】 -- 执行快 -- 内存占用少 - -【 软件工程 】 -- 可读性强 -- 可复用高 -- 类型安全 -- 单元测试方便 - -【可靠性】 -- 并发、线程安全 +考虑执行时间快,资源占用少的一些思路、办法和结论 -''' \ No newline at end of file +D 制造工业级代码 +问题同 A ,以构建工业级的代码为目标,用多种方式做了优化提升演示 \ No newline at end of file