0521-3

2 months ago · a270414ff0
parent 8e7c4a3117
commit a270414ff0
82 changed files with 253 additions and 67 deletions
--- a/跑起来了/.ipynb_checkpoints/3
+++ b/跑起来了/.ipynb_checkpoints/3
@ -1,15 +0,0 @@
-import re
-import collections
-from cppy.cp_util import stopwordfilepath, testfilepath
-
-stopwords = set(open(stopwordfilepath, encoding='utf8').read().split(','))
-words = re.findall('[a-z]{2,}',
-                   open(testfilepath, encoding='utf8').read().lower())
-counts = collections.Counter(w for w in words if w not in stopwords)
-for (w, c) in counts.most_common(10):
-    print(w, '-', c)
-    
-'''
-熟练的软件工程师，会如此简单完成任务
-后面的例子，我们必须变的啰嗦一些，不能用这种太 hacker 的写法
-'''
--- a/最基础的写法.py
+++ b/最基础的写法.py
@ -51,6 +51,5 @@ for tf in word_freqs[:10]:


 '''
-想到哪里写到哪里
-用的最基础的编程思想，没有使用 Python 高级语法特性、数据结构和算法
+想到哪里写到哪里，只会Python语言最基础的语法，C语言编程风格 。
 '''
--- a/使用一些函数.py
+++ b/使用一些函数.py
@ -25,7 +25,6 @@ for word, freq in word_freqs.most_common(10):
    print(f"{word}-{freq}")
    
 '''
-相比 A01
 使用collections.Counter来计数单词频率，从而简化了代码并提高了效率。
 使用enumerate来获取行号和行内容，使用set来存储停用词，都有助于提高代码的性能和可读性。
 使用most_common方法来获取最常见的单词，使输出更为简洁。
--- a/动机与模式/10
+++ b/动机与模式/10
@ -12,4 +12,5 @@ for (w, c) in counts.most_common(10):
 '''
 熟练的软件工程师，会如此简单完成任务
 后面的例子，我们必须变的啰嗦一些，不能用这种太 hacker 的写法
+我们假设是要解决一个相对复杂的问题
 '''
--- a/封装/对象封装/2
+++ b/封装/对象封装/2
@ -1,5 +1,8 @@
 from cppy.cp_util import *

+#
+# 简单使用字典，也能满足对象的一些应用场景。而且更省资源
+#

 def extract_words(obj, path_to_file):
    """
@ -49,4 +52,4 @@ if __name__ == '__main__':

    # 获取排序后的单词频率并打印
    word_freqs = word_freqs_obj['sorted']()
-    print_word_freqs(word_freqs)
+    print_word_freqs(word_freqs)
--- a/对象接口/tf-14A.py
+++ b/对象接口/tf-14A.py
--- a/对象接口/tf-14B.py
+++ b/对象接口/tf-14B.py
--- a/封装/管道封装/1
+++ b/封装/管道封装/1
--- a/封装/管道封装/2
+++ b/封装/管道封装/2
--- a/封装/管道封装/3
+++ b/封装/管道封装/3
--- a/封装/管道封装/4
+++ b/封装/管道封装/4
--- a/封装/管道封装/5
+++ b/封装/管道封装/5
--- a/动态读写PyObject/1
+++ b/动态读写PyObject/1
--- a/动态读写PyObject/2
+++ b/动态读写PyObject/2
--- a/动态读写PyObject/2
+++ b/动态读写PyObject/2
--- a/动态读写PyObject/2
+++ b/动态读写PyObject/2
--- a/反射/反射_函数.py
+++ b/反射/反射_函数.py
--- a/动态读写PyObject/3
+++ b/动态读写PyObject/3
--- a/复用/函数调用复用.py
+++ b/复用/函数调用复用.py
--- a/复用/对象复用.py
+++ b/复用/对象复用.py
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/观察者/readme.MD
+++ b/观察者/readme.MD
--- a/只有消息接口.py
+++ b/只有消息接口.py
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/微服务/client_app.py
+++ b/微服务/client_app.py
--- a/微服务/counter_service.py
+++ b/微服务/counter_service.py
--- a/微服务/sorter_service.py
+++ b/微服务/sorter_service.py
--- a/微服务/tokenizer_service.py
+++ b/微服务/tokenizer_service.py
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/动机与模式/13
+++ b/动机与模式/13
--- a/插件/plugins-src/buildingPyc.py
+++ b/插件/plugins-src/buildingPyc.py
--- a/插件/plugins-src/f1.py
+++ b/插件/plugins-src/f1.py
--- a/插件/plugins-src/f2.py
+++ b/插件/plugins-src/f2.py
--- a/插件/plugins/f1.pyc
+++ b/插件/plugins/f1.pyc
--- a/插件/plugins/f2.pyc
+++ b/插件/plugins/f2.pyc
--- a/类型申明/参数类型申明.py
+++ b/类型申明/参数类型申明.py
--- a/多计算单元/数据共享/1
+++ b/多计算单元/数据共享/1
--- a/多计算单元/数据共享/2
+++ b/多计算单元/数据共享/2
--- a/多计算单元/数据共享/2
+++ b/多计算单元/数据共享/2
--- a/多计算单元/数据共享/3
+++ b/多计算单元/数据共享/3
--- a/多计算单元/数据共享/3
+++ b/多计算单元/数据共享/3
--- a/多计算单元/数据共享/3
+++ b/多计算单元/数据共享/3
--- a/多计算单元/数据共享/3
+++ b/多计算单元/数据共享/3
--- a/多计算单元/数据共享/3
+++ b/多计算单元/数据共享/3
--- a/多计算单元/数据共享/4
+++ b/多计算单元/数据共享/4
--- a/多计算单元/数据共享/4
+++ b/多计算单元/数据共享/4
--- a/多计算单元/数据分包/1
+++ b/多计算单元/数据分包/1
--- a/多计算单元/数据分包/2
+++ b/多计算单元/数据分包/2
--- a/多计算单元/数据分包/3
+++ b/多计算单元/数据分包/3
--- a/多计算单元/数据分包/4
+++ b/多计算单元/数据分包/4
--- a/终端/终端命令行/command_line_1.py
+++ b/终端/终端命令行/command_line_1.py
--- a/终端/终端命令行/command_line_2.py
+++ b/终端/终端命令行/command_line_2.py
--- a/终端/终端菜单/terminal_menu.py
+++ b/终端/终端菜单/terminal_menu.py
--- a/终端/终端菜单/test.txt
+++ b/终端/终端菜单/test.txt
--- a/动机与模式/15
+++ b/动机与模式/15
--- a/动机与模式/15
+++ b/动机与模式/15
--- a/动机与模式/15
+++ b/动机与模式/15
--- a/动机与模式/15
+++ b/动机与模式/15
--- a/Web/MVC/templates/index.html
+++ b/Web/MVC/templates/index.html
--- a/Web/simpleWeb/app.py
+++ b/Web/simpleWeb/app.py
--- a/Web/simpleWeb/templates/index.html
+++ b/Web/simpleWeb/templates/index.html
--- a/Web/simpleWeb/templates/result.html
+++ b/Web/simpleWeb/templates/result.html
--- a/软件不能挂掉.py
+++ b/软件不能挂掉.py
@ -2,7 +2,7 @@ import re, operator, string
 from cppy.cp_util import *

 #
-# The functions
+# 遇到异常，给出一个能让程序继续运行下去的默认值
 #
 def extract_words(path_to_file):
    try:
--- a/时间停止在那一刻.py
+++ b/时间停止在那一刻.py
@ -1,7 +1,9 @@
 from collections import Counter
 from cppy.cp_util import *

-
+#
+# 遇到异常，退出程序
+#
 def extract_words(path_to_file):
    assert(type(path_to_file) is str), "Must be a string!" 
    assert(path_to_file), "Must be a non-empty string!" 
--- a/预判可能的错误.py
+++ b/预判可能的错误.py
@ -1,6 +1,8 @@
 from cppy.cp_util import *

-
+#
+# 用断言从事发点给出出错的准确信息
+#
 def extractWords(path_to_file):
    assert(type(path_to_file) is str), "Must be a string" 
    assert(path_to_file), "Must be a non-empty string"         
--- a/动机与模式/17
+++ b/动机与模式/17
@ -2,6 +2,9 @@
 import cppy.cp_util as util
 from collections import Counter

+# 古老的编码风格：使用状态机来计算词频
+# 这种方法在Python中并不常见，但它展示了如何使用状态机来管理程序的状态和流程    
+

 class WordFrequencyStateMachine:
    def __init__(self, file_path):
--- a/动机与模式/17
+++ b/动机与模式/17
@ -3,6 +3,9 @@ from dataclasses import dataclass
 from collections import Counter
 import re

+# 对象属性是现代 Python编程喜欢的风格
+# 这里使用了dataclass来简化代码 
+
@dataclass
 class WordFrequency:
    text: str
--- a/高性能编程/.ipynb_checkpoints/readme-checkpoint.md
+++ b/高性能编程/.ipynb_checkpoints/readme-checkpoint.md
@ -1,44 +0,0 @@
-
-从计算机系统结构的角度，提高 Python 任务执行速度的核心在于：减少解释器开销（编译/JIT）、提升并行性（多核/GPU）、优化内存访问（缓存友好）、降低 I/O 瓶颈以及适配硬件特性等。当前主要办法如下：
-
-
-### 计算单元层面利用多核并行计算
-对于 CPU 密集型任务，使用多进程，每个进程拥有独立的 Python 解释器和内存空间，运行在独立的内核上，实现并行计算。
-
-
-### I/O 层面减少等待时间
- 异步编程：针对I/O请求等待，手工实现任务切换，完成并发执行.
- 多线程：解释器自动完成I/O请求的线程切换 。
- 批量处理，减少I/O请求数量 。
-
-
-### 编译层面减少解释器开销
- 使用 JIT 编译器：Just-In-Time（JIT）编译可以在运行时将Python代码编译成机器码，从而提升执行速度 。PyPy 是一种替代 CPython 的实现，PyPy 的 JIT 引擎可以分析代码执行路径，优化频繁调用的函数，充分利用处理器架构。
- **Cython 编译**：Cython 允许开发者为 Python 代码添加 C 类型注解，并编译为 C 代码，再由 C 编译器生成机器码。Cython 特别适合静态类型优化场景。
-
-
-### 利用Python的解释器特性
-  **使用内置数据类型和函数**：内置的数据类型（如列表、字典、集合等）和函数通常经过高度优化。
-  **选择合适的数据结构**：例如，一些类型执行一些操作更快，一些类型更省空间。
-  **减少全局变量的使用**：访问全局变量通常比局部变量慢，因为它们需要在更大的作用域中查找。
-  **减少函数调用**，可降低堆栈操作开销。
-  使用列表推导式替代循环，降低频繁创建和销毁临时对象的开销。
-  使用生成器而不是列表来处理大数据集，以减少内存占用。
-  使用XX池或预分配资源。
-
-
-### 使用第三方高性能库
-  NumPy/Pandas 用 C/C++ 编写并经过优化,使用连续内存块存储数据，向量化操作比显式的Python循环更高效。
-  SIMD 指令加速，NumPy、Numba、Pandas/SciPy 都使用了 SIMD。Cython 可以直接用 C 代码使用 SIMD 。
- `gzip` 模块可压缩数据，减少网络传输的数据量，提高网络传输速度。
- `mmap` 模块实现内存映射文件，在处理超大文件、优化I/O性能以及进程间通信方面具有显著优势。
- `functools.lru_cache` 缓存计算结果，避免重复计算 。
-
-
-### 使用性能分析工具
-如 cProfile 、Py-Spy、timeit 或 line_profiler
-
-## 总结
-具体实施时，应结合性能分析工具定位瓶颈，并根据任务特点选择合适的策略 。
-当然计算设备方面也可以简单提升：多机、更快的 CPU、更多核的CPU、更多的内存、更快的存储、增加 GPU/FPGA/TPU 。
-此外，随着Python社区的发展，新的技术和工具不断涌现，开发者应持续关注最新进展，以便更好地优化自己的代码 。
--- a/走向工业级代码/测试驱动的开发/config/settings.yaml
+++ b/走向工业级代码/测试驱动的开发/config/settings.yaml
@ -0,0 +1,7 @@
+
+data_dir: "./data"
+output_file: "./results.csv"
+stop_words_file: "./config/stopwords.txt"
+top_n: 10
+tokenizer: "simple"  # 可选 simple/jieba
+
--- a/走向工业级代码/测试驱动的开发/readme.MD
+++ b/走向工业级代码/测试驱动的开发/readme.MD
@ -0,0 +1,72 @@
+
+## 项目结构
+
+wordcount/
+├── data/               # 存放小说文本（示例含生成测试数据的脚本）
+├── config/
+│   └── settings.yaml   # 配置文件
+├── src/
+│   ├── __init__.py
+│   ├── core.py         # 核心逻辑
+│   └── file_io.py      # 文件操作
+└── tests/
+    ├── __init__.py
+    ├── conftest.py     # pytest配置
+    ├── unit/
+    │   ├── test_tokenizer.py
+    │   └── test_counter.py
+    ├── integration/
+    │   └── test_pipeline.py
+    └── performance/
+        └── test_benchmark.py
+
+## 关键测试点说明
+测试类型	测试目标	测试方法
+单元测试	验证分词器独立功能	提供固定输入，断言输出符合预期
+集成测试	验证完整流程的协同工作	使用临时目录模拟真实环境，验证端到端结果
+性能测试	确保算法时间复杂度合理	生成大数据量，监控执行时间与内存使用
+边界测试	处理空文件/无数据情况	添加空文件测试用例
+异常测试	处理无效路径/错误编码	故意传入错误参数验证异常处理
+
+## 开发流程演示
+
+### 先写测试（TDD核心）
+
+先写测试再实现功能
+```
+    def test_empty_file_processing():
+        counter = WordCounter()
+        counter.config = {"data_dir": "/empty"}
+        assert counter.process_files() == {}
+```
+
+### 实现功能直到测试通过
+```
+def process_files(self):
+    if not Path(self.config['data_dir']).exists():
+        return {}
+    # ...实际实现...
+```
+
+### 持续优化
+```
+# 发现性能瓶颈后优化
+def tokenize(self, text: str, stop_words: set):
+    # 优化为集合查找 O(1)
+    return [w for w in text.split() if w not in stop_words]
+```
+
+### 运行测试
+
+```
+# 安装依赖
+pip install pytest pytest-benchmark jieba pyyaml
+
+# 运行所有测试
+pytest tests/ -v
+
+# 运行性能测试
+pytest tests/performance/ -v --benchmark-only
+```
+
+这个项目完整展示了测试驱动开发的完整流程，涵盖单元测试、集成测试、性能测试等关键环节，符合工业级代码质量标准。
--- a/走向工业级代码/测试驱动的开发/src/core.py
+++ b/走向工业级代码/测试驱动的开发/src/core.py
@ -0,0 +1,52 @@
+from abc import ABC, abstractmethod
+from collections import Counter
+from pathlib import Path
+from typing import List, Set, Dict
+import yaml
+
+class BaseTokenizer(ABC):
+    """分词器抽象基类"""
+    @abstractmethod
+    def tokenize(self, text: str, stop_words: Set[str]) -> List[str]:
+        pass
+
+class SimpleTokenizer(BaseTokenizer):
+    """按空格分词"""
+    def tokenize(self, text: str, stop_words: Set[str]) -> List[str]:
+        return [word for word in text.split() if word not in stop_words]
+
+class JiebaTokenizer(BaseTokenizer):
+    """结巴分词（需安装jieba）"""
+    def tokenize(self, text: str, stop_words: Set[str]) -> List[str]:
+        import jieba
+        return [word for word in jieba.lcut(text) if word not in stop_words]
+
+class WordCounter:
+    """词频统计核心类"""
+    def __init__(self, config_path: str = "config/settings.yaml"):
+        with open(config_path, 'r') as f:
+            self.config = yaml.safe_load(f)
+        self.stop_words = self._load_stop_words()
+        self.tokenizer = self._init_tokenizer()
+
+    def _load_stop_words(self) -> Set[str]:
+        with open(self.config['stop_words_file'], 'r') as f:
+            return {line.strip() for line in f}
+
+    def _init_tokenizer(self) -> BaseTokenizer:
+        tokenizers = {
+            "simple": SimpleTokenizer(),
+            "jieba": JiebaTokenizer()
+        }
+        return tokenizers[self.config['tokenizer']]
+
+    def process_files(self) -> Dict[str, int]:
+        """处理所有文件并返回词频统计"""
+        counter = Counter()
+        data_dir = Path(self.config['data_dir'])
+        for file_path in data_dir.glob("*.txt"):
+            with open(file_path, 'r', encoding='utf-8') as f:
+                text = f.read()
+                words = self.tokenizer.tokenize(text, self.stop_words)
+                counter.update(words)
+        return dict(counter.most_common(self.config['top_n']))
--- a/走向工业级代码/测试驱动的开发/src/file_io.py
+++ b/走向工业级代码/测试驱动的开发/src/file_io.py
@ -0,0 +1,10 @@
+from pathlib import Path
+import csv
+
+def save_results(results: dict, output_path: str) -> None:
+    """保存结果到CSV"""
+    Path(output_path).parent.mkdir(exist_ok=True)
+    with open(output_path, 'w', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(["word", "count"])
+        writer.writerows(results.items())
--- a/走向工业级代码/测试驱动的开发/tests/integration/test_pipeline.py
+++ b/走向工业级代码/测试驱动的开发/tests/integration/test_pipeline.py
@ -0,0 +1,36 @@
+import pytest
+from src.core import WordCounter
+from src.file_io import save_results
+import csv
+
+@pytest.fixture
+def sample_data(tmp_path):
+    data_dir = tmp_path / "data"
+    data_dir.mkdir()
+    (data_dir / "test1.txt").write_text("apple banana apple")
+    (data_dir / "test2.txt").write_text("banana cherry")
+    return data_dir
+
+def test_full_pipeline(sample_data, tmp_path):
+    config = {
+        "data_dir": str(sample_data),
+        "stop_words_file": str(tmp_path / "stopwords.txt"),
+        "output_file": str(tmp_path / "results.csv"),
+        "top_n": 2,
+        "tokenizer": "simple"
+    }
+    
+    # 生成停用词文件
+    (tmp_path / "stopwords.txt").write_text("cherry")
+    
+    # 执行完整流程
+    counter = WordCounter()
+    counter.config = config
+    results = counter.process_files()
+    save_results(results, config['output_file'])
+    
+    # 验证输出
+    with open(config['output_file']) as f:
+        reader = csv.reader(f)
+        next(reader)  # Skip header
+        assert list(reader) == [["apple", "2"], ["banana", "2"]]
--- a/走向工业级代码/测试驱动的开发/tests/performance/test_benchmark.py
+++ b/走向工业级代码/测试驱动的开发/tests/performance/test_benchmark.py
@ -0,0 +1,34 @@
+import pytest
+from src.core import WordCounter
+import tempfile
+import random
+
+@pytest.fixture(scope="module")
+def large_data():
+    """生成1MB测试数据"""
+    words = [f"word{i}" for i in range(1000)]
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir) / "data"
+        data_dir.mkdir()
+        for i in range(10):
+            with open(data_dir / f"bigfile{i}.txt", 'w') as f:
+                content = " ".join(random.choices(words, k=100000))
+                f.write(content)
+        yield str(data_dir)
+
+def test_processing_performance(benchmark, large_data):
+    """性能基准测试"""
+    counter = WordCounter()
+    counter.config = {
+        "data_dir": large_data,
+        "stop_words_file": "nonexistent",
+        "output_file": "/dev/null",
+        "top_n": 10,
+        "tokenizer": "simple"
+    }
+    
+    # 执行基准测试
+    result = benchmark(counter.process_files)
+    
+    # 验证性能指标
+    assert benchmark.stats['mean'] < 1.0  # 平均执行时间 < 1秒
--- a/走向工业级代码/测试驱动的开发/tests/unit/test_tokenizer.py
+++ b/走向工业级代码/测试驱动的开发/tests/unit/test_tokenizer.py
@ -0,0 +1,16 @@
+import pytest
+from src.core import SimpleTokenizer, JiebaTokenizer
+
+@pytest.fixture
+def stop_words():
+    return {"the", "and", "a"}
+
+def test_simple_tokenizer(stop_words):
+    tokenizer = SimpleTokenizer()
+    text = "the quick brown fox and a dog"
+    assert tokenizer.tokenize(text, stop_words) == ["quick", "brown", "fox", "dog"]
+
+def test_jieba_tokenizer(stop_words):
+    tokenizer = JiebaTokenizer()
+    text = "我爱北京天安门"
+    assert tokenizer.tokenize(text, set()) == ["我", "爱", "北京", "天安门"]
--- a/走向工业级代码/测试驱动的开发/测试驱动开发.ipynb
+++ b/走向工业级代码/测试驱动的开发/测试驱动开发.ipynb
--- a/readme.MD
+++ b/readme.MD
@ -1,5 +1,11 @@

-## 代码为啥要这样写，我要这样写代码
+## Python 软件工程师的编程艺术
+
+初学编程者完成一门学校的标准课程学习后，会发现成熟的开源项目代码使用了完全不同课堂教学练习的代码风格。本项目主要用来回答这个问题：
+<div align="center">
+代码为啥要这样写，我要这样写代码 。
+</div>
+

 A 代码模式
 用一个简单任务，展示各种软件工程需求（完成任务简单、可读性强、可复用高、维护成本低等）下的代码写法