From f170c936d8c972d3ecebf361b3d07c5b1777ddf3 Mon Sep 17 00:00:00 2001
From: Yao <1928814540@qq.com>
Date: Sun, 11 Aug 2024 21:00:11 +0800
Subject: [PATCH 1/3] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E4=BA=86?=
 =?UTF-8?q?=E6=A0=B9=E6=8D=AE=E5=85=B3=E9=94=AE=E8=AF=8D=E7=88=AC=E5=8F=96?=
 =?UTF-8?q?=E5=A4=A9=E6=B0=B4=E5=B8=82=E4=BA=BA=E6=B0=91=E6=94=BF=E5=BA=9C?=
 =?UTF-8?q?=E7=BD=91=E7=AB=99=E4=B8=8A=E6=8C=87=E5=AE=9A=E6=97=A5=E6=9C=9F?=
 =?UTF-8?q?=E5=86=85=E6=96=B0=E9=97=BB=E6=A0=87=E9=A2=98=E7=9A=84=E5=8A=9F?=
 =?UTF-8?q?=E8=83=BD=EF=BC=8C=E5=B9=B6=E6=8F=90=E4=BE=9B=E4=BA=86=E5=A4=9A?=
 =?UTF-8?q?=E7=BA=BF=E7=A8=8B=E3=80=81=E5=A4=9A=E8=BF=9B=E7=A8=8B=E3=80=81?=
 =?UTF-8?q?=E5=8D=8F=E7=A8=8B=E5=92=8C=E5=BC=82=E6=AD=A5=E5=9B=9B=E7=A7=8D?=
 =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E6=96=B9=E5=BC=8F=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                            |   4 +
 B 高性能模式/000 普通做法.py |  74 ++++++++++
 B 高性能模式/010 多进程.py    |  86 ++++++++++++
 B 高性能模式/020 多线程.py    |  89 ++++++++++++
 B 高性能模式/030 协程.py       |  89 ++++++++++++
 B 高性能模式/040 异步.py       |  85 ++++++++++++
 B 高性能模式/readme.md           |  16 ++-
 B 高性能模式/util.py             | 190 +++++++++++++++++++++++++-
 8 files changed, 629 insertions(+), 4 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..638bc3f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+log.txt
+/test
+/.venv
+*/__pycache__
diff --git a/B 高性能模式/000 普通做法.py b/B 高性能模式/000 普通做法.py
index e69de29..a97e3b2 100644
--- a/B 高性能模式/000 普通做法.py	
+++ b/B 高性能模式/000 普通做法.py	
@@ -0,0 +1,74 @@
+"""
+根据提供的关键词列表，爬取天水市人民政府网站上指定日期内与关键词相关的新闻的标题，并将其存储至数据库中。
+
+考虑到相关因素，因此本代码只爬取前10页的新闻内容，即最多100条新闻作为测试。
+
+此方法为普通做法，即使用requests库通过Post请求爬取网页内容，再使用json提取新闻内容。
+
+注意：本代码中的关键词列表默认为['灾害']，日期范围默认为2018年1月1日至2018年12月31日。
+
+Args:
+    keywords: 用于搜索新闻的关键词列表
+    begin_date: 开始日期，用于搜索
+    end_date: 结束日期，用于搜索
+    size: 一次请求返回的新闻或政策的最大数量
+
+Examples:
+    ```
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
+    ```
+"""
+import util
+import logging
+from typing import List
+
+import tqdm
+
+
+@util.timeit
+def main(keywords: List[str], begin_date: str, end_date: str, size: int = 10):
+    """
+    爬取与提供的关键词列表相关的新闻.
+
+    Args:
+        keywords: 用于搜索新闻的关键词列表
+        begin_date: 开始日期，用于搜索
+        end_date: 结束日期，用于搜索
+        size: 一次请求返回的新闻或政策的最大数量
+    """
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s - %(levelname)s - %(message)s",
+                        datefmt="%Y-%m-%d %H:%M:%S",
+                        filename='log.txt',
+                        encoding='utf-8')
+
+    logging.info("开始运行普通爬取")
+
+    spider = util.Spider(keywords=keywords,
+                         begin_date=begin_date,
+                         end_date=end_date,
+                         size=size)
+
+    pbar = tqdm.tqdm(total=size * 10, desc='普通爬取进度', unit='条', ncols=80)
+    title_list = []
+    for keyword in keywords:
+        for current in range(1, 11):
+            logging.info(f'keyword: {keyword}, current: {current}')
+            config = spider.get_config(keyword, current)
+            data = spider.fetch(config)
+            title_list += spider.parse(data)
+            pbar.update(size)
+
+    spider.save(title_list)
+    pbar.close()
+    logging.info("爬取完成")
+
+
+if __name__ == "__main__":
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
diff --git a/B 高性能模式/010 多进程.py b/B 高性能模式/010 多进程.py
index e69de29..94177bc 100644
--- a/B 高性能模式/010 多进程.py	
+++ b/B 高性能模式/010 多进程.py	
@@ -0,0 +1,86 @@
+"""
+根据提供的关键词列表，爬取天水市人民政府网站上指定日期内与关键词相关的新闻的标题，并将其存储至数据库中。
+
+考虑到相关因素，因此本代码只爬取前10页的新闻内容，即最多100条新闻作为测试。
+
+此方法为多进程做法，即使用多进程并发爬取网页内容，再使用json提取新闻内容。
+
+注意：本代码中的关键词列表默认为['灾害']，日期范围默认为2018年1月1日至2018年12月31日。
+
+Args:
+    keywords: 用于搜索新闻的关键词列表
+    begin_date: 开始日期，用于搜索
+    end_date: 结束日期，用于搜索
+    size: 一次请求返回的新闻或政策的最大数量
+
+Examples:
+    ```
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
+    ```
+"""
+import util
+import logging
+from typing import List
+import multiprocessing
+
+import tqdm
+
+lock = multiprocessing.Lock()
+
+
+@util.timeit
+def main(keywords: List[str], begin_date: str, end_date: str, size: int = 10):
+    """
+    爬取与提供的关键词列表相关的新闻.
+
+    Args:
+        keywords: 用于搜索新闻的关键词列表
+        begin_date: 开始日期，用于搜索
+        end_date: 结束日期，用于搜索
+        size: 一次请求返回的新闻或政策的最大数量
+    """
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s - %(levelname)s - %(message)s",
+                        datefmt="%Y-%m-%d %H:%M:%S",
+                        filename='log.txt',
+                        encoding='utf-8')
+
+    logging.info("开始运行普通做法")
+
+    spider = util.Spider(keywords=keywords,
+                         begin_date=begin_date,
+                         end_date=end_date,
+                         size=size)
+
+    title_list = []
+    pbar = tqdm.tqdm(total=size * 10, desc='多进程爬取进度', unit='条', ncols=80)
+
+    with multiprocessing.Pool(processes=5) as pool:
+        results = []
+        for keyword in keywords:
+            for current in range(1, 11):
+                logging.info(f'keyword: {keyword}, current: {current}')
+                config = spider.get_config(keyword, current)
+                results.append(pool.apply_async(spider.fetch, (config, )))
+
+        for result in results:
+            data = result.get()
+            title_list += spider.parse(data)
+
+            lock.acquire()
+            pbar.update(size)
+            lock.release()
+
+    spider.save(title_list)
+    pbar.close()
+    logging.info("爬取完成")
+
+
+if __name__ == "__main__":
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
diff --git a/B 高性能模式/020 多线程.py b/B 高性能模式/020 多线程.py
index e69de29..19bc3c4 100644
--- a/B 高性能模式/020 多线程.py	
+++ b/B 高性能模式/020 多线程.py	
@@ -0,0 +1,89 @@
+"""
+根据提供的关键词列表，爬取天水市人民政府网站上指定日期内与关键词相关的新闻的标题，并将其存储至数据库中。
+
+考虑到相关因素，因此本代码只爬取前10页的新闻内容，即最多100条新闻作为测试。
+
+此方法为多线程做法，即使用多线程并行爬取网页内容，再使用json提取新闻内容。
+
+注意：本代码中的关键词列表默认为['灾害']，日期范围默认为2018年1月1日至2018年12月31日。
+
+Args:
+    keywords: 用于搜索新闻的关键词列表
+    begin_date: 开始日期，用于搜索
+    end_date: 结束日期，用于搜索
+    size: 一次请求返回的新闻或政策的最大数量
+
+Examples:
+    ```
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
+    ```
+"""
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+import util
+import logging
+from typing import List
+
+import tqdm
+
+lock = threading.Lock()
+
+
+@util.timeit
+def main(keywords: List[str], begin_date: str, end_date: str, size: int = 10):
+    """
+    爬取与提供的关键词列表相关的新闻.
+
+    Args:
+        keywords: 用于搜索新闻的关键词列表
+        begin_date: 开始日期，用于搜索
+        end_date: 结束日期，用于搜索
+        size: 一次请求返回的新闻或政策的最大数量
+    """
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s - %(levelname)s - %(message)s",
+                        datefmt="%Y-%m-%d %H:%M:%S",
+                        filename='log.txt',
+                        encoding='utf-8')
+
+    logging.info("开始运行多线程爬取")
+
+    spider = util.Spider(keywords=keywords,
+                         begin_date=begin_date,
+                         end_date=end_date,
+                         size=size)
+
+    pbar = tqdm.tqdm(total=size * 10, desc='多线程爬取进度', unit='条', ncols=80)
+    title_list = []
+    tasks = []
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        for keyword in keywords:
+            for current in range(1, 11):
+                logging.info(f'keyword: {keyword}, current: {current}')
+
+                config = spider.get_config(keyword, current)
+                future = executor.submit(spider.fetch, config)
+                tasks.append(future)
+                # 更新进度条
+                lock.acquire()
+                pbar.update(size)
+                lock.release()
+
+        for future in as_completed(tasks):
+            data = future.result()
+            title_list += spider.parse(data)
+
+    spider.save(title_list)
+    pbar.close()
+    logging.info("爬取完成")
+
+
+if __name__ == "__main__":
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
diff --git a/B 高性能模式/030 协程.py b/B 高性能模式/030 协程.py
index e69de29..291736f 100644
--- a/B 高性能模式/030 协程.py	
+++ b/B 高性能模式/030 协程.py	
@@ -0,0 +1,89 @@
+"""
+根据提供的关键词列表，爬取天水市人民政府网站上指定日期内与关键词相关的新闻的标题，并将其存储至数据库中。
+
+考虑到相关因素，因此本代码只爬取前10页的新闻内容，即最多100条新闻作为测试。
+
+此方法为协程做法，即使用gevent库通过协程并发爬取网页内容，再使用json提取新闻内容。
+
+注意：本代码中的关键词列表默认为['灾害']，日期范围默认为2018年1月1日至2018年12月31日。
+
+Args:
+    keywords: 用于搜索新闻的关键词列表
+    begin_date: 开始日期，用于搜索
+    end_date: 结束日期，用于搜索
+    size: 一次请求返回的新闻或政策的最大数量
+
+Examples:
+    ```
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
+    ```
+"""
+
+import gevent
+from gevent import monkey
+
+# 打补丁，使标准库能够与gevent协同工作
+monkey.patch_all()
+
+import util
+import logging
+from typing import List
+
+import tqdm
+
+
+@util.timeit
+def main(keywords: List[str], begin_date: str, end_date: str, size: int = 10):
+    """
+    爬取与提供的关键词列表相关的新闻.
+
+    Args:
+        keywords: 用于搜索新闻的关键词列表
+        begin_date: 开始日期，用于搜索
+        end_date: 结束日期，用于搜索
+        size: 一次请求返回的新闻或政策的最大数量
+    """
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s - %(levelname)s - %(message)s",
+                        datefmt="%Y-%m-%d %H:%M:%S",
+                        filename='log.txt',
+                        encoding='utf-8')
+
+    logging.info("开始运行协程爬取")
+
+    spider = util.Spider(keywords=keywords,
+                         begin_date=begin_date,
+                         end_date=end_date,
+                         size=size)
+
+    pbar = tqdm.tqdm(total=size * 10, desc='协程爬取进度', unit='条', ncols=80)
+    title_list = []
+
+    def fetch_and_parse(keyword, current):
+        logging.info(f'keyword: {keyword}, current: {current}')
+        config = spider.get_config(keyword, current)
+        data = spider.fetch(config)
+        titles = spider.parse(data)
+        title_list.extend(titles)
+        pbar.update(size)
+
+    jobs = [
+        gevent.spawn(fetch_and_parse, keyword, current) for keyword in keywords
+        for current in range(1, 11)
+    ]
+
+    gevent.joinall(jobs)
+
+    spider.save(title_list)
+    pbar.close()
+    logging.info("爬取完成")
+
+
+if __name__ == "__main__":
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
diff --git a/B 高性能模式/040 异步.py b/B 高性能模式/040 异步.py
index e69de29..fb2b1c1 100644
--- a/B 高性能模式/040 异步.py	
+++ b/B 高性能模式/040 异步.py	
@@ -0,0 +1,85 @@
+"""
+根据提供的关键词列表，爬取天水市人民政府网站上指定日期内与关键词相关的新闻的标题，并将其存储至数据库中。
+
+考虑到相关因素，因此本代码只爬取前10页的新闻内容，即最多100条新闻作为测试。
+
+此方法为多线程做法，即使用异步并行爬取网页内容，再使用json提取新闻内容。
+
+注意：本代码中的关键词列表默认为['灾害']，日期范围默认为2018年1月1日至2018年12月31日。
+
+Args:
+    keywords: 用于搜索新闻的关键词列表
+    begin_date: 开始日期，用于搜索
+    end_date: 结束日期，用于搜索
+    size: 一次请求返回的新闻或政策的最大数量
+
+Examples:
+    ```
+    asyncio.run(
+        main_async(keywords=['灾害'],
+                   begin_date='2018-01-01',
+                   end_date='2018-12-31',
+                   size=10))
+    ```
+"""
+
+import asyncio
+import util
+import logging
+from typing import List
+import tqdm
+
+
+@util.timeit_async
+async def main_async(keywords: List[str],
+                     begin_date: str,
+                     end_date: str,
+                     size: int = 10):
+    """
+    使用异步方式爬取与提供的关键词列表相关的新闻.
+
+    Args:
+        keywords: 用于搜索新闻的关键词列表
+        begin_date: 开始日期，用于搜索
+        end_date: 结束日期，用于搜索
+        size: 一次请求返回的新闻或政策的最大数量
+    """
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s - %(levelname)s - %(message)s",
+                        datefmt="%Y-%m-%d %H:%M:%S",
+                        filename='log.txt',
+                        encoding='utf-8')
+
+    logging.info("开始运行异步爬取")
+
+    spider = util.Spider(keywords=keywords,
+                         begin_date=begin_date,
+                         end_date=end_date,
+                         size=size)
+
+    pbar = tqdm.tqdm(total=size * 10, desc='异步爬取进度', unit='条', ncols=80)
+    title_list = []
+    tasks = []
+    for keyword in keywords:
+        for current in range(1, 11):
+            logging.info(f'keyword: {keyword}, current: {current}')
+            config = spider.get_config(keyword, current)
+            task = asyncio.create_task(spider.fetch_async(config))
+            tasks.append(task)
+
+    for task in asyncio.as_completed(tasks):
+        data = await task
+        title_list += spider.parse(data)
+        pbar.update(size)
+
+    spider.save(title_list)
+    pbar.close()
+    logging.info("爬取完成")
+
+
+if __name__ == "__main__":
+    asyncio.run(
+        main_async(keywords=['灾害'],
+                   begin_date='2018-01-01',
+                   end_date='2018-12-31',
+                   size=10))
diff --git a/B 高性能模式/readme.md b/B 高性能模式/readme.md
index f464b8e..7470dbf 100644
--- a/B 高性能模式/readme.md	
+++ b/B 高性能模式/readme.md	
@@ -8,4 +8,18 @@
 # 任务
 
 
-# 讨论分析
\ No newline at end of file
+# 讨论分析
+普通做法连续进行了五次测试，时间分别为34.231s、34.091s、34.164s、34.226s、33.958s，平均时间为34.134s
+多进程（进程数=5）连续进行了五次测试，时间分别为7.719s、7.716s、7.690s、7.730s、7.711s，平均时间为7.7132s
+多线程（线程数=5）连续进行了五次测试，时间分别为7.185s、7.964s、6.983s、6.969s、7.035s，平均时间为7.2272s
+协程连续进行了五次测试，时间分别为3.775s、3.807s、3.733s、3.824s、3.744s，平均时间为3.776s
+异步连续进行了五次测试，时间分别为6.975s、7.675s、7.018s、7.032s、7.049s，平均时间为7.1498s
+注：为保证公平性，每一次Post请求后休眠3秒
+
+可以看出，协程的性能最好，普通做法的性能最差，多线程、多进程和异步的性能介于两者之间。
+考虑到多进程和多线程是故意开的5个进程和线程，而协程是单线程，所以协程的性能最好。
+另外，异步的性能最差，可能是由于异步的并发模型需要频繁地切换线程，导致性能下降。
+总的来说，协程的性能最好，多线程和多进程的性能介于两者之间，普通做法的性能最差。
+
+# 总结
+协程的性能最好，多线程和多进程的性能介于两者之间，普通做法的性能最差。
\ No newline at end of file
diff --git a/B 高性能模式/util.py b/B 高性能模式/util.py
index 384717d..5d7495a 100644
--- a/B 高性能模式/util.py	
+++ b/B 高性能模式/util.py	
@@ -1,4 +1,188 @@
+"""
 
-################################################################################
-#  本主题通用代码
-################################################################################
+"""
+import re
+import time
+import functools
+import json
+import asyncio
+import requests
+from typing import Any, Dict, List
+
+
+class Spider:
+    """
+    爬虫类。
+
+    Args:
+        keywords (List[str]): 用于搜索新闻的关键词列表
+        begin_date (str): 开始日期，用于搜索
+        end_date (str): 结束日期，用于搜索
+        size (int): 一次请求返回的新闻或政策的最大数量
+
+    Attributes:
+        URL (str): 网址
+    """
+    # 天水市人民政府网站
+    URL = ('https://www.tianshui.gov.cn/aop_component/'
+           '/webber/search/search/search/queryPage')
+
+    def __init__(self, keywords: List[str], begin_date: str, end_date: str,
+                 size: int):
+        self.keywords = keywords
+        self.begin_date = begin_date
+        self.end_date = end_date
+        self.size = size
+
+    def get_config(self, keyword: str, current: int) -> Dict[str, Any]:
+        """
+        获取配置信息。
+
+        Args:
+            keyword (str): 关键词
+            size (int): 一次请求返回的新闻的最大数量
+
+        Returns:
+            Dict[str, Any]: 配置信息
+        """
+
+        return {
+            "aliasName": "article_data,open_data,mailbox_data,article_file",
+            "keyWord": keyword,
+            "lastkeyWord": keyword,
+            "searchKeyWord": False,
+            "orderType": "score",
+            "searchType": "text",
+            "searchScope": "3",
+            "searchOperator": 0,
+            "searchDateType": "custom",
+            "searchDateName": f"{self.begin_date}-{self.end_date}",
+            "beginDate": self.begin_date,
+            "endDate": self.end_date,
+            "showId": "c2ee13065aae85d7a998b8a3cd645961",
+            "auditing": ["1"],
+            "owner": "1912126876",
+            "token": "tourist",
+            "urlPrefix": "/aop_component/",
+            "page": {
+                "current": current,
+                "size": self.size,
+                "pageSizes": [2, 5, 10, 20, 50, 100],
+                "total": 0,
+                "totalPage": 0,
+                "indexs": []
+            },
+            "advance": False,
+            "advanceKeyWord": "",
+            "lang": "i18n_zh_CN"
+        }
+
+    def generate_headers(self) -> dict:
+        """
+        生成请求头。
+
+        Returns:
+            dict: 请求头
+        """
+        return {
+            'Authorization':
+            'tourist',
+            'User-Agent':
+            ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit'
+             '/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari'
+             '/537.36 Edg/124.0.0.0')
+        }
+
+    def fetch(self, config: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        普通做法。
+        Post请求获取网页内容，并返回请求结果。
+
+        Args:
+            config (Dict[str, Any]): 配置信息
+
+        Returns:
+            Dict[str, Any]: 请求结果
+        """
+        response = requests.post(self.URL,
+                                 headers=self.generate_headers(),
+                                 json=config).text
+        time.sleep(3)
+        return json.loads(response)
+
+    async def fetch_async(self, config: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        异步做法。
+        Post请求获取网页内容，并返回请求结果。
+
+        Args:
+            config (Dict[str, Any]): 配置信息
+
+        Returns:
+            Dict[str, Any]: 请求结果
+        """
+        response = requests.post(self.URL,
+                                 headers=self.generate_headers(),
+                                 json=config).text
+        await asyncio.sleep(3)
+        return json.loads(response)
+
+    def parse(self, data: Dict[str, Any]) -> List[str]:
+        """
+        解析网页内容。
+
+        Args:
+            data (Dict[str, Any]): 网页内容
+
+        Returns:
+            List[str]: 标题列表
+        """
+        title_list = []
+        records = data['data']['page']['records']
+        for i in range(self.size):
+            title = records[i]['title']
+            title = re.sub('<[^>]*>', '', title)  # 去除html标签
+            title_list.append(title)
+            # print(title)
+        return title_list
+
+    def save(self, title_list: List[str]):
+        """
+        保存数据。
+        """
+        pass
+
+
+# 时间装饰器
+def timeit(func):
+    """
+    计算函数运行时间。
+
+    Args:
+        func: 函数
+
+    Return:
+        函数
+    """
+
+    def wrapper(*args, **kwargs):
+        start = time.time()
+        result = func(*args, **kwargs)
+
+        print(f'{func.__name__} cost: {time.time() - start}')
+        return result
+
+    return wrapper
+
+
+def timeit_async(func):
+
+    @functools.wraps(func)
+    async def wrapper(*args, **kwargs):
+        start = time.time()
+        result = await func(*args, **kwargs)
+
+        print(f'{func.__name__} cost: {time.time() - start}')
+        return result
+
+    return wrapper

From 15736d7393992b24dcd38d3554cbd85fe04aa953 Mon Sep 17 00:00:00 2001
From: Yao <1928814540@qq.com>
Date: Tue, 13 Aug 2024 15:10:07 +0800
Subject: [PATCH 2/3] =?UTF-8?q?refactor(code):=20=E4=BC=98=E5=8C=96?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81=EF=BC=8C=E6=8F=90=E9=AB=98=E5=8F=AF=E8=AF=BB?=
 =?UTF-8?q?=E6=80=A7=E5=92=8C=E6=95=88=E7=8E=87?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                    |   2 +-
 .../10 一盘大棋/1 最基础的写法.py   |  10 +--
 .../10 一盘大棋/2 加入语言特性.py   |   7 +-
 A 代码模式/10 一盘大棋/3 Hacker.py    |  18 +++--
 A 代码模式/{cppy_ => cppy}/cp_util.py     |  74 +++++++++++-------
 A 代码模式/{cppy_ => cppy}/data/Prey.txt  |   0
 .../data/pride-and-prejudice.txt              |   0
 .../{cppy_ => cppy}/data/stop_words.txt       |   0
 A 代码模式/{cppy_ => cppy}/data/test.txt  |   0
 .../cppy_/__pycache__/cp_util.cpython-38.pyc  | Bin 3069 -> 0 bytes
 10 files changed, 62 insertions(+), 49 deletions(-)
 rename A 代码模式/{cppy_ => cppy}/cp_util.py (60%)
 rename A 代码模式/{cppy_ => cppy}/data/Prey.txt (100%)
 rename A 代码模式/{cppy_ => cppy}/data/pride-and-prejudice.txt (100%)
 rename A 代码模式/{cppy_ => cppy}/data/stop_words.txt (100%)
 rename A 代码模式/{cppy_ => cppy}/data/test.txt (100%)
 delete mode 100644 A 代码模式/cppy_/__pycache__/cp_util.cpython-38.pyc

diff --git a/.gitignore b/.gitignore
index 638bc3f..61f4419 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
 log.txt
 /test
 /.venv
-*/__pycache__
+__pycache__
diff --git a/A 代码模式/10 一盘大棋/1 最基础的写法.py b/A 代码模式/10 一盘大棋/1 最基础的写法.py
index a0c4ab6..e70362d 100644
--- a/A 代码模式/10 一盘大棋/1 最基础的写法.py	
+++ b/A 代码模式/10 一盘大棋/1 最基础的写法.py	
@@ -7,10 +7,9 @@ with open(stopwordfilepath, encoding='utf-8') as f:
 for letter in 'abcdefghijklmnopqrstuvwxyz':
     stop_words.append(letter)
 
-
 # 读文件，逐行扫描文本，发现词，确定不是停用词，计数
 word_freqs = []
-for line in open( testfilepath, encoding='utf-8' ):
+for line in open(testfilepath, encoding='utf-8'):
     start_char = None
     i = 0
     for c in line:
@@ -42,10 +41,9 @@ for line in open( testfilepath, encoding='utf-8' ):
 # 使用冒泡排序对词频进行排序
 n = len(word_freqs)
 for i in range(n):
-    for j in range(0, n-i-1):
-        if word_freqs[j][1] < word_freqs[j+1][1]:
-            word_freqs[j], word_freqs[j+1] = word_freqs[j+1], word_freqs[j]            
-
+    for j in range(0, n - i - 1):
+        if word_freqs[j][1] < word_freqs[j + 1][1]:
+            word_freqs[j], word_freqs[j + 1] = word_freqs[j + 1], word_freqs[j]
 
 # 打印频率最高的前10个词
 for tf in word_freqs[:10]:
diff --git a/A 代码模式/10 一盘大棋/2 加入语言特性.py b/A 代码模式/10 一盘大棋/2 加入语言特性.py
index df26455..f48b2c4 100644
--- a/A 代码模式/10 一盘大棋/2 加入语言特性.py	
+++ b/A 代码模式/10 一盘大棋/2 加入语言特性.py	
@@ -1,4 +1,4 @@
-from cppy.cp_util import stopwordfilepath,testfilepath
+from cppy.cp_util import stopwordfilepath, testfilepath
 import string
 from collections import Counter
 
@@ -8,7 +8,7 @@ stop_words.update(list(string.ascii_lowercase))
 
 # 读取文件并计算单词频率
 word_freqs = Counter()
-with open(testfilepath,encoding = 'utf8') as f:
+with open(testfilepath, encoding='utf8') as f:
     for line_num, line in enumerate(f, 1):
         start_char = None
         for i, c in enumerate(line):
@@ -23,10 +23,9 @@ with open(testfilepath,encoding = 'utf8') as f:
 # 打印前10个最常见的单词
 for word, freq in word_freqs.most_common(10):
     print(f"{word}-{freq}")
-
 '''
 相比 A01
 使用collections.Counter来计数单词频率，从而简化了代码并提高了效率。
 使用enumerate来获取行号和行内容，使用set来存储停用词，都有助于提高代码的性能和可读性。
 使用most_common方法来获取最常见的单词，使输出更为简洁。
-'''
\ No newline at end of file
+'''
diff --git a/A 代码模式/10 一盘大棋/3 Hacker.py b/A 代码模式/10 一盘大棋/3 Hacker.py
index c70b4b5..2576218 100644
--- a/A 代码模式/10 一盘大棋/3 Hacker.py	
+++ b/A 代码模式/10 一盘大棋/3 Hacker.py	
@@ -1,12 +1,14 @@
-import re, collections
-from cppy.cp_util import stopwordfilepath,testfilepath
-
-stopwords = set(open( stopwordfilepath,encoding = 'utf8' ).read().split(','))
-words = re.findall('[a-z]{2,}', open( testfilepath,encoding = 'utf8').read().lower())
-counts = collections.Counter( w for w in words if w not in stopwords )
-for (w, c) in counts.most_common(10) :  print(w, '-', c)
+import re
+import collections
+from cppy.cp_util import stopwordfilepath, testfilepath
 
+stopwords = set(open(stopwordfilepath, encoding='utf8').read().split(','))
+words = re.findall('[a-z]{2,}',
+                   open(testfilepath, encoding='utf8').read().lower())
+counts = collections.Counter(w for w in words if w not in stopwords)
+for (w, c) in counts.most_common(10):
+    print(w, '-', c)
 '''
 熟练的软件工程师，会如此简单完成任务
 后面的例子，我们必须变的啰嗦一些，不能用这种太 hacker 的写法
-'''
\ No newline at end of file
+'''
diff --git a/A 代码模式/cppy_/cp_util.py b/A 代码模式/cppy/cp_util.py
similarity index 60%
rename from A 代码模式/cppy_/cp_util.py
rename to A 代码模式/cppy/cp_util.py
index 6554c2c..b85ec2a 100644
--- a/A 代码模式/cppy_/cp_util.py	
+++ b/A 代码模式/cppy/cp_util.py	
@@ -1,7 +1,6 @@
-
 import site
-import os,re,time
-import string,operator
+import os, re, time
+import string, operator
 
 ################################################################################
 #  变量
@@ -10,76 +9,89 @@ testfilename = 'test.txt'
 testfilename = 'pride-and-prejudice.txt'
 testfilename = 'Prey.txt'
 
-db_filename = "tf.db"  
+db_filename = "tf.db"
 
 site_packages = site.getsitepackages()
 for package in site_packages:
-    if 'package' in  package:
+    if 'package' in package:
         basePath = package
-stopwordfilepath = os.path.join(basePath, 'cppy','data','stop_words.txt')
-testfilepath = os.path.join(basePath, 'cppy','data',testfilename )
+stopwordfilepath = os.path.join(basePath, 'cppy', 'data', 'stop_words.txt')
+testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename)
 
 
 ################################################################################
 #  项目函数
 ################################################################################
-def read_file(path_to_file):    
-    with open(path_to_file,encoding='utf-8') as f:
+def read_file(path_to_file):
+    with open(path_to_file, encoding='utf-8') as f:
         data = f.read()
     return data
 
-def re_split( data ):
+
+def re_split(data):
     pattern = re.compile('[\W_]+')
     data = pattern.sub(' ', data).lower()
     return data.split()
 
-def get_stopwords( path_to_file = stopwordfilepath ):
-    with open(path_to_file,encoding='utf-8') as f:
-        data = f.read().split(',')        
+
+def get_stopwords(path_to_file=stopwordfilepath):
+    with open(path_to_file, encoding='utf-8') as f:
+        data = f.read().split(',')
     data.extend(list(string.ascii_lowercase))
     return data
 
-def get_chunks( file_path = testfilepath, chunk_size = 1000):
+
+def get_chunks(file_path=testfilepath, chunk_size=1000):
     # 读取文件内容，分割文件内容为多个块，每个块由一个进程处理
     # 可以根据实际情况调整块大小
-    content = re_split(read_file(file_path))         
-    chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
+    content = re_split(read_file(file_path))
+    chunks = [
+        content[i:i + chunk_size] for i in range(0, len(content), chunk_size)
+    ]
     return chunks
 
+
 def extract_file_words(path_to_file):
-    word_list = re_split( read_file(path_to_file) )
+    word_list = re_split(read_file(path_to_file))
     stop_words = get_stopwords()
-    return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ]
+    return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
+
 
 def extract_str_words(data_str):
-    word_list = re_split( data_str )
+    word_list = re_split(data_str)
     stop_words = get_stopwords()
-    return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ]
+    return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
+
 
 def count_word(word, word_freqs, stopwords):
     if word not in stopwords:
         word_freqs[word] = word_freqs.get(word, 0) + 1
 
-def get_frequencies(word_list):    
-    word_freqs = {}  
-    for word in word_list:  
-        word_freqs[word] = word_freqs.get(word, 0) + 1    
+
+def get_frequencies(word_list):
+    word_freqs = {}
+    for word in word_list:
+        word_freqs[word] = word_freqs.get(word, 0) + 1
     return word_freqs
 
-def sort_dict (word_freq):
+
+def sort_dict(word_freq):
     return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
     # return sorted( word_freq, key=lambda x: x[1], reverse=True )
 
-def print_word_freqs( word_freqs, n = 10):
-    for (w, c) in word_freqs[ :n ]:
-        print( w, '-', c )
+
+def print_word_freqs(word_freqs, n=10):
+    for (w, c) in word_freqs[:n]:
+        print(w, '-', c)
 
 
 ################################################################################
 #  通用工具
 ################################################################################
 
+
 def timing_decorator(func):
+
     def wrapper(*args, **kwargs):
         start_time = time.time()  # 记录开始时间
         result = func(*args, **kwargs)  # 调用原始函数
@@ -87,7 +99,9 @@ def timing_decorator(func):
         run_time = end_time - start_time  # 计算运行时间
         print(f"{func.__name__} 运行时间: {run_time*1000:.2f} 秒")
         return result
+
     return wrapper
 
-def  test():
-    print( 'cppy welcome' )
\ No newline at end of file
+
+def test():
+    print('cppy welcome')
diff --git a/A 代码模式/cppy_/data/Prey.txt b/A 代码模式/cppy/data/Prey.txt
similarity index 100%
rename from A 代码模式/cppy_/data/Prey.txt
rename to A 代码模式/cppy/data/Prey.txt
diff --git a/A 代码模式/cppy_/data/pride-and-prejudice.txt b/A 代码模式/cppy/data/pride-and-prejudice.txt
similarity index 100%
rename from A 代码模式/cppy_/data/pride-and-prejudice.txt
rename to A 代码模式/cppy/data/pride-and-prejudice.txt
diff --git a/A 代码模式/cppy_/data/stop_words.txt b/A 代码模式/cppy/data/stop_words.txt
similarity index 100%
rename from A 代码模式/cppy_/data/stop_words.txt
rename to A 代码模式/cppy/data/stop_words.txt
diff --git a/A 代码模式/cppy_/data/test.txt b/A 代码模式/cppy/data/test.txt
similarity index 100%
rename from A 代码模式/cppy_/data/test.txt
rename to A 代码模式/cppy/data/test.txt
diff --git a/A 代码模式/cppy_/__pycache__/cp_util.cpython-38.pyc b/A 代码模式/cppy_/__pycache__/cp_util.cpython-38.pyc
deleted file mode 100644
index 6e5a57946fffc445f92a98fa3490fdd6133ea99f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3069
zcmcIm&2Jk;6rY)$wbzdG(KKz-FPawGV%h{$C@NK`5G`D&qC%;PmQ>buXOcQ`Y-e^%
z+Q>Pjh$2zGE<GTmL_$KjBd+`(a|J>32XM)W-y1t=oFZ<lHE(ui*KgkYz2AFpZ}s;V
z8Lpo${;gYmjQviF-B%8acQCR)AQV%)#^TP*WA4`?7Jd%mz|S(4eh%YMi8!YM42coO
zdCk_Mbru(N;TpR$6c<%a7w2>4x-Q0ji`O-uE3NhGK9yIIDy++RVE2yBVP{YkRo_Dv
z52=#s$GlGu%y2cJ1|RZxSPiLt*xRp$)qcz)YD68td_WykhcF*hqv|l`L+YeDs*d3w
zqw2VN0Xv7)2{nfKh~hU{`P3#nWmC(M)tQ~J_wDkbrl~4Dp48QN)9AUDs;1gMh;A7D
zz~6;-W<uR{`DT*NC$l=1X|wsjNtM`Sc_6cmW@V{iR7UTnob-3E0CNH3Dn|Aagk^1j
zti)sS&A<w8^EJNCR>W;~k$uQMVxO?KfQiA>R@kyL<CiwLi*!A0sA_$-ESzjKb=^s$
z6SWbzQZuplDz;IXsn)dPGlmY^V5Wp=>uh>?8Yaz?h0|9XO5aGV)uuj8hC@_ZwpyEj
z4+YZZRU9E3i4Tkq@L_HSaM$CEz=p)|&YZ%HWoumVb?zMzO0?O!SPS3~oLrOZ%)7Fj
z`*iwt<<7Y+K2{E$FxutQ#zK>94zku=7uFg}+PE-l)~dEFycaQm_U3wLI-(y3y2$8?
zfAIuXU1N)Uh?@gg_ZZuQ{AF+K4)P=0$gc?%!2BNk*8(eTIG0<6W4sd&Ic3Oi6GCk9
z^X0tB(=sA40^xG{zSVW*WUZQ6m&>d{z~aFqORLq2_b*K{ZHNhbJ?fv;wn7Z@1R{!L
zq?ckaN{&%fdK{xDkaIM;l(M|dR{1L8V8tr?l6}FDH6Yo;sLik8iI2l`O30=C+e&5;
zvOC8Jmq%<qZzOmMg^JCGMz;BPVo}~?8+?-v1u27?u5Sn@CSG%VX@kebXDP5NSj9<#
zgMxi!oX#P?a!jy@mDutbKxUHEdg^tvXR_8vlUjCV;@Ka|fe)=GqsIaap^D}Tgja<Q
z=$uG=G@c{F&ue~#j_**r#B<!7#`-xtc0e7&Q_52hDE}V;HAiuhIZlH3Y`hPYzTOe@
zBJM6=^a6K!+dX*xAGnDQZbYB5h$E^4pqVkmu)T=wdrsx6bZiH(hyXi-bxHtjUvy6d
zx{8sFL9~%7AkHzAG~A;qpdK6oiP~%u{RHF*rVLfaO%g;Q@V*>4NnvzFAIuq}7qhsq
zlMH)PrI<Eab?be814=SFpYezrVpmT%$vR3_ZyniQWuP`gR~lUr-a@g<LNUn}ktsKs
zIg3|K83MHPK%J+Jb0mCNc4FfODQ(H%7V2|VXP59!H*qCQI>^lgc6y8=$9Iex#ez6S
z_1~gsQW;gKMvns|E9%Y5c>Xa;elV{exV+JyX_M))L~UsptCb6@Rxe~OLPs!(Z5UUi
z4H$(YU*>TqJIKDhh8IXd1uf2A!4Vl%6UNO&$W6+t)NjxxE&_PRVuBHYMMysvhY8A&
zX|W<!0(b#1+retU*<y)8T^3VYd>k<j(OB!YEDgcvc-@@zbGp~2A^)Batj9Ea7w>h$
zJHpK*Ry|0PjTB-U*$~9Df=3msQ|_?6a}krt5#KdPIXV}k#`_2^wfWba-0(ZHE!$$0
zt%MeZa8CRHCb#jOGs0BxgH-ZIxk^oef)q}*2BB%}>2KdYUH|IOC%<k!`Q`Fh^(l;6
z4kj+lv<SVizkdFnSZ=Ee+PsAg7gZ|tWI<Od<(!j=nax~oeu-u(q6tDnuezMkS*vDa
zbbsY{;>fh>{tmdxnO40EqOW-QC6hEkgzGpUBaTv56fq)xj4z*rMzo`f(rLqU1?9C{
zXS@IN@jzjvbTQXRc%l<fosv4VSH%Z-)D2K6Z{EWy#pt;xNTkpkWPu|g*QLjDi9SQd
zmUIo>K^sb*U7YZTd&eYwqunAUqzycMAF|BNn=~DbWd_na(Z~#uz`RZ3Fp1Y8;u8It
zzKsa)xKG{nt%eppd+ER3&$%q_r&VQpY3Q^#x{IET&!&4$(wCN#bB$^}9@;KXq}lK9
zoI|TlqoS`7->rP7@zv88DPI(PvUyT_;Ji=&>U}i1&`_<KzCz|?R8J!MCI}_RPe?)W
YrRqG)hjURFg(WW|8Ra5bEJVeB0Bxe5b^rhX


From 36afa1d66950153147ce46d2b5a4e37ddaa403e0 Mon Sep 17 00:00:00 2001
From: Yao <1928814540@qq.com>
Date: Tue, 13 Aug 2024 16:19:57 +0800
Subject: [PATCH 3/3] =?UTF-8?q?refactor:=20=E4=BC=98=E5=8C=96=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81=EF=BC=8C=E6=8F=90=E9=AB=98=E5=8F=AF=E8=AF=BB=E6=80=A7?=
 =?UTF-8?q?=E5=92=8C=E6=95=88=E7=8E=87?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../11 基础结构/对象化/1 类对象.py  | 71 +++++++++++---
 .../对象化/2 字典对象.py               | 45 ++++++---
 A 代码模式/cppy/cp_util.py                | 95 ++++++++++++++++++-
 3 files changed, 181 insertions(+), 30 deletions(-)

diff --git a/A 代码模式/11 基础结构/对象化/1 类对象.py b/A 代码模式/11 基础结构/对象化/1 类对象.py
index 013e743..4d251b8 100644
--- a/A 代码模式/11 基础结构/对象化/1 类对象.py	
+++ b/A 代码模式/11 基础结构/对象化/1 类对象.py	
@@ -1,58 +1,101 @@
 from collections import Counter
 from cppy.cp_util import *
-    
+
 
 class DataStorageManager:
-    """ 数据模型 """    
-    def __init__(self, path_to_file):                
-        self._data = re_split( read_file(path_to_file) )
+    """
+    数据模型，读取文件内容，并将内容分割成单词。
+
+    Attributes:
+        _data: 单词列表。
+
+    Methods:
+        _words (self): 返回分割后的单词列表。
+    """
+
+    def __init__(self, path_to_file):
+        self._data = re_split(read_file(path_to_file))
 
-    def words(self):        
+    def words(self):
+        """返回分割后的单词列表。"""
         return self._data
 
 
 class StopWordManager:
-    """ 停用词模型 """    
-    def __init__(self):        
+    """
+    停用词模型
+
+    Attributes:
+        _stop_words: 停用词列表
+
+    Methods:
+        is_stop_word (self, word): 判断给定单词是否为停用词。
+    """
+
+    def __init__(self):
         self._stop_words = get_stopwords()
 
     def is_stop_word(self, word):
+        """判断给定单词是否为停用词。"""
         return word in self._stop_words
 
 
 class WordFrequencyManager:
-    """ 词频模型 """    
+    """
+    词频模型，计算并管理单词的频率。
+
+    Attributes:
+        _word_freqs: 使用 Counter 存储单词及其出现次数。
+
+    Methods:
+        increment_count (self, word): 计算词频。
+        sorted(self): 返回按出现次数排序的单词列表。
+
+    """
+
     def __init__(self):
         self._word_freqs = Counter()
 
     def increment_count(self, word):
+        """计算词频。"""
         self._word_freqs[word] += 1
 
     def sorted(self):
+        """返回按出现次数排序的单词列表。"""
         return self._word_freqs.most_common()
 
 
 class WordFrequencyController:
+    """
+    控制器，控制整个流程，读取文件、处理停用词、计算词频并输出结果。
+
+    Attributes:
+        _storage_manager: DataStorageManager 实例，用于读取和处理文件内容。
+        _stop_word_manager: StopWordManager 实例，用于管理停用词。
+        _word_freq_manager: WordFrequencyManager 实例，用于计算和存储单词频率。
+
+    Methods:
+        run(self): 运行方法，遍历单词列表，过滤掉停用词，并计算每个单词的频率，最后输出结果。
+    """
+
     def __init__(self, path_to_file):
         self._storage_manager = DataStorageManager(path_to_file)
         self._stop_word_manager = StopWordManager()
         self._word_freq_manager = WordFrequencyManager()
 
     def run(self):
+        """运行方法，遍历单词列表，过滤掉停用词，并计算每个单词的频率，最后输出结果。"""
         for w in self._storage_manager.words():
             if not self._stop_word_manager.is_stop_word(w):
                 self._word_freq_manager.increment_count(w)
 
         word_freqs = self._word_freq_manager.sorted()
-        print_word_freqs(word_freqs)        
-
+        print_word_freqs(word_freqs)
 
 
-if __name__ == '__main__':    
+if __name__ == '__main__':
     WordFrequencyController(testfilepath).run()
-
-
 '''
 函数输入参数调用后，你的马上接住返回值
 类输入参数后实例化后，你可以需要的时候去访问你需要的数据（实例属性）
-'''    
\ No newline at end of file
+'''
diff --git a/A 代码模式/11 基础结构/对象化/2 字典对象.py b/A 代码模式/11 基础结构/对象化/2 字典对象.py
index 8fb9b14..9f3b1cc 100644
--- a/A 代码模式/11 基础结构/对象化/2 字典对象.py	
+++ b/A 代码模式/11 基础结构/对象化/2 字典对象.py	
@@ -1,29 +1,52 @@
 from cppy.cp_util import *
 
-def extract_words(obj, path_to_file):    
+
+def extract_words(obj, path_to_file):
+    """
+    从文件中提取单词并存储在对象的 'data' 字段中。
+
+    Args:
+        obj (dict): 存储数据的字典对象。
+        path_to_file (str): 文件路径。
+    """
     obj['data'] = extract_file_words(path_to_file)
 
+
 def increment_count(obj, w):
-    obj['freqs'][w] = 1 if w not in obj['freqs'] else obj['freqs'][w]+1
+    """
+    增加单词的计数。如果单词不存在，则将其计数设置为1。
 
+    参数:
+        obj (dict): 存储单词频率的字典对象。
+        w (str): 单词。
+    """
+    obj['freqs'][w] = 1 if w not in obj['freqs'] else obj['freqs'][w] + 1
+
+
+# 数据存储对象，包含初始化和获取单词的方法
 data_storage_obj = {
-    'data' : [],
-    'init' : lambda path_to_file : extract_words(data_storage_obj, path_to_file),
-    'words' : lambda : data_storage_obj['data']
+    'data': [],  # 存储单词列表
+    'init': lambda path_to_file: extract_words(data_storage_obj, path_to_file
+                                               ),  # 初始化方法，提取文件中的单词
+    'words': lambda: data_storage_obj['data']  # 获取单词列表的方法
 }
 
+# 单词频率对象，包含增加计数和排序的方法
 word_freqs_obj = {
-    'freqs' : {},
-    'increment_count' : lambda w : increment_count(word_freqs_obj, w),
-    'sorted' : lambda : sort_dict(word_freqs_obj['freqs']) 
+    'freqs': {},  # 存储单词频率的字典
+    'increment_count':
+    lambda w: increment_count(word_freqs_obj, w),  # 增加单词计数的方法
+    'sorted': lambda: sort_dict(word_freqs_obj['freqs'])  # 获取排序后的单词频率的方法
 }
 
-
 if __name__ == '__main__':
-    data_storage_obj['init']( testfilepath )    
+    # 初始化数据存储对象，提取文件中的单词
+    data_storage_obj['init'](testfilepath)
 
+    # 遍历单词列表，增加单词的计数
     for word in data_storage_obj['words']():
         word_freqs_obj['increment_count'](word)
 
+    # 获取排序后的单词频率并打印
     word_freqs = word_freqs_obj['sorted']()
-    print_word_freqs(word_freqs)    
\ No newline at end of file
+    print_word_freqs(word_freqs)
diff --git a/A 代码模式/cppy/cp_util.py b/A 代码模式/cppy/cp_util.py
index b85ec2a..13052b1 100644
--- a/A 代码模式/cppy/cp_util.py	
+++ b/A 代码模式/cppy/cp_util.py	
@@ -23,18 +23,45 @@ testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename)
 #  项目函数
 ################################################################################
 def read_file(path_to_file):
+    """
+    读取指定文件的内容。
+
+    Args:
+        path_to_file (str): 文件路径。
+
+    Returns:
+        str: 文件内容。
+    """
     with open(path_to_file, encoding='utf-8') as f:
         data = f.read()
     return data
 
 
 def re_split(data):
+    """
+    使用正则表达式分割字符串，将非字母字符替换为空格，并将所有字符转换为小写。
+
+    Args:
+        data (str): 输入字符串。
+
+    Returns:
+        list: 分割后的单词列表。
+    """
     pattern = re.compile('[\W_]+')
     data = pattern.sub(' ', data).lower()
     return data.split()
 
 
 def get_stopwords(path_to_file=stopwordfilepath):
+    """
+    获取停用词列表。
+
+    Args:
+        path_to_file (str): 停用词文件路径，默认为 stopwordfilepath。
+
+    Returns:
+        list: 停用词列表。
+    """
     with open(path_to_file, encoding='utf-8') as f:
         data = f.read().split(',')
     data.extend(list(string.ascii_lowercase))
@@ -42,8 +69,16 @@ def get_stopwords(path_to_file=stopwordfilepath):
 
 
 def get_chunks(file_path=testfilepath, chunk_size=1000):
-    # 读取文件内容，分割文件内容为多个块，每个块由一个进程处理
-    # 可以根据实际情况调整块大小
+    """
+    将文件内容分割成多个块。
+
+    Args:
+        file_path (str): 文件路径，默认为 testfilepath。
+        chunk_size (int): 每个块的大小，默认为 1000。
+
+    Returns:
+        list: 分割后的块列表。
+    """
     content = re_split(read_file(file_path))
     chunks = [
         content[i:i + chunk_size] for i in range(0, len(content), chunk_size)
@@ -52,23 +87,58 @@ def get_chunks(file_path=testfilepath, chunk_size=1000):
 
 
 def extract_file_words(path_to_file):
+    """
+    提取文件中的单词，去除停用词和长度小于3的单词。
+
+    Args:
+        path_to_file (str): 文件路径。
+
+    Returns:
+        list: 提取后的单词列表。
+    """
     word_list = re_split(read_file(path_to_file))
     stop_words = get_stopwords()
-    return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
+    return [w for w in word_list if (w not in stop_words) and len(w) >= 3]
 
 
 def extract_str_words(data_str):
+    """
+    提取字符串中的单词，去除停用词和长度小于3的单词。
+
+    Args:
+        data_str (str): 输入字符串。
+
+    Returns:
+        list: 提取后的单词列表。
+    """
     word_list = re_split(data_str)
     stop_words = get_stopwords()
-    return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
+    return [w for w in word_list if (w not in stop_words) and len(w) >= 3]
 
 
 def count_word(word, word_freqs, stopwords):
+    """
+    统计单词频率。
+
+    Args:
+        word (str): 单词。
+        word_freqs (dict): 单词频率字典。
+        stopwords (list): 停用词列表。
+    """
     if word not in stopwords:
         word_freqs[word] = word_freqs.get(word, 0) + 1
 
 
 def get_frequencies(word_list):
+    """
+    获取单词频率。
+
+    Args:
+        word_list (list): 单词列表。
+
+    Returns:
+        dict: 单词频率字典。
+    """
     word_freqs = {}
     for word in word_list:
         word_freqs[word] = word_freqs.get(word, 0) + 1
@@ -76,11 +146,26 @@ def get_frequencies(word_list):
 
 
 def sort_dict(word_freq):
+    """
+    对字典进行排序。
+
+    Args:
+        word_freq (dict): 单词频率字典。
+
+    Returns:
+        list: 排序后的单词频率列表。
+    """
     return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
-    # return sorted( word_freq, key=lambda x: x[1], reverse=True )
 
 
 def print_word_freqs(word_freqs, n=10):
+    """
+    打印单词频率。
+
+    Args:
+        word_freqs (list): 单词频率列表。
+        n (int): 打印的单词数量，默认为 10。
+    """
     for (w, c) in word_freqs[:n]:
         print(w, '-', c)