Merge pull request 'dev' (#17 ) from pcz4qfnkl/CodePattern:dev into dev

refactor: 优化代码，提高可读性和效率
refactor(code): 优化代码，提高可读性和效率
20 changed files with 929 additions and 140 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+log.txt
+/test
+/.venv
+__pycache__
--- a/最基础的写法.py
+++ b/最基础的写法.py
@ -7,10 +7,9 @@ with open(stopwordfilepath, encoding='utf-8') as f:
 for letter in 'abcdefghijklmnopqrstuvwxyz':
    stop_words.append(letter)

-
 # 读文件，逐行扫描文本，发现词，确定不是停用词，计数
 word_freqs = []
-for line in open( testfilepath, encoding='utf-8' ):
+for line in open(testfilepath, encoding='utf-8'):
    start_char = None
    i = 0
    for c in line:
@ -42,10 +41,9 @@ for line in open( testfilepath, encoding='utf-8' ):
 # 使用冒泡排序对词频进行排序
 n = len(word_freqs)
 for i in range(n):
-    for j in range(0, n-i-1):
-        if word_freqs[j][1] < word_freqs[j+1][1]:
-            word_freqs[j], word_freqs[j+1] = word_freqs[j+1], word_freqs[j]            
-
+    for j in range(0, n - i - 1):
+        if word_freqs[j][1] < word_freqs[j + 1][1]:
+            word_freqs[j], word_freqs[j + 1] = word_freqs[j + 1], word_freqs[j]

 # 打印频率最高的前10个词
 for tf in word_freqs[:10]:
--- a/加入语言特性.py
+++ b/加入语言特性.py
@ -1,4 +1,4 @@
-from cppy.cp_util import stopwordfilepath,testfilepath
+from cppy.cp_util import stopwordfilepath, testfilepath
 import string
 from collections import Counter

@ -8,7 +8,7 @@ stop_words.update(list(string.ascii_lowercase))

 # 读取文件并计算单词频率
 word_freqs = Counter()
-with open(testfilepath,encoding = 'utf8') as f:
+with open(testfilepath, encoding='utf8') as f:
    for line_num, line in enumerate(f, 1):
        start_char = None
        for i, c in enumerate(line):
@ -23,7 +23,6 @@ with open(testfilepath,encoding = 'utf8') as f:
 # 打印前10个最常见的单词
 for word, freq in word_freqs.most_common(10):
    print(f"{word}-{freq}")
-
 '''
 相比 A01
 使用collections.Counter来计数单词频率，从而简化了代码并提高了效率。
--- a/代码模式/10
+++ b/代码模式/10
@ -1,11 +1,13 @@
-import re, collections
-from cppy.cp_util import stopwordfilepath,testfilepath
-
-stopwords = set(open( stopwordfilepath,encoding = 'utf8' ).read().split(','))
-words = re.findall('[a-z]{2,}', open( testfilepath,encoding = 'utf8').read().lower())
-counts = collections.Counter( w for w in words if w not in stopwords )
-for (w, c) in counts.most_common(10) :  print(w, '-', c)
+import re
+import collections
+from cppy.cp_util import stopwordfilepath, testfilepath

+stopwords = set(open(stopwordfilepath, encoding='utf8').read().split(','))
+words = re.findall('[a-z]{2,}',
+                   open(testfilepath, encoding='utf8').read().lower())
+counts = collections.Counter(w for w in words if w not in stopwords)
+for (w, c) in counts.most_common(10):
+    print(w, '-', c)
 '''
 熟练的软件工程师，会如此简单完成任务
 后面的例子，我们必须变的啰嗦一些，不能用这种太 hacker 的写法
--- a/基础结构/对象化/1
+++ b/基础结构/对象化/1
@ -3,42 +3,88 @@ from cppy.cp_util import *


 class DataStorageManager:
-    """ 数据模型 """    
+    """
+    数据模型，读取文件内容，并将内容分割成单词。
+
+    Attributes:
+        _data: 单词列表。
+
+    Methods:
+        _words (self): 返回分割后的单词列表。
+    """
+
    def __init__(self, path_to_file):
-        self._data = re_split( read_file(path_to_file) )
+        self._data = re_split(read_file(path_to_file))

    def words(self):
+        """返回分割后的单词列表。"""
        return self._data


 class StopWordManager:
-    """ 停用词模型 """    
+    """
+    停用词模型
+
+    Attributes:
+        _stop_words: 停用词列表
+
+    Methods:
+        is_stop_word (self, word): 判断给定单词是否为停用词。
+    """
+
    def __init__(self):
        self._stop_words = get_stopwords()

    def is_stop_word(self, word):
+        """判断给定单词是否为停用词。"""
        return word in self._stop_words


 class WordFrequencyManager:
-    """ 词频模型 """    
+    """
+    词频模型，计算并管理单词的频率。
+
+    Attributes:
+        _word_freqs: 使用 Counter 存储单词及其出现次数。
+
+    Methods:
+        increment_count (self, word): 计算词频。
+        sorted(self): 返回按出现次数排序的单词列表。
+
+    """
+
    def __init__(self):
        self._word_freqs = Counter()

    def increment_count(self, word):
+        """计算词频。"""
        self._word_freqs[word] += 1

    def sorted(self):
+        """返回按出现次数排序的单词列表。"""
        return self._word_freqs.most_common()


 class WordFrequencyController:
+    """
+    控制器，控制整个流程，读取文件、处理停用词、计算词频并输出结果。
+
+    Attributes:
+        _storage_manager: DataStorageManager 实例，用于读取和处理文件内容。
+        _stop_word_manager: StopWordManager 实例，用于管理停用词。
+        _word_freq_manager: WordFrequencyManager 实例，用于计算和存储单词频率。
+
+    Methods:
+        run(self): 运行方法，遍历单词列表，过滤掉停用词，并计算每个单词的频率，最后输出结果。
+    """
+
    def __init__(self, path_to_file):
        self._storage_manager = DataStorageManager(path_to_file)
        self._stop_word_manager = StopWordManager()
        self._word_freq_manager = WordFrequencyManager()

    def run(self):
+        """运行方法，遍历单词列表，过滤掉停用词，并计算每个单词的频率，最后输出结果。"""
        for w in self._storage_manager.words():
            if not self._stop_word_manager.is_stop_word(w):
                self._word_freq_manager.increment_count(w)
@ -47,11 +93,8 @@ class WordFrequencyController:
        print_word_freqs(word_freqs)


-
 if __name__ == '__main__':
    WordFrequencyController(testfilepath).run()
-
-
 '''
 函数输入参数调用后，你的马上接住返回值
 类输入参数后实例化后，你可以需要的时候去访问你需要的数据（实例属性）
--- a/基础结构/对象化/2
+++ b/基础结构/对象化/2
@ -1,29 +1,52 @@
 from cppy.cp_util import *

+
 def extract_words(obj, path_to_file):
+    """
+    从文件中提取单词并存储在对象的 'data' 字段中。
+
+    Args:
+        obj (dict): 存储数据的字典对象。
+        path_to_file (str): 文件路径。
+    """
    obj['data'] = extract_file_words(path_to_file)

+
 def increment_count(obj, w):
-    obj['freqs'][w] = 1 if w not in obj['freqs'] else obj['freqs'][w]+1
+    """
+    增加单词的计数。如果单词不存在，则将其计数设置为1。

+    参数:
+        obj (dict): 存储单词频率的字典对象。
+        w (str): 单词。
+    """
+    obj['freqs'][w] = 1 if w not in obj['freqs'] else obj['freqs'][w] + 1
+
+
+# 数据存储对象，包含初始化和获取单词的方法
 data_storage_obj = {
-    'data' : [],
-    'init' : lambda path_to_file : extract_words(data_storage_obj, path_to_file),
-    'words' : lambda : data_storage_obj['data']
+    'data': [],  # 存储单词列表
+    'init': lambda path_to_file: extract_words(data_storage_obj, path_to_file
+                                               ),  # 初始化方法，提取文件中的单词
+    'words': lambda: data_storage_obj['data']  # 获取单词列表的方法
 }

+# 单词频率对象，包含增加计数和排序的方法
 word_freqs_obj = {
-    'freqs' : {},
-    'increment_count' : lambda w : increment_count(word_freqs_obj, w),
-    'sorted' : lambda : sort_dict(word_freqs_obj['freqs']) 
+    'freqs': {},  # 存储单词频率的字典
+    'increment_count':
+    lambda w: increment_count(word_freqs_obj, w),  # 增加单词计数的方法
+    'sorted': lambda: sort_dict(word_freqs_obj['freqs'])  # 获取排序后的单词频率的方法
 }

-
 if __name__ == '__main__':
-    data_storage_obj['init']( testfilepath )    
+    # 初始化数据存储对象，提取文件中的单词
+    data_storage_obj['init'](testfilepath)

+    # 遍历单词列表，增加单词的计数
    for word in data_storage_obj['words']():
        word_freqs_obj['increment_count'](word)

+    # 获取排序后的单词频率并打印
    word_freqs = word_freqs_obj['sorted']()
    print_word_freqs(word_freqs)
--- a/代码模式/cppy/cp_util.py
+++ b/代码模式/cppy/cp_util.py
@ -0,0 +1,192 @@
+import site
+import os, re, time
+import string, operator
+
+################################################################################
+#  变量
+################################################################################
+testfilename = 'test.txt'
+testfilename = 'pride-and-prejudice.txt'
+testfilename = 'Prey.txt'
+
+db_filename = "tf.db"
+
+site_packages = site.getsitepackages()
+for package in site_packages:
+    if 'package' in package:
+        basePath = package
+stopwordfilepath = os.path.join(basePath, 'cppy', 'data', 'stop_words.txt')
+testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename)
+
+
+################################################################################
+#  项目函数
+################################################################################
+def read_file(path_to_file):
+    """
+    读取指定文件的内容。
+
+    Args:
+        path_to_file (str): 文件路径。
+
+    Returns:
+        str: 文件内容。
+    """
+    with open(path_to_file, encoding='utf-8') as f:
+        data = f.read()
+    return data
+
+
+def re_split(data):
+    """
+    使用正则表达式分割字符串，将非字母字符替换为空格，并将所有字符转换为小写。
+
+    Args:
+        data (str): 输入字符串。
+
+    Returns:
+        list: 分割后的单词列表。
+    """
+    pattern = re.compile('[\W_]+')
+    data = pattern.sub(' ', data).lower()
+    return data.split()
+
+
+def get_stopwords(path_to_file=stopwordfilepath):
+    """
+    获取停用词列表。
+
+    Args:
+        path_to_file (str): 停用词文件路径，默认为 stopwordfilepath。
+
+    Returns:
+        list: 停用词列表。
+    """
+    with open(path_to_file, encoding='utf-8') as f:
+        data = f.read().split(',')
+    data.extend(list(string.ascii_lowercase))
+    return data
+
+
+def get_chunks(file_path=testfilepath, chunk_size=1000):
+    """
+    将文件内容分割成多个块。
+
+    Args:
+        file_path (str): 文件路径，默认为 testfilepath。
+        chunk_size (int): 每个块的大小，默认为 1000。
+
+    Returns:
+        list: 分割后的块列表。
+    """
+    content = re_split(read_file(file_path))
+    chunks = [
+        content[i:i + chunk_size] for i in range(0, len(content), chunk_size)
+    ]
+    return chunks
+
+
+def extract_file_words(path_to_file):
+    """
+    提取文件中的单词，去除停用词和长度小于3的单词。
+
+    Args:
+        path_to_file (str): 文件路径。
+
+    Returns:
+        list: 提取后的单词列表。
+    """
+    word_list = re_split(read_file(path_to_file))
+    stop_words = get_stopwords()
+    return [w for w in word_list if (w not in stop_words) and len(w) >= 3]
+
+
+def extract_str_words(data_str):
+    """
+    提取字符串中的单词，去除停用词和长度小于3的单词。
+
+    Args:
+        data_str (str): 输入字符串。
+
+    Returns:
+        list: 提取后的单词列表。
+    """
+    word_list = re_split(data_str)
+    stop_words = get_stopwords()
+    return [w for w in word_list if (w not in stop_words) and len(w) >= 3]
+
+
+def count_word(word, word_freqs, stopwords):
+    """
+    统计单词频率。
+
+    Args:
+        word (str): 单词。
+        word_freqs (dict): 单词频率字典。
+        stopwords (list): 停用词列表。
+    """
+    if word not in stopwords:
+        word_freqs[word] = word_freqs.get(word, 0) + 1
+
+
+def get_frequencies(word_list):
+    """
+    获取单词频率。
+
+    Args:
+        word_list (list): 单词列表。
+
+    Returns:
+        dict: 单词频率字典。
+    """
+    word_freqs = {}
+    for word in word_list:
+        word_freqs[word] = word_freqs.get(word, 0) + 1
+    return word_freqs
+
+
+def sort_dict(word_freq):
+    """
+    对字典进行排序。
+
+    Args:
+        word_freq (dict): 单词频率字典。
+
+    Returns:
+        list: 排序后的单词频率列表。
+    """
+    return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
+
+
+def print_word_freqs(word_freqs, n=10):
+    """
+    打印单词频率。
+
+    Args:
+        word_freqs (list): 单词频率列表。
+        n (int): 打印的单词数量，默认为 10。
+    """
+    for (w, c) in word_freqs[:n]:
+        print(w, '-', c)
+
+
+################################################################################
+#  通用工具
+################################################################################
+
+
+def timing_decorator(func):
+
+    def wrapper(*args, **kwargs):
+        start_time = time.time()  # 记录开始时间
+        result = func(*args, **kwargs)  # 调用原始函数
+        end_time = time.time()  # 记录结束时间
+        run_time = end_time - start_time  # 计算运行时间
+        print(f"{func.__name__} 运行时间: {run_time*1000:.2f} 秒")
+        return result
+
+    return wrapper
+
+
+def test():
+    print('cppy welcome')
--- a/代码模式/cppy_/data/Prey.txt
+++ b/代码模式/cppy_/data/Prey.txt
--- a/代码模式/cppy_/data/pride-and-prejudice.txt
+++ b/代码模式/cppy_/data/pride-and-prejudice.txt
--- a/代码模式/cppy_/data/stop_words.txt
+++ b/代码模式/cppy_/data/stop_words.txt
--- a/代码模式/cppy_/data/test.txt
+++ b/代码模式/cppy_/data/test.txt
--- a/代码模式/cppy_/pycache/cp_util.cpython-38.pyc
+++ b/代码模式/cppy_/pycache/cp_util.cpython-38.pyc
--- a/代码模式/cppy_/cp_util.py
+++ b/代码模式/cppy_/cp_util.py
@ -1,93 +0,0 @@
-
-import site
-import os,re,time
-import string,operator
-
-################################################################################
-#  变量
-################################################################################
-testfilename = 'test.txt'
-testfilename = 'pride-and-prejudice.txt'
-testfilename = 'Prey.txt'
-
-db_filename = "tf.db"  
-
-site_packages = site.getsitepackages()
-for package in site_packages:
-    if 'package' in  package:
-        basePath = package
-stopwordfilepath = os.path.join(basePath, 'cppy','data','stop_words.txt')
-testfilepath = os.path.join(basePath, 'cppy','data',testfilename )
-
-
-################################################################################
-#  项目函数
-################################################################################
-def read_file(path_to_file):    
-    with open(path_to_file,encoding='utf-8') as f:
-        data = f.read()
-    return data
-
-def re_split( data ):
-    pattern = re.compile('[\W_]+')
-    data = pattern.sub(' ', data).lower()
-    return data.split()
-
-def get_stopwords( path_to_file = stopwordfilepath ):
-    with open(path_to_file,encoding='utf-8') as f:
-        data = f.read().split(',')        
-    data.extend(list(string.ascii_lowercase))
-    return data
-
-def get_chunks( file_path = testfilepath, chunk_size = 1000):
-    # 读取文件内容，分割文件内容为多个块，每个块由一个进程处理
-    # 可以根据实际情况调整块大小
-    content = re_split(read_file(file_path))         
-    chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
-    return chunks
-
-def extract_file_words(path_to_file):
-    word_list = re_split( read_file(path_to_file) )
-    stop_words = get_stopwords()
-    return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ]
-
-def extract_str_words(data_str):
-    word_list = re_split( data_str )
-    stop_words = get_stopwords()
-    return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ]
-
-def count_word(word, word_freqs, stopwords):
-    if word not in stopwords:
-        word_freqs[word] = word_freqs.get(word, 0) + 1
-
-def get_frequencies(word_list):    
-    word_freqs = {}  
-    for word in word_list:  
-        word_freqs[word] = word_freqs.get(word, 0) + 1    
-    return word_freqs
-
-def sort_dict (word_freq):
-    return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
-    # return sorted( word_freq, key=lambda x: x[1], reverse=True )
-
-def print_word_freqs( word_freqs, n = 10):
-    for (w, c) in word_freqs[ :n ]:
-        print( w, '-', c )
-
-
-################################################################################
-#  通用工具
-################################################################################
-
-def timing_decorator(func):
-    def wrapper(*args, **kwargs):
-        start_time = time.time()  # 记录开始时间
-        result = func(*args, **kwargs)  # 调用原始函数
-        end_time = time.time()  # 记录结束时间
-        run_time = end_time - start_time  # 计算运行时间
-        print(f"{func.__name__} 运行时间: {run_time*1000:.2f} 秒")
-        return result
-    return wrapper
-
-def  test():
-    print( 'cppy welcome' )
--- a/高性能模式/000
+++ b/高性能模式/000
@ -0,0 +1,74 @@
+"""
+根据提供的关键词列表，爬取天水市人民政府网站上指定日期内与关键词相关的新闻的标题，并将其存储至数据库中。
+
+考虑到相关因素，因此本代码只爬取前10页的新闻内容，即最多100条新闻作为测试。
+
+此方法为普通做法，即使用requests库通过Post请求爬取网页内容，再使用json提取新闻内容。
+
+注意：本代码中的关键词列表默认为['灾害']，日期范围默认为2018年1月1日至2018年12月31日。
+
+Args:
+    keywords: 用于搜索新闻的关键词列表
+    begin_date: 开始日期，用于搜索
+    end_date: 结束日期，用于搜索
+    size: 一次请求返回的新闻或政策的最大数量
+
+Examples:
+    ```
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
+    ```
+"""
+import util
+import logging
+from typing import List
+
+import tqdm
+
+
+@util.timeit
+def main(keywords: List[str], begin_date: str, end_date: str, size: int = 10):
+    """
+    爬取与提供的关键词列表相关的新闻.
+
+    Args:
+        keywords: 用于搜索新闻的关键词列表
+        begin_date: 开始日期，用于搜索
+        end_date: 结束日期，用于搜索
+        size: 一次请求返回的新闻或政策的最大数量
+    """
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s - %(levelname)s - %(message)s",
+                        datefmt="%Y-%m-%d %H:%M:%S",
+                        filename='log.txt',
+                        encoding='utf-8')
+
+    logging.info("开始运行普通爬取")
+
+    spider = util.Spider(keywords=keywords,
+                         begin_date=begin_date,
+                         end_date=end_date,
+                         size=size)
+
+    pbar = tqdm.tqdm(total=size * 10, desc='普通爬取进度', unit='条', ncols=80)
+    title_list = []
+    for keyword in keywords:
+        for current in range(1, 11):
+            logging.info(f'keyword: {keyword}, current: {current}')
+            config = spider.get_config(keyword, current)
+            data = spider.fetch(config)
+            title_list += spider.parse(data)
+            pbar.update(size)
+
+    spider.save(title_list)
+    pbar.close()
+    logging.info("爬取完成")
+
+
+if __name__ == "__main__":
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
--- a/高性能模式/010
+++ b/高性能模式/010
@ -0,0 +1,86 @@
+"""
+根据提供的关键词列表，爬取天水市人民政府网站上指定日期内与关键词相关的新闻的标题，并将其存储至数据库中。
+
+考虑到相关因素，因此本代码只爬取前10页的新闻内容，即最多100条新闻作为测试。
+
+此方法为多进程做法，即使用多进程并发爬取网页内容，再使用json提取新闻内容。
+
+注意：本代码中的关键词列表默认为['灾害']，日期范围默认为2018年1月1日至2018年12月31日。
+
+Args:
+    keywords: 用于搜索新闻的关键词列表
+    begin_date: 开始日期，用于搜索
+    end_date: 结束日期，用于搜索
+    size: 一次请求返回的新闻或政策的最大数量
+
+Examples:
+    ```
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
+    ```
+"""
+import util
+import logging
+from typing import List
+import multiprocessing
+
+import tqdm
+
+lock = multiprocessing.Lock()
+
+
+@util.timeit
+def main(keywords: List[str], begin_date: str, end_date: str, size: int = 10):
+    """
+    爬取与提供的关键词列表相关的新闻.
+
+    Args:
+        keywords: 用于搜索新闻的关键词列表
+        begin_date: 开始日期，用于搜索
+        end_date: 结束日期，用于搜索
+        size: 一次请求返回的新闻或政策的最大数量
+    """
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s - %(levelname)s - %(message)s",
+                        datefmt="%Y-%m-%d %H:%M:%S",
+                        filename='log.txt',
+                        encoding='utf-8')
+
+    logging.info("开始运行普通做法")
+
+    spider = util.Spider(keywords=keywords,
+                         begin_date=begin_date,
+                         end_date=end_date,
+                         size=size)
+
+    title_list = []
+    pbar = tqdm.tqdm(total=size * 10, desc='多进程爬取进度', unit='条', ncols=80)
+
+    with multiprocessing.Pool(processes=5) as pool:
+        results = []
+        for keyword in keywords:
+            for current in range(1, 11):
+                logging.info(f'keyword: {keyword}, current: {current}')
+                config = spider.get_config(keyword, current)
+                results.append(pool.apply_async(spider.fetch, (config, )))
+
+        for result in results:
+            data = result.get()
+            title_list += spider.parse(data)
+
+            lock.acquire()
+            pbar.update(size)
+            lock.release()
+
+    spider.save(title_list)
+    pbar.close()
+    logging.info("爬取完成")
+
+
+if __name__ == "__main__":
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
--- a/高性能模式/020
+++ b/高性能模式/020
@ -0,0 +1,89 @@
+"""
+根据提供的关键词列表，爬取天水市人民政府网站上指定日期内与关键词相关的新闻的标题，并将其存储至数据库中。
+
+考虑到相关因素，因此本代码只爬取前10页的新闻内容，即最多100条新闻作为测试。
+
+此方法为多线程做法，即使用多线程并行爬取网页内容，再使用json提取新闻内容。
+
+注意：本代码中的关键词列表默认为['灾害']，日期范围默认为2018年1月1日至2018年12月31日。
+
+Args:
+    keywords: 用于搜索新闻的关键词列表
+    begin_date: 开始日期，用于搜索
+    end_date: 结束日期，用于搜索
+    size: 一次请求返回的新闻或政策的最大数量
+
+Examples:
+    ```
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
+    ```
+"""
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+import util
+import logging
+from typing import List
+
+import tqdm
+
+lock = threading.Lock()
+
+
+@util.timeit
+def main(keywords: List[str], begin_date: str, end_date: str, size: int = 10):
+    """
+    爬取与提供的关键词列表相关的新闻.
+
+    Args:
+        keywords: 用于搜索新闻的关键词列表
+        begin_date: 开始日期，用于搜索
+        end_date: 结束日期，用于搜索
+        size: 一次请求返回的新闻或政策的最大数量
+    """
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s - %(levelname)s - %(message)s",
+                        datefmt="%Y-%m-%d %H:%M:%S",
+                        filename='log.txt',
+                        encoding='utf-8')
+
+    logging.info("开始运行多线程爬取")
+
+    spider = util.Spider(keywords=keywords,
+                         begin_date=begin_date,
+                         end_date=end_date,
+                         size=size)
+
+    pbar = tqdm.tqdm(total=size * 10, desc='多线程爬取进度', unit='条', ncols=80)
+    title_list = []
+    tasks = []
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        for keyword in keywords:
+            for current in range(1, 11):
+                logging.info(f'keyword: {keyword}, current: {current}')
+
+                config = spider.get_config(keyword, current)
+                future = executor.submit(spider.fetch, config)
+                tasks.append(future)
+                # 更新进度条
+                lock.acquire()
+                pbar.update(size)
+                lock.release()
+
+        for future in as_completed(tasks):
+            data = future.result()
+            title_list += spider.parse(data)
+
+    spider.save(title_list)
+    pbar.close()
+    logging.info("爬取完成")
+
+
+if __name__ == "__main__":
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
--- a/高性能模式/030
+++ b/高性能模式/030
@ -0,0 +1,89 @@
+"""
+根据提供的关键词列表，爬取天水市人民政府网站上指定日期内与关键词相关的新闻的标题，并将其存储至数据库中。
+
+考虑到相关因素，因此本代码只爬取前10页的新闻内容，即最多100条新闻作为测试。
+
+此方法为协程做法，即使用gevent库通过协程并发爬取网页内容，再使用json提取新闻内容。
+
+注意：本代码中的关键词列表默认为['灾害']，日期范围默认为2018年1月1日至2018年12月31日。
+
+Args:
+    keywords: 用于搜索新闻的关键词列表
+    begin_date: 开始日期，用于搜索
+    end_date: 结束日期，用于搜索
+    size: 一次请求返回的新闻或政策的最大数量
+
+Examples:
+    ```
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
+    ```
+"""
+
+import gevent
+from gevent import monkey
+
+# 打补丁，使标准库能够与gevent协同工作
+monkey.patch_all()
+
+import util
+import logging
+from typing import List
+
+import tqdm
+
+
+@util.timeit
+def main(keywords: List[str], begin_date: str, end_date: str, size: int = 10):
+    """
+    爬取与提供的关键词列表相关的新闻.
+
+    Args:
+        keywords: 用于搜索新闻的关键词列表
+        begin_date: 开始日期，用于搜索
+        end_date: 结束日期，用于搜索
+        size: 一次请求返回的新闻或政策的最大数量
+    """
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s - %(levelname)s - %(message)s",
+                        datefmt="%Y-%m-%d %H:%M:%S",
+                        filename='log.txt',
+                        encoding='utf-8')
+
+    logging.info("开始运行协程爬取")
+
+    spider = util.Spider(keywords=keywords,
+                         begin_date=begin_date,
+                         end_date=end_date,
+                         size=size)
+
+    pbar = tqdm.tqdm(total=size * 10, desc='协程爬取进度', unit='条', ncols=80)
+    title_list = []
+
+    def fetch_and_parse(keyword, current):
+        logging.info(f'keyword: {keyword}, current: {current}')
+        config = spider.get_config(keyword, current)
+        data = spider.fetch(config)
+        titles = spider.parse(data)
+        title_list.extend(titles)
+        pbar.update(size)
+
+    jobs = [
+        gevent.spawn(fetch_and_parse, keyword, current) for keyword in keywords
+        for current in range(1, 11)
+    ]
+
+    gevent.joinall(jobs)
+
+    spider.save(title_list)
+    pbar.close()
+    logging.info("爬取完成")
+
+
+if __name__ == "__main__":
+    main(keywords=['灾害'],
+         begin_date='2018-01-01',
+         end_date='2018-12-31',
+         size=10)
--- a/高性能模式/040
+++ b/高性能模式/040
@ -0,0 +1,85 @@
+"""
+根据提供的关键词列表，爬取天水市人民政府网站上指定日期内与关键词相关的新闻的标题，并将其存储至数据库中。
+
+考虑到相关因素，因此本代码只爬取前10页的新闻内容，即最多100条新闻作为测试。
+
+此方法为多线程做法，即使用异步并行爬取网页内容，再使用json提取新闻内容。
+
+注意：本代码中的关键词列表默认为['灾害']，日期范围默认为2018年1月1日至2018年12月31日。
+
+Args:
+    keywords: 用于搜索新闻的关键词列表
+    begin_date: 开始日期，用于搜索
+    end_date: 结束日期，用于搜索
+    size: 一次请求返回的新闻或政策的最大数量
+
+Examples:
+    ```
+    asyncio.run(
+        main_async(keywords=['灾害'],
+                   begin_date='2018-01-01',
+                   end_date='2018-12-31',
+                   size=10))
+    ```
+"""
+
+import asyncio
+import util
+import logging
+from typing import List
+import tqdm
+
+
+@util.timeit_async
+async def main_async(keywords: List[str],
+                     begin_date: str,
+                     end_date: str,
+                     size: int = 10):
+    """
+    使用异步方式爬取与提供的关键词列表相关的新闻.
+
+    Args:
+        keywords: 用于搜索新闻的关键词列表
+        begin_date: 开始日期，用于搜索
+        end_date: 结束日期，用于搜索
+        size: 一次请求返回的新闻或政策的最大数量
+    """
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s - %(levelname)s - %(message)s",
+                        datefmt="%Y-%m-%d %H:%M:%S",
+                        filename='log.txt',
+                        encoding='utf-8')
+
+    logging.info("开始运行异步爬取")
+
+    spider = util.Spider(keywords=keywords,
+                         begin_date=begin_date,
+                         end_date=end_date,
+                         size=size)
+
+    pbar = tqdm.tqdm(total=size * 10, desc='异步爬取进度', unit='条', ncols=80)
+    title_list = []
+    tasks = []
+    for keyword in keywords:
+        for current in range(1, 11):
+            logging.info(f'keyword: {keyword}, current: {current}')
+            config = spider.get_config(keyword, current)
+            task = asyncio.create_task(spider.fetch_async(config))
+            tasks.append(task)
+
+    for task in asyncio.as_completed(tasks):
+        data = await task
+        title_list += spider.parse(data)
+        pbar.update(size)
+
+    spider.save(title_list)
+    pbar.close()
+    logging.info("爬取完成")
+
+
+if __name__ == "__main__":
+    asyncio.run(
+        main_async(keywords=['灾害'],
+                   begin_date='2018-01-01',
+                   end_date='2018-12-31',
+                   size=10))
--- a/高性能模式/readme.md
+++ b/高性能模式/readme.md
@ -9,3 +9,17 @@


 # 讨论分析
+普通做法连续进行了五次测试，时间分别为34.231s、34.091s、34.164s、34.226s、33.958s，平均时间为34.134s
+多进程（进程数=5）连续进行了五次测试，时间分别为7.719s、7.716s、7.690s、7.730s、7.711s，平均时间为7.7132s
+多线程（线程数=5）连续进行了五次测试，时间分别为7.185s、7.964s、6.983s、6.969s、7.035s，平均时间为7.2272s
+协程连续进行了五次测试，时间分别为3.775s、3.807s、3.733s、3.824s、3.744s，平均时间为3.776s
+异步连续进行了五次测试，时间分别为6.975s、7.675s、7.018s、7.032s、7.049s，平均时间为7.1498s
+注：为保证公平性，每一次Post请求后休眠3秒
+
+可以看出，协程的性能最好，普通做法的性能最差，多线程、多进程和异步的性能介于两者之间。
+考虑到多进程和多线程是故意开的5个进程和线程，而协程是单线程，所以协程的性能最好。
+另外，异步的性能最差，可能是由于异步的并发模型需要频繁地切换线程，导致性能下降。
+总的来说，协程的性能最好，多线程和多进程的性能介于两者之间，普通做法的性能最差。
+
+# 总结
+协程的性能最好，多线程和多进程的性能介于两者之间，普通做法的性能最差。
--- a/高性能模式/util.py
+++ b/高性能模式/util.py
@ -1,4 +1,188 @@
+"""

-################################################################################
-#  本主题通用代码
-################################################################################
+"""
+import re
+import time
+import functools
+import json
+import asyncio
+import requests
+from typing import Any, Dict, List
+
+
+class Spider:
+    """
+    爬虫类。
+
+    Args:
+        keywords (List[str]): 用于搜索新闻的关键词列表
+        begin_date (str): 开始日期，用于搜索
+        end_date (str): 结束日期，用于搜索
+        size (int): 一次请求返回的新闻或政策的最大数量
+
+    Attributes:
+        URL (str): 网址
+    """
+    # 天水市人民政府网站
+    URL = ('https://www.tianshui.gov.cn/aop_component/'
+           '/webber/search/search/search/queryPage')
+
+    def __init__(self, keywords: List[str], begin_date: str, end_date: str,
+                 size: int):
+        self.keywords = keywords
+        self.begin_date = begin_date
+        self.end_date = end_date
+        self.size = size
+
+    def get_config(self, keyword: str, current: int) -> Dict[str, Any]:
+        """
+        获取配置信息。
+
+        Args:
+            keyword (str): 关键词
+            size (int): 一次请求返回的新闻的最大数量
+
+        Returns:
+            Dict[str, Any]: 配置信息
+        """
+
+        return {
+            "aliasName": "article_data,open_data,mailbox_data,article_file",
+            "keyWord": keyword,
+            "lastkeyWord": keyword,
+            "searchKeyWord": False,
+            "orderType": "score",
+            "searchType": "text",
+            "searchScope": "3",
+            "searchOperator": 0,
+            "searchDateType": "custom",
+            "searchDateName": f"{self.begin_date}-{self.end_date}",
+            "beginDate": self.begin_date,
+            "endDate": self.end_date,
+            "showId": "c2ee13065aae85d7a998b8a3cd645961",
+            "auditing": ["1"],
+            "owner": "1912126876",
+            "token": "tourist",
+            "urlPrefix": "/aop_component/",
+            "page": {
+                "current": current,
+                "size": self.size,
+                "pageSizes": [2, 5, 10, 20, 50, 100],
+                "total": 0,
+                "totalPage": 0,
+                "indexs": []
+            },
+            "advance": False,
+            "advanceKeyWord": "",
+            "lang": "i18n_zh_CN"
+        }
+
+    def generate_headers(self) -> dict:
+        """
+        生成请求头。
+
+        Returns:
+            dict: 请求头
+        """
+        return {
+            'Authorization':
+            'tourist',
+            'User-Agent':
+            ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit'
+             '/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari'
+             '/537.36 Edg/124.0.0.0')
+        }
+
+    def fetch(self, config: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        普通做法。
+        Post请求获取网页内容，并返回请求结果。
+
+        Args:
+            config (Dict[str, Any]): 配置信息
+
+        Returns:
+            Dict[str, Any]: 请求结果
+        """
+        response = requests.post(self.URL,
+                                 headers=self.generate_headers(),
+                                 json=config).text
+        time.sleep(3)
+        return json.loads(response)
+
+    async def fetch_async(self, config: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        异步做法。
+        Post请求获取网页内容，并返回请求结果。
+
+        Args:
+            config (Dict[str, Any]): 配置信息
+
+        Returns:
+            Dict[str, Any]: 请求结果
+        """
+        response = requests.post(self.URL,
+                                 headers=self.generate_headers(),
+                                 json=config).text
+        await asyncio.sleep(3)
+        return json.loads(response)
+
+    def parse(self, data: Dict[str, Any]) -> List[str]:
+        """
+        解析网页内容。
+
+        Args:
+            data (Dict[str, Any]): 网页内容
+
+        Returns:
+            List[str]: 标题列表
+        """
+        title_list = []
+        records = data['data']['page']['records']
+        for i in range(self.size):
+            title = records[i]['title']
+            title = re.sub('<[^>]*>', '', title)  # 去除html标签
+            title_list.append(title)
+            # print(title)
+        return title_list
+
+    def save(self, title_list: List[str]):
+        """
+        保存数据。
+        """
+        pass
+
+
+# 时间装饰器
+def timeit(func):
+    """
+    计算函数运行时间。
+
+    Args:
+        func: 函数
+
+    Return:
+        函数
+    """
+
+    def wrapper(*args, **kwargs):
+        start = time.time()
+        result = func(*args, **kwargs)
+
+        print(f'{func.__name__} cost: {time.time() - start}')
+        return result
+
+    return wrapper
+
+
+def timeit_async(func):
+
+    @functools.wraps(func)
+    async def wrapper(*args, **kwargs):
+        start = time.time()
+        result = await func(*args, **kwargs)
+
+        print(f'{func.__name__} cost: {time.time() - start}')
+        return result
+
+    return wrapper
Author	SHA1	Message	Date
p46318075	3d0220d49b	Merge pull request 'dev' (#17 ) from pcz4qfnkl/CodePattern:dev into dev	1 year ago
Yao	36afa1d669	refactor: 优化代码，提高可读性和效率	1 year ago
Yao	15736d7393	refactor(code): 优化代码，提高可读性和效率	1 year ago
Yao	f170c936d8	feat: 添加了根据关键词爬取天水市人民政府网站上指定日期内新闻标题的功能，并提供了多线程、多进程、协程和异步四种实现方式。	1 year ago