From 36afa1d66950153147ce46d2b5a4e37ddaa403e0 Mon Sep 17 00:00:00 2001
From: Yao <1928814540@qq.com>
Date: Tue, 13 Aug 2024 16:19:57 +0800
Subject: [PATCH] =?UTF-8?q?refactor:=20=E4=BC=98=E5=8C=96=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81=EF=BC=8C=E6=8F=90=E9=AB=98=E5=8F=AF=E8=AF=BB=E6=80=A7?=
 =?UTF-8?q?=E5=92=8C=E6=95=88=E7=8E=87?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../11 基础结构/对象化/1 类对象.py  | 71 +++++++++++---
 .../对象化/2 字典对象.py               | 45 ++++++---
 A 代码模式/cppy/cp_util.py                | 95 ++++++++++++++++++-
 3 files changed, 181 insertions(+), 30 deletions(-)

diff --git a/A 代码模式/11 基础结构/对象化/1 类对象.py b/A 代码模式/11 基础结构/对象化/1 类对象.py
index 013e743..4d251b8 100644
--- a/A 代码模式/11 基础结构/对象化/1 类对象.py	
+++ b/A 代码模式/11 基础结构/对象化/1 类对象.py	
@@ -1,58 +1,101 @@
 from collections import Counter
 from cppy.cp_util import *
-    
+
 
 class DataStorageManager:
-    """ 数据模型 """    
-    def __init__(self, path_to_file):                
-        self._data = re_split( read_file(path_to_file) )
+    """
+    数据模型，读取文件内容，并将内容分割成单词。
+
+    Attributes:
+        _data: 单词列表。
+
+    Methods:
+        _words (self): 返回分割后的单词列表。
+    """
+
+    def __init__(self, path_to_file):
+        self._data = re_split(read_file(path_to_file))
 
-    def words(self):        
+    def words(self):
+        """返回分割后的单词列表。"""
         return self._data
 
 
 class StopWordManager:
-    """ 停用词模型 """    
-    def __init__(self):        
+    """
+    停用词模型
+
+    Attributes:
+        _stop_words: 停用词列表
+
+    Methods:
+        is_stop_word (self, word): 判断给定单词是否为停用词。
+    """
+
+    def __init__(self):
         self._stop_words = get_stopwords()
 
     def is_stop_word(self, word):
+        """判断给定单词是否为停用词。"""
         return word in self._stop_words
 
 
 class WordFrequencyManager:
-    """ 词频模型 """    
+    """
+    词频模型，计算并管理单词的频率。
+
+    Attributes:
+        _word_freqs: 使用 Counter 存储单词及其出现次数。
+
+    Methods:
+        increment_count (self, word): 计算词频。
+        sorted(self): 返回按出现次数排序的单词列表。
+
+    """
+
     def __init__(self):
         self._word_freqs = Counter()
 
     def increment_count(self, word):
+        """计算词频。"""
         self._word_freqs[word] += 1
 
     def sorted(self):
+        """返回按出现次数排序的单词列表。"""
         return self._word_freqs.most_common()
 
 
 class WordFrequencyController:
+    """
+    控制器，控制整个流程，读取文件、处理停用词、计算词频并输出结果。
+
+    Attributes:
+        _storage_manager: DataStorageManager 实例，用于读取和处理文件内容。
+        _stop_word_manager: StopWordManager 实例，用于管理停用词。
+        _word_freq_manager: WordFrequencyManager 实例，用于计算和存储单词频率。
+
+    Methods:
+        run(self): 运行方法，遍历单词列表，过滤掉停用词，并计算每个单词的频率，最后输出结果。
+    """
+
     def __init__(self, path_to_file):
         self._storage_manager = DataStorageManager(path_to_file)
         self._stop_word_manager = StopWordManager()
         self._word_freq_manager = WordFrequencyManager()
 
     def run(self):
+        """运行方法，遍历单词列表，过滤掉停用词，并计算每个单词的频率，最后输出结果。"""
         for w in self._storage_manager.words():
             if not self._stop_word_manager.is_stop_word(w):
                 self._word_freq_manager.increment_count(w)
 
         word_freqs = self._word_freq_manager.sorted()
-        print_word_freqs(word_freqs)        
-
+        print_word_freqs(word_freqs)
 
 
-if __name__ == '__main__':    
+if __name__ == '__main__':
     WordFrequencyController(testfilepath).run()
-
-
 '''
 函数输入参数调用后，你的马上接住返回值
 类输入参数后实例化后，你可以需要的时候去访问你需要的数据（实例属性）
-'''    
\ No newline at end of file
+'''
diff --git a/A 代码模式/11 基础结构/对象化/2 字典对象.py b/A 代码模式/11 基础结构/对象化/2 字典对象.py
index 8fb9b14..9f3b1cc 100644
--- a/A 代码模式/11 基础结构/对象化/2 字典对象.py	
+++ b/A 代码模式/11 基础结构/对象化/2 字典对象.py	
@@ -1,29 +1,52 @@
 from cppy.cp_util import *
 
-def extract_words(obj, path_to_file):    
+
+def extract_words(obj, path_to_file):
+    """
+    从文件中提取单词并存储在对象的 'data' 字段中。
+
+    Args:
+        obj (dict): 存储数据的字典对象。
+        path_to_file (str): 文件路径。
+    """
     obj['data'] = extract_file_words(path_to_file)
 
+
 def increment_count(obj, w):
-    obj['freqs'][w] = 1 if w not in obj['freqs'] else obj['freqs'][w]+1
+    """
+    增加单词的计数。如果单词不存在，则将其计数设置为1。
 
+    参数:
+        obj (dict): 存储单词频率的字典对象。
+        w (str): 单词。
+    """
+    obj['freqs'][w] = 1 if w not in obj['freqs'] else obj['freqs'][w] + 1
+
+
+# 数据存储对象，包含初始化和获取单词的方法
 data_storage_obj = {
-    'data' : [],
-    'init' : lambda path_to_file : extract_words(data_storage_obj, path_to_file),
-    'words' : lambda : data_storage_obj['data']
+    'data': [],  # 存储单词列表
+    'init': lambda path_to_file: extract_words(data_storage_obj, path_to_file
+                                               ),  # 初始化方法，提取文件中的单词
+    'words': lambda: data_storage_obj['data']  # 获取单词列表的方法
 }
 
+# 单词频率对象，包含增加计数和排序的方法
 word_freqs_obj = {
-    'freqs' : {},
-    'increment_count' : lambda w : increment_count(word_freqs_obj, w),
-    'sorted' : lambda : sort_dict(word_freqs_obj['freqs']) 
+    'freqs': {},  # 存储单词频率的字典
+    'increment_count':
+    lambda w: increment_count(word_freqs_obj, w),  # 增加单词计数的方法
+    'sorted': lambda: sort_dict(word_freqs_obj['freqs'])  # 获取排序后的单词频率的方法
 }
 
-
 if __name__ == '__main__':
-    data_storage_obj['init']( testfilepath )    
+    # 初始化数据存储对象，提取文件中的单词
+    data_storage_obj['init'](testfilepath)
 
+    # 遍历单词列表，增加单词的计数
     for word in data_storage_obj['words']():
         word_freqs_obj['increment_count'](word)
 
+    # 获取排序后的单词频率并打印
     word_freqs = word_freqs_obj['sorted']()
-    print_word_freqs(word_freqs)    
\ No newline at end of file
+    print_word_freqs(word_freqs)
diff --git a/A 代码模式/cppy/cp_util.py b/A 代码模式/cppy/cp_util.py
index b85ec2a..13052b1 100644
--- a/A 代码模式/cppy/cp_util.py	
+++ b/A 代码模式/cppy/cp_util.py	
@@ -23,18 +23,45 @@ testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename)
 #  项目函数
 ################################################################################
 def read_file(path_to_file):
+    """
+    读取指定文件的内容。
+
+    Args:
+        path_to_file (str): 文件路径。
+
+    Returns:
+        str: 文件内容。
+    """
     with open(path_to_file, encoding='utf-8') as f:
         data = f.read()
     return data
 
 
 def re_split(data):
+    """
+    使用正则表达式分割字符串，将非字母字符替换为空格，并将所有字符转换为小写。
+
+    Args:
+        data (str): 输入字符串。
+
+    Returns:
+        list: 分割后的单词列表。
+    """
     pattern = re.compile('[\W_]+')
     data = pattern.sub(' ', data).lower()
     return data.split()
 
 
 def get_stopwords(path_to_file=stopwordfilepath):
+    """
+    获取停用词列表。
+
+    Args:
+        path_to_file (str): 停用词文件路径，默认为 stopwordfilepath。
+
+    Returns:
+        list: 停用词列表。
+    """
     with open(path_to_file, encoding='utf-8') as f:
         data = f.read().split(',')
     data.extend(list(string.ascii_lowercase))
@@ -42,8 +69,16 @@ def get_stopwords(path_to_file=stopwordfilepath):
 
 
 def get_chunks(file_path=testfilepath, chunk_size=1000):
-    # 读取文件内容，分割文件内容为多个块，每个块由一个进程处理
-    # 可以根据实际情况调整块大小
+    """
+    将文件内容分割成多个块。
+
+    Args:
+        file_path (str): 文件路径，默认为 testfilepath。
+        chunk_size (int): 每个块的大小，默认为 1000。
+
+    Returns:
+        list: 分割后的块列表。
+    """
     content = re_split(read_file(file_path))
     chunks = [
         content[i:i + chunk_size] for i in range(0, len(content), chunk_size)
@@ -52,23 +87,58 @@ def get_chunks(file_path=testfilepath, chunk_size=1000):
 
 
 def extract_file_words(path_to_file):
+    """
+    提取文件中的单词，去除停用词和长度小于3的单词。
+
+    Args:
+        path_to_file (str): 文件路径。
+
+    Returns:
+        list: 提取后的单词列表。
+    """
     word_list = re_split(read_file(path_to_file))
     stop_words = get_stopwords()
-    return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
+    return [w for w in word_list if (w not in stop_words) and len(w) >= 3]
 
 
 def extract_str_words(data_str):
+    """
+    提取字符串中的单词，去除停用词和长度小于3的单词。
+
+    Args:
+        data_str (str): 输入字符串。
+
+    Returns:
+        list: 提取后的单词列表。
+    """
     word_list = re_split(data_str)
     stop_words = get_stopwords()
-    return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
+    return [w for w in word_list if (w not in stop_words) and len(w) >= 3]
 
 
 def count_word(word, word_freqs, stopwords):
+    """
+    统计单词频率。
+
+    Args:
+        word (str): 单词。
+        word_freqs (dict): 单词频率字典。
+        stopwords (list): 停用词列表。
+    """
     if word not in stopwords:
         word_freqs[word] = word_freqs.get(word, 0) + 1
 
 
 def get_frequencies(word_list):
+    """
+    获取单词频率。
+
+    Args:
+        word_list (list): 单词列表。
+
+    Returns:
+        dict: 单词频率字典。
+    """
     word_freqs = {}
     for word in word_list:
         word_freqs[word] = word_freqs.get(word, 0) + 1
@@ -76,11 +146,26 @@ def get_frequencies(word_list):
 
 
 def sort_dict(word_freq):
+    """
+    对字典进行排序。
+
+    Args:
+        word_freq (dict): 单词频率字典。
+
+    Returns:
+        list: 排序后的单词频率列表。
+    """
     return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
-    # return sorted( word_freq, key=lambda x: x[1], reverse=True )
 
 
 def print_word_freqs(word_freqs, n=10):
+    """
+    打印单词频率。
+
+    Args:
+        word_freqs (list): 单词频率列表。
+        n (int): 打印的单词数量，默认为 10。
+    """
     for (w, c) in word_freqs[:n]:
         print(w, '-', c)