refactor: 优化代码，提高可读性和效率

1 year ago · 36afa1d669
parent 15736d7393
commit 36afa1d669
3 changed files with 181 additions and 30 deletions
--- a/基础结构/对象化/1
+++ b/基础结构/对象化/1
@ -1,58 +1,101 @@
 from collections import Counter
 from cppy.cp_util import *
-    
+

 class DataStorageManager:
-    """ 数据模型 """    
-    def __init__(self, path_to_file):                
-        self._data = re_split( read_file(path_to_file) )
+    """
+    数据模型，读取文件内容，并将内容分割成单词。
+
+    Attributes:
+        _data: 单词列表。
+
+    Methods:
+        _words (self): 返回分割后的单词列表。
+    """
+
+    def __init__(self, path_to_file):
+        self._data = re_split(read_file(path_to_file))

-    def words(self):        
+    def words(self):
+        """返回分割后的单词列表。"""
        return self._data


 class StopWordManager:
-    """ 停用词模型 """    
-    def __init__(self):        
+    """
+    停用词模型
+
+    Attributes:
+        _stop_words: 停用词列表
+
+    Methods:
+        is_stop_word (self, word): 判断给定单词是否为停用词。
+    """
+
+    def __init__(self):
        self._stop_words = get_stopwords()

    def is_stop_word(self, word):
+        """判断给定单词是否为停用词。"""
        return word in self._stop_words


 class WordFrequencyManager:
-    """ 词频模型 """    
+    """
+    词频模型，计算并管理单词的频率。
+
+    Attributes:
+        _word_freqs: 使用 Counter 存储单词及其出现次数。
+
+    Methods:
+        increment_count (self, word): 计算词频。
+        sorted(self): 返回按出现次数排序的单词列表。
+
+    """
+
    def __init__(self):
        self._word_freqs = Counter()

    def increment_count(self, word):
+        """计算词频。"""
        self._word_freqs[word] += 1

    def sorted(self):
+        """返回按出现次数排序的单词列表。"""
        return self._word_freqs.most_common()


 class WordFrequencyController:
+    """
+    控制器，控制整个流程，读取文件、处理停用词、计算词频并输出结果。
+
+    Attributes:
+        _storage_manager: DataStorageManager 实例，用于读取和处理文件内容。
+        _stop_word_manager: StopWordManager 实例，用于管理停用词。
+        _word_freq_manager: WordFrequencyManager 实例，用于计算和存储单词频率。
+
+    Methods:
+        run(self): 运行方法，遍历单词列表，过滤掉停用词，并计算每个单词的频率，最后输出结果。
+    """
+
    def __init__(self, path_to_file):
        self._storage_manager = DataStorageManager(path_to_file)
        self._stop_word_manager = StopWordManager()
        self._word_freq_manager = WordFrequencyManager()

    def run(self):
+        """运行方法，遍历单词列表，过滤掉停用词，并计算每个单词的频率，最后输出结果。"""
        for w in self._storage_manager.words():
            if not self._stop_word_manager.is_stop_word(w):
                self._word_freq_manager.increment_count(w)

        word_freqs = self._word_freq_manager.sorted()
-        print_word_freqs(word_freqs)        
-
+        print_word_freqs(word_freqs)


-if __name__ == '__main__':    
+if __name__ == '__main__':
    WordFrequencyController(testfilepath).run()
-
-
 '''
 函数输入参数调用后，你的马上接住返回值
 类输入参数后实例化后，你可以需要的时候去访问你需要的数据（实例属性）
-'''    
+'''
--- a/基础结构/对象化/2
+++ b/基础结构/对象化/2
@ -1,29 +1,52 @@
 from cppy.cp_util import *

-def extract_words(obj, path_to_file):    
+
+def extract_words(obj, path_to_file):
+    """
+    从文件中提取单词并存储在对象的 'data' 字段中。
+
+    Args:
+        obj (dict): 存储数据的字典对象。
+        path_to_file (str): 文件路径。
+    """
    obj['data'] = extract_file_words(path_to_file)

+
 def increment_count(obj, w):
-    obj['freqs'][w] = 1 if w not in obj['freqs'] else obj['freqs'][w]+1
+    """
+    增加单词的计数。如果单词不存在，则将其计数设置为1。

+    参数:
+        obj (dict): 存储单词频率的字典对象。
+        w (str): 单词。
+    """
+    obj['freqs'][w] = 1 if w not in obj['freqs'] else obj['freqs'][w] + 1
+
+
+# 数据存储对象，包含初始化和获取单词的方法
 data_storage_obj = {
-    'data' : [],
-    'init' : lambda path_to_file : extract_words(data_storage_obj, path_to_file),
-    'words' : lambda : data_storage_obj['data']
+    'data': [],  # 存储单词列表
+    'init': lambda path_to_file: extract_words(data_storage_obj, path_to_file
+                                               ),  # 初始化方法，提取文件中的单词
+    'words': lambda: data_storage_obj['data']  # 获取单词列表的方法
 }

+# 单词频率对象，包含增加计数和排序的方法
 word_freqs_obj = {
-    'freqs' : {},
-    'increment_count' : lambda w : increment_count(word_freqs_obj, w),
-    'sorted' : lambda : sort_dict(word_freqs_obj['freqs']) 
+    'freqs': {},  # 存储单词频率的字典
+    'increment_count':
+    lambda w: increment_count(word_freqs_obj, w),  # 增加单词计数的方法
+    'sorted': lambda: sort_dict(word_freqs_obj['freqs'])  # 获取排序后的单词频率的方法
 }

-
 if __name__ == '__main__':
-    data_storage_obj['init']( testfilepath )    
+    # 初始化数据存储对象，提取文件中的单词
+    data_storage_obj['init'](testfilepath)

+    # 遍历单词列表，增加单词的计数
    for word in data_storage_obj['words']():
        word_freqs_obj['increment_count'](word)

+    # 获取排序后的单词频率并打印
    word_freqs = word_freqs_obj['sorted']()
-    print_word_freqs(word_freqs)    
+    print_word_freqs(word_freqs)
--- a/代码模式/cppy/cp_util.py
+++ b/代码模式/cppy/cp_util.py
@ -23,18 +23,45 @@ testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename)
 #  项目函数
 ################################################################################
 def read_file(path_to_file):
+    """
+    读取指定文件的内容。
+
+    Args:
+        path_to_file (str): 文件路径。
+
+    Returns:
+        str: 文件内容。
+    """
    with open(path_to_file, encoding='utf-8') as f:
        data = f.read()
    return data


 def re_split(data):
+    """
+    使用正则表达式分割字符串，将非字母字符替换为空格，并将所有字符转换为小写。
+
+    Args:
+        data (str): 输入字符串。
+
+    Returns:
+        list: 分割后的单词列表。
+    """
    pattern = re.compile('[\W_]+')
    data = pattern.sub(' ', data).lower()
    return data.split()


 def get_stopwords(path_to_file=stopwordfilepath):
+    """
+    获取停用词列表。
+
+    Args:
+        path_to_file (str): 停用词文件路径，默认为 stopwordfilepath。
+
+    Returns:
+        list: 停用词列表。
+    """
    with open(path_to_file, encoding='utf-8') as f:
        data = f.read().split(',')
    data.extend(list(string.ascii_lowercase))
@ -42,8 +69,16 @@ def get_stopwords(path_to_file=stopwordfilepath):


 def get_chunks(file_path=testfilepath, chunk_size=1000):
-    # 读取文件内容，分割文件内容为多个块，每个块由一个进程处理
-    # 可以根据实际情况调整块大小
+    """
+    将文件内容分割成多个块。
+
+    Args:
+        file_path (str): 文件路径，默认为 testfilepath。
+        chunk_size (int): 每个块的大小，默认为 1000。
+
+    Returns:
+        list: 分割后的块列表。
+    """
    content = re_split(read_file(file_path))
    chunks = [
        content[i:i + chunk_size] for i in range(0, len(content), chunk_size)
@ -52,23 +87,58 @@ def get_chunks(file_path=testfilepath, chunk_size=1000):


 def extract_file_words(path_to_file):
+    """
+    提取文件中的单词，去除停用词和长度小于3的单词。
+
+    Args:
+        path_to_file (str): 文件路径。
+
+    Returns:
+        list: 提取后的单词列表。
+    """
    word_list = re_split(read_file(path_to_file))
    stop_words = get_stopwords()
-    return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
+    return [w for w in word_list if (w not in stop_words) and len(w) >= 3]


 def extract_str_words(data_str):
+    """
+    提取字符串中的单词，去除停用词和长度小于3的单词。
+
+    Args:
+        data_str (str): 输入字符串。
+
+    Returns:
+        list: 提取后的单词列表。
+    """
    word_list = re_split(data_str)
    stop_words = get_stopwords()
-    return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
+    return [w for w in word_list if (w not in stop_words) and len(w) >= 3]


 def count_word(word, word_freqs, stopwords):
+    """
+    统计单词频率。
+
+    Args:
+        word (str): 单词。
+        word_freqs (dict): 单词频率字典。
+        stopwords (list): 停用词列表。
+    """
    if word not in stopwords:
        word_freqs[word] = word_freqs.get(word, 0) + 1


 def get_frequencies(word_list):
+    """
+    获取单词频率。
+
+    Args:
+        word_list (list): 单词列表。
+
+    Returns:
+        dict: 单词频率字典。
+    """
    word_freqs = {}
    for word in word_list:
        word_freqs[word] = word_freqs.get(word, 0) + 1
@ -76,11 +146,26 @@ def get_frequencies(word_list):


 def sort_dict(word_freq):
+    """
+    对字典进行排序。
+
+    Args:
+        word_freq (dict): 单词频率字典。
+
+    Returns:
+        list: 排序后的单词频率列表。
+    """
    return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
-    # return sorted( word_freq, key=lambda x: x[1], reverse=True )


 def print_word_freqs(word_freqs, n=10):
+    """
+    打印单词频率。
+
+    Args:
+        word_freqs (list): 单词频率列表。
+        n (int): 打印的单词数量，默认为 10。
+    """
    for (w, c) in word_freqs[:n]:
        print(w, '-', c)