refactor: 优化代码，提高可读性和效率

10 months ago · 36afa1d669
parent 15736d7393
commit 36afa1d669
3 changed files with 181 additions and 30 deletions
--- a/基础结构/对象化/1
+++ b/基础结构/对象化/1
@ -1,58 +1,101 @@
 from collections import Counter
 from cppy.cp_util import *
-    
+
 class DataStorageManager:
-    """ 数据模型 """    
+    """
-    def __init__(self, path_to_file):                
+    数据模型，读取文件内容，并将内容分割成单词。
-        self._data = re_split( read_file(path_to_file) )
+
    Attributes:
        _data: 单词列表。
    Methods:
        _words (self): 返回分割后的单词列表。
    """
    def __init__(self, path_to_file):
        self._data = re_split(read_file(path_to_file))
-    def words(self):        
+    def words(self):
        """返回分割后的单词列表。"""
        return self._data
 class StopWordManager:
-    """ 停用词模型 """    
+    """
-    def __init__(self):        
+    停用词模型
    Attributes:
        _stop_words: 停用词列表
    Methods:
        is_stop_word (self, word): 判断给定单词是否为停用词。
    """
    def __init__(self):
        self._stop_words = get_stopwords()
    def is_stop_word(self, word):
        """判断给定单词是否为停用词。"""
        return word in self._stop_words
 class WordFrequencyManager:
-    """ 词频模型 """    
+    """
    词频模型，计算并管理单词的频率。
    Attributes:
        _word_freqs: 使用 Counter 存储单词及其出现次数。
    Methods:
        increment_count (self, word): 计算词频。
        sorted(self): 返回按出现次数排序的单词列表。
    """
    def __init__(self):
        self._word_freqs = Counter()
    def increment_count(self, word):
        """计算词频。"""
        self._word_freqs[word] += 1
    def sorted(self):
        """返回按出现次数排序的单词列表。"""
        return self._word_freqs.most_common()
 class WordFrequencyController:
    """
    控制器，控制整个流程，读取文件、处理停用词、计算词频并输出结果。
    Attributes:
        _storage_manager: DataStorageManager 实例，用于读取和处理文件内容。
        _stop_word_manager: StopWordManager 实例，用于管理停用词。
        _word_freq_manager: WordFrequencyManager 实例，用于计算和存储单词频率。
    Methods:
        run(self): 运行方法，遍历单词列表，过滤掉停用词，并计算每个单词的频率，最后输出结果。
    """
    def __init__(self, path_to_file):
        self._storage_manager = DataStorageManager(path_to_file)
        self._stop_word_manager = StopWordManager()
        self._word_freq_manager = WordFrequencyManager()
    def run(self):
        """运行方法，遍历单词列表，过滤掉停用词，并计算每个单词的频率，最后输出结果。"""
        for w in self._storage_manager.words():
            if not self._stop_word_manager.is_stop_word(w):
                self._word_freq_manager.increment_count(w)
        word_freqs = self._word_freq_manager.sorted()
-        print_word_freqs(word_freqs)        
+        print_word_freqs(word_freqs)
-if __name__ == '__main__':    
+if __name__ == '__main__':
    WordFrequencyController(testfilepath).run()
 '''
 函数输入参数调用后，你的马上接住返回值
 类输入参数后实例化后，你可以需要的时候去访问你需要的数据（实例属性）
-'''    
+'''
--- a/基础结构/对象化/2
+++ b/基础结构/对象化/2
@ -1,29 +1,52 @@
 from cppy.cp_util import *
-def extract_words(obj, path_to_file):    
+
 def extract_words(obj, path_to_file):
    """
    从文件中提取单词并存储在对象的 'data' 字段中。
    Args:
        obj (dict): 存储数据的字典对象。
        path_to_file (str): 文件路径。
    """
    obj['data'] = extract_file_words(path_to_file)
 def increment_count(obj, w):
-    obj['freqs'][w] = 1 if w not in obj['freqs'] else obj['freqs'][w]+1
+    """
    增加单词的计数。如果单词不存在，则将其计数设置为1。
    参数:
        obj (dict): 存储单词频率的字典对象。
        w (str): 单词。
    """
    obj['freqs'][w] = 1 if w not in obj['freqs'] else obj['freqs'][w] + 1
 # 数据存储对象，包含初始化和获取单词的方法
 data_storage_obj = {
-    'data' : [],
+    'data': [],  # 存储单词列表
-    'init' : lambda path_to_file : extract_words(data_storage_obj, path_to_file),
+    'init': lambda path_to_file: extract_words(data_storage_obj, path_to_file
-    'words' : lambda : data_storage_obj['data']
+                                               ),  # 初始化方法，提取文件中的单词
    'words': lambda: data_storage_obj['data']  # 获取单词列表的方法
 }
 # 单词频率对象，包含增加计数和排序的方法
 word_freqs_obj = {
-    'freqs' : {},
+    'freqs': {},  # 存储单词频率的字典
-    'increment_count' : lambda w : increment_count(word_freqs_obj, w),
+    'increment_count':
-    'sorted' : lambda : sort_dict(word_freqs_obj['freqs']) 
+    lambda w: increment_count(word_freqs_obj, w),  # 增加单词计数的方法
    'sorted': lambda: sort_dict(word_freqs_obj['freqs'])  # 获取排序后的单词频率的方法
 }
 if __name__ == '__main__':
-    data_storage_obj['init']( testfilepath )    
+    # 初始化数据存储对象，提取文件中的单词
    data_storage_obj['init'](testfilepath)
    # 遍历单词列表，增加单词的计数
    for word in data_storage_obj['words']():
        word_freqs_obj['increment_count'](word)
    # 获取排序后的单词频率并打印
    word_freqs = word_freqs_obj['sorted']()
-    print_word_freqs(word_freqs)    
+    print_word_freqs(word_freqs)
--- a/代码模式/cppy/cp_util.py
+++ b/代码模式/cppy/cp_util.py
@ -23,18 +23,45 @@ testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename)
 #  项目函数
 ################################################################################
 def read_file(path_to_file):
    """
    读取指定文件的内容。
    Args:
        path_to_file (str): 文件路径。
    Returns:
        str: 文件内容。
    """
    with open(path_to_file, encoding='utf-8') as f:
        data = f.read()
    return data
 def re_split(data):
    """
    使用正则表达式分割字符串，将非字母字符替换为空格，并将所有字符转换为小写。
    Args:
        data (str): 输入字符串。
    Returns:
        list: 分割后的单词列表。
    """
    pattern = re.compile('[\W_]+')
    data = pattern.sub(' ', data).lower()
    return data.split()
 def get_stopwords(path_to_file=stopwordfilepath):
    """
    获取停用词列表。
    Args:
        path_to_file (str): 停用词文件路径，默认为 stopwordfilepath。
    Returns:
        list: 停用词列表。
    """
    with open(path_to_file, encoding='utf-8') as f:
        data = f.read().split(',')
    data.extend(list(string.ascii_lowercase))
@ -42,8 +69,16 @@ def get_stopwords(path_to_file=stopwordfilepath):
 def get_chunks(file_path=testfilepath, chunk_size=1000):
-    # 读取文件内容，分割文件内容为多个块，每个块由一个进程处理
+    """
-    # 可以根据实际情况调整块大小
+    将文件内容分割成多个块。
    Args:
        file_path (str): 文件路径，默认为 testfilepath。
        chunk_size (int): 每个块的大小，默认为 1000。
    Returns:
        list: 分割后的块列表。
    """
    content = re_split(read_file(file_path))
    chunks = [
        content[i:i + chunk_size] for i in range(0, len(content), chunk_size)
@ -52,23 +87,58 @@ def get_chunks(file_path=testfilepath, chunk_size=1000):
 def extract_file_words(path_to_file):
    """
    提取文件中的单词，去除停用词和长度小于3的单词。
    Args:
        path_to_file (str): 文件路径。
    Returns:
        list: 提取后的单词列表。
    """
    word_list = re_split(read_file(path_to_file))
    stop_words = get_stopwords()
-    return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
+    return [w for w in word_list if (w not in stop_words) and len(w) >= 3]
 def extract_str_words(data_str):
    """
    提取字符串中的单词，去除停用词和长度小于3的单词。
    Args:
        data_str (str): 输入字符串。
    Returns:
        list: 提取后的单词列表。
    """
    word_list = re_split(data_str)
    stop_words = get_stopwords()
-    return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
+    return [w for w in word_list if (w not in stop_words) and len(w) >= 3]
 def count_word(word, word_freqs, stopwords):
    """
    统计单词频率。
    Args:
        word (str): 单词。
        word_freqs (dict): 单词频率字典。
        stopwords (list): 停用词列表。
    """
    if word not in stopwords:
        word_freqs[word] = word_freqs.get(word, 0) + 1
 def get_frequencies(word_list):
    """
    获取单词频率。
    Args:
        word_list (list): 单词列表。
    Returns:
        dict: 单词频率字典。
    """
    word_freqs = {}
    for word in word_list:
        word_freqs[word] = word_freqs.get(word, 0) + 1
@ -76,11 +146,26 @@ def get_frequencies(word_list):
 def sort_dict(word_freq):
    """
    对字典进行排序。
    Args:
        word_freq (dict): 单词频率字典。
    Returns:
        list: 排序后的单词频率列表。
    """
    return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
    # return sorted( word_freq, key=lambda x: x[1], reverse=True )
 def print_word_freqs(word_freqs, n=10):
    """
    打印单词频率。
    Args:
        word_freqs (list): 单词频率列表。
        n (int): 打印的单词数量，默认为 10。
    """
    for (w, c) in word_freqs[:n]:
        print(w, '-', c)