refactor(code): 优化代码，提高可读性和效率

2 years ago · 15736d7393
parent f170c936d8
commit 15736d7393
10 changed files with 62 additions and 49 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,4 @@
 log.txt
 /test
 /.venv
-*/__pycache__
+__pycache__
--- a/最基础的写法.py
+++ b/最基础的写法.py
@ -7,10 +7,9 @@ with open(stopwordfilepath, encoding='utf-8') as f:
 for letter in 'abcdefghijklmnopqrstuvwxyz':
    stop_words.append(letter)

-
 # 读文件，逐行扫描文本，发现词，确定不是停用词，计数
 word_freqs = []
-for line in open( testfilepath, encoding='utf-8' ):
+for line in open(testfilepath, encoding='utf-8'):
    start_char = None
    i = 0
    for c in line:
@ -42,10 +41,9 @@ for line in open( testfilepath, encoding='utf-8' ):
 # 使用冒泡排序对词频进行排序
 n = len(word_freqs)
 for i in range(n):
-    for j in range(0, n-i-1):
-        if word_freqs[j][1] < word_freqs[j+1][1]:
-            word_freqs[j], word_freqs[j+1] = word_freqs[j+1], word_freqs[j]            
-
+    for j in range(0, n - i - 1):
+        if word_freqs[j][1] < word_freqs[j + 1][1]:
+            word_freqs[j], word_freqs[j + 1] = word_freqs[j + 1], word_freqs[j]

 # 打印频率最高的前10个词
 for tf in word_freqs[:10]:
--- a/加入语言特性.py
+++ b/加入语言特性.py
@ -1,4 +1,4 @@
-from cppy.cp_util import stopwordfilepath,testfilepath
+from cppy.cp_util import stopwordfilepath, testfilepath
 import string
 from collections import Counter

@ -8,7 +8,7 @@ stop_words.update(list(string.ascii_lowercase))

 # 读取文件并计算单词频率
 word_freqs = Counter()
-with open(testfilepath,encoding = 'utf8') as f:
+with open(testfilepath, encoding='utf8') as f:
    for line_num, line in enumerate(f, 1):
        start_char = None
        for i, c in enumerate(line):
@ -23,10 +23,9 @@ with open(testfilepath,encoding = 'utf8') as f:
 # 打印前10个最常见的单词
 for word, freq in word_freqs.most_common(10):
    print(f"{word}-{freq}")
-
 '''
 相比 A01
 使用collections.Counter来计数单词频率，从而简化了代码并提高了效率。
 使用enumerate来获取行号和行内容，使用set来存储停用词，都有助于提高代码的性能和可读性。
 使用most_common方法来获取最常见的单词，使输出更为简洁。
-'''
+'''
--- a/代码模式/10
+++ b/代码模式/10
@ -1,12 +1,14 @@
-import re, collections
-from cppy.cp_util import stopwordfilepath,testfilepath
-
-stopwords = set(open( stopwordfilepath,encoding = 'utf8' ).read().split(','))
-words = re.findall('[a-z]{2,}', open( testfilepath,encoding = 'utf8').read().lower())
-counts = collections.Counter( w for w in words if w not in stopwords )
-for (w, c) in counts.most_common(10) :  print(w, '-', c)
+import re
+import collections
+from cppy.cp_util import stopwordfilepath, testfilepath

+stopwords = set(open(stopwordfilepath, encoding='utf8').read().split(','))
+words = re.findall('[a-z]{2,}',
+                   open(testfilepath, encoding='utf8').read().lower())
+counts = collections.Counter(w for w in words if w not in stopwords)
+for (w, c) in counts.most_common(10):
+    print(w, '-', c)
 '''
 熟练的软件工程师，会如此简单完成任务
 后面的例子，我们必须变的啰嗦一些，不能用这种太 hacker 的写法
-'''
+'''
--- a/代码模式/cppy_/cp_util.py
+++ b/代码模式/cppy_/cp_util.py
@ -1,7 +1,6 @@
-
 import site
-import os,re,time
-import string,operator
+import os, re, time
+import string, operator

 ################################################################################
 #  变量
@ -10,76 +9,89 @@ testfilename = 'test.txt'
 testfilename = 'pride-and-prejudice.txt'
 testfilename = 'Prey.txt'

-db_filename = "tf.db"  
+db_filename = "tf.db"

 site_packages = site.getsitepackages()
 for package in site_packages:
-    if 'package' in  package:
+    if 'package' in package:
        basePath = package
-stopwordfilepath = os.path.join(basePath, 'cppy','data','stop_words.txt')
-testfilepath = os.path.join(basePath, 'cppy','data',testfilename )
+stopwordfilepath = os.path.join(basePath, 'cppy', 'data', 'stop_words.txt')
+testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename)


 ################################################################################
 #  项目函数
 ################################################################################
-def read_file(path_to_file):    
-    with open(path_to_file,encoding='utf-8') as f:
+def read_file(path_to_file):
+    with open(path_to_file, encoding='utf-8') as f:
        data = f.read()
    return data

-def re_split( data ):
+
+def re_split(data):
    pattern = re.compile('[\W_]+')
    data = pattern.sub(' ', data).lower()
    return data.split()

-def get_stopwords( path_to_file = stopwordfilepath ):
-    with open(path_to_file,encoding='utf-8') as f:
-        data = f.read().split(',')        
+
+def get_stopwords(path_to_file=stopwordfilepath):
+    with open(path_to_file, encoding='utf-8') as f:
+        data = f.read().split(',')
    data.extend(list(string.ascii_lowercase))
    return data

-def get_chunks( file_path = testfilepath, chunk_size = 1000):
+
+def get_chunks(file_path=testfilepath, chunk_size=1000):
    # 读取文件内容，分割文件内容为多个块，每个块由一个进程处理
    # 可以根据实际情况调整块大小
-    content = re_split(read_file(file_path))         
-    chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
+    content = re_split(read_file(file_path))
+    chunks = [
+        content[i:i + chunk_size] for i in range(0, len(content), chunk_size)
+    ]
    return chunks

+
 def extract_file_words(path_to_file):
-    word_list = re_split( read_file(path_to_file) )
+    word_list = re_split(read_file(path_to_file))
    stop_words = get_stopwords()
-    return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ]
+    return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
+

 def extract_str_words(data_str):
-    word_list = re_split( data_str )
+    word_list = re_split(data_str)
    stop_words = get_stopwords()
-    return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ]
+    return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
+

 def count_word(word, word_freqs, stopwords):
    if word not in stopwords:
        word_freqs[word] = word_freqs.get(word, 0) + 1

-def get_frequencies(word_list):    
-    word_freqs = {}  
-    for word in word_list:  
-        word_freqs[word] = word_freqs.get(word, 0) + 1    
+
+def get_frequencies(word_list):
+    word_freqs = {}
+    for word in word_list:
+        word_freqs[word] = word_freqs.get(word, 0) + 1
    return word_freqs

-def sort_dict (word_freq):
+
+def sort_dict(word_freq):
    return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
    # return sorted( word_freq, key=lambda x: x[1], reverse=True )

-def print_word_freqs( word_freqs, n = 10):
-    for (w, c) in word_freqs[ :n ]:
-        print( w, '-', c )
+
+def print_word_freqs(word_freqs, n=10):
+    for (w, c) in word_freqs[:n]:
+        print(w, '-', c)


 ################################################################################
 #  通用工具
 ################################################################################

+
 def timing_decorator(func):
+
    def wrapper(*args, **kwargs):
        start_time = time.time()  # 记录开始时间
        result = func(*args, **kwargs)  # 调用原始函数
@ -87,7 +99,9 @@ def timing_decorator(func):
        run_time = end_time - start_time  # 计算运行时间
        print(f"{func.__name__} 运行时间: {run_time*1000:.2f} 秒")
        return result
+
    return wrapper

-def  test():
-    print( 'cppy welcome' )
+
+def test():
+    print('cppy welcome')
--- a/代码模式/cppy_/data/Prey.txt
+++ b/代码模式/cppy_/data/Prey.txt
--- a/代码模式/cppy_/data/pride-and-prejudice.txt
+++ b/代码模式/cppy_/data/pride-and-prejudice.txt
--- a/代码模式/cppy_/data/stop_words.txt
+++ b/代码模式/cppy_/data/stop_words.txt
--- a/代码模式/cppy_/data/test.txt
+++ b/代码模式/cppy_/data/test.txt
--- a/代码模式/cppy_/pycache/cp_util.cpython-38.pyc
+++ b/代码模式/cppy_/pycache/cp_util.cpython-38.pyc