refactor(code): 优化代码,提高可读性和效率

pull/17/head
Yao 3 months ago
parent f170c936d8
commit 15736d7393

2
.gitignore vendored

@ -1,4 +1,4 @@
log.txt log.txt
/test /test
/.venv /.venv
*/__pycache__ __pycache__

@ -7,10 +7,9 @@ with open(stopwordfilepath, encoding='utf-8') as f:
for letter in 'abcdefghijklmnopqrstuvwxyz': for letter in 'abcdefghijklmnopqrstuvwxyz':
stop_words.append(letter) stop_words.append(letter)
# 读文件,逐行扫描文本,发现词,确定不是停用词,计数 # 读文件,逐行扫描文本,发现词,确定不是停用词,计数
word_freqs = [] word_freqs = []
for line in open( testfilepath, encoding='utf-8' ): for line in open(testfilepath, encoding='utf-8'):
start_char = None start_char = None
i = 0 i = 0
for c in line: for c in line:
@ -42,10 +41,9 @@ for line in open( testfilepath, encoding='utf-8' ):
# 使用冒泡排序对词频进行排序 # 使用冒泡排序对词频进行排序
n = len(word_freqs) n = len(word_freqs)
for i in range(n): for i in range(n):
for j in range(0, n-i-1): for j in range(0, n - i - 1):
if word_freqs[j][1] < word_freqs[j+1][1]: if word_freqs[j][1] < word_freqs[j + 1][1]:
word_freqs[j], word_freqs[j+1] = word_freqs[j+1], word_freqs[j] word_freqs[j], word_freqs[j + 1] = word_freqs[j + 1], word_freqs[j]
# 打印频率最高的前10个词 # 打印频率最高的前10个词
for tf in word_freqs[:10]: for tf in word_freqs[:10]:

@ -1,4 +1,4 @@
from cppy.cp_util import stopwordfilepath,testfilepath from cppy.cp_util import stopwordfilepath, testfilepath
import string import string
from collections import Counter from collections import Counter
@ -8,7 +8,7 @@ stop_words.update(list(string.ascii_lowercase))
# 读取文件并计算单词频率 # 读取文件并计算单词频率
word_freqs = Counter() word_freqs = Counter()
with open(testfilepath,encoding = 'utf8') as f: with open(testfilepath, encoding='utf8') as f:
for line_num, line in enumerate(f, 1): for line_num, line in enumerate(f, 1):
start_char = None start_char = None
for i, c in enumerate(line): for i, c in enumerate(line):
@ -23,10 +23,9 @@ with open(testfilepath,encoding = 'utf8') as f:
# 打印前10个最常见的单词 # 打印前10个最常见的单词
for word, freq in word_freqs.most_common(10): for word, freq in word_freqs.most_common(10):
print(f"{word}-{freq}") print(f"{word}-{freq}")
''' '''
相比 A01 相比 A01
使用collections.Counter来计数单词频率从而简化了代码并提高了效率 使用collections.Counter来计数单词频率从而简化了代码并提高了效率
使用enumerate来获取行号和行内容使用set来存储停用词都有助于提高代码的性能和可读性 使用enumerate来获取行号和行内容使用set来存储停用词都有助于提高代码的性能和可读性
使用most_common方法来获取最常见的单词使输出更为简洁 使用most_common方法来获取最常见的单词使输出更为简洁
''' '''

@ -1,12 +1,14 @@
import re, collections import re
from cppy.cp_util import stopwordfilepath,testfilepath import collections
from cppy.cp_util import stopwordfilepath, testfilepath
stopwords = set(open( stopwordfilepath,encoding = 'utf8' ).read().split(','))
words = re.findall('[a-z]{2,}', open( testfilepath,encoding = 'utf8').read().lower())
counts = collections.Counter( w for w in words if w not in stopwords )
for (w, c) in counts.most_common(10) : print(w, '-', c)
stopwords = set(open(stopwordfilepath, encoding='utf8').read().split(','))
words = re.findall('[a-z]{2,}',
open(testfilepath, encoding='utf8').read().lower())
counts = collections.Counter(w for w in words if w not in stopwords)
for (w, c) in counts.most_common(10):
print(w, '-', c)
''' '''
熟练的软件工程师会如此简单完成任务 熟练的软件工程师会如此简单完成任务
后面的例子我们必须变的啰嗦一些不能用这种太 hacker 的写法 后面的例子我们必须变的啰嗦一些不能用这种太 hacker 的写法
''' '''

@ -1,7 +1,6 @@
import site import site
import os,re,time import os, re, time
import string,operator import string, operator
################################################################################ ################################################################################
# 变量 # 变量
@ -10,76 +9,89 @@ testfilename = 'test.txt'
testfilename = 'pride-and-prejudice.txt' testfilename = 'pride-and-prejudice.txt'
testfilename = 'Prey.txt' testfilename = 'Prey.txt'
db_filename = "tf.db" db_filename = "tf.db"
site_packages = site.getsitepackages() site_packages = site.getsitepackages()
for package in site_packages: for package in site_packages:
if 'package' in package: if 'package' in package:
basePath = package basePath = package
stopwordfilepath = os.path.join(basePath, 'cppy','data','stop_words.txt') stopwordfilepath = os.path.join(basePath, 'cppy', 'data', 'stop_words.txt')
testfilepath = os.path.join(basePath, 'cppy','data',testfilename ) testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename)
################################################################################ ################################################################################
# 项目函数 # 项目函数
################################################################################ ################################################################################
def read_file(path_to_file): def read_file(path_to_file):
with open(path_to_file,encoding='utf-8') as f: with open(path_to_file, encoding='utf-8') as f:
data = f.read() data = f.read()
return data return data
def re_split( data ):
def re_split(data):
pattern = re.compile('[\W_]+') pattern = re.compile('[\W_]+')
data = pattern.sub(' ', data).lower() data = pattern.sub(' ', data).lower()
return data.split() return data.split()
def get_stopwords( path_to_file = stopwordfilepath ):
with open(path_to_file,encoding='utf-8') as f: def get_stopwords(path_to_file=stopwordfilepath):
data = f.read().split(',') with open(path_to_file, encoding='utf-8') as f:
data = f.read().split(',')
data.extend(list(string.ascii_lowercase)) data.extend(list(string.ascii_lowercase))
return data return data
def get_chunks( file_path = testfilepath, chunk_size = 1000):
def get_chunks(file_path=testfilepath, chunk_size=1000):
# 读取文件内容,分割文件内容为多个块,每个块由一个进程处理 # 读取文件内容,分割文件内容为多个块,每个块由一个进程处理
# 可以根据实际情况调整块大小 # 可以根据实际情况调整块大小
content = re_split(read_file(file_path)) content = re_split(read_file(file_path))
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)] chunks = [
content[i:i + chunk_size] for i in range(0, len(content), chunk_size)
]
return chunks return chunks
def extract_file_words(path_to_file): def extract_file_words(path_to_file):
word_list = re_split( read_file(path_to_file) ) word_list = re_split(read_file(path_to_file))
stop_words = get_stopwords() stop_words = get_stopwords()
return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ] return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
def extract_str_words(data_str): def extract_str_words(data_str):
word_list = re_split( data_str ) word_list = re_split(data_str)
stop_words = get_stopwords() stop_words = get_stopwords()
return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ] return [w for w in word_list if (not w in stop_words) and len(w) >= 3]
def count_word(word, word_freqs, stopwords): def count_word(word, word_freqs, stopwords):
if word not in stopwords: if word not in stopwords:
word_freqs[word] = word_freqs.get(word, 0) + 1 word_freqs[word] = word_freqs.get(word, 0) + 1
def get_frequencies(word_list):
word_freqs = {} def get_frequencies(word_list):
for word in word_list: word_freqs = {}
word_freqs[word] = word_freqs.get(word, 0) + 1 for word in word_list:
word_freqs[word] = word_freqs.get(word, 0) + 1
return word_freqs return word_freqs
def sort_dict (word_freq):
def sort_dict(word_freq):
return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True) return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
# return sorted( word_freq, key=lambda x: x[1], reverse=True ) # return sorted( word_freq, key=lambda x: x[1], reverse=True )
def print_word_freqs( word_freqs, n = 10):
for (w, c) in word_freqs[ :n ]: def print_word_freqs(word_freqs, n=10):
print( w, '-', c ) for (w, c) in word_freqs[:n]:
print(w, '-', c)
################################################################################ ################################################################################
# 通用工具 # 通用工具
################################################################################ ################################################################################
def timing_decorator(func): def timing_decorator(func):
def wrapper(*args, **kwargs): def wrapper(*args, **kwargs):
start_time = time.time() # 记录开始时间 start_time = time.time() # 记录开始时间
result = func(*args, **kwargs) # 调用原始函数 result = func(*args, **kwargs) # 调用原始函数
@ -87,7 +99,9 @@ def timing_decorator(func):
run_time = end_time - start_time # 计算运行时间 run_time = end_time - start_time # 计算运行时间
print(f"{func.__name__} 运行时间: {run_time*1000:.2f}") print(f"{func.__name__} 运行时间: {run_time*1000:.2f}")
return result return result
return wrapper return wrapper
def test():
print( 'cppy welcome' ) def test():
print('cppy welcome')
Loading…
Cancel
Save