refactor: 优化代码,提高可读性和效率

dev^2
Yao 3 months ago
parent 15736d7393
commit 36afa1d669

@ -1,58 +1,101 @@
from collections import Counter from collections import Counter
from cppy.cp_util import * from cppy.cp_util import *
class DataStorageManager: class DataStorageManager:
""" 数据模型 """ """
def __init__(self, path_to_file): 数据模型读取文件内容并将内容分割成单词
self._data = re_split( read_file(path_to_file) )
Attributes:
_data: 单词列表
Methods:
_words (self): 返回分割后的单词列表
"""
def __init__(self, path_to_file):
self._data = re_split(read_file(path_to_file))
def words(self): def words(self):
"""返回分割后的单词列表。"""
return self._data return self._data
class StopWordManager: class StopWordManager:
""" 停用词模型 """ """
def __init__(self): 停用词模型
Attributes:
_stop_words: 停用词列表
Methods:
is_stop_word (self, word): 判断给定单词是否为停用词
"""
def __init__(self):
self._stop_words = get_stopwords() self._stop_words = get_stopwords()
def is_stop_word(self, word): def is_stop_word(self, word):
"""判断给定单词是否为停用词。"""
return word in self._stop_words return word in self._stop_words
class WordFrequencyManager: class WordFrequencyManager:
""" 词频模型 """ """
词频模型计算并管理单词的频率
Attributes:
_word_freqs: 使用 Counter 存储单词及其出现次数
Methods:
increment_count (self, word): 计算词频
sorted(self): 返回按出现次数排序的单词列表
"""
def __init__(self): def __init__(self):
self._word_freqs = Counter() self._word_freqs = Counter()
def increment_count(self, word): def increment_count(self, word):
"""计算词频。"""
self._word_freqs[word] += 1 self._word_freqs[word] += 1
def sorted(self): def sorted(self):
"""返回按出现次数排序的单词列表。"""
return self._word_freqs.most_common() return self._word_freqs.most_common()
class WordFrequencyController: class WordFrequencyController:
"""
控制器控制整个流程读取文件处理停用词计算词频并输出结果
Attributes:
_storage_manager: DataStorageManager 实例用于读取和处理文件内容
_stop_word_manager: StopWordManager 实例用于管理停用词
_word_freq_manager: WordFrequencyManager 实例用于计算和存储单词频率
Methods:
run(self): 运行方法遍历单词列表过滤掉停用词并计算每个单词的频率最后输出结果
"""
def __init__(self, path_to_file): def __init__(self, path_to_file):
self._storage_manager = DataStorageManager(path_to_file) self._storage_manager = DataStorageManager(path_to_file)
self._stop_word_manager = StopWordManager() self._stop_word_manager = StopWordManager()
self._word_freq_manager = WordFrequencyManager() self._word_freq_manager = WordFrequencyManager()
def run(self): def run(self):
"""运行方法,遍历单词列表,过滤掉停用词,并计算每个单词的频率,最后输出结果。"""
for w in self._storage_manager.words(): for w in self._storage_manager.words():
if not self._stop_word_manager.is_stop_word(w): if not self._stop_word_manager.is_stop_word(w):
self._word_freq_manager.increment_count(w) self._word_freq_manager.increment_count(w)
word_freqs = self._word_freq_manager.sorted() word_freqs = self._word_freq_manager.sorted()
print_word_freqs(word_freqs) print_word_freqs(word_freqs)
if __name__ == '__main__': if __name__ == '__main__':
WordFrequencyController(testfilepath).run() WordFrequencyController(testfilepath).run()
''' '''
函数输入参数调用后你的马上接住返回值 函数输入参数调用后你的马上接住返回值
类输入参数后实例化后你可以需要的时候去访问你需要的数据实例属性 类输入参数后实例化后你可以需要的时候去访问你需要的数据实例属性
''' '''

@ -1,29 +1,52 @@
from cppy.cp_util import * from cppy.cp_util import *
def extract_words(obj, path_to_file):
def extract_words(obj, path_to_file):
"""
从文件中提取单词并存储在对象的 'data' 字段中
Args:
obj (dict): 存储数据的字典对象
path_to_file (str): 文件路径
"""
obj['data'] = extract_file_words(path_to_file) obj['data'] = extract_file_words(path_to_file)
def increment_count(obj, w): def increment_count(obj, w):
obj['freqs'][w] = 1 if w not in obj['freqs'] else obj['freqs'][w]+1 """
增加单词的计数如果单词不存在则将其计数设置为1
参数:
obj (dict): 存储单词频率的字典对象
w (str): 单词
"""
obj['freqs'][w] = 1 if w not in obj['freqs'] else obj['freqs'][w] + 1
# 数据存储对象,包含初始化和获取单词的方法
data_storage_obj = { data_storage_obj = {
'data' : [], 'data': [], # 存储单词列表
'init' : lambda path_to_file : extract_words(data_storage_obj, path_to_file), 'init': lambda path_to_file: extract_words(data_storage_obj, path_to_file
'words' : lambda : data_storage_obj['data'] ), # 初始化方法,提取文件中的单词
'words': lambda: data_storage_obj['data'] # 获取单词列表的方法
} }
# 单词频率对象,包含增加计数和排序的方法
word_freqs_obj = { word_freqs_obj = {
'freqs' : {}, 'freqs': {}, # 存储单词频率的字典
'increment_count' : lambda w : increment_count(word_freqs_obj, w), 'increment_count':
'sorted' : lambda : sort_dict(word_freqs_obj['freqs']) lambda w: increment_count(word_freqs_obj, w), # 增加单词计数的方法
'sorted': lambda: sort_dict(word_freqs_obj['freqs']) # 获取排序后的单词频率的方法
} }
if __name__ == '__main__': if __name__ == '__main__':
data_storage_obj['init']( testfilepath ) # 初始化数据存储对象,提取文件中的单词
data_storage_obj['init'](testfilepath)
# 遍历单词列表,增加单词的计数
for word in data_storage_obj['words'](): for word in data_storage_obj['words']():
word_freqs_obj['increment_count'](word) word_freqs_obj['increment_count'](word)
# 获取排序后的单词频率并打印
word_freqs = word_freqs_obj['sorted']() word_freqs = word_freqs_obj['sorted']()
print_word_freqs(word_freqs) print_word_freqs(word_freqs)

@ -23,18 +23,45 @@ testfilepath = os.path.join(basePath, 'cppy', 'data', testfilename)
# 项目函数 # 项目函数
################################################################################ ################################################################################
def read_file(path_to_file): def read_file(path_to_file):
"""
读取指定文件的内容
Args:
path_to_file (str): 文件路径
Returns:
str: 文件内容
"""
with open(path_to_file, encoding='utf-8') as f: with open(path_to_file, encoding='utf-8') as f:
data = f.read() data = f.read()
return data return data
def re_split(data): def re_split(data):
"""
使用正则表达式分割字符串将非字母字符替换为空格并将所有字符转换为小写
Args:
data (str): 输入字符串
Returns:
list: 分割后的单词列表
"""
pattern = re.compile('[\W_]+') pattern = re.compile('[\W_]+')
data = pattern.sub(' ', data).lower() data = pattern.sub(' ', data).lower()
return data.split() return data.split()
def get_stopwords(path_to_file=stopwordfilepath): def get_stopwords(path_to_file=stopwordfilepath):
"""
获取停用词列表
Args:
path_to_file (str): 停用词文件路径默认为 stopwordfilepath
Returns:
list: 停用词列表
"""
with open(path_to_file, encoding='utf-8') as f: with open(path_to_file, encoding='utf-8') as f:
data = f.read().split(',') data = f.read().split(',')
data.extend(list(string.ascii_lowercase)) data.extend(list(string.ascii_lowercase))
@ -42,8 +69,16 @@ def get_stopwords(path_to_file=stopwordfilepath):
def get_chunks(file_path=testfilepath, chunk_size=1000): def get_chunks(file_path=testfilepath, chunk_size=1000):
# 读取文件内容,分割文件内容为多个块,每个块由一个进程处理 """
# 可以根据实际情况调整块大小 将文件内容分割成多个块
Args:
file_path (str): 文件路径默认为 testfilepath
chunk_size (int): 每个块的大小默认为 1000
Returns:
list: 分割后的块列表
"""
content = re_split(read_file(file_path)) content = re_split(read_file(file_path))
chunks = [ chunks = [
content[i:i + chunk_size] for i in range(0, len(content), chunk_size) content[i:i + chunk_size] for i in range(0, len(content), chunk_size)
@ -52,23 +87,58 @@ def get_chunks(file_path=testfilepath, chunk_size=1000):
def extract_file_words(path_to_file): def extract_file_words(path_to_file):
"""
提取文件中的单词去除停用词和长度小于3的单词
Args:
path_to_file (str): 文件路径
Returns:
list: 提取后的单词列表
"""
word_list = re_split(read_file(path_to_file)) word_list = re_split(read_file(path_to_file))
stop_words = get_stopwords() stop_words = get_stopwords()
return [w for w in word_list if (not w in stop_words) and len(w) >= 3] return [w for w in word_list if (w not in stop_words) and len(w) >= 3]
def extract_str_words(data_str): def extract_str_words(data_str):
"""
提取字符串中的单词去除停用词和长度小于3的单词
Args:
data_str (str): 输入字符串
Returns:
list: 提取后的单词列表
"""
word_list = re_split(data_str) word_list = re_split(data_str)
stop_words = get_stopwords() stop_words = get_stopwords()
return [w for w in word_list if (not w in stop_words) and len(w) >= 3] return [w for w in word_list if (w not in stop_words) and len(w) >= 3]
def count_word(word, word_freqs, stopwords): def count_word(word, word_freqs, stopwords):
"""
统计单词频率
Args:
word (str): 单词
word_freqs (dict): 单词频率字典
stopwords (list): 停用词列表
"""
if word not in stopwords: if word not in stopwords:
word_freqs[word] = word_freqs.get(word, 0) + 1 word_freqs[word] = word_freqs.get(word, 0) + 1
def get_frequencies(word_list): def get_frequencies(word_list):
"""
获取单词频率
Args:
word_list (list): 单词列表
Returns:
dict: 单词频率字典
"""
word_freqs = {} word_freqs = {}
for word in word_list: for word in word_list:
word_freqs[word] = word_freqs.get(word, 0) + 1 word_freqs[word] = word_freqs.get(word, 0) + 1
@ -76,11 +146,26 @@ def get_frequencies(word_list):
def sort_dict(word_freq): def sort_dict(word_freq):
"""
对字典进行排序
Args:
word_freq (dict): 单词频率字典
Returns:
list: 排序后的单词频率列表
"""
return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True) return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
# return sorted( word_freq, key=lambda x: x[1], reverse=True )
def print_word_freqs(word_freqs, n=10): def print_word_freqs(word_freqs, n=10):
"""
打印单词频率
Args:
word_freqs (list): 单词频率列表
n (int): 打印的单词数量默认为 10
"""
for (w, c) in word_freqs[:n]: for (w, c) in word_freqs[:n]:
print(w, '-', c) print(w, '-', c)

Loading…
Cancel
Save