feat: 修改了代码以实现新的功能或API;

fix: 修复了代码中的一个错误;
refactor: 重写/重构代码,但没有改变任何API行为;
style: 添加了空格、格式化、缺失的分号等;
test: 添加了缺失的测试或修正了现有的测试;
docs: 更新了如readme等的文档;
build: 更新了依赖项、项目版本;
ops: 影响了操作性组件,如基础设施、部署、备份、恢复;
chore: 修改了.gitignore等;
dev
Yao 2 months ago
parent 36afa1d669
commit e2eab49065

@ -9,15 +9,19 @@ data = ''
words = [] words = []
word_freqs = [] word_freqs = []
################################ ################################
# procedures # procedures
################################ ################################
def read_file(path_to_file): def read_file(path_to_file):
"""读取文件内容并赋值给全局变量data"""
global data global data
with open(path_to_file,encoding='utf-8') as f: with open(path_to_file, encoding='utf-8') as f:
data = f.read() data = f.read()
def extractwords(): def extractwords():
"""提取data中的单词并赋值给全局变量words"""
global data global data
global words global words
words = data.lower().split() words = data.lower().split()
@ -26,18 +30,22 @@ def extractwords():
stop_words.update(string.ascii_lowercase) stop_words.update(string.ascii_lowercase)
words = [word for word in words if word not in stop_words] words = [word for word in words if word not in stop_words]
def frequencies(): def frequencies():
"""统计words中单词的频率并赋值给全局变量word_freqs"""
global words global words
global word_freqs global word_freqs
word_freqs.extend([(word, 1) for word in words]) word_freqs.extend([(word, 1) for word in words])
def sort(): def sort():
"""对word_freqs按照频率进行排序"""
global word_freqs global word_freqs
word_freqs = Counter(words).most_common() word_freqs = Counter(words).most_common()
if __name__ == "__main__": if __name__ == "__main__":
read_file( testfilepath ) read_file(testfilepath)
extractwords() extractwords()
frequencies() frequencies()
sort() sort()

@ -3,25 +3,30 @@ from cppy.cp_util import *
def extractwords(str_data): def extractwords(str_data):
"""提取单词"""
pattern = re.compile('[\W_]+') pattern = re.compile('[\W_]+')
word_list = pattern.sub(' ', str_data).lower().split() word_list = pattern.sub(' ', str_data).lower().split()
stop_words = get_stopwords() stop_words = get_stopwords()
return [w for w in word_list if not w in stop_words] return [w for w in word_list if w not in stop_words]
def frequencies(word_list): def frequencies(word_list):
"""统计单词频率"""
word_freqs = {} word_freqs = {}
for word in word_list: for word in word_list:
word_freqs[word] = word_freqs.get(word, 0) + 1 word_freqs[word] = word_freqs.get(word, 0) + 1
return word_freqs return word_freqs
def sort(word_freq): def sort(word_freq):
return sorted( word_freq.items(), key=lambda x: x[1], reverse=True ) """对单词频率进行排序"""
return sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
if __name__ == "__main__": if __name__ == "__main__":
txtcontent = read_file( testfilepath ) txtcontent = read_file(testfilepath)
word_list = extractwords( txtcontent ) word_list = extractwords(txtcontent)
word_freqs = frequencies( word_list ) word_freqs = frequencies(word_list)
word_sorts = sort ( word_freqs ) word_sorts = sort(word_freqs)
for tf in word_sorts[:10]: for tf in word_sorts[:10]:
print(tf[0], '-', tf[1]) print(tf[0], '-', tf[1])

@ -3,7 +3,7 @@ from collections import Counter
from cppy.cp_util import * from cppy.cp_util import *
# 读取文件 # 读取文件
with open(testfilepath,encoding='utf-8') as f: with open(testfilepath, encoding='utf-8') as f:
data = f.read().lower() # 直接转换为小写 data = f.read().lower() # 直接转换为小写
# 过滤非字母字符 # 过滤非字母字符
@ -20,6 +20,8 @@ words = [word for word in words if word not in stop_words]
word_freqs = Counter(words) word_freqs = Counter(words)
# 排序并打印 # 排序并打印
sorted_word_freqs = sorted(word_freqs.items(), key=lambda x: x[1], reverse=True) sorted_word_freqs = sorted(word_freqs.items(),
key=lambda x: x[1],
reverse=True)
print_word_freqs(sorted_word_freqs) print_word_freqs(sorted_word_freqs)

@ -3,18 +3,22 @@ from collections import Counter
stop_words = get_stopwords() stop_words = get_stopwords()
def process_chunk(chunk): def process_chunk(chunk):
# 过滤停用词 # 过滤停用词
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ] words = [w for w in chunk if (w not in stop_words) and len(w) >= 3]
return Counter(words) return Counter(words)
def process_chunks( chunks,word_freqs,x,max ):
def process_chunks(chunks, word_freqs, x, max):
"""递归处理分片"""
next = x + 1 next = x + 1
if next < max: if next < max:
process_chunks(chunks,word_freqs,next,max) process_chunks(chunks, word_freqs, next, max)
word_list = process_chunk(chunks[x]) word_list = process_chunk(chunks[x])
word_freqs += Counter(word_list) word_freqs += Counter(word_list)
# def process_chunks( chunks,word_freqs,x,max ): # def process_chunks( chunks,word_freqs,x,max ):
# word_list = process_chunk(chunks[x]) # word_list = process_chunk(chunks[x])
# word_freqs += Counter(word_list) # word_freqs += Counter(word_list)
@ -22,9 +26,8 @@ def process_chunks( chunks,word_freqs,x,max ):
# if next < max: # if next < max:
# process_chunks(chunks,word_freqs,next,max) # process_chunks(chunks,word_freqs,next,max)
# 读数据按1000个词一组分片 # 读数据按1000个词一组分片
chunks = get_chunks(testfilepath,2000) chunks = get_chunks(testfilepath, 2000)
word_freqs = Counter() word_freqs = Counter()
process_chunks( chunks,word_freqs,0,len(chunks) ) process_chunks(chunks, word_freqs, 0, len(chunks))
print_word_freqs( word_freqs.most_common(10) ) print_word_freqs(word_freqs.most_common(10))

@ -1,3 +1,3 @@
from cppy.cp_util import * from cppy.cp_util import *
print_word_freqs( sort_dict ( get_frequencies ( extract_file_words(testfilepath) ))) print_word_freqs(sort_dict(get_frequencies(extract_file_words(testfilepath))))

@ -2,11 +2,13 @@ from cppy.cp_util import *
# 如果有连续的对数据加工操作,而且总是把共同加工数据对象当第一个参数,可以用本文件夹方法提升阅读体验 # 如果有连续的对数据加工操作,而且总是把共同加工数据对象当第一个参数,可以用本文件夹方法提升阅读体验
# 框架类 # 框架类
class FunBind: class FunBind:
def bind(self, func,*args, **kwargs):
def bind(self, func, *args, **kwargs):
try: try:
self.data = func(self.data,*args, **kwargs) self.data = func(self.data, *args, **kwargs)
except: except:
self.data = func(*args, **kwargs) self.data = func(*args, **kwargs)
return self return self
@ -19,7 +21,6 @@ data = FunBind()\
.bind(print_word_freqs,10)\ .bind(print_word_freqs,10)\
.data .data
print(data) print(data)
''' '''
函数是自由函数,还是正常的函数写法 函数是自由函数,还是正常的函数写法
使用 使用

@ -1,5 +1,4 @@
from cppy.cp_util import * from cppy.cp_util import *
''' '''
函数是自由函数,还是正常的函数写法 函数是自由函数,还是正常的函数写法
使用 使用
@ -7,7 +6,9 @@ from cppy.cp_util import *
- 调用 data 得到最后数据 - 调用 data 得到最后数据
''' '''
class FunPipe: class FunPipe:
def __init__(self, func, *args, **kwargs): def __init__(self, func, *args, **kwargs):
self.func = func self.func = func
self.args = args self.args = args
@ -15,7 +16,7 @@ class FunPipe:
def __or__(self, other): def __or__(self, other):
_data = self.func(*self.args, **self.kwargs) _data = self.func(*self.args, **self.kwargs)
return FunPipe( other.func,_data,*other.args,**other.kwargs) return FunPipe(other.func, _data, *other.args, **other.kwargs)
@property @property
def data(self): def data(self):
@ -23,6 +24,8 @@ class FunPipe:
# 模仿管道 # 模仿管道
pipe = FunPipe(extract_file_words,testfilepath) | FunPipe(get_frequencies) | FunPipe(sort_dict) | FunPipe(print_word_freqs, 10) pipe = FunPipe(
extract_file_words,
testfilepath) | FunPipe(get_frequencies) | FunPipe(sort_dict) | FunPipe(
print_word_freqs, 10)
pipe.data pipe.data

@ -1,6 +1,8 @@
from cppy.cp_util import * from cppy.cp_util import *
class Flow: class Flow:
def extract_file_words(self, filepath): def extract_file_words(self, filepath):
self.data = extract_file_words(filepath) self.data = extract_file_words(filepath)
return self return self
@ -19,8 +21,8 @@ class Flow:
# 顺序调用 # 顺序调用
Flow().extract_file_words(testfilepath).get_frequencies().sort_dict().print_word_freqs(10) Flow().extract_file_words(
testfilepath).get_frequencies().sort_dict().print_word_freqs(10)
''' '''
连续方法调用看起来比较舒服 连续方法调用看起来比较舒服
但是需要假设 但是需要假设

@ -1,15 +1,18 @@
from cppy.cp_util import * from cppy.cp_util import *
# 装饰器改写类 # 装饰器改写类
# - 找到以f_开头的方法 # - 找到以f_开头的方法
# - 将方法函数的返回值赋值给对象的data属性 # - 将方法函数的返回值赋值给对象的data属性
# - 返回对象自身 # - 返回对象自身
def return_self_decorator(cls): def return_self_decorator(cls):
def return_self(func): def return_self(func):
# 定义一个闭包函数,用于接收参数 # 定义一个闭包函数,用于接收参数
def wrapper(self, *args, **kwargs): def wrapper(self, *args, **kwargs):
self.data = func(self, *args, **kwargs) self.data = func(self, *args, **kwargs)
return self # 返回类自身 return self # 返回类自身
return wrapper return wrapper
for name, method in cls.__dict__.items(): for name, method in cls.__dict__.items():
@ -19,8 +22,10 @@ def return_self_decorator(cls):
setattr(cls, name, return_self(method)) setattr(cls, name, return_self(method))
return cls return cls
@return_self_decorator @return_self_decorator
class Flow(): class Flow():
def test(self): def test(self):
return 'test' return 'test'
@ -38,8 +43,8 @@ class Flow():
# 顺序调用 # 顺序调用
Flow().f_extract_file_words(testfilepath).f_get_frequencies().f_sort_dict().f_print_word_freqs(10) Flow().f_extract_file_words(
testfilepath).f_get_frequencies().f_sort_dict().f_print_word_freqs(10)
''' '''
改写后参与 function flow 功能的方法 改写后参与 function flow 功能的方法
- 需要以 'f_' 开头 - 需要以 'f_' 开头

@ -1,26 +1,30 @@
from cppy.cp_util import * from cppy.cp_util import *
from collections import Counter from collections import Counter
# 定义一个带计数器的元类 # 定义一个带计数器的元类
class CounterMetaclass(type): class CounterMetaclass(type):
def __new__(mcs, name, bases, attrs): def __new__(mcs, name, bases, attrs):
attrs['_counter'] = Counter() attrs['_counter'] = Counter()
return super().__new__(mcs, name, bases, attrs) return super().__new__(mcs, name, bases, attrs)
# 基于元类创建类 # 基于元类创建类
class Word( metaclass=CounterMetaclass ): class Word(metaclass=CounterMetaclass):
def __init__(self, word): def __init__(self, word):
self.word = word self.word = word
self._counter[self.word] += 1 self._counter[self.word] += 1
@classmethod @classmethod
def get_word_freqs(cls,n) -> Counter: def get_word_freqs(cls, n) -> Counter:
return cls._counter.most_common(n) return cls._counter.most_common(n)
for word in extract_file_words ( testfilepath ) : Word(word) for word in extract_file_words(testfilepath):
Word(word)
print_word_freqs(Word.get_word_freqs(10)) print_word_freqs(Word.get_word_freqs(10))
''' '''
常用于将依赖项如服务或配置自动注入到类中 常用于将依赖项如服务或配置自动注入到类中
''' '''
Loading…
Cancel
Save