feat: 修改了代码以实现新的功能或API;

fix: 修复了代码中的一个错误;
refactor: 重写/重构代码,但没有改变任何API行为;
style: 添加了空格、格式化、缺失的分号等;
test: 添加了缺失的测试或修正了现有的测试;
docs: 更新了如readme等的文档;
build: 更新了依赖项、项目版本;
ops: 影响了操作性组件,如基础设施、部署、备份、恢复;
chore: 修改了.gitignore等;
dev
Yao 2 months ago
parent 36afa1d669
commit e2eab49065

@ -9,38 +9,46 @@ data = ''
words = [] words = []
word_freqs = [] word_freqs = []
################################ ################################
# procedures # procedures
################################ ################################
def read_file(path_to_file): def read_file(path_to_file):
"""读取文件内容并赋值给全局变量data"""
global data global data
with open(path_to_file,encoding='utf-8') as f: with open(path_to_file, encoding='utf-8') as f:
data = f.read() data = f.read()
def extractwords():
def extractwords():
"""提取data中的单词并赋值给全局变量words"""
global data global data
global words global words
words = data.lower().split() words = data.lower().split()
with open(stopwordfilepath) as f: with open(stopwordfilepath) as f:
stop_words = set(f.read().split(',')) stop_words = set(f.read().split(','))
stop_words.update(string.ascii_lowercase) stop_words.update(string.ascii_lowercase)
words = [word for word in words if word not in stop_words] words = [word for word in words if word not in stop_words]
def frequencies():
def frequencies():
"""统计words中单词的频率并赋值给全局变量word_freqs"""
global words global words
global word_freqs global word_freqs
word_freqs.extend([(word, 1) for word in words]) word_freqs.extend([(word, 1) for word in words])
def sort():
global word_freqs def sort():
"""对word_freqs按照频率进行排序"""
global word_freqs
word_freqs = Counter(words).most_common() word_freqs = Counter(words).most_common()
if __name__ == "__main__": if __name__ == "__main__":
read_file( testfilepath ) read_file(testfilepath)
extractwords() extractwords()
frequencies() frequencies()
sort() sort()
for tf in word_freqs[:10]: for tf in word_freqs[:10]:
print(tf[0], '-', tf[1]) print(tf[0], '-', tf[1])

@ -3,25 +3,30 @@ from cppy.cp_util import *
def extractwords(str_data): def extractwords(str_data):
"""提取单词"""
pattern = re.compile('[\W_]+') pattern = re.compile('[\W_]+')
word_list = pattern.sub(' ', str_data).lower().split() word_list = pattern.sub(' ', str_data).lower().split()
stop_words = get_stopwords() stop_words = get_stopwords()
return [w for w in word_list if not w in stop_words] return [w for w in word_list if w not in stop_words]
def frequencies(word_list):
word_freqs = {} def frequencies(word_list):
for word in word_list: """统计单词频率"""
word_freqs[word] = word_freqs.get(word, 0) + 1 word_freqs = {}
for word in word_list:
word_freqs[word] = word_freqs.get(word, 0) + 1
return word_freqs return word_freqs
def sort(word_freq):
return sorted( word_freq.items(), key=lambda x: x[1], reverse=True ) def sort(word_freq):
"""对单词频率进行排序"""
return sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
if __name__ == "__main__": if __name__ == "__main__":
txtcontent = read_file( testfilepath ) txtcontent = read_file(testfilepath)
word_list = extractwords( txtcontent ) word_list = extractwords(txtcontent)
word_freqs = frequencies( word_list ) word_freqs = frequencies(word_list)
word_sorts = sort ( word_freqs ) word_sorts = sort(word_freqs)
for tf in word_sorts[:10]: for tf in word_sorts[:10]:
print(tf[0], '-', tf[1]) print(tf[0], '-', tf[1])

@ -3,7 +3,7 @@ from collections import Counter
from cppy.cp_util import * from cppy.cp_util import *
# 读取文件 # 读取文件
with open(testfilepath,encoding='utf-8') as f: with open(testfilepath, encoding='utf-8') as f:
data = f.read().lower() # 直接转换为小写 data = f.read().lower() # 直接转换为小写
# 过滤非字母字符 # 过滤非字母字符
@ -20,6 +20,8 @@ words = [word for word in words if word not in stop_words]
word_freqs = Counter(words) word_freqs = Counter(words)
# 排序并打印 # 排序并打印
sorted_word_freqs = sorted(word_freqs.items(), key=lambda x: x[1], reverse=True) sorted_word_freqs = sorted(word_freqs.items(),
key=lambda x: x[1],
reverse=True)
print_word_freqs(sorted_word_freqs) print_word_freqs(sorted_word_freqs)

@ -3,28 +3,31 @@ from collections import Counter
stop_words = get_stopwords() stop_words = get_stopwords()
def process_chunk(chunk): def process_chunk(chunk):
# 过滤停用词 # 过滤停用词
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ] words = [w for w in chunk if (w not in stop_words) and len(w) >= 3]
return Counter(words) return Counter(words)
def process_chunks( chunks,word_freqs,x,max ):
next = x + 1 def process_chunks(chunks, word_freqs, x, max):
"""递归处理分片"""
next = x + 1
if next < max: if next < max:
process_chunks(chunks,word_freqs,next,max) process_chunks(chunks, word_freqs, next, max)
word_list = process_chunk(chunks[x]) word_list = process_chunk(chunks[x])
word_freqs += Counter(word_list) word_freqs += Counter(word_list)
# def process_chunks( chunks,word_freqs,x,max ): # def process_chunks( chunks,word_freqs,x,max ):
# word_list = process_chunk(chunks[x]) # word_list = process_chunk(chunks[x])
# word_freqs += Counter(word_list) # word_freqs += Counter(word_list)
# next = x + 1 # next = x + 1
# if next < max: # if next < max:
# process_chunks(chunks,word_freqs,next,max) # process_chunks(chunks,word_freqs,next,max)
# 读数据按1000个词一组分片 # 读数据按1000个词一组分片
chunks = get_chunks(testfilepath,2000) chunks = get_chunks(testfilepath, 2000)
word_freqs = Counter() word_freqs = Counter()
process_chunks( chunks,word_freqs,0,len(chunks) ) process_chunks(chunks, word_freqs, 0, len(chunks))
print_word_freqs( word_freqs.most_common(10) ) print_word_freqs(word_freqs.most_common(10))

@ -1,3 +1,3 @@
from cppy.cp_util import * from cppy.cp_util import *
print_word_freqs( sort_dict ( get_frequencies ( extract_file_words(testfilepath) ))) print_word_freqs(sort_dict(get_frequencies(extract_file_words(testfilepath))))

@ -2,14 +2,16 @@ from cppy.cp_util import *
# 如果有连续的对数据加工操作,而且总是把共同加工数据对象当第一个参数,可以用本文件夹方法提升阅读体验 # 如果有连续的对数据加工操作,而且总是把共同加工数据对象当第一个参数,可以用本文件夹方法提升阅读体验
# 框架类 # 框架类
class FunBind: class FunBind:
def bind(self, func,*args, **kwargs):
def bind(self, func, *args, **kwargs):
try: try:
self.data = func(self.data,*args, **kwargs) self.data = func(self.data, *args, **kwargs)
except: except:
self.data = func(*args, **kwargs) self.data = func(*args, **kwargs)
return self return self
data = FunBind()\ data = FunBind()\
@ -19,10 +21,9 @@ data = FunBind()\
.bind(print_word_freqs,10)\ .bind(print_word_freqs,10)\
.data .data
print(data) print(data)
''' '''
函数是自由函数,还是正常的函数写法 函数是自由函数,还是正常的函数写法
使用 使用
- 列举函数名首部参数外的其它参数 - 列举函数名首部参数外的其它参数
- 调用 data 得到最后数据 - 调用 data 得到最后数据
''' '''

@ -1,28 +1,31 @@
from cppy.cp_util import * from cppy.cp_util import *
''' '''
函数是自由函数,还是正常的函数写法 函数是自由函数,还是正常的函数写法
使用 使用
- 列举函数名首部参数外的其它参数 - 列举函数名首部参数外的其它参数
- 调用 data 得到最后数据 - 调用 data 得到最后数据
''' '''
class FunPipe: class FunPipe:
def __init__(self, func, *args, **kwargs):
def __init__(self, func, *args, **kwargs):
self.func = func self.func = func
self.args = args self.args = args
self.kwargs = kwargs self.kwargs = kwargs
def __or__(self, other): def __or__(self, other):
_data = self.func(*self.args, **self.kwargs) _data = self.func(*self.args, **self.kwargs)
return FunPipe( other.func,_data,*other.args,**other.kwargs) return FunPipe(other.func, _data, *other.args, **other.kwargs)
@property @property
def data(self): def data(self):
return self.func(*self.args, **self.kwargs) return self.func(*self.args, **self.kwargs)
# 模仿管道 # 模仿管道
pipe = FunPipe(extract_file_words,testfilepath) | FunPipe(get_frequencies) | FunPipe(sort_dict) | FunPipe(print_word_freqs, 10) pipe = FunPipe(
extract_file_words,
testfilepath) | FunPipe(get_frequencies) | FunPipe(sort_dict) | FunPipe(
print_word_freqs, 10)
pipe.data pipe.data

@ -1,17 +1,19 @@
from cppy.cp_util import * from cppy.cp_util import *
class Flow: class Flow:
def extract_file_words(self, filepath): def extract_file_words(self, filepath):
self.data = extract_file_words(filepath) self.data = extract_file_words(filepath)
return self return self
def get_frequencies(self): def get_frequencies(self):
self.data = get_frequencies(self.data) self.data = get_frequencies(self.data)
return self return self
def sort_dict(self): def sort_dict(self):
self.data = sort_dict(self.data) self.data = sort_dict(self.data)
return self return self
def print_word_freqs(self, n): def print_word_freqs(self, n):
print_word_freqs(self.data, n) print_word_freqs(self.data, n)
@ -19,11 +21,11 @@ class Flow:
# 顺序调用 # 顺序调用
Flow().extract_file_words(testfilepath).get_frequencies().sort_dict().print_word_freqs(10) Flow().extract_file_words(
testfilepath).get_frequencies().sort_dict().print_word_freqs(10)
''' '''
连续方法调用看起来比较舒服 连续方法调用看起来比较舒服
但是需要假设 但是需要假设
- 每一个类方法返回 self 否则没法连续 - 每一个类方法返回 self 否则没法连续
- 类方法默认不写第一个参数数据都在 .data 里面 - 类方法默认不写第一个参数数据都在 .data 里面
''' '''

@ -1,45 +1,50 @@
from cppy.cp_util import * from cppy.cp_util import *
# 装饰器改写类 # 装饰器改写类
# - 找到以f_开头的方法 # - 找到以f_开头的方法
# - 将方法函数的返回值赋值给对象的data属性 # - 将方法函数的返回值赋值给对象的data属性
# - 返回对象自身 # - 返回对象自身
def return_self_decorator(cls): def return_self_decorator(cls):
def return_self(func): def return_self(func):
# 定义一个闭包函数,用于接收参数 # 定义一个闭包函数,用于接收参数
def wrapper(self, *args, **kwargs): def wrapper(self, *args, **kwargs):
self.data = func(self, *args, **kwargs) self.data = func(self, *args, **kwargs)
return self # 返回类自身 return self # 返回类自身
return wrapper return wrapper
for name, method in cls.__dict__.items(): for name, method in cls.__dict__.items():
# 判断属性是否可调用且属性名以f_开头 # 判断属性是否可调用且属性名以f_开头
if callable(method) and name.startswith('f_'): if callable(method) and name.startswith('f_'):
# 为类改写属性,将封装后的函数赋值 # 为类改写属性,将封装后的函数赋值
setattr(cls, name, return_self(method)) setattr(cls, name, return_self(method))
return cls return cls
@return_self_decorator @return_self_decorator
class Flow(): class Flow():
def test(self): def test(self):
return 'test' return 'test'
def f_extract_file_words(self, filepath): def f_extract_file_words(self, filepath):
return extract_file_words(filepath) return extract_file_words(filepath)
def f_get_frequencies(self): def f_get_frequencies(self):
return get_frequencies(self.data) return get_frequencies(self.data)
def f_sort_dict(self): def f_sort_dict(self):
return sort_dict(self.data) return sort_dict(self.data)
def f_print_word_freqs(self, n): def f_print_word_freqs(self, n):
print_word_freqs(self.data, n) print_word_freqs(self.data, n)
# 顺序调用 # 顺序调用
Flow().f_extract_file_words(testfilepath).f_get_frequencies().f_sort_dict().f_print_word_freqs(10) Flow().f_extract_file_words(
testfilepath).f_get_frequencies().f_sort_dict().f_print_word_freqs(10)
''' '''
改写后参与 function flow 功能的方法 改写后参与 function flow 功能的方法
- 需要以 'f_' 开头 - 需要以 'f_' 开头
@ -47,4 +52,4 @@ Flow().f_extract_file_words(testfilepath).f_get_frequencies().f_sort_dict().f_pr
仍旧需要特殊的方法写法 仍旧需要特殊的方法写法
所以还是 12种方法比较自然 所以还是 12种方法比较自然
''' '''

@ -1,26 +1,30 @@
from cppy.cp_util import * from cppy.cp_util import *
from collections import Counter from collections import Counter
# 定义一个带计数器的元类 # 定义一个带计数器的元类
class CounterMetaclass(type): class CounterMetaclass(type):
def __new__(mcs, name, bases, attrs):
attrs['_counter'] = Counter() def __new__(mcs, name, bases, attrs):
attrs['_counter'] = Counter()
return super().__new__(mcs, name, bases, attrs) return super().__new__(mcs, name, bases, attrs)
# 基于元类创建类 # 基于元类创建类
class Word( metaclass=CounterMetaclass ): class Word(metaclass=CounterMetaclass):
def __init__(self, word): def __init__(self, word):
self.word = word self.word = word
self._counter[self.word] += 1 self._counter[self.word] += 1
@classmethod @classmethod
def get_word_freqs(cls,n) -> Counter: def get_word_freqs(cls, n) -> Counter:
return cls._counter.most_common(n) return cls._counter.most_common(n)
for word in extract_file_words ( testfilepath ) : Word(word)
print_word_freqs(Word.get_word_freqs(10))
for word in extract_file_words(testfilepath):
Word(word)
print_word_freqs(Word.get_word_freqs(10))
''' '''
常用于将依赖项如服务或配置自动注入到类中 常用于将依赖项如服务或配置自动注入到类中
''' '''

Loading…
Cancel
Save