feat: 修改了代码以实现新的功能或API;

fix: 修复了代码中的一个错误;
refactor: 重写/重构代码,但没有改变任何API行为;
style: 添加了空格、格式化、缺失的分号等;
test: 添加了缺失的测试或修正了现有的测试;
docs: 更新了如readme等的文档;
build: 更新了依赖项、项目版本;
ops: 影响了操作性组件,如基础设施、部署、备份、恢复;
chore: 修改了.gitignore等;
dev
Yao 2 months ago
parent 36afa1d669
commit e2eab49065

@ -9,38 +9,46 @@ data = ''
words = []
word_freqs = []
################################
# procedures
################################
def read_file(path_to_file):
"""读取文件内容并赋值给全局变量data"""
global data
with open(path_to_file,encoding='utf-8') as f:
with open(path_to_file, encoding='utf-8') as f:
data = f.read()
def extractwords():
def extractwords():
"""提取data中的单词并赋值给全局变量words"""
global data
global words
global words
words = data.lower().split()
with open(stopwordfilepath) as f:
stop_words = set(f.read().split(','))
stop_words = set(f.read().split(','))
stop_words.update(string.ascii_lowercase)
words = [word for word in words if word not in stop_words]
def frequencies():
def frequencies():
"""统计words中单词的频率并赋值给全局变量word_freqs"""
global words
global word_freqs
word_freqs.extend([(word, 1) for word in words])
def sort():
global word_freqs
def sort():
"""对word_freqs按照频率进行排序"""
global word_freqs
word_freqs = Counter(words).most_common()
if __name__ == "__main__":
read_file( testfilepath )
extractwords()
read_file(testfilepath)
extractwords()
frequencies()
sort()
for tf in word_freqs[:10]:
print(tf[0], '-', tf[1])
print(tf[0], '-', tf[1])

@ -3,25 +3,30 @@ from cppy.cp_util import *
def extractwords(str_data):
"""提取单词"""
pattern = re.compile('[\W_]+')
word_list = pattern.sub(' ', str_data).lower().split()
stop_words = get_stopwords()
return [w for w in word_list if not w in stop_words]
stop_words = get_stopwords()
return [w for w in word_list if w not in stop_words]
def frequencies(word_list):
word_freqs = {}
for word in word_list:
word_freqs[word] = word_freqs.get(word, 0) + 1
def frequencies(word_list):
"""统计单词频率"""
word_freqs = {}
for word in word_list:
word_freqs[word] = word_freqs.get(word, 0) + 1
return word_freqs
def sort(word_freq):
return sorted( word_freq.items(), key=lambda x: x[1], reverse=True )
def sort(word_freq):
"""对单词频率进行排序"""
return sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
if __name__ == "__main__":
txtcontent = read_file( testfilepath )
word_list = extractwords( txtcontent )
word_freqs = frequencies( word_list )
word_sorts = sort ( word_freqs )
txtcontent = read_file(testfilepath)
word_list = extractwords(txtcontent)
word_freqs = frequencies(word_list)
word_sorts = sort(word_freqs)
for tf in word_sorts[:10]:
print(tf[0], '-', tf[1])
print(tf[0], '-', tf[1])

@ -3,7 +3,7 @@ from collections import Counter
from cppy.cp_util import *
# 读取文件
with open(testfilepath,encoding='utf-8') as f:
with open(testfilepath, encoding='utf-8') as f:
data = f.read().lower() # 直接转换为小写
# 过滤非字母字符
@ -20,6 +20,8 @@ words = [word for word in words if word not in stop_words]
word_freqs = Counter(words)
# 排序并打印
sorted_word_freqs = sorted(word_freqs.items(), key=lambda x: x[1], reverse=True)
sorted_word_freqs = sorted(word_freqs.items(),
key=lambda x: x[1],
reverse=True)
print_word_freqs(sorted_word_freqs)
print_word_freqs(sorted_word_freqs)

@ -3,28 +3,31 @@ from collections import Counter
stop_words = get_stopwords()
def process_chunk(chunk):
# 过滤停用词
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
words = [w for w in chunk if (w not in stop_words) and len(w) >= 3]
return Counter(words)
def process_chunks( chunks,word_freqs,x,max ):
next = x + 1
def process_chunks(chunks, word_freqs, x, max):
"""递归处理分片"""
next = x + 1
if next < max:
process_chunks(chunks,word_freqs,next,max)
word_list = process_chunk(chunks[x])
process_chunks(chunks, word_freqs, next, max)
word_list = process_chunk(chunks[x])
word_freqs += Counter(word_list)
# def process_chunks( chunks,word_freqs,x,max ):
# word_list = process_chunk(chunks[x])
# word_list = process_chunk(chunks[x])
# word_freqs += Counter(word_list)
# next = x + 1
# if next < max:
# process_chunks(chunks,word_freqs,next,max)
# process_chunks(chunks,word_freqs,next,max)
# 读数据按1000个词一组分片
chunks = get_chunks(testfilepath,2000)
chunks = get_chunks(testfilepath, 2000)
word_freqs = Counter()
process_chunks( chunks,word_freqs,0,len(chunks) )
print_word_freqs( word_freqs.most_common(10) )
process_chunks(chunks, word_freqs, 0, len(chunks))
print_word_freqs(word_freqs.most_common(10))

@ -1,3 +1,3 @@
from cppy.cp_util import *
print_word_freqs( sort_dict ( get_frequencies ( extract_file_words(testfilepath) )))
print_word_freqs(sort_dict(get_frequencies(extract_file_words(testfilepath))))

@ -2,14 +2,16 @@ from cppy.cp_util import *
# 如果有连续的对数据加工操作,而且总是把共同加工数据对象当第一个参数,可以用本文件夹方法提升阅读体验
# 框架类
class FunBind:
def bind(self, func,*args, **kwargs):
def bind(self, func, *args, **kwargs):
try:
self.data = func(self.data,*args, **kwargs)
self.data = func(self.data, *args, **kwargs)
except:
self.data = func(*args, **kwargs)
return self
return self
data = FunBind()\
@ -19,10 +21,9 @@ data = FunBind()\
.bind(print_word_freqs,10)\
.data
print(data)
'''
函数是自由函数,还是正常的函数写法
使用
- 列举函数名首部参数外的其它参数
- 调用 data 得到最后数据
'''
'''

@ -1,28 +1,31 @@
from cppy.cp_util import *
'''
函数是自由函数,还是正常的函数写法
使用
- 列举函数名首部参数外的其它参数
- 调用 data 得到最后数据
'''
class FunPipe:
def __init__(self, func, *args, **kwargs):
def __init__(self, func, *args, **kwargs):
self.func = func
self.args = args
self.kwargs = kwargs
def __or__(self, other):
_data = self.func(*self.args, **self.kwargs)
return FunPipe( other.func,_data,*other.args,**other.kwargs)
_data = self.func(*self.args, **self.kwargs)
return FunPipe(other.func, _data, *other.args, **other.kwargs)
@property
def data(self):
return self.func(*self.args, **self.kwargs)
return self.func(*self.args, **self.kwargs)
# 模仿管道
pipe = FunPipe(extract_file_words,testfilepath) | FunPipe(get_frequencies) | FunPipe(sort_dict) | FunPipe(print_word_freqs, 10)
pipe = FunPipe(
extract_file_words,
testfilepath) | FunPipe(get_frequencies) | FunPipe(sort_dict) | FunPipe(
print_word_freqs, 10)
pipe.data

@ -1,17 +1,19 @@
from cppy.cp_util import *
class Flow:
def extract_file_words(self, filepath):
self.data = extract_file_words(filepath)
return self
def get_frequencies(self):
self.data = get_frequencies(self.data)
return self
def sort_dict(self):
self.data = sort_dict(self.data)
return self
return self
def print_word_freqs(self, n):
print_word_freqs(self.data, n)
@ -19,11 +21,11 @@ class Flow:
# 顺序调用
Flow().extract_file_words(testfilepath).get_frequencies().sort_dict().print_word_freqs(10)
Flow().extract_file_words(
testfilepath).get_frequencies().sort_dict().print_word_freqs(10)
'''
连续方法调用看起来比较舒服
但是需要假设
- 每一个类方法返回 self 否则没法连续
- 类方法默认不写第一个参数数据都在 .data 里面
'''
'''

@ -1,45 +1,50 @@
from cppy.cp_util import *
# 装饰器改写类
# - 找到以f_开头的方法
# - 将方法函数的返回值赋值给对象的data属性
# - 返回对象自身
# - 返回对象自身
def return_self_decorator(cls):
def return_self(func):
# 定义一个闭包函数,用于接收参数
def wrapper(self, *args, **kwargs):
self.data = func(self, *args, **kwargs)
return self # 返回类自身
def wrapper(self, *args, **kwargs):
self.data = func(self, *args, **kwargs)
return self # 返回类自身
return wrapper
for name, method in cls.__dict__.items():
# 判断属性是否可调用且属性名以f_开头
if callable(method) and name.startswith('f_'):
if callable(method) and name.startswith('f_'):
# 为类改写属性,将封装后的函数赋值
setattr(cls, name, return_self(method))
setattr(cls, name, return_self(method))
return cls
@return_self_decorator
class Flow():
class Flow():
def test(self):
return 'test'
def f_extract_file_words(self, filepath):
return extract_file_words(filepath)
return extract_file_words(filepath)
def f_get_frequencies(self):
return get_frequencies(self.data)
return get_frequencies(self.data)
def f_sort_dict(self):
return sort_dict(self.data)
return sort_dict(self.data)
def f_print_word_freqs(self, n):
print_word_freqs(self.data, n)
print_word_freqs(self.data, n)
# 顺序调用
Flow().f_extract_file_words(testfilepath).f_get_frequencies().f_sort_dict().f_print_word_freqs(10)
Flow().f_extract_file_words(
testfilepath).f_get_frequencies().f_sort_dict().f_print_word_freqs(10)
'''
改写后参与 function flow 功能的方法
- 需要以 'f_' 开头
@ -47,4 +52,4 @@ Flow().f_extract_file_words(testfilepath).f_get_frequencies().f_sort_dict().f_pr
仍旧需要特殊的方法写法
所以还是 12种方法比较自然
'''
'''

@ -1,26 +1,30 @@
from cppy.cp_util import *
from collections import Counter
# 定义一个带计数器的元类
class CounterMetaclass(type):
def __new__(mcs, name, bases, attrs):
attrs['_counter'] = Counter()
def __new__(mcs, name, bases, attrs):
attrs['_counter'] = Counter()
return super().__new__(mcs, name, bases, attrs)
# 基于元类创建类
class Word( metaclass=CounterMetaclass ):
class Word(metaclass=CounterMetaclass):
def __init__(self, word):
self.word = word
self._counter[self.word] += 1
@classmethod
def get_word_freqs(cls,n) -> Counter:
def get_word_freqs(cls, n) -> Counter:
return cls._counter.most_common(n)
for word in extract_file_words ( testfilepath ) : Word(word)
print_word_freqs(Word.get_word_freqs(10))
for word in extract_file_words(testfilepath):
Word(word)
print_word_freqs(Word.get_word_freqs(10))
'''
常用于将依赖项如服务或配置自动注入到类中
'''
'''

Loading…
Cancel
Save