zj3D 8 months ago
parent 2518a5cd85
commit 950cb41e08

@ -0,0 +1,65 @@
from cppy.cp_util import *
from collections import Counter
from heapq import nlargest
import re
class Pipeline:
def __init__(self):
pass
def __or__(self, other):
class PipelineComposition(Pipeline):
def __init__(self, first, second):
self.first = first
self.second = second
def process(self, data):
return self.second.process(self.first.process(data))
return PipelineComposition(self, other)
def process(self, data):
raise NotImplementedError
class FileReader(Pipeline):
def __init__(self, filename):
super().__init__()
self.filename = filename
def process(self):
with open(self.filename, 'r', encoding='utf-8') as file:
content = file.read()
return content
class WordFrequencyCounter(Pipeline):
def process(self, text):
words = re.findall(r'\w+', text.lower())
word_freq = Counter(words)
return word_freq
class TopNFilter(Pipeline):
def __init__(self, n):
super().__init__()
self.n = n
def process(self, word_freq):
return nlargest(self.n, word_freq.items(), key=lambda item: item[1])
# 假设有一个文本文件"text.txt",其内容是需要分析的文本
filename = testfilepath
n = 5 # 求取最高5个词频
# 创建管道
pipeline = FileReader(filename) | WordFrequencyCounter() | TopNFilter(n)
# 执行管道
top_n_words = pipeline.process()
# 打印结果
for word, freq in top_n_words:
print(f"{word}: {freq}")

@ -14,7 +14,7 @@ class TFFlowcls:
print(self._value)
def top10_freqs(word_freqs):
def top_freqs(word_freqs):
top10 = "\n".join(f"{word} - {count}" for word, count in word_freqs[:10])
return top10
@ -23,5 +23,5 @@ if __name__ == "__main__":
.bind(extract_file_words)\
.bind(get_frequencies)\
.bind(sort_dict)\
.bind(top10_freqs)\
.bind(top_freqs)\
.over()

@ -3,7 +3,7 @@ from cppy.cp_util import *
#
# 框架类
#
class TFFlowAll:
class TFFlowcls:
def __init__(self, func):
self._funcs = [func]
@ -11,20 +11,15 @@ class TFFlowAll:
self._funcs.append(func)
return self
def execute(self):
def is_callable(obj):
"""Check if an object is callable."""
return hasattr(obj, '__call__')
def execute(self):
def call_if_possible(obj):
"""Call the object if it's callable, otherwise return it as is."""
return obj() if is_callable(obj) else obj
return obj() if hasattr(obj, '__call__') else obj
# Initialize the value to a no-op lambda function
value = lambda: None
for func in self._funcs:
value = call_if_possible(func(value))
print(call_if_possible(value))
value = call_if_possible(func(value))
#
# 工具函数
@ -51,12 +46,12 @@ def sort(word_freq):
def top10_freqs(word_freqs):
def _f():
return '\n'.join(f"{word} - {freq}" for word, freq in word_freqs[:10])
return print_word_freqs( word_freqs )
return _f
if __name__ == "__main__":
TFFlowAll(get_input)\
TFFlowcls(get_input)\
.bind(extractwords)\
.bind(frequencies)\
.bind(sort)\

@ -0,0 +1,35 @@
from cppy.cp_util import *
from collections import Counter
class Pipe:
def __init__(self, func, *args, kwargs=None):
self.func = func
# self.args, self.kwargs= None ,None
if args : self.args = args
if kwargs: self.kwargs = kwargs
# print( self.args, self.kwargs)
def __or__(self, other):
return other(self._value)
def __call__(self, data):
self._value = self.func(data, *self.args, self.kwargs)
def read_file(filename):
with open(filename, 'r') as f:
return f.read()
def split_words(text):
return re.findall(r'\b\w+\b', text.lower())
def count_words(words):
return Counter(words)
def top_n_words(word_counts, n):
return word_counts.most_common(n)
# 使用管道
pipe = Pipe(extract_file_words) | Pipe(get_frequencies) | Pipe(sort_dict) | Pipe(print_word_freqs, 5)
result = pipe(testfilepath)
print(result)

@ -0,0 +1,49 @@
import re
from collections import Counter
from functools import reduce
from cppy.cp_util import *
class Pipeline:
def __init__(self, function):
self.function = function
def __or__(self, other):
if isinstance(other, Pipeline):
return Pipeline(lambda x: self.function(x) or other.function(x))
else:
raise TypeError("The argument must be an instance of Pipeline")
def process(self, data):
return self.function(data)
# 定义处理函数
def read_file(path):
with open(path, 'r', encoding='utf-8') as file:
return file.read()
def clean_text(text):
return re.sub(r'[^\w\s]', '', text).lower()
def tokenize(text):
return re.findall(r'\b\w+\b', text)
def remove_stop_words(tokens, stop_words):
return [token for token in tokens if token not in stop_words]
def count_frequencies(tokens):
return Counter(tokens)
def get_top_n_frequencies(counter, n):
return counter.most_common(n)
# 定义停用词列表
stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'for', 'on', 'is', 'it', 'with', 'that', 'as', 'by', 'this', 'at', 'be', 'which', 'from', 'or', 'are', 'an', 'but', 'not', 'you', 'have', 'your', 'can', 'will', 'all', 'any', 'if', 'their', 'would', 'what', 'there', 'when', 'which', 'who', 'whom', 'whose', 'where', 'why'])
# 创建管道
pipeline = (Pipeline(read_file) | clean_text | tokenize
| remove_stop_words | count_frequencies
| get_top_n_frequencies(n=10))
# 执行管道并打印结果
top_n_word_frequencies = pipeline.process( testfilepath )
print(top_n_word_frequencies)

@ -1,45 +1,20 @@
from cppy.cp_util import *
###########################################
#
# 生成器
###########################################
def characters(filename): # 弹出一行
for line in open(filename,encoding='utf-8'):
for c in line:
yield c
def all_words(filename): # 弹出一个词
start_char = True
for c in characters(filename):
if start_char == True:
word = ""
if c.isalnum(): # start of a word
word = c.lower()
start_char = False
else:
pass
else:
if c.isalnum():
word += c.lower() # end of word, emit it
else:
start_char = True
yield word
def non_stop_words(filename, stopwords):
for w in all_words(filename):
if not w in stopwords:
yield w # 弹出一个审核过的词
if __name__ == "__main__":
#
def non_stop_words(testfilepath):
stopwords = get_stopwords()
data_str = read_file(testfilepath)
wordlist = re_split( data_str )
for word in wordlist:
if word not in stopwords:
yield word # 弹出一个非停用词
freqs = {}
for word in non_stop_words(testfilepath,stopwords):
freqs[word] = freqs.get(word, 0) + 1
freqs = {}
for word in non_stop_words(testfilepath):
freqs[word] = freqs.get(word, 0) + 1
data = sort_dict(freqs)
print_word_freqs(data)
data = sort_dict(freqs)
print_word_freqs(data)
Loading…
Cancel
Save