dev
parent
bfcaab3439
commit
239c0188d0
@ -1,65 +0,0 @@
|
||||
from cppy.cp_util import *
|
||||
from collections import Counter
|
||||
from heapq import nlargest
|
||||
import re
|
||||
|
||||
|
||||
class Pipeline:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __or__(self, other):
|
||||
class PipelineComposition(Pipeline):
|
||||
def __init__(self, first, second):
|
||||
self.first = first
|
||||
self.second = second
|
||||
|
||||
def process(self, data):
|
||||
return self.second.process(self.first.process(data))
|
||||
|
||||
return PipelineComposition(self, other)
|
||||
|
||||
def process(self, data):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class FileReader(Pipeline):
|
||||
def __init__(self, filename):
|
||||
super().__init__()
|
||||
self.filename = filename
|
||||
|
||||
def process(self):
|
||||
with open(self.filename, 'r', encoding='utf-8') as file:
|
||||
content = file.read()
|
||||
return content
|
||||
|
||||
|
||||
class WordFrequencyCounter(Pipeline):
|
||||
def process(self, text):
|
||||
words = re.findall(r'\w+', text.lower())
|
||||
word_freq = Counter(words)
|
||||
return word_freq
|
||||
|
||||
|
||||
class TopNFilter(Pipeline):
|
||||
def __init__(self, n):
|
||||
super().__init__()
|
||||
self.n = n
|
||||
|
||||
def process(self, word_freq):
|
||||
return nlargest(self.n, word_freq.items(), key=lambda item: item[1])
|
||||
|
||||
|
||||
# 假设有一个文本文件"text.txt",其内容是需要分析的文本
|
||||
filename = testfilepath
|
||||
n = 5 # 求取最高5个词频
|
||||
|
||||
# 创建管道
|
||||
pipeline = FileReader(filename) | WordFrequencyCounter() | TopNFilter(n)
|
||||
|
||||
# 执行管道
|
||||
top_n_words = pipeline.process()
|
||||
|
||||
# 打印结果
|
||||
for word, freq in top_n_words:
|
||||
print(f"{word}: {freq}")
|
@ -1,49 +1,25 @@
|
||||
import re
|
||||
from collections import Counter
|
||||
from functools import reduce
|
||||
from cppy.cp_util import *
|
||||
|
||||
class Pipeline:
|
||||
def __init__(self, function):
|
||||
self.function = function
|
||||
# 这种连续方法调用,看起来也比较舒服
|
||||
# 每一个类方法返回 self ,另外最后一个方法注意是否有返回值
|
||||
|
||||
def __or__(self, other):
|
||||
if isinstance(other, Pipeline):
|
||||
return Pipeline(lambda x: self.function(x) or other.function(x))
|
||||
else:
|
||||
raise TypeError("The argument must be an instance of Pipeline")
|
||||
class Flow:
|
||||
def extract_file_words(self, filepath):
|
||||
self.data = extract_file_words(filepath)
|
||||
return self
|
||||
|
||||
def process(self, data):
|
||||
return self.function(data)
|
||||
def get_frequencies(self):
|
||||
self.data = get_frequencies(self.data)
|
||||
return self
|
||||
|
||||
# 定义处理函数
|
||||
def read_file(path):
|
||||
with open(path, 'r', encoding='utf-8') as file:
|
||||
return file.read()
|
||||
def sort_dict(self):
|
||||
self.data = sort_dict(self.data)
|
||||
return self
|
||||
|
||||
def clean_text(text):
|
||||
return re.sub(r'[^\w\s]', '', text).lower()
|
||||
def print_word_freqs(self, n):
|
||||
print_word_freqs(self.data, n)
|
||||
return self
|
||||
|
||||
def tokenize(text):
|
||||
return re.findall(r'\b\w+\b', text)
|
||||
|
||||
def remove_stop_words(tokens, stop_words):
|
||||
return [token for token in tokens if token not in stop_words]
|
||||
|
||||
def count_frequencies(tokens):
|
||||
return Counter(tokens)
|
||||
|
||||
def get_top_n_frequencies(counter, n):
|
||||
return counter.most_common(n)
|
||||
|
||||
# 定义停用词列表
|
||||
stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'for', 'on', 'is', 'it', 'with', 'that', 'as', 'by', 'this', 'at', 'be', 'which', 'from', 'or', 'are', 'an', 'but', 'not', 'you', 'have', 'your', 'can', 'will', 'all', 'any', 'if', 'their', 'would', 'what', 'there', 'when', 'which', 'who', 'whom', 'whose', 'where', 'why'])
|
||||
|
||||
# 创建管道
|
||||
pipeline = (Pipeline(read_file) | clean_text | tokenize
|
||||
| remove_stop_words | count_frequencies
|
||||
| get_top_n_frequencies(n=10))
|
||||
|
||||
# 执行管道并打印结果
|
||||
top_n_word_frequencies = pipeline.process( testfilepath )
|
||||
print(top_n_word_frequencies)
|
||||
# 顺序调用
|
||||
Flow().extract_file_words(testfilepath).get_frequencies().sort_dict().print_word_freqs(10)
|
||||
|
Loading…
Reference in new issue