forked from p46318075/CodePattern
dev
parent
bfcaab3439
commit
239c0188d0
@ -1,65 +0,0 @@
|
|||||||
from cppy.cp_util import *
|
|
||||||
from collections import Counter
|
|
||||||
from heapq import nlargest
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
class Pipeline:
|
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def __or__(self, other):
|
|
||||||
class PipelineComposition(Pipeline):
|
|
||||||
def __init__(self, first, second):
|
|
||||||
self.first = first
|
|
||||||
self.second = second
|
|
||||||
|
|
||||||
def process(self, data):
|
|
||||||
return self.second.process(self.first.process(data))
|
|
||||||
|
|
||||||
return PipelineComposition(self, other)
|
|
||||||
|
|
||||||
def process(self, data):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
|
|
||||||
class FileReader(Pipeline):
|
|
||||||
def __init__(self, filename):
|
|
||||||
super().__init__()
|
|
||||||
self.filename = filename
|
|
||||||
|
|
||||||
def process(self):
|
|
||||||
with open(self.filename, 'r', encoding='utf-8') as file:
|
|
||||||
content = file.read()
|
|
||||||
return content
|
|
||||||
|
|
||||||
|
|
||||||
class WordFrequencyCounter(Pipeline):
|
|
||||||
def process(self, text):
|
|
||||||
words = re.findall(r'\w+', text.lower())
|
|
||||||
word_freq = Counter(words)
|
|
||||||
return word_freq
|
|
||||||
|
|
||||||
|
|
||||||
class TopNFilter(Pipeline):
|
|
||||||
def __init__(self, n):
|
|
||||||
super().__init__()
|
|
||||||
self.n = n
|
|
||||||
|
|
||||||
def process(self, word_freq):
|
|
||||||
return nlargest(self.n, word_freq.items(), key=lambda item: item[1])
|
|
||||||
|
|
||||||
|
|
||||||
# 假设有一个文本文件"text.txt",其内容是需要分析的文本
|
|
||||||
filename = testfilepath
|
|
||||||
n = 5 # 求取最高5个词频
|
|
||||||
|
|
||||||
# 创建管道
|
|
||||||
pipeline = FileReader(filename) | WordFrequencyCounter() | TopNFilter(n)
|
|
||||||
|
|
||||||
# 执行管道
|
|
||||||
top_n_words = pipeline.process()
|
|
||||||
|
|
||||||
# 打印结果
|
|
||||||
for word, freq in top_n_words:
|
|
||||||
print(f"{word}: {freq}")
|
|
@ -1,49 +1,25 @@
|
|||||||
import re
|
|
||||||
from collections import Counter
|
|
||||||
from functools import reduce
|
|
||||||
from cppy.cp_util import *
|
from cppy.cp_util import *
|
||||||
|
|
||||||
class Pipeline:
|
# 这种连续方法调用,看起来也比较舒服
|
||||||
def __init__(self, function):
|
# 每一个类方法返回 self ,另外最后一个方法注意是否有返回值
|
||||||
self.function = function
|
|
||||||
|
|
||||||
def __or__(self, other):
|
class Flow:
|
||||||
if isinstance(other, Pipeline):
|
def extract_file_words(self, filepath):
|
||||||
return Pipeline(lambda x: self.function(x) or other.function(x))
|
self.data = extract_file_words(filepath)
|
||||||
else:
|
return self
|
||||||
raise TypeError("The argument must be an instance of Pipeline")
|
|
||||||
|
|
||||||
def process(self, data):
|
def get_frequencies(self):
|
||||||
return self.function(data)
|
self.data = get_frequencies(self.data)
|
||||||
|
return self
|
||||||
|
|
||||||
# 定义处理函数
|
def sort_dict(self):
|
||||||
def read_file(path):
|
self.data = sort_dict(self.data)
|
||||||
with open(path, 'r', encoding='utf-8') as file:
|
return self
|
||||||
return file.read()
|
|
||||||
|
|
||||||
def clean_text(text):
|
def print_word_freqs(self, n):
|
||||||
return re.sub(r'[^\w\s]', '', text).lower()
|
print_word_freqs(self.data, n)
|
||||||
|
return self
|
||||||
|
|
||||||
def tokenize(text):
|
|
||||||
return re.findall(r'\b\w+\b', text)
|
|
||||||
|
|
||||||
def remove_stop_words(tokens, stop_words):
|
# 顺序调用
|
||||||
return [token for token in tokens if token not in stop_words]
|
Flow().extract_file_words(testfilepath).get_frequencies().sort_dict().print_word_freqs(10)
|
||||||
|
|
||||||
def count_frequencies(tokens):
|
|
||||||
return Counter(tokens)
|
|
||||||
|
|
||||||
def get_top_n_frequencies(counter, n):
|
|
||||||
return counter.most_common(n)
|
|
||||||
|
|
||||||
# 定义停用词列表
|
|
||||||
stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'for', 'on', 'is', 'it', 'with', 'that', 'as', 'by', 'this', 'at', 'be', 'which', 'from', 'or', 'are', 'an', 'but', 'not', 'you', 'have', 'your', 'can', 'will', 'all', 'any', 'if', 'their', 'would', 'what', 'there', 'when', 'which', 'who', 'whom', 'whose', 'where', 'why'])
|
|
||||||
|
|
||||||
# 创建管道
|
|
||||||
pipeline = (Pipeline(read_file) | clean_text | tokenize
|
|
||||||
| remove_stop_words | count_frequencies
|
|
||||||
| get_top_n_frequencies(n=10))
|
|
||||||
|
|
||||||
# 执行管道并打印结果
|
|
||||||
top_n_word_frequencies = pipeline.process( testfilepath )
|
|
||||||
print(top_n_word_frequencies)
|
|
||||||
|
Loading…
Reference in new issue