You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

65 lines
1.7 KiB

from cppy.cp_util import *
from collections import Counter
from heapq import nlargest
import re
class Pipeline:
def __init__(self):
pass
def __or__(self, other):
class PipelineComposition(Pipeline):
def __init__(self, first, second):
self.first = first
self.second = second
def process(self, data):
return self.second.process(self.first.process(data))
return PipelineComposition(self, other)
def process(self, data):
raise NotImplementedError
class FileReader(Pipeline):
def __init__(self, filename):
super().__init__()
self.filename = filename
def process(self):
with open(self.filename, 'r', encoding='utf-8') as file:
content = file.read()
return content
class WordFrequencyCounter(Pipeline):
def process(self, text):
words = re.findall(r'\w+', text.lower())
word_freq = Counter(words)
return word_freq
class TopNFilter(Pipeline):
def __init__(self, n):
super().__init__()
self.n = n
def process(self, word_freq):
return nlargest(self.n, word_freq.items(), key=lambda item: item[1])
# 假设有一个文本文件"text.txt",其内容是需要分析的文本
filename = testfilepath
n = 5 # 求取最高5个词频
# 创建管道
pipeline = FileReader(filename) | WordFrequencyCounter() | TopNFilter(n)
# 执行管道
top_n_words = pipeline.process()
# 打印结果
for word, freq in top_n_words:
print(f"{word}: {freq}")