You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
65 lines
1.7 KiB
65 lines
1.7 KiB
9 months ago
|
from cppy.cp_util import *
|
||
|
from collections import Counter
|
||
|
from heapq import nlargest
|
||
|
import re
|
||
|
|
||
|
|
||
|
class Pipeline:
|
||
|
def __init__(self):
|
||
|
pass
|
||
|
|
||
|
def __or__(self, other):
|
||
|
class PipelineComposition(Pipeline):
|
||
|
def __init__(self, first, second):
|
||
|
self.first = first
|
||
|
self.second = second
|
||
|
|
||
|
def process(self, data):
|
||
|
return self.second.process(self.first.process(data))
|
||
|
|
||
|
return PipelineComposition(self, other)
|
||
|
|
||
|
def process(self, data):
|
||
|
raise NotImplementedError
|
||
|
|
||
|
|
||
|
class FileReader(Pipeline):
|
||
|
def __init__(self, filename):
|
||
|
super().__init__()
|
||
|
self.filename = filename
|
||
|
|
||
|
def process(self):
|
||
|
with open(self.filename, 'r', encoding='utf-8') as file:
|
||
|
content = file.read()
|
||
|
return content
|
||
|
|
||
|
|
||
|
class WordFrequencyCounter(Pipeline):
|
||
|
def process(self, text):
|
||
|
words = re.findall(r'\w+', text.lower())
|
||
|
word_freq = Counter(words)
|
||
|
return word_freq
|
||
|
|
||
|
|
||
|
class TopNFilter(Pipeline):
|
||
|
def __init__(self, n):
|
||
|
super().__init__()
|
||
|
self.n = n
|
||
|
|
||
|
def process(self, word_freq):
|
||
|
return nlargest(self.n, word_freq.items(), key=lambda item: item[1])
|
||
|
|
||
|
|
||
|
# 假设有一个文本文件"text.txt",其内容是需要分析的文本
|
||
|
filename = testfilepath
|
||
|
n = 5 # 求取最高5个词频
|
||
|
|
||
|
# 创建管道
|
||
|
pipeline = FileReader(filename) | WordFrequencyCounter() | TopNFilter(n)
|
||
|
|
||
|
# 执行管道
|
||
|
top_n_words = pipeline.process()
|
||
|
|
||
|
# 打印结果
|
||
|
for word, freq in top_n_words:
|
||
|
print(f"{word}: {freq}")
|