import re from collections import Counter from functools import reduce from cppy.cp_util import * class Pipeline: def __init__(self, function): self.function = function def __or__(self, other): if isinstance(other, Pipeline): return Pipeline(lambda x: self.function(x) or other.function(x)) else: raise TypeError("The argument must be an instance of Pipeline") def process(self, data): return self.function(data) # 定义处理函数 def read_file(path): with open(path, 'r', encoding='utf-8') as file: return file.read() def clean_text(text): return re.sub(r'[^\w\s]', '', text).lower() def tokenize(text): return re.findall(r'\b\w+\b', text) def remove_stop_words(tokens, stop_words): return [token for token in tokens if token not in stop_words] def count_frequencies(tokens): return Counter(tokens) def get_top_n_frequencies(counter, n): return counter.most_common(n) # 定义停用词列表 stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'for', 'on', 'is', 'it', 'with', 'that', 'as', 'by', 'this', 'at', 'be', 'which', 'from', 'or', 'are', 'an', 'but', 'not', 'you', 'have', 'your', 'can', 'will', 'all', 'any', 'if', 'their', 'would', 'what', 'there', 'when', 'which', 'who', 'whom', 'whose', 'where', 'why']) # 创建管道 pipeline = (Pipeline(read_file) | clean_text | tokenize | remove_stop_words | count_frequencies | get_top_n_frequencies(n=10)) # 执行管道并打印结果 top_n_word_frequencies = pipeline.process( testfilepath ) print(top_n_word_frequencies)