import re
from collections import Counter
from functools import reduce
from cppy.cp_util import *

class Pipeline:
    def __init__(self, function):
        self.function = function

    def __or__(self, other):
        if isinstance(other, Pipeline):
            return Pipeline(lambda x: self.function(x) or other.function(x))
        else:
            raise TypeError("The argument must be an instance of Pipeline")

    def process(self, data):
        return self.function(data)

# 定义处理函数
def read_file(path):
    with open(path, 'r', encoding='utf-8') as file:
        return file.read()

def clean_text(text):
    return re.sub(r'[^\w\s]', '', text).lower()

def tokenize(text):
    return re.findall(r'\b\w+\b', text)

def remove_stop_words(tokens, stop_words):
    return [token for token in tokens if token not in stop_words]

def count_frequencies(tokens):
    return Counter(tokens)

def get_top_n_frequencies(counter, n):
    return counter.most_common(n)

# 定义停用词列表
stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'for', 'on', 'is', 'it', 'with', 'that', 'as', 'by', 'this', 'at', 'be', 'which', 'from', 'or', 'are', 'an', 'but', 'not', 'you', 'have', 'your', 'can', 'will', 'all', 'any', 'if', 'their', 'would', 'what', 'there', 'when', 'which', 'who', 'whom', 'whose', 'where', 'why'])

# 创建管道
pipeline = (Pipeline(read_file) | clean_text | tokenize
             | remove_stop_words | count_frequencies
             | get_top_n_frequencies(n=10))

# 执行管道并打印结果
top_n_word_frequencies = pipeline.process( testfilepath )
print(top_n_word_frequencies)