CodePattern/语言特性/尾调用_类方法/tf-26B.py

import re
from collections import Counter
from functools import reduce
from cppy.cp_util import *

class Pipeline:
    def __init__(self, function):
        self.function = function

    def __or__(self, other):
        if isinstance(other, Pipeline):
            return Pipeline(lambda x: self.function(x) or other.function(x))
        else:
            raise TypeError("The argument must be an instance of Pipeline")

    def process(self, data):
        return self.function(data)

# 定义处理函数
def read_file(path):
    with open(path, 'r', encoding='utf-8') as file:
        return file.read()

def clean_text(text):
    return re.sub(r'[^\w\s]', '', text).lower()

def tokenize(text):
    return re.findall(r'\b\w+\b', text)

def remove_stop_words(tokens, stop_words):
    return [token for token in tokens if token not in stop_words]

def count_frequencies(tokens):
    return Counter(tokens)

def get_top_n_frequencies(counter, n):
    return counter.most_common(n)

# 定义停用词列表
stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'for', 'on', 'is', 'it', 'with', 'that', 'as', 'by', 'this', 'at', 'be', 'which', 'from', 'or', 'are', 'an', 'but', 'not', 'you', 'have', 'your', 'can', 'will', 'all', 'any', 'if', 'their', 'would', 'what', 'there', 'when', 'which', 'who', 'whom', 'whose', 'where', 'why'])

# 创建管道
pipeline = (Pipeline(read_file) | clean_text | tokenize
             | remove_stop_words | count_frequencies
             | get_top_n_frequencies(n=10))

# 执行管道并打印结果
top_n_word_frequencies = pipeline.process( testfilepath )
print(top_n_word_frequencies)
debug 9 months ago			`import re`
			`from collections import Counter`
			`from functools import reduce`
			`from cppy.cp_util import *`

			`class Pipeline:`
			`def __init__(self, function):`
			`self.function = function`

			`def __or__(self, other):`
			`if isinstance(other, Pipeline):`
			`return Pipeline(lambda x: self.function(x) or other.function(x))`
			`else:`
			`raise TypeError("The argument must be an instance of Pipeline")`

			`def process(self, data):`
			`return self.function(data)`

			`# 定义处理函数`
			`def read_file(path):`
			`with open(path, 'r', encoding='utf-8') as file:`
			`return file.read()`

			`def clean_text(text):`
			`return re.sub(r'[^\w\s]', '', text).lower()`

			`def tokenize(text):`
			`return re.findall(r'\b\w+\b', text)`

			`def remove_stop_words(tokens, stop_words):`
			`return [token for token in tokens if token not in stop_words]`

			`def count_frequencies(tokens):`
			`return Counter(tokens)`

			`def get_top_n_frequencies(counter, n):`
			`return counter.most_common(n)`

			`# 定义停用词列表`
			`stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'for', 'on', 'is', 'it', 'with', 'that', 'as', 'by', 'this', 'at', 'be', 'which', 'from', 'or', 'are', 'an', 'but', 'not', 'you', 'have', 'your', 'can', 'will', 'all', 'any', 'if', 'their', 'would', 'what', 'there', 'when', 'which', 'who', 'whom', 'whose', 'where', 'why'])`

			`# 创建管道`
			`pipeline = (Pipeline(read_file) \| clean_text \| tokenize`
			`\| remove_stop_words \| count_frequencies`
			`\| get_top_n_frequencies(n=10))`

			`# 执行管道并打印结果`
			`top_n_word_frequencies = pipeline.process( testfilepath )`
			`print(top_n_word_frequencies)`