You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

49 lines
1.6 KiB

import re
from collections import Counter
from functools import reduce
from cppy.cp_util import *
class Pipeline:
def __init__(self, function):
self.function = function
def __or__(self, other):
if isinstance(other, Pipeline):
return Pipeline(lambda x: self.function(x) or other.function(x))
else:
raise TypeError("The argument must be an instance of Pipeline")
def process(self, data):
return self.function(data)
# 定义处理函数
def read_file(path):
with open(path, 'r', encoding='utf-8') as file:
return file.read()
def clean_text(text):
return re.sub(r'[^\w\s]', '', text).lower()
def tokenize(text):
return re.findall(r'\b\w+\b', text)
def remove_stop_words(tokens, stop_words):
return [token for token in tokens if token not in stop_words]
def count_frequencies(tokens):
return Counter(tokens)
def get_top_n_frequencies(counter, n):
return counter.most_common(n)
# 定义停用词列表
stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'for', 'on', 'is', 'it', 'with', 'that', 'as', 'by', 'this', 'at', 'be', 'which', 'from', 'or', 'are', 'an', 'but', 'not', 'you', 'have', 'your', 'can', 'will', 'all', 'any', 'if', 'their', 'would', 'what', 'there', 'when', 'which', 'who', 'whom', 'whose', 'where', 'why'])
# 创建管道
pipeline = (Pipeline(read_file) | clean_text | tokenize
| remove_stop_words | count_frequencies
| get_top_n_frequencies(n=10))
# 执行管道并打印结果
top_n_word_frequencies = pipeline.process( testfilepath )
print(top_n_word_frequencies)