forked from p46318075/CodePattern
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
49 lines
1.6 KiB
49 lines
1.6 KiB
9 months ago
|
import re
|
||
|
from collections import Counter
|
||
|
from functools import reduce
|
||
|
from cppy.cp_util import *
|
||
|
|
||
|
class Pipeline:
|
||
|
def __init__(self, function):
|
||
|
self.function = function
|
||
|
|
||
|
def __or__(self, other):
|
||
|
if isinstance(other, Pipeline):
|
||
|
return Pipeline(lambda x: self.function(x) or other.function(x))
|
||
|
else:
|
||
|
raise TypeError("The argument must be an instance of Pipeline")
|
||
|
|
||
|
def process(self, data):
|
||
|
return self.function(data)
|
||
|
|
||
|
# 定义处理函数
|
||
|
def read_file(path):
|
||
|
with open(path, 'r', encoding='utf-8') as file:
|
||
|
return file.read()
|
||
|
|
||
|
def clean_text(text):
|
||
|
return re.sub(r'[^\w\s]', '', text).lower()
|
||
|
|
||
|
def tokenize(text):
|
||
|
return re.findall(r'\b\w+\b', text)
|
||
|
|
||
|
def remove_stop_words(tokens, stop_words):
|
||
|
return [token for token in tokens if token not in stop_words]
|
||
|
|
||
|
def count_frequencies(tokens):
|
||
|
return Counter(tokens)
|
||
|
|
||
|
def get_top_n_frequencies(counter, n):
|
||
|
return counter.most_common(n)
|
||
|
|
||
|
# 定义停用词列表
|
||
|
stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'for', 'on', 'is', 'it', 'with', 'that', 'as', 'by', 'this', 'at', 'be', 'which', 'from', 'or', 'are', 'an', 'but', 'not', 'you', 'have', 'your', 'can', 'will', 'all', 'any', 'if', 'their', 'would', 'what', 'there', 'when', 'which', 'who', 'whom', 'whose', 'where', 'why'])
|
||
|
|
||
|
# 创建管道
|
||
|
pipeline = (Pipeline(read_file) | clean_text | tokenize
|
||
|
| remove_stop_words | count_frequencies
|
||
|
| get_top_n_frequencies(n=10))
|
||
|
|
||
|
# 执行管道并打印结果
|
||
|
top_n_word_frequencies = pipeline.process( testfilepath )
|
||
|
print(top_n_word_frequencies)
|