parent
2518a5cd85
commit
950cb41e08
@ -0,0 +1,65 @@
|
||||
from cppy.cp_util import *
|
||||
from collections import Counter
|
||||
from heapq import nlargest
|
||||
import re
|
||||
|
||||
|
||||
class Pipeline:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __or__(self, other):
|
||||
class PipelineComposition(Pipeline):
|
||||
def __init__(self, first, second):
|
||||
self.first = first
|
||||
self.second = second
|
||||
|
||||
def process(self, data):
|
||||
return self.second.process(self.first.process(data))
|
||||
|
||||
return PipelineComposition(self, other)
|
||||
|
||||
def process(self, data):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class FileReader(Pipeline):
|
||||
def __init__(self, filename):
|
||||
super().__init__()
|
||||
self.filename = filename
|
||||
|
||||
def process(self):
|
||||
with open(self.filename, 'r', encoding='utf-8') as file:
|
||||
content = file.read()
|
||||
return content
|
||||
|
||||
|
||||
class WordFrequencyCounter(Pipeline):
|
||||
def process(self, text):
|
||||
words = re.findall(r'\w+', text.lower())
|
||||
word_freq = Counter(words)
|
||||
return word_freq
|
||||
|
||||
|
||||
class TopNFilter(Pipeline):
|
||||
def __init__(self, n):
|
||||
super().__init__()
|
||||
self.n = n
|
||||
|
||||
def process(self, word_freq):
|
||||
return nlargest(self.n, word_freq.items(), key=lambda item: item[1])
|
||||
|
||||
|
||||
# 假设有一个文本文件"text.txt",其内容是需要分析的文本
|
||||
filename = testfilepath
|
||||
n = 5 # 求取最高5个词频
|
||||
|
||||
# 创建管道
|
||||
pipeline = FileReader(filename) | WordFrequencyCounter() | TopNFilter(n)
|
||||
|
||||
# 执行管道
|
||||
top_n_words = pipeline.process()
|
||||
|
||||
# 打印结果
|
||||
for word, freq in top_n_words:
|
||||
print(f"{word}: {freq}")
|
@ -0,0 +1,35 @@
|
||||
from cppy.cp_util import *
|
||||
from collections import Counter
|
||||
|
||||
class Pipe:
|
||||
def __init__(self, func, *args, kwargs=None):
|
||||
self.func = func
|
||||
# self.args, self.kwargs= None ,None
|
||||
if args : self.args = args
|
||||
if kwargs: self.kwargs = kwargs
|
||||
# print( self.args, self.kwargs)
|
||||
|
||||
def __or__(self, other):
|
||||
return other(self._value)
|
||||
|
||||
def __call__(self, data):
|
||||
self._value = self.func(data, *self.args, self.kwargs)
|
||||
|
||||
def read_file(filename):
|
||||
with open(filename, 'r') as f:
|
||||
return f.read()
|
||||
|
||||
def split_words(text):
|
||||
return re.findall(r'\b\w+\b', text.lower())
|
||||
|
||||
def count_words(words):
|
||||
return Counter(words)
|
||||
|
||||
def top_n_words(word_counts, n):
|
||||
return word_counts.most_common(n)
|
||||
|
||||
|
||||
# 使用管道
|
||||
pipe = Pipe(extract_file_words) | Pipe(get_frequencies) | Pipe(sort_dict) | Pipe(print_word_freqs, 5)
|
||||
result = pipe(testfilepath)
|
||||
print(result)
|
@ -0,0 +1,49 @@
|
||||
import re
|
||||
from collections import Counter
|
||||
from functools import reduce
|
||||
from cppy.cp_util import *
|
||||
|
||||
class Pipeline:
|
||||
def __init__(self, function):
|
||||
self.function = function
|
||||
|
||||
def __or__(self, other):
|
||||
if isinstance(other, Pipeline):
|
||||
return Pipeline(lambda x: self.function(x) or other.function(x))
|
||||
else:
|
||||
raise TypeError("The argument must be an instance of Pipeline")
|
||||
|
||||
def process(self, data):
|
||||
return self.function(data)
|
||||
|
||||
# 定义处理函数
|
||||
def read_file(path):
|
||||
with open(path, 'r', encoding='utf-8') as file:
|
||||
return file.read()
|
||||
|
||||
def clean_text(text):
|
||||
return re.sub(r'[^\w\s]', '', text).lower()
|
||||
|
||||
def tokenize(text):
|
||||
return re.findall(r'\b\w+\b', text)
|
||||
|
||||
def remove_stop_words(tokens, stop_words):
|
||||
return [token for token in tokens if token not in stop_words]
|
||||
|
||||
def count_frequencies(tokens):
|
||||
return Counter(tokens)
|
||||
|
||||
def get_top_n_frequencies(counter, n):
|
||||
return counter.most_common(n)
|
||||
|
||||
# 定义停用词列表
|
||||
stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'for', 'on', 'is', 'it', 'with', 'that', 'as', 'by', 'this', 'at', 'be', 'which', 'from', 'or', 'are', 'an', 'but', 'not', 'you', 'have', 'your', 'can', 'will', 'all', 'any', 'if', 'their', 'would', 'what', 'there', 'when', 'which', 'who', 'whom', 'whose', 'where', 'why'])
|
||||
|
||||
# 创建管道
|
||||
pipeline = (Pipeline(read_file) | clean_text | tokenize
|
||||
| remove_stop_words | count_frequencies
|
||||
| get_top_n_frequencies(n=10))
|
||||
|
||||
# 执行管道并打印结果
|
||||
top_n_word_frequencies = pipeline.process( testfilepath )
|
||||
print(top_n_word_frequencies)
|
@ -1,45 +1,20 @@
|
||||
from cppy.cp_util import *
|
||||
|
||||
###########################################
|
||||
#
|
||||
# 生成器
|
||||
###########################################
|
||||
def characters(filename): # 弹出一行
|
||||
for line in open(filename,encoding='utf-8'):
|
||||
for c in line:
|
||||
yield c
|
||||
|
||||
|
||||
def all_words(filename): # 弹出一个词
|
||||
start_char = True
|
||||
for c in characters(filename):
|
||||
if start_char == True:
|
||||
word = ""
|
||||
if c.isalnum(): # start of a word
|
||||
word = c.lower()
|
||||
start_char = False
|
||||
else:
|
||||
pass
|
||||
else:
|
||||
if c.isalnum():
|
||||
word += c.lower() # end of word, emit it
|
||||
else:
|
||||
start_char = True
|
||||
yield word
|
||||
|
||||
|
||||
def non_stop_words(filename, stopwords):
|
||||
for w in all_words(filename):
|
||||
if not w in stopwords:
|
||||
yield w # 弹出一个审核过的词
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
#
|
||||
def non_stop_words(testfilepath):
|
||||
stopwords = get_stopwords()
|
||||
data_str = read_file(testfilepath)
|
||||
wordlist = re_split( data_str )
|
||||
for word in wordlist:
|
||||
if word not in stopwords:
|
||||
yield word # 弹出一个非停用词
|
||||
|
||||
|
||||
freqs = {}
|
||||
for word in non_stop_words(testfilepath,stopwords):
|
||||
for word in non_stop_words(testfilepath):
|
||||
freqs[word] = freqs.get(word, 0) + 1
|
||||
|
||||
data = sort_dict(freqs)
|
||||
print_word_freqs(data)
|
||||
|
Loading…
Reference in new issue