parent
2518a5cd85
commit
950cb41e08
@ -0,0 +1,65 @@
|
|||||||
|
from cppy.cp_util import *
|
||||||
|
from collections import Counter
|
||||||
|
from heapq import nlargest
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class Pipeline:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __or__(self, other):
|
||||||
|
class PipelineComposition(Pipeline):
|
||||||
|
def __init__(self, first, second):
|
||||||
|
self.first = first
|
||||||
|
self.second = second
|
||||||
|
|
||||||
|
def process(self, data):
|
||||||
|
return self.second.process(self.first.process(data))
|
||||||
|
|
||||||
|
return PipelineComposition(self, other)
|
||||||
|
|
||||||
|
def process(self, data):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class FileReader(Pipeline):
|
||||||
|
def __init__(self, filename):
|
||||||
|
super().__init__()
|
||||||
|
self.filename = filename
|
||||||
|
|
||||||
|
def process(self):
|
||||||
|
with open(self.filename, 'r', encoding='utf-8') as file:
|
||||||
|
content = file.read()
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
class WordFrequencyCounter(Pipeline):
|
||||||
|
def process(self, text):
|
||||||
|
words = re.findall(r'\w+', text.lower())
|
||||||
|
word_freq = Counter(words)
|
||||||
|
return word_freq
|
||||||
|
|
||||||
|
|
||||||
|
class TopNFilter(Pipeline):
|
||||||
|
def __init__(self, n):
|
||||||
|
super().__init__()
|
||||||
|
self.n = n
|
||||||
|
|
||||||
|
def process(self, word_freq):
|
||||||
|
return nlargest(self.n, word_freq.items(), key=lambda item: item[1])
|
||||||
|
|
||||||
|
|
||||||
|
# 假设有一个文本文件"text.txt",其内容是需要分析的文本
|
||||||
|
filename = testfilepath
|
||||||
|
n = 5 # 求取最高5个词频
|
||||||
|
|
||||||
|
# 创建管道
|
||||||
|
pipeline = FileReader(filename) | WordFrequencyCounter() | TopNFilter(n)
|
||||||
|
|
||||||
|
# 执行管道
|
||||||
|
top_n_words = pipeline.process()
|
||||||
|
|
||||||
|
# 打印结果
|
||||||
|
for word, freq in top_n_words:
|
||||||
|
print(f"{word}: {freq}")
|
@ -0,0 +1,35 @@
|
|||||||
|
from cppy.cp_util import *
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
class Pipe:
|
||||||
|
def __init__(self, func, *args, kwargs=None):
|
||||||
|
self.func = func
|
||||||
|
# self.args, self.kwargs= None ,None
|
||||||
|
if args : self.args = args
|
||||||
|
if kwargs: self.kwargs = kwargs
|
||||||
|
# print( self.args, self.kwargs)
|
||||||
|
|
||||||
|
def __or__(self, other):
|
||||||
|
return other(self._value)
|
||||||
|
|
||||||
|
def __call__(self, data):
|
||||||
|
self._value = self.func(data, *self.args, self.kwargs)
|
||||||
|
|
||||||
|
def read_file(filename):
|
||||||
|
with open(filename, 'r') as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
def split_words(text):
|
||||||
|
return re.findall(r'\b\w+\b', text.lower())
|
||||||
|
|
||||||
|
def count_words(words):
|
||||||
|
return Counter(words)
|
||||||
|
|
||||||
|
def top_n_words(word_counts, n):
|
||||||
|
return word_counts.most_common(n)
|
||||||
|
|
||||||
|
|
||||||
|
# 使用管道
|
||||||
|
pipe = Pipe(extract_file_words) | Pipe(get_frequencies) | Pipe(sort_dict) | Pipe(print_word_freqs, 5)
|
||||||
|
result = pipe(testfilepath)
|
||||||
|
print(result)
|
@ -0,0 +1,49 @@
|
|||||||
|
import re
|
||||||
|
from collections import Counter
|
||||||
|
from functools import reduce
|
||||||
|
from cppy.cp_util import *
|
||||||
|
|
||||||
|
class Pipeline:
|
||||||
|
def __init__(self, function):
|
||||||
|
self.function = function
|
||||||
|
|
||||||
|
def __or__(self, other):
|
||||||
|
if isinstance(other, Pipeline):
|
||||||
|
return Pipeline(lambda x: self.function(x) or other.function(x))
|
||||||
|
else:
|
||||||
|
raise TypeError("The argument must be an instance of Pipeline")
|
||||||
|
|
||||||
|
def process(self, data):
|
||||||
|
return self.function(data)
|
||||||
|
|
||||||
|
# 定义处理函数
|
||||||
|
def read_file(path):
|
||||||
|
with open(path, 'r', encoding='utf-8') as file:
|
||||||
|
return file.read()
|
||||||
|
|
||||||
|
def clean_text(text):
|
||||||
|
return re.sub(r'[^\w\s]', '', text).lower()
|
||||||
|
|
||||||
|
def tokenize(text):
|
||||||
|
return re.findall(r'\b\w+\b', text)
|
||||||
|
|
||||||
|
def remove_stop_words(tokens, stop_words):
|
||||||
|
return [token for token in tokens if token not in stop_words]
|
||||||
|
|
||||||
|
def count_frequencies(tokens):
|
||||||
|
return Counter(tokens)
|
||||||
|
|
||||||
|
def get_top_n_frequencies(counter, n):
|
||||||
|
return counter.most_common(n)
|
||||||
|
|
||||||
|
# 定义停用词列表
|
||||||
|
stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'for', 'on', 'is', 'it', 'with', 'that', 'as', 'by', 'this', 'at', 'be', 'which', 'from', 'or', 'are', 'an', 'but', 'not', 'you', 'have', 'your', 'can', 'will', 'all', 'any', 'if', 'their', 'would', 'what', 'there', 'when', 'which', 'who', 'whom', 'whose', 'where', 'why'])
|
||||||
|
|
||||||
|
# 创建管道
|
||||||
|
pipeline = (Pipeline(read_file) | clean_text | tokenize
|
||||||
|
| remove_stop_words | count_frequencies
|
||||||
|
| get_top_n_frequencies(n=10))
|
||||||
|
|
||||||
|
# 执行管道并打印结果
|
||||||
|
top_n_word_frequencies = pipeline.process( testfilepath )
|
||||||
|
print(top_n_word_frequencies)
|
@ -1,45 +1,20 @@
|
|||||||
from cppy.cp_util import *
|
from cppy.cp_util import *
|
||||||
|
|
||||||
###########################################
|
#
|
||||||
# 生成器
|
# 生成器
|
||||||
###########################################
|
#
|
||||||
def characters(filename): # 弹出一行
|
def non_stop_words(testfilepath):
|
||||||
for line in open(filename,encoding='utf-8'):
|
|
||||||
for c in line:
|
|
||||||
yield c
|
|
||||||
|
|
||||||
|
|
||||||
def all_words(filename): # 弹出一个词
|
|
||||||
start_char = True
|
|
||||||
for c in characters(filename):
|
|
||||||
if start_char == True:
|
|
||||||
word = ""
|
|
||||||
if c.isalnum(): # start of a word
|
|
||||||
word = c.lower()
|
|
||||||
start_char = False
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
if c.isalnum():
|
|
||||||
word += c.lower() # end of word, emit it
|
|
||||||
else:
|
|
||||||
start_char = True
|
|
||||||
yield word
|
|
||||||
|
|
||||||
|
|
||||||
def non_stop_words(filename, stopwords):
|
|
||||||
for w in all_words(filename):
|
|
||||||
if not w in stopwords:
|
|
||||||
yield w # 弹出一个审核过的词
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
stopwords = get_stopwords()
|
stopwords = get_stopwords()
|
||||||
|
data_str = read_file(testfilepath)
|
||||||
|
wordlist = re_split( data_str )
|
||||||
|
for word in wordlist:
|
||||||
|
if word not in stopwords:
|
||||||
|
yield word # 弹出一个非停用词
|
||||||
|
|
||||||
|
|
||||||
freqs = {}
|
freqs = {}
|
||||||
for word in non_stop_words(testfilepath,stopwords):
|
for word in non_stop_words(testfilepath):
|
||||||
freqs[word] = freqs.get(word, 0) + 1
|
freqs[word] = freqs.get(word, 0) + 1
|
||||||
|
|
||||||
data = sort_dict(freqs)
|
data = sort_dict(freqs)
|
||||||
print_word_freqs(data)
|
print_word_freqs(data)
|
||||||
|
|
Loading…
Reference in new issue