forked from p46318075/CodePattern
				
			
							parent
							
								
									2518a5cd85
								
							
						
					
					
						commit
						950cb41e08
					
				| @ -0,0 +1,65 @@ | ||||
| from cppy.cp_util import * | ||||
| from collections import Counter   | ||||
| from heapq import nlargest   | ||||
| import re   | ||||
|    | ||||
|    | ||||
| class Pipeline:   | ||||
|     def __init__(self):   | ||||
|         pass   | ||||
|    | ||||
|     def __or__(self, other):   | ||||
|         class PipelineComposition(Pipeline):   | ||||
|             def __init__(self, first, second):   | ||||
|                 self.first = first   | ||||
|                 self.second = second   | ||||
|    | ||||
|             def process(self, data):   | ||||
|                 return self.second.process(self.first.process(data))   | ||||
|    | ||||
|         return PipelineComposition(self, other)   | ||||
|    | ||||
|     def process(self, data):   | ||||
|         raise NotImplementedError   | ||||
|    | ||||
|    | ||||
| class FileReader(Pipeline):   | ||||
|     def __init__(self, filename):   | ||||
|         super().__init__()   | ||||
|         self.filename = filename   | ||||
|    | ||||
|     def process(self):   | ||||
|         with open(self.filename, 'r', encoding='utf-8') as file:   | ||||
|             content = file.read()   | ||||
|         return content   | ||||
|    | ||||
|    | ||||
| class WordFrequencyCounter(Pipeline):   | ||||
|     def process(self, text):   | ||||
|         words = re.findall(r'\w+', text.lower())   | ||||
|         word_freq = Counter(words)   | ||||
|         return word_freq   | ||||
|    | ||||
|    | ||||
| class TopNFilter(Pipeline):   | ||||
|     def __init__(self, n):   | ||||
|         super().__init__()   | ||||
|         self.n = n   | ||||
|    | ||||
|     def process(self, word_freq):   | ||||
|         return nlargest(self.n, word_freq.items(), key=lambda item: item[1])   | ||||
|    | ||||
|    | ||||
| # 假设有一个文本文件"text.txt",其内容是需要分析的文本   | ||||
| filename = testfilepath | ||||
| n = 5  # 求取最高5个词频   | ||||
|    | ||||
| # 创建管道   | ||||
| pipeline = FileReader(filename) | WordFrequencyCounter() | TopNFilter(n)   | ||||
|    | ||||
| # 执行管道   | ||||
| top_n_words = pipeline.process()   | ||||
|    | ||||
| # 打印结果   | ||||
| for word, freq in top_n_words:   | ||||
|     print(f"{word}: {freq}") | ||||
| @ -0,0 +1,35 @@ | ||||
| from cppy.cp_util import * | ||||
| from collections import Counter | ||||
| 
 | ||||
| class Pipe: | ||||
|     def __init__(self, func, *args, kwargs=None): | ||||
|         self.func = func | ||||
|         # self.args, self.kwargs= None ,None  | ||||
|         if args : self.args = args | ||||
|         if kwargs: self.kwargs = kwargs | ||||
|         # print( self.args, self.kwargs) | ||||
| 
 | ||||
|     def __or__(self, other): | ||||
|         return other(self._value)         | ||||
| 
 | ||||
|     def __call__(self, data): | ||||
|         self._value = self.func(data, *self.args, self.kwargs) | ||||
| 
 | ||||
| def read_file(filename): | ||||
|     with open(filename, 'r') as f: | ||||
|         return f.read() | ||||
| 
 | ||||
| def split_words(text): | ||||
|     return re.findall(r'\b\w+\b', text.lower()) | ||||
| 
 | ||||
| def count_words(words): | ||||
|     return Counter(words) | ||||
| 
 | ||||
| def top_n_words(word_counts, n): | ||||
|     return word_counts.most_common(n) | ||||
| 
 | ||||
| 
 | ||||
| # 使用管道 | ||||
| pipe = Pipe(extract_file_words) | Pipe(get_frequencies) | Pipe(sort_dict) | Pipe(print_word_freqs, 5) | ||||
| result = pipe(testfilepath) | ||||
| print(result) | ||||
| @ -0,0 +1,49 @@ | ||||
| import re | ||||
| from collections import Counter | ||||
| from functools import reduce | ||||
| from cppy.cp_util import * | ||||
| 
 | ||||
| class Pipeline: | ||||
|     def __init__(self, function): | ||||
|         self.function = function | ||||
| 
 | ||||
|     def __or__(self, other): | ||||
|         if isinstance(other, Pipeline): | ||||
|             return Pipeline(lambda x: self.function(x) or other.function(x)) | ||||
|         else: | ||||
|             raise TypeError("The argument must be an instance of Pipeline") | ||||
| 
 | ||||
|     def process(self, data): | ||||
|         return self.function(data) | ||||
| 
 | ||||
| # 定义处理函数 | ||||
| def read_file(path): | ||||
|     with open(path, 'r', encoding='utf-8') as file: | ||||
|         return file.read() | ||||
| 
 | ||||
| def clean_text(text): | ||||
|     return re.sub(r'[^\w\s]', '', text).lower() | ||||
| 
 | ||||
| def tokenize(text): | ||||
|     return re.findall(r'\b\w+\b', text) | ||||
| 
 | ||||
| def remove_stop_words(tokens, stop_words): | ||||
|     return [token for token in tokens if token not in stop_words] | ||||
| 
 | ||||
| def count_frequencies(tokens): | ||||
|     return Counter(tokens) | ||||
| 
 | ||||
| def get_top_n_frequencies(counter, n): | ||||
|     return counter.most_common(n) | ||||
| 
 | ||||
| # 定义停用词列表 | ||||
| stop_words = set(['the', 'and', 'a', 'to', 'of', 'in', 'for', 'on', 'is', 'it', 'with', 'that', 'as', 'by', 'this', 'at', 'be', 'which', 'from', 'or', 'are', 'an', 'but', 'not', 'you', 'have', 'your', 'can', 'will', 'all', 'any', 'if', 'their', 'would', 'what', 'there', 'when', 'which', 'who', 'whom', 'whose', 'where', 'why']) | ||||
| 
 | ||||
| # 创建管道 | ||||
| pipeline = (Pipeline(read_file) | clean_text | tokenize | ||||
|              | remove_stop_words | count_frequencies | ||||
|              | get_top_n_frequencies(n=10)) | ||||
| 
 | ||||
| # 执行管道并打印结果 | ||||
| top_n_word_frequencies = pipeline.process( testfilepath ) | ||||
| print(top_n_word_frequencies) | ||||
| @ -1,45 +1,20 @@ | ||||
| from cppy.cp_util import * | ||||
| 
 | ||||
| ########################################### | ||||
| # | ||||
| # 生成器 | ||||
| ########################################### | ||||
| def characters(filename):  # 弹出一行 | ||||
|     for line in open(filename,encoding='utf-8'): | ||||
|         for c in line: | ||||
|             yield c | ||||
| 
 | ||||
| 
 | ||||
| def all_words(filename):  # 弹出一个词 | ||||
|     start_char = True | ||||
|     for c in characters(filename): | ||||
|         if start_char == True: | ||||
|             word = "" | ||||
|             if c.isalnum(): # start of a word                 | ||||
|                 word = c.lower() | ||||
|                 start_char = False | ||||
|             else: | ||||
|                 pass | ||||
|         else: | ||||
|             if c.isalnum(): | ||||
|                 word += c.lower() # end of word, emit it | ||||
|             else:                 | ||||
|                 start_char = True | ||||
|                 yield word | ||||
| 
 | ||||
| 
 | ||||
| def non_stop_words(filename, stopwords): | ||||
|     for w in all_words(filename): | ||||
|         if not w in stopwords: | ||||
|             yield w    # 弹出一个审核过的词 | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
| # | ||||
| def non_stop_words(testfilepath):  | ||||
|     stopwords = get_stopwords() | ||||
|     data_str = read_file(testfilepath) | ||||
|     wordlist = re_split( data_str ) | ||||
|     for word in wordlist: | ||||
|         if word not in stopwords:  | ||||
|             yield word   # 弹出一个非停用词 | ||||
| 
 | ||||
| 
 | ||||
|     freqs = {} | ||||
|     for word in non_stop_words(testfilepath,stopwords): | ||||
|         freqs[word] = freqs.get(word, 0) + 1         | ||||
| freqs = {} | ||||
| for word in non_stop_words(testfilepath): | ||||
|     freqs[word] = freqs.get(word, 0) + 1         | ||||
| 
 | ||||
|     data  = sort_dict(freqs) | ||||
|     print_word_freqs(data) | ||||
|      | ||||
| data  = sort_dict(freqs) | ||||
| print_word_freqs(data) | ||||
					Loading…
					
					
				
		Reference in new issue