From 3c439ef8d7e1b3024f598995e4795023a33cac39 Mon Sep 17 00:00:00 2001 From: zj3D Date: Sat, 16 Mar 2024 13:59:29 +0800 Subject: [PATCH] update --- 交互/Agent/80.py | 10 +- .../{享元模式 => 共享对象}/tf-38.py | 98 +++++++++---------- .../函数缓存}/84.py | 0 .../map-reduce}/tf_91.py | 42 +++++--- 计算设备/map-reduce/tf_92.py | 21 ++-- 5 files changed, 90 insertions(+), 81 deletions(-) rename 基本结构/{享元模式 => 共享对象}/tf-38.py (97%) rename {计算设备/缓存 => 基本结构/函数缓存}/84.py (100%) rename {语言特性/异步 => 计算设备/map-reduce}/tf_91.py (50%) diff --git a/交互/Agent/80.py b/交互/Agent/80.py index ab6d22a..80ae836 100644 --- a/交互/Agent/80.py +++ b/交互/Agent/80.py @@ -27,12 +27,7 @@ def compute_all_word_frequencies(agents): future_to_agent = {executor.submit(agent.compute_word_frequency): agent for agent in agents} for future in concurrent.futures.as_completed(future_to_agent): agent = future_to_agent[future] - try: - # 获取计算结果,但不处理异常 - data = future.result() - except Exception as exc: - print(f'生成 {agent.text_chunk[:10]}... 的词频时出错: {exc}') - # 词频已经被保存在agent中 + data = future.result() # 词频被保存在agent中 # 所有Agent计算完成后,合并它们的词频结果 @@ -48,5 +43,4 @@ if __name__ == '__main__': agents = create_agents(words) # 创建代理 compute_all_word_frequencies(agents) # 计算 merged_word_freq = merge_word_frequencies(agents) # 合并结果 - for (w, c) in merged_word_freq.most_common(10): # 排序输出 - print(w, '-', c) \ No newline at end of file + util.print_word_freqs(merged_word_freq.most_common(10)) # 排序输出 \ No newline at end of file diff --git a/基本结构/享元模式/tf-38.py b/基本结构/共享对象/tf-38.py similarity index 97% rename from 基本结构/享元模式/tf-38.py rename to 基本结构/共享对象/tf-38.py index 79b8c7d..359d4b7 100644 --- a/基本结构/享元模式/tf-38.py +++ b/基本结构/共享对象/tf-38.py @@ -1,50 +1,50 @@ -''' -享元模式中,对象被设计为可共享的,被多个上下文使用,而不必在每个上下文中都创建新的对象。 -如果我们有大量不同的词频分析需求,有时需要词频前10的单词,有时需要词频前20的单词,有时还需要限定词汇的长度,那就需要创建多个词频统计器,每个词频统 -计器都独立创建并存储其内部状态,在这种情况下,享元模式共享相同类型的词频统计器对象,只需创建一个共享实例,然后通过设置不同的参数个性化每个对象,通过共享相同的内部状态,降低了对象的创建和内存占用成本。 -例如,我需要对3个文件获取词频前十的单词,对另外3个文件获取词频前二十的单词,那么我只需要创建2个词频统计器对象,每个对象存储相同的内部状态,一个对象 -获取前十的单词,一个对象获取前二十的单词,而不用创建6个对象 -''' - -from cppy.cp_util import * - - -#定义享元接口 -class WordFrequencyController(): - def print_word_freqs(self,number): - pass - -#定义具体的享元类 -class ConcreteWordFrequencyController(WordFrequencyController): - def __init__(self, controllertype,filepath): - self.word_list = extract_file_words(filepath) - self.word_freq = get_frequencies(self.word_list) - self.word_freq = sort_dict(self.word_freq) - def print_word_freqs(self, number): - print_word_freqs( self.word_freq,number) - -#定义享元工厂 -class WordFrequencyControllerFactory(): - def __init__(self): - self.types = {} - - def get_WordFrequencyController(self, controller_type,testfilepath): - if controller_type not in self.types: - self.types[controller_type] = ConcreteWordFrequencyController(controller_type,testfilepath) - #创建新的享元对象 - return self.types[controller_type]#重复使用已存在的享元对象 - -def process_command(factory: WordFrequencyControllerFactory, number: str): - controller_type = number - WordFrequencyController = factory.get_WordFrequencyController(controller_type,testfilepath) - WordFrequencyController.print_word_freqs(int(number)) - - -if __name__ == "__main__": - factory = WordFrequencyControllerFactory() - while True: - try: - number = input("请输入需要显示词频前几的单词: ") - process_command(factory, number) - except EOFError: +''' +享元模式中,对象被设计为可共享的,被多个上下文使用,而不必在每个上下文中都创建新的对象。 +如果我们有大量不同的词频分析需求,有时需要词频前10的单词,有时需要词频前20的单词,有时还需要限定词汇的长度,那就需要创建多个词频统计器,每个词频统 +计器都独立创建并存储其内部状态,在这种情况下,享元模式共享相同类型的词频统计器对象,只需创建一个共享实例,然后通过设置不同的参数个性化每个对象,通过共享相同的内部状态,降低了对象的创建和内存占用成本。 +例如,我需要对3个文件获取词频前十的单词,对另外3个文件获取词频前二十的单词,那么我只需要创建2个词频统计器对象,每个对象存储相同的内部状态,一个对象 +获取前十的单词,一个对象获取前二十的单词,而不用创建6个对象 +''' + +from cppy.cp_util import * + + +#定义享元接口 +class WordFrequencyController(): + def print_word_freqs(self,number): + pass + +#定义具体的享元类 +class ConcreteWordFrequencyController(WordFrequencyController): + def __init__(self, controllertype,filepath): + self.word_list = extract_file_words(filepath) + self.word_freq = get_frequencies(self.word_list) + self.word_freq = sort_dict(self.word_freq) + def print_word_freqs(self, number): + print_word_freqs( self.word_freq,number) + +#定义享元工厂 +class WordFrequencyControllerFactory(): + def __init__(self): + self.types = {} + + def get_WordFrequencyController(self, controller_type,testfilepath): + if controller_type not in self.types: + self.types[controller_type] = ConcreteWordFrequencyController(controller_type,testfilepath) + #创建新的享元对象 + return self.types[controller_type]#重复使用已存在的享元对象 + +def process_command(factory: WordFrequencyControllerFactory, number: str): + controller_type = number + WordFrequencyController = factory.get_WordFrequencyController(controller_type,testfilepath) + WordFrequencyController.print_word_freqs(int(number)) + + +if __name__ == "__main__": + factory = WordFrequencyControllerFactory() + while True: + try: + number = input("请输入需要显示词频前几的单词: ") + process_command(factory, number) + except EOFError: break \ No newline at end of file diff --git a/计算设备/缓存/84.py b/基本结构/函数缓存/84.py similarity index 100% rename from 计算设备/缓存/84.py rename to 基本结构/函数缓存/84.py diff --git a/语言特性/异步/tf_91.py b/计算设备/map-reduce/tf_91.py similarity index 50% rename from 语言特性/异步/tf_91.py rename to 计算设备/map-reduce/tf_91.py index f0a0475..df5add8 100644 --- a/语言特性/异步/tf_91.py +++ b/计算设备/map-reduce/tf_91.py @@ -2,16 +2,23 @@ import threading from collections import Counter from cppy.cp_util import * -stop_words = get_stopwords() - -# 定义一个函数来计算每个线程的词频 -def count_words(start, end, text, result_index, results): - words = re_split( text[start:end] ) - words = [w for w in words if not w in stop_words] - result = Counter(words) - results[result_index] = result - -if __name__ == '__main__': +# +# 多线程 +# +def process_chunk(start, end, text, result_index, results): + # 切词并过滤停用词 + words = extract_str_words( text[start:end] ) + results[result_index] = Counter(words) + +def merge_counts(counts_list): + # 合并多个Counter对象 + total_counts = Counter() + for counts in counts_list: + total_counts += counts + return total_counts + +@timing_decorator +def main(): # 读取文件内容 text = read_file(testfilepath) @@ -29,16 +36,19 @@ if __name__ == '__main__': start = i * chunk_size # 确保最后一个线程能够读取文件的末尾 end = text_length if i == num_threads - 1 else (i + 1) * chunk_size - t = threading.Thread(target=count_words, args=(start, end, text, i, results)) + t = threading.Thread(target=process_chunk, args=(start, end, text, i, results)) threads.append(t) t.start() # 等待所有线程完成 for t in threads: t.join() - # 合并结果 - total_count = Counter() - for result in results: total_count += result + # 合并计数 + total_counts = merge_counts(results) - # 打印词频最高的10个单词 - print_word_freqs( total_count.most_common(10) ) \ No newline at end of file + # 输出最高频的n个词 + print_word_freqs( total_counts.most_common(10) ) + + +if __name__ == '__main__': + main() diff --git a/计算设备/map-reduce/tf_92.py b/计算设备/map-reduce/tf_92.py index e9a63a8..525181e 100644 --- a/计算设备/map-reduce/tf_92.py +++ b/计算设备/map-reduce/tf_92.py @@ -1,10 +1,10 @@ -import re import multiprocessing from collections import Counter from cppy.cp_util import * -stopwords = get_stopwords() - +# +# 多进程 +# def process_chunk(chunk): # 切词并过滤停用词 words = extract_str_words( chunk.lower() ) @@ -16,8 +16,9 @@ def merge_counts(counts_list): for counts in counts_list: total_counts += counts return total_counts - -if __name__ == '__main__': + +@timing_decorator +def main(): # 读取文件内容 content = read_file(testfilepath) @@ -34,6 +35,10 @@ if __name__ == '__main__': # 合并计数 total_counts = merge_counts(counts_list) - # 输出最高频的n个词 - for word, count in total_counts.most_common(10): - print(f"{word}-- {count}") \ No newline at end of file + # 输出最高频的n个词 + print_word_freqs( total_counts.most_common(10) ) + + +if __name__ == '__main__': + main() + \ No newline at end of file