zj3D 8 months ago
parent 445088fde8
commit 3c439ef8d7

@ -27,12 +27,7 @@ def compute_all_word_frequencies(agents):
future_to_agent = {executor.submit(agent.compute_word_frequency): agent for agent in agents} future_to_agent = {executor.submit(agent.compute_word_frequency): agent for agent in agents}
for future in concurrent.futures.as_completed(future_to_agent): for future in concurrent.futures.as_completed(future_to_agent):
agent = future_to_agent[future] agent = future_to_agent[future]
try: data = future.result() # 词频被保存在agent中
# 获取计算结果,但不处理异常
data = future.result()
except Exception as exc:
print(f'生成 {agent.text_chunk[:10]}... 的词频时出错: {exc}')
# 词频已经被保存在agent中
# 所有Agent计算完成后合并它们的词频结果 # 所有Agent计算完成后合并它们的词频结果
@ -48,5 +43,4 @@ if __name__ == '__main__':
agents = create_agents(words) # 创建代理 agents = create_agents(words) # 创建代理
compute_all_word_frequencies(agents) # 计算 compute_all_word_frequencies(agents) # 计算
merged_word_freq = merge_word_frequencies(agents) # 合并结果 merged_word_freq = merge_word_frequencies(agents) # 合并结果
for (w, c) in merged_word_freq.most_common(10): # 排序输出 util.print_word_freqs(merged_word_freq.most_common(10)) # 排序输出
print(w, '-', c)

@ -1,50 +1,50 @@
''' '''
享元模式中对象被设计为可共享的被多个上下文使用而不必在每个上下文中都创建新的对象 享元模式中对象被设计为可共享的被多个上下文使用而不必在每个上下文中都创建新的对象
如果我们有大量不同的词频分析需求有时需要词频前10的单词有时需要词频前20的单词有时还需要限定词汇的长度那就需要创建多个词频统计器每个词频统 如果我们有大量不同的词频分析需求有时需要词频前10的单词有时需要词频前20的单词有时还需要限定词汇的长度那就需要创建多个词频统计器每个词频统
计器都独立创建并存储其内部状态在这种情况下享元模式共享相同类型的词频统计器对象只需创建一个共享实例然后通过设置不同的参数个性化每个对象通过共享相同的内部状态降低了对象的创建和内存占用成本 计器都独立创建并存储其内部状态在这种情况下享元模式共享相同类型的词频统计器对象只需创建一个共享实例然后通过设置不同的参数个性化每个对象通过共享相同的内部状态降低了对象的创建和内存占用成本
例如我需要对3个文件获取词频前十的单词对另外3个文件获取词频前二十的单词那么我只需要创建2个词频统计器对象每个对象存储相同的内部状态一个对象 例如我需要对3个文件获取词频前十的单词对另外3个文件获取词频前二十的单词那么我只需要创建2个词频统计器对象每个对象存储相同的内部状态一个对象
获取前十的单词一个对象获取前二十的单词而不用创建6个对象 获取前十的单词一个对象获取前二十的单词而不用创建6个对象
''' '''
from cppy.cp_util import * from cppy.cp_util import *
#定义享元接口 #定义享元接口
class WordFrequencyController(): class WordFrequencyController():
def print_word_freqs(self,number): def print_word_freqs(self,number):
pass pass
#定义具体的享元类 #定义具体的享元类
class ConcreteWordFrequencyController(WordFrequencyController): class ConcreteWordFrequencyController(WordFrequencyController):
def __init__(self, controllertype,filepath): def __init__(self, controllertype,filepath):
self.word_list = extract_file_words(filepath) self.word_list = extract_file_words(filepath)
self.word_freq = get_frequencies(self.word_list) self.word_freq = get_frequencies(self.word_list)
self.word_freq = sort_dict(self.word_freq) self.word_freq = sort_dict(self.word_freq)
def print_word_freqs(self, number): def print_word_freqs(self, number):
print_word_freqs( self.word_freq,number) print_word_freqs( self.word_freq,number)
#定义享元工厂 #定义享元工厂
class WordFrequencyControllerFactory(): class WordFrequencyControllerFactory():
def __init__(self): def __init__(self):
self.types = {} self.types = {}
def get_WordFrequencyController(self, controller_type,testfilepath): def get_WordFrequencyController(self, controller_type,testfilepath):
if controller_type not in self.types: if controller_type not in self.types:
self.types[controller_type] = ConcreteWordFrequencyController(controller_type,testfilepath) self.types[controller_type] = ConcreteWordFrequencyController(controller_type,testfilepath)
#创建新的享元对象 #创建新的享元对象
return self.types[controller_type]#重复使用已存在的享元对象 return self.types[controller_type]#重复使用已存在的享元对象
def process_command(factory: WordFrequencyControllerFactory, number: str): def process_command(factory: WordFrequencyControllerFactory, number: str):
controller_type = number controller_type = number
WordFrequencyController = factory.get_WordFrequencyController(controller_type,testfilepath) WordFrequencyController = factory.get_WordFrequencyController(controller_type,testfilepath)
WordFrequencyController.print_word_freqs(int(number)) WordFrequencyController.print_word_freqs(int(number))
if __name__ == "__main__": if __name__ == "__main__":
factory = WordFrequencyControllerFactory() factory = WordFrequencyControllerFactory()
while True: while True:
try: try:
number = input("请输入需要显示词频前几的单词: ") number = input("请输入需要显示词频前几的单词: ")
process_command(factory, number) process_command(factory, number)
except EOFError: except EOFError:
break break

@ -2,16 +2,23 @@ import threading
from collections import Counter from collections import Counter
from cppy.cp_util import * from cppy.cp_util import *
stop_words = get_stopwords() #
# 多线程
# 定义一个函数来计算每个线程的词频 #
def count_words(start, end, text, result_index, results): def process_chunk(start, end, text, result_index, results):
words = re_split( text[start:end] ) # 切词并过滤停用词
words = [w for w in words if not w in stop_words] words = extract_str_words( text[start:end] )
result = Counter(words) results[result_index] = Counter(words)
results[result_index] = result
def merge_counts(counts_list):
if __name__ == '__main__': # 合并多个Counter对象
total_counts = Counter()
for counts in counts_list:
total_counts += counts
return total_counts
@timing_decorator
def main():
# 读取文件内容 # 读取文件内容
text = read_file(testfilepath) text = read_file(testfilepath)
@ -29,16 +36,19 @@ if __name__ == '__main__':
start = i * chunk_size start = i * chunk_size
# 确保最后一个线程能够读取文件的末尾 # 确保最后一个线程能够读取文件的末尾
end = text_length if i == num_threads - 1 else (i + 1) * chunk_size end = text_length if i == num_threads - 1 else (i + 1) * chunk_size
t = threading.Thread(target=count_words, args=(start, end, text, i, results)) t = threading.Thread(target=process_chunk, args=(start, end, text, i, results))
threads.append(t) threads.append(t)
t.start() t.start()
# 等待所有线程完成 # 等待所有线程完成
for t in threads: t.join() for t in threads: t.join()
# 合并结果 # 合并计数
total_count = Counter() total_counts = merge_counts(results)
for result in results: total_count += result
# 打印词频最高的10个单词 # 输出最高频的n个词
print_word_freqs( total_count.most_common(10) ) print_word_freqs( total_counts.most_common(10) )
if __name__ == '__main__':
main()

@ -1,10 +1,10 @@
import re
import multiprocessing import multiprocessing
from collections import Counter from collections import Counter
from cppy.cp_util import * from cppy.cp_util import *
stopwords = get_stopwords() #
# 多进程
#
def process_chunk(chunk): def process_chunk(chunk):
# 切词并过滤停用词 # 切词并过滤停用词
words = extract_str_words( chunk.lower() ) words = extract_str_words( chunk.lower() )
@ -16,8 +16,9 @@ def merge_counts(counts_list):
for counts in counts_list: for counts in counts_list:
total_counts += counts total_counts += counts
return total_counts return total_counts
if __name__ == '__main__': @timing_decorator
def main():
# 读取文件内容 # 读取文件内容
content = read_file(testfilepath) content = read_file(testfilepath)
@ -34,6 +35,10 @@ if __name__ == '__main__':
# 合并计数 # 合并计数
total_counts = merge_counts(counts_list) total_counts = merge_counts(counts_list)
# 输出最高频的n个词 # 输出最高频的n个词
for word, count in total_counts.most_common(10): print_word_freqs( total_counts.most_common(10) )
print(f"{word}-- {count}")
if __name__ == '__main__':
main()
Loading…
Cancel
Save