zj3D 8 months ago
parent 445088fde8
commit 3c439ef8d7

@ -27,12 +27,7 @@ def compute_all_word_frequencies(agents):
future_to_agent = {executor.submit(agent.compute_word_frequency): agent for agent in agents}
for future in concurrent.futures.as_completed(future_to_agent):
agent = future_to_agent[future]
try:
# 获取计算结果,但不处理异常
data = future.result()
except Exception as exc:
print(f'生成 {agent.text_chunk[:10]}... 的词频时出错: {exc}')
# 词频已经被保存在agent中
data = future.result() # 词频被保存在agent中
# 所有Agent计算完成后合并它们的词频结果
@ -48,5 +43,4 @@ if __name__ == '__main__':
agents = create_agents(words) # 创建代理
compute_all_word_frequencies(agents) # 计算
merged_word_freq = merge_word_frequencies(agents) # 合并结果
for (w, c) in merged_word_freq.most_common(10): # 排序输出
print(w, '-', c)
util.print_word_freqs(merged_word_freq.most_common(10)) # 排序输出

@ -1,50 +1,50 @@
'''
享元模式中对象被设计为可共享的被多个上下文使用而不必在每个上下文中都创建新的对象
如果我们有大量不同的词频分析需求有时需要词频前10的单词有时需要词频前20的单词有时还需要限定词汇的长度那就需要创建多个词频统计器每个词频统
计器都独立创建并存储其内部状态在这种情况下享元模式共享相同类型的词频统计器对象只需创建一个共享实例然后通过设置不同的参数个性化每个对象通过共享相同的内部状态降低了对象的创建和内存占用成本
例如我需要对3个文件获取词频前十的单词对另外3个文件获取词频前二十的单词那么我只需要创建2个词频统计器对象每个对象存储相同的内部状态一个对象
获取前十的单词一个对象获取前二十的单词而不用创建6个对象
'''
from cppy.cp_util import *
#定义享元接口
class WordFrequencyController():
def print_word_freqs(self,number):
pass
#定义具体的享元类
class ConcreteWordFrequencyController(WordFrequencyController):
def __init__(self, controllertype,filepath):
self.word_list = extract_file_words(filepath)
self.word_freq = get_frequencies(self.word_list)
self.word_freq = sort_dict(self.word_freq)
def print_word_freqs(self, number):
print_word_freqs( self.word_freq,number)
#定义享元工厂
class WordFrequencyControllerFactory():
def __init__(self):
self.types = {}
def get_WordFrequencyController(self, controller_type,testfilepath):
if controller_type not in self.types:
self.types[controller_type] = ConcreteWordFrequencyController(controller_type,testfilepath)
#创建新的享元对象
return self.types[controller_type]#重复使用已存在的享元对象
def process_command(factory: WordFrequencyControllerFactory, number: str):
controller_type = number
WordFrequencyController = factory.get_WordFrequencyController(controller_type,testfilepath)
WordFrequencyController.print_word_freqs(int(number))
if __name__ == "__main__":
factory = WordFrequencyControllerFactory()
while True:
try:
number = input("请输入需要显示词频前几的单词: ")
process_command(factory, number)
except EOFError:
'''
享元模式中对象被设计为可共享的被多个上下文使用而不必在每个上下文中都创建新的对象
如果我们有大量不同的词频分析需求有时需要词频前10的单词有时需要词频前20的单词有时还需要限定词汇的长度那就需要创建多个词频统计器每个词频统
计器都独立创建并存储其内部状态在这种情况下享元模式共享相同类型的词频统计器对象只需创建一个共享实例然后通过设置不同的参数个性化每个对象通过共享相同的内部状态降低了对象的创建和内存占用成本
例如我需要对3个文件获取词频前十的单词对另外3个文件获取词频前二十的单词那么我只需要创建2个词频统计器对象每个对象存储相同的内部状态一个对象
获取前十的单词一个对象获取前二十的单词而不用创建6个对象
'''
from cppy.cp_util import *
#定义享元接口
class WordFrequencyController():
def print_word_freqs(self,number):
pass
#定义具体的享元类
class ConcreteWordFrequencyController(WordFrequencyController):
def __init__(self, controllertype,filepath):
self.word_list = extract_file_words(filepath)
self.word_freq = get_frequencies(self.word_list)
self.word_freq = sort_dict(self.word_freq)
def print_word_freqs(self, number):
print_word_freqs( self.word_freq,number)
#定义享元工厂
class WordFrequencyControllerFactory():
def __init__(self):
self.types = {}
def get_WordFrequencyController(self, controller_type,testfilepath):
if controller_type not in self.types:
self.types[controller_type] = ConcreteWordFrequencyController(controller_type,testfilepath)
#创建新的享元对象
return self.types[controller_type]#重复使用已存在的享元对象
def process_command(factory: WordFrequencyControllerFactory, number: str):
controller_type = number
WordFrequencyController = factory.get_WordFrequencyController(controller_type,testfilepath)
WordFrequencyController.print_word_freqs(int(number))
if __name__ == "__main__":
factory = WordFrequencyControllerFactory()
while True:
try:
number = input("请输入需要显示词频前几的单词: ")
process_command(factory, number)
except EOFError:
break

@ -2,16 +2,23 @@ import threading
from collections import Counter
from cppy.cp_util import *
stop_words = get_stopwords()
# 定义一个函数来计算每个线程的词频
def count_words(start, end, text, result_index, results):
words = re_split( text[start:end] )
words = [w for w in words if not w in stop_words]
result = Counter(words)
results[result_index] = result
if __name__ == '__main__':
#
# 多线程
#
def process_chunk(start, end, text, result_index, results):
# 切词并过滤停用词
words = extract_str_words( text[start:end] )
results[result_index] = Counter(words)
def merge_counts(counts_list):
# 合并多个Counter对象
total_counts = Counter()
for counts in counts_list:
total_counts += counts
return total_counts
@timing_decorator
def main():
# 读取文件内容
text = read_file(testfilepath)
@ -29,16 +36,19 @@ if __name__ == '__main__':
start = i * chunk_size
# 确保最后一个线程能够读取文件的末尾
end = text_length if i == num_threads - 1 else (i + 1) * chunk_size
t = threading.Thread(target=count_words, args=(start, end, text, i, results))
t = threading.Thread(target=process_chunk, args=(start, end, text, i, results))
threads.append(t)
t.start()
# 等待所有线程完成
for t in threads: t.join()
# 合并结果
total_count = Counter()
for result in results: total_count += result
# 合并计数
total_counts = merge_counts(results)
# 打印词频最高的10个单词
print_word_freqs( total_count.most_common(10) )
# 输出最高频的n个词
print_word_freqs( total_counts.most_common(10) )
if __name__ == '__main__':
main()

@ -1,10 +1,10 @@
import re
import multiprocessing
from collections import Counter
from cppy.cp_util import *
stopwords = get_stopwords()
#
# 多进程
#
def process_chunk(chunk):
# 切词并过滤停用词
words = extract_str_words( chunk.lower() )
@ -16,8 +16,9 @@ def merge_counts(counts_list):
for counts in counts_list:
total_counts += counts
return total_counts
if __name__ == '__main__':
@timing_decorator
def main():
# 读取文件内容
content = read_file(testfilepath)
@ -34,6 +35,10 @@ if __name__ == '__main__':
# 合并计数
total_counts = merge_counts(counts_list)
# 输出最高频的n个词
for word, count in total_counts.most_common(10):
print(f"{word}-- {count}")
# 输出最高频的n个词
print_word_freqs( total_counts.most_common(10) )
if __name__ == '__main__':
main()
Loading…
Cancel
Save