You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

52 lines
2.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import concurrent.futures
from collections import Counter
import cppy.cp_util as util
'''
concurrent.futures模块为Python中的并发编程提供了一个统一接口,
这个模块隐藏了低层次的线程和进程创建、同步和清理的细节,提供了一个更高层次的API来处理并发任务。
当前版本推荐它与asyncio模块结合使用完成Python中的各种异步编程任务。
'''
stop_words = util.get_stopwords()
class WordFrequencyAgent:
def __init__(self, words):
self.words = words
def compute_word_frequency(self):
words = [ w for w in self.words if ( not w in stop_words ) and len(w) >= 3 ]
self.word_freq = Counter( words)
def get_word_frequency(self):
return self.word_freq
# 将文本分割成多个部分并为每个部分创建一个Agent
def create_agents( words ):
return [ WordFrequencyAgent(chunk) for chunk in words ]
def compute_all_word_frequencies(agents):
with concurrent.futures.ThreadPoolExecutor() as executor:
# 使用线程池来并行计算词频
future_to_agent = {executor.submit(agent.compute_word_frequency): agent for agent in agents}
for future in concurrent.futures.as_completed(future_to_agent):
agent = future_to_agent[future]
data = future.result() # 词频被保存在agent中
# 所有Agent计算完成后合并它们的词频结果
def merge_word_frequencies(agents):
merged_freq = Counter()
for agent in agents:
merged_freq.update(agent.get_word_frequency())
return merged_freq
@util.timing_decorator
def main():
words = util.get_chunks(util.testfilepath)
agents = create_agents(words) # 创建代理
compute_all_word_frequencies(agents) # 计算
merged_word_freq = merge_word_frequencies(agents) # 合并结果
util.print_word_freqs(merged_word_freq.most_common(10)) # 排序输出
if __name__ == '__main__':
main()