You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

52 lines
2.0 KiB

9 months ago
import concurrent.futures
from collections import Counter
import cppy.cp_util as util
8 months ago
'''
concurrent.futures模块为Python中的并发编程提供了一个统一接口,
这个模块隐藏了低层次的线程和进程创建同步和清理的细节,提供了一个更高层次的API来处理并发任务
当前版本推荐它与asyncio模块结合使用完成Python中的各种异步编程任务
'''
8 months ago
stop_words = util.get_stopwords()
9 months ago
class WordFrequencyAgent:
def __init__(self, words):
self.words = words
8 months ago
def compute_word_frequency(self):
words = [ w for w in self.words if ( not w in stop_words ) and len(w) >= 3 ]
self.word_freq = Counter( words)
9 months ago
def get_word_frequency(self):
return self.word_freq
# 将文本分割成多个部分并为每个部分创建一个Agent
8 months ago
def create_agents( words ):
return [ WordFrequencyAgent(chunk) for chunk in words ]
9 months ago
def compute_all_word_frequencies(agents):
with concurrent.futures.ThreadPoolExecutor() as executor:
# 使用线程池来并行计算词频
future_to_agent = {executor.submit(agent.compute_word_frequency): agent for agent in agents}
for future in concurrent.futures.as_completed(future_to_agent):
agent = future_to_agent[future]
9 months ago
data = future.result() # 词频被保存在agent中
9 months ago
# 所有Agent计算完成后合并它们的词频结果
def merge_word_frequencies(agents):
merged_freq = Counter()
for agent in agents:
merged_freq.update(agent.get_word_frequency())
return merged_freq
8 months ago
@util.timing_decorator
def main():
words = util.get_chunks(util.testfilepath)
9 months ago
agents = create_agents(words) # 创建代理
compute_all_word_frequencies(agents) # 计算
merged_word_freq = merge_word_frequencies(agents) # 合并结果
8 months ago
util.print_word_freqs(merged_word_freq.most_common(10)) # 排序输出
if __name__ == '__main__':
main()