|
|
@ -7,24 +7,23 @@ concurrent.futures模块为Python中的并发编程提供了一个统一接口,
|
|
|
|
这个模块隐藏了低层次的线程和进程创建、同步和清理的细节,提供了一个更高层次的API来处理并发任务。
|
|
|
|
这个模块隐藏了低层次的线程和进程创建、同步和清理的细节,提供了一个更高层次的API来处理并发任务。
|
|
|
|
当前版本推荐它与asyncio模块结合使用完成Python中的各种异步编程任务。
|
|
|
|
当前版本推荐它与asyncio模块结合使用完成Python中的各种异步编程任务。
|
|
|
|
'''
|
|
|
|
'''
|
|
|
|
|
|
|
|
stop_words = util.get_stopwords()
|
|
|
|
|
|
|
|
|
|
|
|
class WordFrequencyAgent:
|
|
|
|
class WordFrequencyAgent:
|
|
|
|
def __init__(self, words):
|
|
|
|
def __init__(self, words):
|
|
|
|
self.words = words
|
|
|
|
self.words = words
|
|
|
|
|
|
|
|
|
|
|
|
def compute_word_frequency(self):
|
|
|
|
def compute_word_frequency(self):
|
|
|
|
self.word_freq = Counter(self.words)
|
|
|
|
words = [ w for w in self.words if ( not w in stop_words ) and len(w) >= 3 ]
|
|
|
|
|
|
|
|
self.word_freq = Counter( words)
|
|
|
|
|
|
|
|
|
|
|
|
def get_word_frequency(self):
|
|
|
|
def get_word_frequency(self):
|
|
|
|
return self.word_freq
|
|
|
|
return self.word_freq
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 将文本分割成多个部分,并为每个部分创建一个Agent
|
|
|
|
# 将文本分割成多个部分,并为每个部分创建一个Agent
|
|
|
|
def create_agents(words, num_agents = 4 ):
|
|
|
|
def create_agents( words ):
|
|
|
|
text_chunks = [ words[i::num_agents] for i in range(num_agents) ]
|
|
|
|
return [ WordFrequencyAgent(chunk) for chunk in words ]
|
|
|
|
agents = [ WordFrequencyAgent(chunk) for chunk in text_chunks ]
|
|
|
|
|
|
|
|
return agents
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def compute_all_word_frequencies(agents):
|
|
|
|
def compute_all_word_frequencies(agents):
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
|
@ -34,7 +33,6 @@ def compute_all_word_frequencies(agents):
|
|
|
|
agent = future_to_agent[future]
|
|
|
|
agent = future_to_agent[future]
|
|
|
|
data = future.result() # 词频被保存在agent中
|
|
|
|
data = future.result() # 词频被保存在agent中
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 所有Agent计算完成后,合并它们的词频结果
|
|
|
|
# 所有Agent计算完成后,合并它们的词频结果
|
|
|
|
def merge_word_frequencies(agents):
|
|
|
|
def merge_word_frequencies(agents):
|
|
|
|
merged_freq = Counter()
|
|
|
|
merged_freq = Counter()
|
|
|
@ -42,10 +40,13 @@ def merge_word_frequencies(agents):
|
|
|
|
merged_freq.update(agent.get_word_frequency())
|
|
|
|
merged_freq.update(agent.get_word_frequency())
|
|
|
|
return merged_freq
|
|
|
|
return merged_freq
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@util.timing_decorator
|
|
|
|
if __name__ == '__main__':
|
|
|
|
def main():
|
|
|
|
words = util.extract_file_words(util.testfilepath) # 从文本抽词
|
|
|
|
words = util.get_chunks(util.testfilepath)
|
|
|
|
agents = create_agents(words) # 创建代理
|
|
|
|
agents = create_agents(words) # 创建代理
|
|
|
|
compute_all_word_frequencies(agents) # 计算
|
|
|
|
compute_all_word_frequencies(agents) # 计算
|
|
|
|
merged_word_freq = merge_word_frequencies(agents) # 合并结果
|
|
|
|
merged_word_freq = merge_word_frequencies(agents) # 合并结果
|
|
|
|
util.print_word_freqs(merged_word_freq.most_common(10)) # 排序输出
|
|
|
|
util.print_word_freqs(merged_word_freq.most_common(10)) # 排序输出
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
main()
|