pull/12/head
zj3D 8 months ago
parent 445088fde8
commit 3c439ef8d7

@ -27,12 +27,7 @@ def compute_all_word_frequencies(agents):
future_to_agent = {executor.submit(agent.compute_word_frequency): agent for agent in agents} future_to_agent = {executor.submit(agent.compute_word_frequency): agent for agent in agents}
for future in concurrent.futures.as_completed(future_to_agent): for future in concurrent.futures.as_completed(future_to_agent):
agent = future_to_agent[future] agent = future_to_agent[future]
try: data = future.result() # 词频被保存在agent中
# 获取计算结果,但不处理异常
data = future.result()
except Exception as exc:
print(f'生成 {agent.text_chunk[:10]}... 的词频时出错: {exc}')
# 词频已经被保存在agent中
# 所有Agent计算完成后合并它们的词频结果 # 所有Agent计算完成后合并它们的词频结果
@ -48,5 +43,4 @@ if __name__ == '__main__':
agents = create_agents(words) # 创建代理 agents = create_agents(words) # 创建代理
compute_all_word_frequencies(agents) # 计算 compute_all_word_frequencies(agents) # 计算
merged_word_freq = merge_word_frequencies(agents) # 合并结果 merged_word_freq = merge_word_frequencies(agents) # 合并结果
for (w, c) in merged_word_freq.most_common(10): # 排序输出 util.print_word_freqs(merged_word_freq.most_common(10)) # 排序输出
print(w, '-', c)

@ -2,16 +2,23 @@ import threading
from collections import Counter from collections import Counter
from cppy.cp_util import * from cppy.cp_util import *
stop_words = get_stopwords() #
# 多线程
# 定义一个函数来计算每个线程的词频 #
def count_words(start, end, text, result_index, results): def process_chunk(start, end, text, result_index, results):
words = re_split( text[start:end] ) # 切词并过滤停用词
words = [w for w in words if not w in stop_words] words = extract_str_words( text[start:end] )
result = Counter(words) results[result_index] = Counter(words)
results[result_index] = result
def merge_counts(counts_list):
if __name__ == '__main__': # 合并多个Counter对象
total_counts = Counter()
for counts in counts_list:
total_counts += counts
return total_counts
@timing_decorator
def main():
# 读取文件内容 # 读取文件内容
text = read_file(testfilepath) text = read_file(testfilepath)
@ -29,16 +36,19 @@ if __name__ == '__main__':
start = i * chunk_size start = i * chunk_size
# 确保最后一个线程能够读取文件的末尾 # 确保最后一个线程能够读取文件的末尾
end = text_length if i == num_threads - 1 else (i + 1) * chunk_size end = text_length if i == num_threads - 1 else (i + 1) * chunk_size
t = threading.Thread(target=count_words, args=(start, end, text, i, results)) t = threading.Thread(target=process_chunk, args=(start, end, text, i, results))
threads.append(t) threads.append(t)
t.start() t.start()
# 等待所有线程完成 # 等待所有线程完成
for t in threads: t.join() for t in threads: t.join()
# 合并结果 # 合并计数
total_count = Counter() total_counts = merge_counts(results)
for result in results: total_count += result
# 打印词频最高的10个单词 # 输出最高频的n个词
print_word_freqs( total_count.most_common(10) ) print_word_freqs( total_counts.most_common(10) )
if __name__ == '__main__':
main()

@ -1,10 +1,10 @@
import re
import multiprocessing import multiprocessing
from collections import Counter from collections import Counter
from cppy.cp_util import * from cppy.cp_util import *
stopwords = get_stopwords() #
# 多进程
#
def process_chunk(chunk): def process_chunk(chunk):
# 切词并过滤停用词 # 切词并过滤停用词
words = extract_str_words( chunk.lower() ) words = extract_str_words( chunk.lower() )
@ -17,7 +17,8 @@ def merge_counts(counts_list):
total_counts += counts total_counts += counts
return total_counts return total_counts
if __name__ == '__main__': @timing_decorator
def main():
# 读取文件内容 # 读取文件内容
content = read_file(testfilepath) content = read_file(testfilepath)
@ -35,5 +36,9 @@ if __name__ == '__main__':
total_counts = merge_counts(counts_list) total_counts = merge_counts(counts_list)
# 输出最高频的n个词 # 输出最高频的n个词
for word, count in total_counts.most_common(10): print_word_freqs( total_counts.most_common(10) )
print(f"{word}-- {count}")
if __name__ == '__main__':
main()
Loading…
Cancel
Save