# -*- coding: utf-8 -*- import multiprocessing from collections import Counter from cppy.cp_util import * # # 多进程: 因为创建进程相比计算过程开销太大,结果最慢 # stop_words = get_stopwords() def process_chunk(chunk): # 过滤停用词 words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ] return Counter(words) def merge_counts(counts_list): # 合并多个Counter对象 total_counts = Counter() for counts in counts_list: total_counts += counts return total_counts @timing_decorator def main(): # 读取文件内容,分割文件内容为多个块,每个块由一个进程处理 chunks = get_chunks(testfilepath,1000) # 使用多进程处理每个块 pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) counts_list = pool.map(process_chunk, chunks) pool.close() pool.join() # 合并计数 total_counts = merge_counts(counts_list) # 输出最高频的n个词 print_word_freqs(total_counts.most_common(10)) if __name__ == '__main__': main()