# -*- coding: utf-8 -*- from collections import Counter from cppy.cp_util import * from multiprocessing.pool import ThreadPool # # 多线程 # stop_words = get_stopwords() def process_chunk(chunk): # 过滤停用词 words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ] return Counter(words) def merge_counts(counts_list): """合并多个Counter对象的总和""" return sum(counts_list, Counter()) def thread_function(chunk, counts_list): word_count = process_chunk(chunk) counts_list.append(word_count) @timing_decorator def main(): # 读数据,按1000个词一组分片 chunks = get_chunks(testfilepath,1000) # 线程池 pool = ThreadPool(len(chunks)) # 随意指定的线程数 counts_list = pool.map(process_chunk, chunks) pool.close() pool.join() # 合并计数 total_counts = merge_counts(counts_list) # 输出最高频的n个词 print_word_freqs(total_counts.most_common(10)) if __name__ == '__main__': main()