# -*- coding: utf-8 -*- import multiprocessing from collections import Counter from cppy.cp_util import * # # 多进程 # def process_chunk(chunk): # 过滤停用词 stop_words = get_stopwords() words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ] return Counter(words) def merge_counts(counts_list): # 合并多个Counter对象 total_counts = Counter() for counts in counts_list: total_counts += counts return total_counts @timing_decorator def main(): # 读取文件内容 content = re_split(read_file(testfilepath)) # 分割文件内容为多个块,每个块由一个进程处理 chunk_size = 1000 # 可以根据实际情况调整块大小 chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)] # 使用多进程处理每个块 pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) counts_list = pool.map(process_chunk, chunks) pool.close() pool.join() # 合并计数 total_counts = merge_counts(counts_list) # 输出最高频的n个词 print_word_freqs(total_counts.most_common(10)) if __name__ == '__main__': main()