import multiprocessing from collections import Counter from cppy.cp_util import * # # 多进程 # def process_chunk(chunk): # 切词并过滤停用词 words = extract_str_words( chunk.lower() ) return Counter(words) def merge_counts(counts_list): # 合并多个Counter对象 total_counts = Counter() for counts in counts_list: total_counts += counts return total_counts @timing_decorator def main(): # 读取文件内容 content = read_file(testfilepath) # 分割文件内容为多个块,每个块由一个进程处理 chunk_size = 1000 # 可以根据实际情况调整块大小 chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)] # 使用多进程处理每个块 pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) counts_list = pool.map(process_chunk, chunks) pool.close() pool.join() # 合并计数 total_counts = merge_counts(counts_list) # 输出最高频的n个词 print_word_freqs( total_counts.most_common(10) ) if __name__ == '__main__': main()