# -*- coding: utf-8 -*- from collections import Counter from cppy.cp_util import * from multiprocessing.pool import ThreadPool # # 多线程 # def process_chunk(chunk): # 过滤停用词 stop_words = get_stopwords() words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ] return Counter(words) def merge_counts(counts_list): # 合并多个Counter对象 total_counts = Counter() for counts in counts_list: total_counts += counts return total_counts def thread_function(chunk, counts_list): word_count = process_chunk(chunk) counts_list.append(word_count) @timing_decorator def main(): # 读取文件内容 content = re_split(read_file(testfilepath)) chunk_size = 1000 # 可以根据实际情况调整块大小 chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)] # 使用多线程池,每个线程处理一个块 pool = ThreadPool(len(content)//chunk_size+1) counts_list = pool.map(process_chunk, chunks) pool.close() pool.join() # 合并计数 total_counts = merge_counts(counts_list) # 输出最高频的n个词 print_word_freqs(total_counts.most_common(10)) if __name__ == '__main__': main()