# -*- coding: utf-8 -*- from collections import Counter from cppy.cp_util import * from functools import reduce # map - reduce def process_chunk(chunk): # 过滤停用词 stop_words = get_stopwords() words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ] return Counter(words) def merge_counts(count1,count2): sum_counts = count1 + count2 return sum_counts @timing_decorator def main(): # 读取文件内容 content = re_split(read_file(testfilepath)) # 分割文件内容为多个块,每个块由一个进程处理 chunk_size = 1000 # 可以根据实际情况调整块大小 chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)] # 使用 map 方法和 process_chunk 函数处理每个分区 counts_list = list(map(process_chunk, chunks)) # 使用 reduce 和 merge_counts 函数统计所有分区的词频 total_counts = (reduce(merge_counts,counts_list)) # 输出最高频的n个词 print_word_freqs(total_counts.most_common(10)) if __name__ == '__main__': main()