# -*- coding: utf-8 -*- from collections import Counter from cppy.cp_util import * from functools import reduce stop_words = get_stopwords() # map - reduce def process_chunk(chunk): # 过滤停用词 words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ] return Counter(words) def merge_counts(count1,count2): sum_counts = count1 + count2 return sum_counts @timing_decorator def main(): # 读数据,按1000个词一组分片 chunks = get_chunks(testfilepath,1000) # 使用 map 方法和 process_chunk 函数处理每个分区 counts_list = list(map(process_chunk, chunks)) # 使用 reduce 和 merge_counts 函数统计所有分区的词频 total_counts = (reduce(merge_counts,counts_list)) # 输出最高频的n个词 print_word_freqs(total_counts.most_common(10)) if __name__ == '__main__': main()