from cppy.cp_util import * from collections import Counter stop_words = get_stopwords() def process_chunk(chunk): # 过滤停用词 words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ] return Counter(words) def process_chunks( chunks,word_freqs,x,max ): next = x + 1 if next < max: process_chunks(chunks,word_freqs,next,max) word_list = process_chunk(chunks[x]) word_freqs += Counter(word_list) # def process_chunks( chunks,word_freqs,x,max ): # word_list = process_chunk(chunks[x]) # word_freqs += Counter(word_list) # next = x + 1 # if next < max: # process_chunks(chunks,word_freqs,next,max) # 读数据,按1000个词一组分片 chunks = get_chunks(testfilepath,2000) word_freqs = Counter() process_chunks( chunks,word_freqs,0,len(chunks) ) print_word_freqs( word_freqs.most_common(10) )