from cppy.cp_util import * from collections import Counter stop_words = get_stopwords() def process_chunk(chunk): # 过滤停用词 words = [w for w in chunk if (w not in stop_words) and len(w) >= 3] return Counter(words) def process_chunks(chunks, word_freqs, x, max): """递归处理分片""" next = x + 1 if next < max: process_chunks(chunks, word_freqs, next, max) word_list = process_chunk(chunks[x]) word_freqs += Counter(word_list) # def process_chunks( chunks,word_freqs,x,max ): # word_list = process_chunk(chunks[x]) # word_freqs += Counter(word_list) # next = x + 1 # if next < max: # process_chunks(chunks,word_freqs,next,max) # 读数据,按1000个词一组分片 chunks = get_chunks(testfilepath, 2000) word_freqs = Counter() process_chunks(chunks, word_freqs, 0, len(chunks)) print_word_freqs(word_freqs.most_common(10))