from cppy.cp_util import *
from collections import Counter

stop_words = get_stopwords()

def process_chunk(chunk):
    # 过滤停用词
    words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
    return Counter(words)

def process_chunks( chunks,word_freqs,x,max ):
    next  = x + 1
    if next < max:
        process_chunks(chunks,word_freqs,next,max)   
    word_list = process_chunk(chunks[x])    
    word_freqs += Counter(word_list)

# def process_chunks( chunks,word_freqs,x,max ):
#     word_list = process_chunk(chunks[x])    
#     word_freqs += Counter(word_list)
#     next  = x + 1
#     if next < max:
#         process_chunks(chunks,word_freqs,next,max)   
  
  
# 读数据，按1000个词一组分片
chunks = get_chunks(testfilepath,2000)
word_freqs = Counter()
process_chunks( chunks,word_freqs,0,len(chunks) )
print_word_freqs( word_freqs.most_common(10) )