from cppy.cp_util import *
from collections import Counter

stop_words = get_stopwords()


def process_chunk(chunk):
    # 过滤停用词
    words = [w for w in chunk if (w not in stop_words) and len(w) >= 3]
    return Counter(words)


def process_chunks(chunks, word_freqs, x, max):
    """递归处理分片"""
    next = x + 1
    if next < max:
        process_chunks(chunks, word_freqs, next, max)
    word_list = process_chunk(chunks[x])
    word_freqs += Counter(word_list)


# def process_chunks( chunks,word_freqs,x,max ):
#     word_list = process_chunk(chunks[x])
#     word_freqs += Counter(word_list)
#     next  = x + 1
#     if next < max:
#         process_chunks(chunks,word_freqs,next,max)

# 读数据，按1000个词一组分片
chunks = get_chunks(testfilepath, 2000)
word_freqs = Counter()
process_chunks(chunks, word_freqs, 0, len(chunks))
print_word_freqs(word_freqs.most_common(10))