You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

34 lines
911 B

8 months ago
from cppy.cp_util import *
from collections import Counter
stop_words = get_stopwords()
8 months ago
def process_chunk(chunk):
# 过滤停用词
words = [w for w in chunk if (w not in stop_words) and len(w) >= 3]
8 months ago
return Counter(words)
def process_chunks(chunks, word_freqs, x, max):
"""递归处理分片"""
next = x + 1
8 months ago
if next < max:
process_chunks(chunks, word_freqs, next, max)
word_list = process_chunk(chunks[x])
8 months ago
word_freqs += Counter(word_list)
8 months ago
# def process_chunks( chunks,word_freqs,x,max ):
# word_list = process_chunk(chunks[x])
8 months ago
# word_freqs += Counter(word_list)
# next = x + 1
# if next < max:
# process_chunks(chunks,word_freqs,next,max)
8 months ago
# 读数据按1000个词一组分片
chunks = get_chunks(testfilepath, 2000)
8 months ago
word_freqs = Counter()
process_chunks(chunks, word_freqs, 0, len(chunks))
print_word_freqs(word_freqs.most_common(10))