You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
from cppy.cp_util import *
|
|
|
|
|
from collections import Counter
|
|
|
|
|
|
|
|
|
|
stop_words = get_stopwords()
|
|
|
|
|
|
|
|
|
|
def process_chunk(chunk):
|
|
|
|
|
# 过滤停用词
|
|
|
|
|
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
|
|
|
|
|
return Counter(words)
|
|
|
|
|
|
|
|
|
|
def process_chunks( chunks,word_freqs,x,max ):
|
|
|
|
|
next = x + 1
|
|
|
|
|
if next < max:
|
|
|
|
|
process_chunks(chunks,word_freqs,next,max)
|
|
|
|
|
word_list = process_chunk(chunks[x])
|
|
|
|
|
word_freqs += Counter(word_list)
|
|
|
|
|
|
|
|
|
|
# def process_chunks( chunks,word_freqs,x,max ):
|
|
|
|
|
# word_list = process_chunk(chunks[x])
|
|
|
|
|
# word_freqs += Counter(word_list)
|
|
|
|
|
# next = x + 1
|
|
|
|
|
# if next < max:
|
|
|
|
|
# process_chunks(chunks,word_freqs,next,max)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 读数据,按1000个词一组分片
|
|
|
|
|
chunks = get_chunks(testfilepath,2000)
|
|
|
|
|
word_freqs = Counter()
|
|
|
|
|
process_chunks( chunks,word_freqs,0,len(chunks) )
|
|
|
|
|
print_word_freqs( word_freqs.most_common(10) )
|