|
|
@ -3,18 +3,22 @@ from collections import Counter
|
|
|
|
|
|
|
|
|
|
|
|
stop_words = get_stopwords()
|
|
|
|
stop_words = get_stopwords()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_chunk(chunk):
|
|
|
|
def process_chunk(chunk):
|
|
|
|
# 过滤停用词
|
|
|
|
# 过滤停用词
|
|
|
|
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
|
|
|
|
words = [w for w in chunk if (w not in stop_words) and len(w) >= 3]
|
|
|
|
return Counter(words)
|
|
|
|
return Counter(words)
|
|
|
|
|
|
|
|
|
|
|
|
def process_chunks( chunks,word_freqs,x,max ):
|
|
|
|
|
|
|
|
next = x + 1
|
|
|
|
def process_chunks(chunks, word_freqs, x, max):
|
|
|
|
|
|
|
|
"""递归处理分片"""
|
|
|
|
|
|
|
|
next = x + 1
|
|
|
|
if next < max:
|
|
|
|
if next < max:
|
|
|
|
process_chunks(chunks,word_freqs,next,max)
|
|
|
|
process_chunks(chunks, word_freqs, next, max)
|
|
|
|
word_list = process_chunk(chunks[x])
|
|
|
|
word_list = process_chunk(chunks[x])
|
|
|
|
word_freqs += Counter(word_list)
|
|
|
|
word_freqs += Counter(word_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# def process_chunks( chunks,word_freqs,x,max ):
|
|
|
|
# def process_chunks( chunks,word_freqs,x,max ):
|
|
|
|
# word_list = process_chunk(chunks[x])
|
|
|
|
# word_list = process_chunk(chunks[x])
|
|
|
|
# word_freqs += Counter(word_list)
|
|
|
|
# word_freqs += Counter(word_list)
|
|
|
@ -22,9 +26,8 @@ def process_chunks( chunks,word_freqs,x,max ):
|
|
|
|
# if next < max:
|
|
|
|
# if next < max:
|
|
|
|
# process_chunks(chunks,word_freqs,next,max)
|
|
|
|
# process_chunks(chunks,word_freqs,next,max)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 读数据,按1000个词一组分片
|
|
|
|
# 读数据,按1000个词一组分片
|
|
|
|
chunks = get_chunks(testfilepath,2000)
|
|
|
|
chunks = get_chunks(testfilepath, 2000)
|
|
|
|
word_freqs = Counter()
|
|
|
|
word_freqs = Counter()
|
|
|
|
process_chunks( chunks,word_freqs,0,len(chunks) )
|
|
|
|
process_chunks(chunks, word_freqs, 0, len(chunks))
|
|
|
|
print_word_freqs( word_freqs.most_common(10) )
|
|
|
|
print_word_freqs(word_freqs.most_common(10))
|
|
|
|