|
|
@ -6,14 +6,12 @@ from functools import reduce
|
|
|
|
stop_words = get_stopwords()
|
|
|
|
stop_words = get_stopwords()
|
|
|
|
|
|
|
|
|
|
|
|
# map - reduce
|
|
|
|
# map - reduce
|
|
|
|
def process_chunk(chunk):
|
|
|
|
def process_chunk(chunk): # 过滤停用词
|
|
|
|
# 过滤停用词
|
|
|
|
|
|
|
|
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
|
|
|
|
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
|
|
|
|
return Counter(words)
|
|
|
|
return Counter(words)
|
|
|
|
|
|
|
|
|
|
|
|
def merge_counts(count1,count2):
|
|
|
|
def merge_counts(count1,count2):
|
|
|
|
sum_counts = count1 + count2
|
|
|
|
return count1 + count2
|
|
|
|
return sum_counts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@timing_decorator
|
|
|
|
@timing_decorator
|
|
|
|