forked from p46318075/CodePattern
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
27 lines
847 B
27 lines
847 B
9 months ago
|
from functools import reduce
|
||
|
from cppy.cp_util import *
|
||
|
from collections import Counter
|
||
|
|
||
|
def partition(data_str, nlines):
|
||
|
lines = data_str.split('\n')
|
||
|
for i in range(0, len(lines), nlines):
|
||
|
yield '\n'.join(lines[i:i+nlines])
|
||
|
|
||
|
def split_words(data_str):
|
||
|
word_list = extract_str_words(data_str)
|
||
|
return Counter( word_list )
|
||
|
|
||
|
def count_words(pairs_list_1, pairs_list_2):
|
||
|
return pairs_list_1 + pairs_list_2
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
data = read_file(testfilepath)
|
||
|
|
||
|
# 使用 map 方法和 split_words 函数处理每个分区
|
||
|
splits = map(split_words, partition(data, 200))
|
||
|
splits_list = list(splits)
|
||
|
|
||
|
# 使用 reduce 和 count_words 函数统计所有分区的词频
|
||
|
word_freqs = sort_dict(reduce(count_words, splits_list, Counter()) )
|
||
|
print_word_freqs(word_freqs)
|