You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

37 lines
1.1 KiB

9 months ago
from functools import reduce
from cppy.cp_util import *
#################################################
# Functions for map reduce
#################################################
def partition(data_str, nlines):
lines = data_str.split('\n')
for i in range(0, len(lines), nlines):
yield '\n'.join(lines[i:i+nlines])
def split_words(data_str):
words = extract_str_words(data_str)
return [ (w, 1) for w in words ]
def regroup(pairs_list):
mapping = {}
for pairs in pairs_list:
for p in pairs:
mapping[p[0]] = mapping.get(p[0], []) + [p]
return mapping
def count_words(mapping):
def add(x, y): return x+y
return ( mapping[0],
reduce(add, (pair[1] for pair in mapping[1]))
)
def sort (word_freq):
return sorted(word_freq, key=operator.itemgetter(1), reverse=True)
if __name__ == '__main__':
data = read_file(testfilepath)
splits = map(split_words, partition(data, 200))
splits_per_word = regroup(splits)
word_freqs = sort(map(count_words, splits_per_word.items()))
print_word_freqs(word_freqs)