from functools import reduce from cppy.cp_util import * ################################################# # Functions for map reduce ################################################# def partition(data_str, nlines): lines = data_str.split('\n') for i in range(0, len(lines), nlines): yield '\n'.join(lines[i:i+nlines]) def split_words(data_str): words = extract_str_words(data_str) return [ (w, 1) for w in words ] def regroup(pairs_list): mapping = {} for pairs in pairs_list: for p in pairs: mapping[p[0]] = mapping.get(p[0], []) + [p] return mapping def count_words(mapping): def add(x, y): return x+y return ( mapping[0], reduce(add, (pair[1] for pair in mapping[1])) ) def sort (word_freq): return sorted(word_freq, key=operator.itemgetter(1), reverse=True) if __name__ == '__main__': data = read_file(testfilepath) splits = map(split_words, partition(data, 200)) splits_per_word = regroup(splits) word_freqs = sort(map(count_words, splits_per_word.items())) print_word_freqs(word_freqs)