parent
fb95636bb1
commit
e993c23ed1
@ -1,37 +0,0 @@
|
|||||||
from functools import reduce
|
|
||||||
from cppy.cp_util import *
|
|
||||||
|
|
||||||
#################################################
|
|
||||||
# Functions for map reduce
|
|
||||||
#################################################
|
|
||||||
def partition(data_str, nlines):
|
|
||||||
lines = data_str.split('\n')
|
|
||||||
for i in range(0, len(lines), nlines):
|
|
||||||
yield '\n'.join(lines[i:i+nlines])
|
|
||||||
|
|
||||||
def split_words(data_str):
|
|
||||||
words = extract_str_words(data_str)
|
|
||||||
return [ (w, 1) for w in words ]
|
|
||||||
|
|
||||||
def regroup(pairs_list):
|
|
||||||
mapping = {}
|
|
||||||
for pairs in pairs_list:
|
|
||||||
for p in pairs:
|
|
||||||
mapping[p[0]] = mapping.get(p[0], []) + [p]
|
|
||||||
return mapping
|
|
||||||
|
|
||||||
def count_words(mapping):
|
|
||||||
def add(x, y): return x+y
|
|
||||||
return ( mapping[0],
|
|
||||||
reduce(add, (pair[1] for pair in mapping[1]))
|
|
||||||
)
|
|
||||||
|
|
||||||
def sort (word_freq):
|
|
||||||
return sorted(word_freq, key=operator.itemgetter(1), reverse=True)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
data = read_file(testfilepath)
|
|
||||||
splits = map(split_words, partition(data, 200))
|
|
||||||
splits_per_word = regroup(splits)
|
|
||||||
word_freqs = sort(map(count_words, splits_per_word.items()))
|
|
||||||
print_word_freqs(word_freqs)
|
|
Loading…
Reference in new issue