import re from cppy.cp_util import * def filter_chars_and_normalize(str_data): pattern = re.compile('[\W_]+') word_list = pattern.sub(' ', str_data).lower().split() stop_words = get_stopwords() return [w for w in word_list if not w in stop_words] def frequencies(word_list): word_freqs = {} for word in word_list: word_freqs[word] = word_freqs.get(word, 0) + 1 return word_freqs def sort(word_freq): return sorted( word_freq.items(), key=lambda x: x[1], reverse=True ) def print_all(word_freqs, n = 10 ): for word, freq in word_freqs[ :n ]: print(word, '-', freq) if __name__ == "__main__": print_all(sort(frequencies( filter_chars_and_normalize( read_file( testfilepath )))) )