You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

48 lines
1.3 KiB

import sys, re, operator, string, inspect
from cppy.cp_util import *
#
# The functions
#
def extract_words(path_to_file):
try:
with open(path_to_file, 'r', encoding='utf-8') as f:
str_data = f.read()
except IOError as e:
print(f"I/O error({e.errno}) when opening {path_to_file}: {e.strerror}")
return []
word_list = re.findall('\w+', str_data.lower())
return word_list
def remove_stop_words(word_list):
try:
stop_words = set(get_stopwords())
except IOError as e:
print(f"I/O error({e.errno}) when opening stops_words.txt: {e.strerror}")
return word_list
stop_words.update(string.ascii_lowercase)
return [w for w in word_list if w not in stop_words]
def frequencies(word_list):
if type(word_list) is not list or word_list == []: return {}
word_freqs = {}
for w in word_list:
if w in word_freqs:
word_freqs[w] += 1
else:
word_freqs[w] = 1
return word_freqs
def sort(word_freq):
if type(word_freq) is not dict or word_freq == {}: return []
return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
if __name__ == '__main__':
word_freqs = sort(frequencies(remove_stop_words(extract_words(testfilepath))))
print_word_freqs(word_freqs)