You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
43 lines
1.2 KiB
43 lines
1.2 KiB
import re, operator, string
|
|
from cppy.cp_util import *
|
|
|
|
#
|
|
# The functions
|
|
#
|
|
def extract_words(path_to_file):
|
|
try:
|
|
with open(path_to_file, 'r', encoding='utf-8') as f:
|
|
str_data = f.read()
|
|
except IOError as e:
|
|
print(f"I/O error({e.errno}) when opening {path_to_file}: {e.strerror}")
|
|
return []
|
|
|
|
word_list = re.findall('\w+', str_data.lower())
|
|
return word_list
|
|
|
|
|
|
def remove_stop_words(word_list):
|
|
try:
|
|
stop_words = set(get_stopwords())
|
|
except IOError as e:
|
|
print(f"I/O error({e.errno}) when opening stops_words.txt: {e.strerror}")
|
|
return word_list
|
|
|
|
stop_words.update(string.ascii_lowercase)
|
|
return [w for w in word_list if w not in stop_words]
|
|
|
|
|
|
def frequencies(word_list):
|
|
if type(word_list) is not list or word_list == []: return {}
|
|
|
|
word_freqs = get_frequencies( word_list )
|
|
return word_freqs
|
|
|
|
def sort(word_freq):
|
|
if type(word_freq) is not dict or word_freq == {}: return []
|
|
return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
word_freqs = sort(frequencies(remove_stop_words(extract_words(testfilepath))))
|
|
print_word_freqs(word_freqs) |