forked from p46318075/CodePattern
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
39 lines
966 B
39 lines
966 B
9 months ago
|
import re, operator
|
||
|
from cppy.cp_util import *
|
||
|
|
||
|
def print_text(word_freqs, func):
|
||
|
print_word_freqs(word_freqs)
|
||
|
func(None)
|
||
|
|
||
|
def frequencies(word_list, func):
|
||
|
wf = get_frequencies(word_list)
|
||
|
func(wf, print_text)
|
||
|
|
||
|
def scan(str_data, func):
|
||
|
func(str_data.split(), frequencies)
|
||
|
|
||
|
def filter_chars(str_data, func):
|
||
|
pattern = re.compile('[\W_]+')
|
||
|
func(pattern.sub(' ', str_data), scan)
|
||
|
|
||
|
def remove_stop_words(word_list, func):
|
||
|
stop_words = get_stopwords()
|
||
|
func([w for w in word_list if not w in stop_words], sort)
|
||
|
|
||
|
def sort(wf, func):
|
||
|
func(sorted(wf.items(), key=operator.itemgetter(1), reverse=True), no_op)
|
||
|
|
||
|
def no_op(func):
|
||
|
return
|
||
|
|
||
|
def normalize(str_data, func):
|
||
|
func(str_data.lower(), remove_stop_words)
|
||
|
|
||
|
def read_file(path_to_file, func):
|
||
|
with open(path_to_file,encoding='utf-8') as f:
|
||
|
data = f.read()
|
||
|
func(data, normalize)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
read_file(testfilepath, filter_chars)
|