diff --git a/基本结构/021 函数/A02.py b/基本结构/021 函数/A02.py index 3be975a..aa37ebe 100644 --- a/基本结构/021 函数/A02.py +++ b/基本结构/021 函数/A02.py @@ -2,8 +2,10 @@ import string from collections import Counter from cppy.cp_util import * +################################ # data -data = [] +################################ +data = '' words = [] word_freqs = [] @@ -13,17 +15,12 @@ word_freqs = [] def read_file(path_to_file): global data with open(path_to_file,encoding='utf-8') as f: - data = data + list(f.read()) + data = f.read() -def filter_chars_and_normalize(): +def extractwords(): global data - global words - for i in range(len(data)): - data[i] = ' ' if not data[i].isalnum() else data[i].lower() - - data_str = ''.join(data) - words = words + data_str.split() - + global words + words = data.lower().split() with open(stopwordfilepath) as f: stop_words = set(f.read().split(',')) stop_words.update(string.ascii_lowercase) @@ -41,7 +38,7 @@ def sort(): if __name__ == "__main__": read_file( testfilepath ) - filter_chars_and_normalize() + extractwords() frequencies() sort() diff --git a/基本结构/021 函数/A03.py b/基本结构/021 函数/A03.py index 2cae2fd..8276209 100644 --- a/基本结构/021 函数/A03.py +++ b/基本结构/021 函数/A03.py @@ -2,31 +2,28 @@ import re from cppy.cp_util import * -def filter_chars_and_normalize(str_data): +def extractwords(str_data): pattern = re.compile('[\W_]+') word_list = pattern.sub(' ', str_data).lower().split() stop_words = get_stopwords() return [w for w in word_list if not w in stop_words] - def frequencies(word_list): word_freqs = {} for word in word_list: word_freqs[word] = word_freqs.get(word, 0) + 1 return word_freqs - def sort(word_freq): return sorted( word_freq.items(), key=lambda x: x[1], reverse=True ) - -def print_all(word_freqs, n = 10 ): +def printall(word_freqs, n = 10 ): for word, freq in word_freqs[ :n ]: print(word, '-', freq) if __name__ == "__main__": - print_all(sort(frequencies( - filter_chars_and_normalize( + printall(sort(frequencies( + extractwords( read_file( testfilepath )))) ) \ No newline at end of file diff --git a/基本结构/021 函数/A04.py b/基本结构/021 函数/A04.py index e1ea0ec..048a004 100644 --- a/基本结构/021 函数/A04.py +++ b/基本结构/021 函数/A04.py @@ -1,39 +1,21 @@ -import re, operator from cppy.cp_util import * -def print_text(word_freqs, func): - print_word_freqs(word_freqs) - func(None) +def readfile(path_to_file, func): + data = read_file(path_to_file) + func(data, frequencies) + +def extractwords(str_data,func): + func(extract_str_words(str_data), sort) def frequencies(word_list, func): wf = get_frequencies(word_list) - func(wf, print_text) - -def scan(str_data, func): - func(str_data.split(), frequencies) - -def filter_chars(str_data, func): - pattern = re.compile('[\W_]+') - func(pattern.sub(' ', str_data), scan) - -def remove_stop_words(word_list, func): - stop_words = get_stopwords() - func([w for w in word_list if not w in stop_words], sort) + func(wf, printall) def sort(wf, func): - func(sorted(wf.items(), key=operator.itemgetter(1), reverse=True), no_op) - -def no_op(func): - return - -def normalize(str_data, func): - func(str_data.lower(), remove_stop_words) - -def read_file(path_to_file, func): - with open(path_to_file,encoding='utf-8') as f: - data = f.read() - func(data, normalize) + func(sort_dict(wf), None) +def printall(word_freqs, func): + print_word_freqs(word_freqs) if __name__ == "__main__": - read_file(testfilepath, filter_chars) \ No newline at end of file + readfile(testfilepath, extractwords) \ No newline at end of file diff --git a/基本结构/状态机/81A.py b/基本结构/状态机/81A.py index c7c726b..581bf96 100644 --- a/基本结构/状态机/81A.py +++ b/基本结构/状态机/81A.py @@ -56,5 +56,4 @@ state_machine = WordFrequencyStateMachine(util.testfilepath) word_frequencies = state_machine.run() # 打印结果 -for word, freq in word_frequencies.most_common(10): - print(f"{word}: {freq}") \ No newline at end of file +util.print_word_freqs(word_frequencies.most_common(10)) \ No newline at end of file