pull/13/head
zj3D 8 months ago
parent 254c11c3c9
commit 041fced368

@ -2,8 +2,10 @@ import string
from collections import Counter from collections import Counter
from cppy.cp_util import * from cppy.cp_util import *
################################
# data # data
data = [] ################################
data = ''
words = [] words = []
word_freqs = [] word_freqs = []
@ -13,17 +15,12 @@ word_freqs = []
def read_file(path_to_file): def read_file(path_to_file):
global data global data
with open(path_to_file,encoding='utf-8') as f: with open(path_to_file,encoding='utf-8') as f:
data = data + list(f.read()) data = f.read()
def filter_chars_and_normalize(): def extractwords():
global data global data
global words global words
for i in range(len(data)): words = data.lower().split()
data[i] = ' ' if not data[i].isalnum() else data[i].lower()
data_str = ''.join(data)
words = words + data_str.split()
with open(stopwordfilepath) as f: with open(stopwordfilepath) as f:
stop_words = set(f.read().split(',')) stop_words = set(f.read().split(','))
stop_words.update(string.ascii_lowercase) stop_words.update(string.ascii_lowercase)
@ -41,7 +38,7 @@ def sort():
if __name__ == "__main__": if __name__ == "__main__":
read_file( testfilepath ) read_file( testfilepath )
filter_chars_and_normalize() extractwords()
frequencies() frequencies()
sort() sort()

@ -2,31 +2,28 @@ import re
from cppy.cp_util import * from cppy.cp_util import *
def filter_chars_and_normalize(str_data): def extractwords(str_data):
pattern = re.compile('[\W_]+') pattern = re.compile('[\W_]+')
word_list = pattern.sub(' ', str_data).lower().split() word_list = pattern.sub(' ', str_data).lower().split()
stop_words = get_stopwords() stop_words = get_stopwords()
return [w for w in word_list if not w in stop_words] return [w for w in word_list if not w in stop_words]
def frequencies(word_list): def frequencies(word_list):
word_freqs = {} word_freqs = {}
for word in word_list: for word in word_list:
word_freqs[word] = word_freqs.get(word, 0) + 1 word_freqs[word] = word_freqs.get(word, 0) + 1
return word_freqs return word_freqs
def sort(word_freq): def sort(word_freq):
return sorted( word_freq.items(), key=lambda x: x[1], reverse=True ) return sorted( word_freq.items(), key=lambda x: x[1], reverse=True )
def printall(word_freqs, n = 10 ):
def print_all(word_freqs, n = 10 ):
for word, freq in word_freqs[ :n ]: for word, freq in word_freqs[ :n ]:
print(word, '-', freq) print(word, '-', freq)
if __name__ == "__main__": if __name__ == "__main__":
print_all(sort(frequencies( printall(sort(frequencies(
filter_chars_and_normalize( extractwords(
read_file( testfilepath )))) read_file( testfilepath ))))
) )

@ -1,39 +1,21 @@
import re, operator
from cppy.cp_util import * from cppy.cp_util import *
def print_text(word_freqs, func): def readfile(path_to_file, func):
print_word_freqs(word_freqs) data = read_file(path_to_file)
func(None) func(data, frequencies)
def extractwords(str_data,func):
func(extract_str_words(str_data), sort)
def frequencies(word_list, func): def frequencies(word_list, func):
wf = get_frequencies(word_list) wf = get_frequencies(word_list)
func(wf, print_text) func(wf, printall)
def scan(str_data, func):
func(str_data.split(), frequencies)
def filter_chars(str_data, func):
pattern = re.compile('[\W_]+')
func(pattern.sub(' ', str_data), scan)
def remove_stop_words(word_list, func):
stop_words = get_stopwords()
func([w for w in word_list if not w in stop_words], sort)
def sort(wf, func): def sort(wf, func):
func(sorted(wf.items(), key=operator.itemgetter(1), reverse=True), no_op) func(sort_dict(wf), None)
def no_op(func):
return
def normalize(str_data, func):
func(str_data.lower(), remove_stop_words)
def read_file(path_to_file, func):
with open(path_to_file,encoding='utf-8') as f:
data = f.read()
func(data, normalize)
def printall(word_freqs, func):
print_word_freqs(word_freqs)
if __name__ == "__main__": if __name__ == "__main__":
read_file(testfilepath, filter_chars) readfile(testfilepath, extractwords)

@ -56,5 +56,4 @@ state_machine = WordFrequencyStateMachine(util.testfilepath)
word_frequencies = state_machine.run() word_frequencies = state_machine.run()
# 打印结果 # 打印结果
for word, freq in word_frequencies.most_common(10): util.print_word_freqs(word_frequencies.most_common(10))
print(f"{word}: {freq}")
Loading…
Cancel
Save