zj3D 9 months ago
parent 254c11c3c9
commit 041fced368

@ -2,8 +2,10 @@ import string
from collections import Counter
from cppy.cp_util import *
################################
# data
data = []
################################
data = ''
words = []
word_freqs = []
@ -13,17 +15,12 @@ word_freqs = []
def read_file(path_to_file):
global data
with open(path_to_file,encoding='utf-8') as f:
data = data + list(f.read())
data = f.read()
def filter_chars_and_normalize():
def extractwords():
global data
global words
for i in range(len(data)):
data[i] = ' ' if not data[i].isalnum() else data[i].lower()
data_str = ''.join(data)
words = words + data_str.split()
global words
words = data.lower().split()
with open(stopwordfilepath) as f:
stop_words = set(f.read().split(','))
stop_words.update(string.ascii_lowercase)
@ -41,7 +38,7 @@ def sort():
if __name__ == "__main__":
read_file( testfilepath )
filter_chars_and_normalize()
extractwords()
frequencies()
sort()

@ -2,31 +2,28 @@ import re
from cppy.cp_util import *
def filter_chars_and_normalize(str_data):
def extractwords(str_data):
pattern = re.compile('[\W_]+')
word_list = pattern.sub(' ', str_data).lower().split()
stop_words = get_stopwords()
return [w for w in word_list if not w in stop_words]
def frequencies(word_list):
word_freqs = {}
for word in word_list:
word_freqs[word] = word_freqs.get(word, 0) + 1
return word_freqs
def sort(word_freq):
return sorted( word_freq.items(), key=lambda x: x[1], reverse=True )
def print_all(word_freqs, n = 10 ):
def printall(word_freqs, n = 10 ):
for word, freq in word_freqs[ :n ]:
print(word, '-', freq)
if __name__ == "__main__":
print_all(sort(frequencies(
filter_chars_and_normalize(
printall(sort(frequencies(
extractwords(
read_file( testfilepath ))))
)

@ -1,39 +1,21 @@
import re, operator
from cppy.cp_util import *
def print_text(word_freqs, func):
print_word_freqs(word_freqs)
func(None)
def readfile(path_to_file, func):
data = read_file(path_to_file)
func(data, frequencies)
def extractwords(str_data,func):
func(extract_str_words(str_data), sort)
def frequencies(word_list, func):
wf = get_frequencies(word_list)
func(wf, print_text)
def scan(str_data, func):
func(str_data.split(), frequencies)
def filter_chars(str_data, func):
pattern = re.compile('[\W_]+')
func(pattern.sub(' ', str_data), scan)
def remove_stop_words(word_list, func):
stop_words = get_stopwords()
func([w for w in word_list if not w in stop_words], sort)
func(wf, printall)
def sort(wf, func):
func(sorted(wf.items(), key=operator.itemgetter(1), reverse=True), no_op)
def no_op(func):
return
def normalize(str_data, func):
func(str_data.lower(), remove_stop_words)
def read_file(path_to_file, func):
with open(path_to_file,encoding='utf-8') as f:
data = f.read()
func(data, normalize)
func(sort_dict(wf), None)
def printall(word_freqs, func):
print_word_freqs(word_freqs)
if __name__ == "__main__":
read_file(testfilepath, filter_chars)
readfile(testfilepath, extractwords)

@ -56,5 +56,4 @@ state_machine = WordFrequencyStateMachine(util.testfilepath)
word_frequencies = state_machine.run()
# 打印结果
for word, freq in word_frequencies.most_common(10):
print(f"{word}: {freq}")
util.print_word_freqs(word_frequencies.most_common(10))
Loading…
Cancel
Save