You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

45 lines
1.2 KiB

from cppy.cp_util import *
###########################################
# 生成器
###########################################
def characters(filename): # 弹出一行
for line in open(filename,encoding='utf-8'):
for c in line:
yield c
def all_words(filename): # 弹出一个词
start_char = True
for c in characters(filename):
if start_char == True:
word = ""
if c.isalnum(): # start of a word
word = c.lower()
start_char = False
else:
pass
else:
if c.isalnum():
word += c.lower() # end of word, emit it
else:
start_char = True
yield word
def non_stop_words(filename, stopwords):
for w in all_words(filename):
if not w in stopwords:
yield w # 弹出一个审核过的词
if __name__ == "__main__":
stopwords = get_stopwords()
freqs = {}
for word in non_stop_words(testfilepath,stopwords):
freqs[word] = freqs.get(word, 0) + 1
data = sort_dict(freqs)
print_word_freqs(data)