You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
45 lines
1.2 KiB
45 lines
1.2 KiB
9 months ago
|
from cppy.cp_util import *
|
||
|
|
||
|
###########################################
|
||
|
# 生成器
|
||
|
###########################################
|
||
|
def characters(filename): # 弹出一行
|
||
|
for line in open(filename,encoding='utf-8'):
|
||
|
for c in line:
|
||
|
yield c
|
||
|
|
||
|
|
||
|
def all_words(filename): # 弹出一个词
|
||
|
start_char = True
|
||
|
for c in characters(filename):
|
||
|
if start_char == True:
|
||
|
word = ""
|
||
|
if c.isalnum(): # start of a word
|
||
|
word = c.lower()
|
||
|
start_char = False
|
||
|
else:
|
||
|
pass
|
||
|
else:
|
||
|
if c.isalnum():
|
||
|
word += c.lower() # end of word, emit it
|
||
|
else:
|
||
|
start_char = True
|
||
|
yield word
|
||
|
|
||
|
|
||
|
def non_stop_words(filename, stopwords):
|
||
|
for w in all_words(filename):
|
||
|
if not w in stopwords:
|
||
|
yield w # 弹出一个审核过的词
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
stopwords = get_stopwords()
|
||
|
|
||
|
freqs = {}
|
||
|
for word in non_stop_words(testfilepath,stopwords):
|
||
|
freqs[word] = freqs.get(word, 0) + 1
|
||
|
|
||
|
data = sort_dict(freqs)
|
||
|
print_word_freqs(data)
|
||
|
|