You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
import re
|
|
|
|
from cppy.cp_util import *
|
|
|
|
|
|
|
|
|
|
|
|
def extractwords(str_data):
|
|
|
|
"""提取单词"""
|
|
|
|
pattern = re.compile('[\W_]+')
|
|
|
|
word_list = pattern.sub(' ', str_data).lower().split()
|
|
|
|
stop_words = get_stopwords()
|
|
|
|
return [w for w in word_list if w not in stop_words]
|
|
|
|
|
|
|
|
|
|
|
|
def frequencies(word_list):
|
|
|
|
"""统计单词频率"""
|
|
|
|
word_freqs = {}
|
|
|
|
for word in word_list:
|
|
|
|
word_freqs[word] = word_freqs.get(word, 0) + 1
|
|
|
|
return word_freqs
|
|
|
|
|
|
|
|
|
|
|
|
def sort(word_freq):
|
|
|
|
"""对单词频率进行排序"""
|
|
|
|
return sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
txtcontent = read_file(testfilepath)
|
|
|
|
word_list = extractwords(txtcontent)
|
|
|
|
word_freqs = frequencies(word_list)
|
|
|
|
word_sorts = sort(word_freqs)
|
|
|
|
for tf in word_sorts[:10]:
|
|
|
|
print(tf[0], '-', tf[1])
|