import re from cppy.cp_util import * def extractwords(str_data): """提取单词""" pattern = re.compile('[\W_]+') word_list = pattern.sub(' ', str_data).lower().split() stop_words = get_stopwords() return [w for w in word_list if w not in stop_words] def frequencies(word_list): """统计单词频率""" word_freqs = {} for word in word_list: word_freqs[word] = word_freqs.get(word, 0) + 1 return word_freqs def sort(word_freq): """对单词频率进行排序""" return sorted(word_freq.items(), key=lambda x: x[1], reverse=True) if __name__ == "__main__": txtcontent = read_file(testfilepath) word_list = extractwords(txtcontent) word_freqs = frequencies(word_list) word_sorts = sort(word_freqs) for tf in word_sorts[:10]: print(tf[0], '-', tf[1])