You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

28 lines
619 B

import re
from collections import Counter
from cppy.cp_util import *
# 读取文件
with open(testfilepath, encoding='utf-8') as f:
data = f.read().lower() # 直接转换为小写
# 过滤非字母字符
data = re.sub('[\W_]+', ' ', data)
# 分词
words = data.split()
# 移除停用词
stop_words = get_stopwords()
words = [word for word in words if word not in stop_words]
# 计算词频
word_freqs = Counter(words)
# 排序并打印
sorted_word_freqs = sorted(word_freqs.items(),
key=lambda x: x[1],
reverse=True)
print_word_freqs(sorted_word_freqs)