You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
import re
|
|
|
|
from collections import Counter
|
|
|
|
from cppy.cp_util import *
|
|
|
|
|
|
|
|
# 读取文件
|
|
|
|
with open(testfilepath, encoding='utf-8') as f:
|
|
|
|
data = f.read().lower() # 直接转换为小写
|
|
|
|
|
|
|
|
# 过滤非字母字符
|
|
|
|
data = re.sub('[\W_]+', ' ', data)
|
|
|
|
|
|
|
|
# 分词
|
|
|
|
words = data.split()
|
|
|
|
|
|
|
|
# 移除停用词
|
|
|
|
stop_words = get_stopwords()
|
|
|
|
words = [word for word in words if word not in stop_words]
|
|
|
|
|
|
|
|
# 计算词频
|
|
|
|
word_freqs = Counter(words)
|
|
|
|
|
|
|
|
# 排序并打印
|
|
|
|
sorted_word_freqs = sorted(word_freqs.items(),
|
|
|
|
key=lambda x: x[1],
|
|
|
|
reverse=True)
|
|
|
|
|
|
|
|
print_word_freqs(sorted_word_freqs)
|