You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

51 lines
1.7 KiB

# 引入停用词表和测试文件的路径
from cppy.cp_util import stopwordfilepath, testfilepath
# 准备停用词表
with open(stopwordfilepath, encoding='utf-8') as f:
stop_words = f.read().split(',')
for letter in 'abcdefghijklmnopqrstuvwxyz':
stop_words.append(letter)
# 读文件,逐行扫描文本,发现词,确定不是停用词,计数
word_freqs = []
for line in open(testfilepath, encoding='utf-8'):
start_char = None
i = 0
for c in line:
if start_char is None:
if c.isalnum():
# 一个单词开始
start_char = i
else:
if not c.isalnum():
# 一个单词结束
found = False
word = line[start_char:i].lower()
# 跳过停用词
if word not in stop_words:
pair_index = 0
# 单词是否第一次出现
for pair in word_freqs:
if word == pair[0]:
pair[1] += 1
found = True
break
pair_index += 1
if not found:
word_freqs.append([word, 1])
# 重置开始标记
start_char = None
i += 1
# 使用冒泡排序对词频进行排序
n = len(word_freqs)
for i in range(n):
for j in range(0, n - i - 1):
if word_freqs[j][1] < word_freqs[j + 1][1]:
word_freqs[j], word_freqs[j + 1] = word_freqs[j + 1], word_freqs[j]
# 打印频率最高的前10个词
for tf in word_freqs[:10]:
print(tf[0], '-', tf[1])