You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
46 lines
1.6 KiB
46 lines
1.6 KiB
9 months ago
|
import string
|
||
|
from cppy.cp_util import *
|
||
|
|
||
|
# 准备词和停用词表
|
||
|
word_freqs = []
|
||
|
with open( stopwordfilepath,encoding='utf-8' ) as f:
|
||
|
stop_words = f.read().split(',')
|
||
|
stop_words.extend(list(string.ascii_lowercase))
|
||
|
|
||
|
for line in open( testfilepath ,encoding='utf-8' ):
|
||
|
start_char = None
|
||
|
i = 0
|
||
|
for c in line:
|
||
|
if start_char == None:
|
||
|
if c.isalnum():
|
||
|
# 一个单词开始
|
||
|
start_char = i
|
||
|
else:
|
||
|
if not c.isalnum():
|
||
|
# 一个单词结束
|
||
|
found = False
|
||
|
word = line[start_char:i].lower()
|
||
|
# 跳过停用词
|
||
|
if word not in stop_words:
|
||
|
pair_index = 0
|
||
|
# 单词是否第一次出现
|
||
|
for pair in word_freqs:
|
||
|
if word == pair[0]:
|
||
|
pair[1] += 1
|
||
|
found = True
|
||
|
break
|
||
|
pair_index += 1
|
||
|
if not found:
|
||
|
word_freqs.append([word, 1])
|
||
|
elif len(word_freqs) > 1:
|
||
|
for n in reversed(range(pair_index)):
|
||
|
if word_freqs[pair_index][1] > word_freqs[n][1]:
|
||
|
# 交换
|
||
|
word_freqs[n], word_freqs[pair_index] = word_freqs[pair_index], word_freqs[n]
|
||
|
pair_index = n
|
||
|
# 重置开始标记
|
||
|
start_char = None
|
||
|
i += 1
|
||
|
|
||
|
for tf in word_freqs[0:10]:
|
||
|
print(tf[0], '-', tf[1])
|