You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
25 lines
850 B
25 lines
850 B
3 years ago
|
from operator import itemgetter
|
||
|
import jieba
|
||
|
import string
|
||
|
|
||
|
|
||
|
# 统计词汇数
|
||
|
def count_word(filename):
|
||
|
file_txt = open(filename, "r", encoding='utf-8').read()
|
||
|
file_txt = file_txt.lower() # 将文本文件内容的大写字母换成小写字母
|
||
|
|
||
|
for ch in string.punctuation: # 将文本中的特殊字符替换为空格
|
||
|
file_txt = file_txt.replace(ch, " ")
|
||
|
|
||
|
words = list(jieba.lcut(file_txt, cut_all=False)) # 分词
|
||
|
counts = {} # 用于统计词汇数的字典
|
||
|
|
||
|
for i in set(words): # 统计词汇数
|
||
|
if len(i) > 1:
|
||
|
counts[i] = words.count(i)
|
||
|
|
||
|
counts = sorted(counts.items(), key=itemgetter(1), reverse=True) # 按字典元素的值进行逆序排序
|
||
|
for i in range(20): # 输出词汇数最多的10个词
|
||
|
print(counts[i])
|
||
|
return counts
|