parent
							
								
									c68729b7fc
								
							
						
					
					
						commit
						350c4b2f07
					
				@ -0,0 +1,24 @@
 | 
				
			||||
from operator import itemgetter
 | 
				
			||||
import jieba
 | 
				
			||||
import string
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
# 统计词汇数
 | 
				
			||||
def count_word(filename):
 | 
				
			||||
    file_txt = open(filename, "r", encoding='utf-8').read()
 | 
				
			||||
    file_txt = file_txt.lower()  # 将文本文件内容的大写字母换成小写字母
 | 
				
			||||
 | 
				
			||||
    for ch in string.punctuation:  # 将文本中的特殊字符替换为空格
 | 
				
			||||
        file_txt = file_txt.replace(ch, " ")
 | 
				
			||||
 | 
				
			||||
    words = list(jieba.lcut(file_txt, cut_all=False))  # 分词
 | 
				
			||||
    counts = {}  # 用于统计词汇数的字典
 | 
				
			||||
 | 
				
			||||
    for i in set(words):  # 统计词汇数
 | 
				
			||||
        if len(i) > 1:
 | 
				
			||||
            counts[i] = words.count(i)
 | 
				
			||||
 | 
				
			||||
    counts = sorted(counts.items(), key=itemgetter(1), reverse=True)  # 按字典元素的值进行逆序排序
 | 
				
			||||
    for i in range(20):         # 输出词汇数最多的10个词
 | 
				
			||||
        print(counts[i])
 | 
				
			||||
    return counts
 | 
				
			||||
					Loading…
					
					
				
		Reference in new issue