|
|
|
|
@ -0,0 +1,101 @@
|
|
|
|
|
from operator import itemgetter
|
|
|
|
|
import matplotlib.pyplot as mp
|
|
|
|
|
import jieba
|
|
|
|
|
import string
|
|
|
|
|
import wordcloud
|
|
|
|
|
import csv
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_file(filepath):
|
|
|
|
|
"""
|
|
|
|
|
寻找程序所在目录下的所有文本文件,将保存文件名到filename.csv文件中,并打印文本文件名
|
|
|
|
|
:param filepath: 文本文件存放路径
|
|
|
|
|
:return: 所找到的文本文件名对应的序号
|
|
|
|
|
"""
|
|
|
|
|
with open('filename.csv', 'w', encoding='utf-8', newline='') as file_handler: # 用于清空filename的内容
|
|
|
|
|
file_writer = csv.writer(file_handler)
|
|
|
|
|
row = ['file_no', 'filename']
|
|
|
|
|
file_writer.writerow(row)
|
|
|
|
|
|
|
|
|
|
files = os.listdir(filepath)
|
|
|
|
|
file_no = 0
|
|
|
|
|
print('files list:') # 遍历所有文件
|
|
|
|
|
for filename in files:
|
|
|
|
|
file_no = file_no + 1
|
|
|
|
|
print('%d--' % file_no, end='')
|
|
|
|
|
print(filename) #打印文本文件名以及所对应的序号
|
|
|
|
|
file_handler = open('filename.csv', 'a+', encoding='utf-8', newline='') # 保存文件名到filename.csv
|
|
|
|
|
file_writer = csv.writer(file_handler)
|
|
|
|
|
row = [file_no, filename]
|
|
|
|
|
file_writer.writerow(row)
|
|
|
|
|
return file_no
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def count_word(filename):
|
|
|
|
|
"""
|
|
|
|
|
统计词汇数
|
|
|
|
|
:param filename: 想要统计词汇数的文本文件地址
|
|
|
|
|
:return: 统计的词汇结果
|
|
|
|
|
"""
|
|
|
|
|
file_txt = open(filename, "r", encoding='utf-8').read()
|
|
|
|
|
file_txt = file_txt.lower() # 将文本文件内容的大写字母换成小写字母
|
|
|
|
|
|
|
|
|
|
for ch in string.punctuation: # 将文本中的特殊字符替换为空格
|
|
|
|
|
file_txt = file_txt.replace(ch, " ")
|
|
|
|
|
|
|
|
|
|
words = list(jieba.lcut(file_txt, cut_all=False)) # 分词
|
|
|
|
|
wordfre = {}
|
|
|
|
|
for word in words: # 统计词汇数
|
|
|
|
|
if word in wordfre:
|
|
|
|
|
wordfre[word] += 1
|
|
|
|
|
else:
|
|
|
|
|
wordfre[word] = 1
|
|
|
|
|
|
|
|
|
|
wordfre = sorted(wordfre.items(), key=itemgetter(1), reverse=True) # 按字典元素的值进行逆序排序
|
|
|
|
|
for i in range(30): # 输出前30个词汇
|
|
|
|
|
print(wordfre[i])
|
|
|
|
|
return wordfre
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_cloud(numword):
|
|
|
|
|
"""
|
|
|
|
|
生成词云并将词汇查询结果写入temp文本文件下
|
|
|
|
|
:param numword: 统计的词汇结果
|
|
|
|
|
"""
|
|
|
|
|
with open('temp.txt', 'a', encoding='utf-8') as f: # 将内容写入temp.txt,并不删除文本文件原有内容
|
|
|
|
|
# with open('temp.txt', 'a', encoding='utf-8') as f: #将内容写入temp.txt,并删除文本文件原有内容
|
|
|
|
|
for i in numword:
|
|
|
|
|
f.write("%d:%s\n" % (i[1], i[0]))
|
|
|
|
|
temp_txt = open("temp.txt", "r", encoding='utf-8').read()
|
|
|
|
|
cloud = wordcloud.WordCloud(font_path='msyhbd.ttc', # 词云参数设置
|
|
|
|
|
width=1000,
|
|
|
|
|
height=700,
|
|
|
|
|
background_color="white"
|
|
|
|
|
)
|
|
|
|
|
word_cloud = cloud.generate(temp_txt)
|
|
|
|
|
|
|
|
|
|
mp.imshow(word_cloud) # 词云的展示
|
|
|
|
|
mp.axis('off')
|
|
|
|
|
mp.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
filepath = 'file' # txt 文本存放路径
|
|
|
|
|
create_file(filepath)
|
|
|
|
|
file_no = input("如果要退出,请输入X\n请输入你的选择:")
|
|
|
|
|
while file_no != 'X':
|
|
|
|
|
if file_no.isnumeric(): # 判断是否是数字
|
|
|
|
|
with open('filename.csv', 'r', encoding='utf-8') as fd:
|
|
|
|
|
TXT = csv.reader(fd)
|
|
|
|
|
for i in TXT:
|
|
|
|
|
if file_no == i[0]:
|
|
|
|
|
filename = filepath + '\\' + i[1]
|
|
|
|
|
numword = count_word(filename)
|
|
|
|
|
create_cloud(numword)
|
|
|
|
|
create_file(filepath)
|
|
|
|
|
else:
|
|
|
|
|
file_no = input("请输入正确的:")
|
|
|
|
|
else:
|
|
|
|
|
print("输入错误!")
|
|
|
|
|
file_no = input("请输入正确的:")
|