|
|
|
@ -0,0 +1,132 @@
|
|
|
|
|
import jieba
|
|
|
|
|
import re
|
|
|
|
|
import os
|
|
|
|
|
import wordcloud
|
|
|
|
|
import numpy as np
|
|
|
|
|
from PIL import Image
|
|
|
|
|
now=0
|
|
|
|
|
import os
|
|
|
|
|
def show_files(path)->str:
|
|
|
|
|
'''
|
|
|
|
|
path:文件目录
|
|
|
|
|
获取全部文件目录并让用户选定执行文件。
|
|
|
|
|
返回文件目录
|
|
|
|
|
'''
|
|
|
|
|
numb = 0#记录文件个数
|
|
|
|
|
file_list=[]#保存文件路径
|
|
|
|
|
for root , dicts, files in os.walk(path):
|
|
|
|
|
for d in dicts:
|
|
|
|
|
dpath = os.path.join(root,d)
|
|
|
|
|
if "txt" in dpath:
|
|
|
|
|
file_list.append(dpath)
|
|
|
|
|
numb= numb +1
|
|
|
|
|
print("%d---"%numb+d)#显示文件名
|
|
|
|
|
|
|
|
|
|
for f in files:#保存文件路径
|
|
|
|
|
fpath = os.path.join(root, f)
|
|
|
|
|
if "txt" in fpath:
|
|
|
|
|
file_list.append(fpath)
|
|
|
|
|
numb = numb + 1
|
|
|
|
|
print("%d---"% numb + f )
|
|
|
|
|
file_num = int(input("这是全部的文件名,请输入你要处理的文件序号:"))
|
|
|
|
|
while True:#输入所需统计的文件编号
|
|
|
|
|
if file_num<=0 or file_num>numb:#不合法重新输入
|
|
|
|
|
file_num = int(input("输入的序号不合法,请重新输入:"))
|
|
|
|
|
else:
|
|
|
|
|
break
|
|
|
|
|
file_path = file_list[file_num-1]
|
|
|
|
|
return file_path#返/回文件名
|
|
|
|
|
def get_fileDiretory()->str:
|
|
|
|
|
'''
|
|
|
|
|
使用os模块获取程序文件所在的目录
|
|
|
|
|
返回为该程序文件所在目录的路径
|
|
|
|
|
'''
|
|
|
|
|
dqdirpath = os.getcwd()
|
|
|
|
|
return dqdirpath
|
|
|
|
|
|
|
|
|
|
def Statistics_characters(path)->str:
|
|
|
|
|
'''
|
|
|
|
|
path:文件目录
|
|
|
|
|
获取对应文件中的中英文字符数量
|
|
|
|
|
返回为一个已经统计好字符的元组
|
|
|
|
|
'''
|
|
|
|
|
fp= open(path, 'r', encoding='utf-8')#获取文件指针
|
|
|
|
|
t=fp.read()#读文件
|
|
|
|
|
dic={}#创建字典
|
|
|
|
|
s=jieba.lcut(t)#用jieba库对该文件进行精确分割
|
|
|
|
|
for i in s:#遍历分割好的列表
|
|
|
|
|
test_str = re.search(r"\W", i)#使用re库判断该字符是不是特殊字符
|
|
|
|
|
if test_str==None:#如果分割出来的字符不是特殊字符
|
|
|
|
|
if i in dic:#在原有的key中+1
|
|
|
|
|
dic[i]+=1
|
|
|
|
|
else:#新建一个key
|
|
|
|
|
dic[i]=1
|
|
|
|
|
print("统计字符串成功!")
|
|
|
|
|
dic=sorted(dic.items(),key=lambda e:e[1],reverse=True)
|
|
|
|
|
print("对字符串出现次数降序排序统计成功!")
|
|
|
|
|
return dic
|
|
|
|
|
def writ_file(li)->list:
|
|
|
|
|
'''
|
|
|
|
|
将列表中的值写入result.txt文本中
|
|
|
|
|
传入值为列表
|
|
|
|
|
列表中嵌套一个有两个数的元组
|
|
|
|
|
将元组中的key以及value传入result.txt文件
|
|
|
|
|
传入时候key应为中文或英文词汇,value为一个整数表示在文本中出现次数
|
|
|
|
|
写入时第一列为单词以及中文词汇,第二列为出现次数
|
|
|
|
|
'''
|
|
|
|
|
path="result"+str(now)+".txt"
|
|
|
|
|
fp=open(path,"w",encoding='utf-8')#打开文件
|
|
|
|
|
for key,value in li:#遍历列表
|
|
|
|
|
s=str(key)+"\t"+str(value)
|
|
|
|
|
fp.write(str(s)+'\n')#写入数据
|
|
|
|
|
print("写入成功!")
|
|
|
|
|
fp.close()
|
|
|
|
|
def Gnerate_wordcloud(path)->None:
|
|
|
|
|
'''
|
|
|
|
|
path:所需生成词云文件地址
|
|
|
|
|
通过wordcloud库以及jieba将对应的文本文件生成词云
|
|
|
|
|
并且能够将所生成词云弹出
|
|
|
|
|
'''
|
|
|
|
|
# mas = np.array(Image.open("D:\\pytest\\xt\\qs.png"))#读取背景图的信息
|
|
|
|
|
fp= open(path, 'r', encoding='utf-8')#获取文件指针
|
|
|
|
|
s=fp.read()#读取文件信息
|
|
|
|
|
fp.close()
|
|
|
|
|
li=jieba.lcut(s)#分割字符
|
|
|
|
|
tx=" ".join(li)
|
|
|
|
|
w=wordcloud.WordCloud(font_path="msyhbd.ttc",
|
|
|
|
|
width=2000,
|
|
|
|
|
height=1700,
|
|
|
|
|
#mask=mas,
|
|
|
|
|
background_color="white"
|
|
|
|
|
)#初始化
|
|
|
|
|
w.generate(tx)#生成词云
|
|
|
|
|
s="第"+str(now)+"次生成的词云"+".png"
|
|
|
|
|
w.to_file(s)#保存到文件当中
|
|
|
|
|
img=Image.open(s)#记录词云文件地址
|
|
|
|
|
img.show()#展示词云
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
while True:
|
|
|
|
|
now+=1
|
|
|
|
|
Diretory_name=get_fileDiretory()#获取当前程序所在目录名
|
|
|
|
|
file_name=show_files(Diretory_name)#查找目录下的所有文本并选择一个文本文件
|
|
|
|
|
li=Statistics_characters(file_name)#获取文件中的字符出现次数并且统计
|
|
|
|
|
writ_file(li)#将结果写入result.txt文档
|
|
|
|
|
Gnerate_wordcloud(file_name)
|
|
|
|
|
print("是否要继续统计?(Y/N)")
|
|
|
|
|
f=0
|
|
|
|
|
while True:
|
|
|
|
|
s = input()
|
|
|
|
|
if s=='Y' or s=='y':
|
|
|
|
|
f=0
|
|
|
|
|
break
|
|
|
|
|
elif s=='N' or s=='n':
|
|
|
|
|
f=1
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
print("输入字符无效请重新输入")
|
|
|
|
|
if f==1:
|
|
|
|
|
print("感谢您的使用,再见")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|