mirror of https://gitee.com/hikerxiang/ciyun
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
263 lines
7.7 KiB
263 lines
7.7 KiB
# -*- coding=utf-8 -*-
|
|
|
|
import os
|
|
import glob
|
|
import os
|
|
import jieba
|
|
import wordcloud
|
|
from wordcloud import STOPWORDS
|
|
from matplotlib import pyplot as plt
|
|
|
|
####################################################################################
|
|
#检验是否全是中文字符
|
|
def is_all_chinese(strs):
|
|
for _char in strs:
|
|
if not '\u4e00' <= _char <= '\u9fa5':
|
|
return False
|
|
return True
|
|
#检验是否包含中文字符
|
|
def is_chinese(strs):
|
|
for ch in strs:
|
|
if u'\u4e00' <= ch <= u'\u9fff':
|
|
return True
|
|
return False
|
|
####################################################################################
|
|
|
|
|
|
'''
|
|
纯中文词云
|
|
'''
|
|
def word_cloud_Chinese(file):
|
|
fb = open(file, 'r', encoding="utf-8")
|
|
t = fb.read()
|
|
fb.close()
|
|
|
|
stopwords = set()
|
|
content = [line.strip() for line in open('cn_stopwords.txt', 'r', encoding="utf-8").readlines()]
|
|
stopwords.update(content)
|
|
|
|
ls = jieba.lcut(t)
|
|
txt = " ".join(ls)
|
|
w = wordcloud.WordCloud(font_path="STSONG.TTF",
|
|
width=700,
|
|
height=700,
|
|
background_color="white",
|
|
stopwords=stopwords)
|
|
w.generate(txt)
|
|
w.to_file("123.png")
|
|
|
|
plt.imshow(w, interpolation='bilinear')
|
|
plt.axis('off')
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
|
|
'''
|
|
纯英文词云
|
|
'''
|
|
|
|
|
|
def word_cloud_English(file):
|
|
fb = open(file, 'r', encoding="utf-8")
|
|
t = fb.read()
|
|
fb.close()
|
|
w = wordcloud.WordCloud(font_path="arial.ttf",
|
|
width=1000,
|
|
height=700,
|
|
background_color="white",
|
|
stopwords=STOPWORDS)
|
|
w.generate(t)
|
|
w.to_file("123.png")
|
|
|
|
plt.imshow(w, interpolation='bilinear')
|
|
plt.axis('off')
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
|
|
'''
|
|
中英混合词云
|
|
'''
|
|
|
|
|
|
def word_cloud_English_and_Chinese(file):
|
|
fb = open(file, 'r', encoding="utf-8")
|
|
t = fb.read()
|
|
fb.close()
|
|
stopwords = set()
|
|
content = [line.strip() for line in open('cn_stopwords.txt', 'r', encoding="utf-8").readlines()]
|
|
stopwords.update(content)
|
|
w = wordcloud.WordCloud(font_path="STSONG.TTF",
|
|
width=1000,
|
|
height=700,
|
|
background_color="white",
|
|
stopwords=stopwords,
|
|
collocations=False
|
|
)
|
|
ls = jieba.lcut(t)
|
|
t = " ".join(ls)
|
|
w.generate(t)
|
|
w.to_file("123.png")
|
|
|
|
plt.imshow(w, interpolation='bilinear')
|
|
#################################################################################################
|
|
|
|
'''
|
|
纯中文词频计数
|
|
'''
|
|
|
|
|
|
def Chineseword(file):
|
|
txt = open(file, "r", encoding='utf-8').read()
|
|
counts = {} # 通过键值对的形式存储词语及其出现的次数
|
|
for ch in " ,。:;,《》!?“\”' ''\n'":
|
|
txt = txt.replace(ch, "") # 将文本中特殊字符替换为空格
|
|
words = jieba.lcut(txt) # 使用精确模式对文本进行分词
|
|
|
|
for word in words:
|
|
if (len(word) == 1):
|
|
continue
|
|
else:
|
|
counts[word] = counts.get(word, 0) + 1 # 遍历所有词语,每出现一次其对应的值加 1
|
|
|
|
items = list(counts.items())
|
|
items.sort(key=lambda x: x[1], reverse=True) # 根据词语出现的次数进行从大到小排序
|
|
for i in range(len(items)):
|
|
print(items[i])
|
|
|
|
|
|
'''
|
|
纯英文词频计数
|
|
'''
|
|
|
|
|
|
def Englishword(file):
|
|
fb = open(file, 'r', encoding="utf-8")
|
|
wordfile = {}
|
|
for line in fb:
|
|
line = line.lower()
|
|
sword = line.strip().split()
|
|
for word in sword:
|
|
if word in wordfile:
|
|
wordfile[word] += 1
|
|
else:
|
|
wordfile[word] = 1
|
|
wordfrehigh = []
|
|
for wd, fy in wordfile.items():
|
|
wordfrehigh.append((fy, wd))
|
|
wordfrehigh.sort(reverse=True)
|
|
for wd in wordfrehigh:
|
|
print(wd)
|
|
fb.close()
|
|
|
|
|
|
'''
|
|
中英混合词频计数
|
|
'''
|
|
|
|
|
|
def English_and_Chinese(file):
|
|
fb = open(file, 'r', encoding="utf-8")
|
|
t = fb.read()
|
|
ls = jieba.lcut(t)
|
|
t = " ".join(ls)
|
|
t = t.lower()
|
|
for ch in ",。?:;’“!——、~,《》.--?;:'\"!~' ''\n'":
|
|
t = t.replace(ch, " ")
|
|
t = t.split(" ")
|
|
|
|
wordfile = {}
|
|
for line in t:
|
|
sword = line.split()
|
|
for word in sword:
|
|
if word in wordfile:
|
|
wordfile[word] += 1
|
|
else:
|
|
wordfile[word] = 1
|
|
wordfrehigh = []
|
|
for wd, fy in wordfile.items():
|
|
wordfrehigh.append((fy, wd))
|
|
wordfrehigh.sort(reverse=True)
|
|
for wd in wordfrehigh:
|
|
print(wd)
|
|
fb.close()
|
|
###########################################################################################################
|
|
|
|
if __name__ =='__main__':
|
|
print("欢迎使用小浣熊词云转换器")
|
|
print('''使用介绍:
|
|
1.将你想要转换成词云图的文本放入一个文件夹
|
|
2.告诉我们这个文件夹的地址
|
|
3.确认是否将文本导入(是/否)
|
|
4.我们会用序号标好您的所有文本,由您用序号选择转换哪一个文本
|
|
|
|
开发团队:
|
|
李世健,卢婉梅,李子祥,
|
|
鲁朕家,兰晶晶,闭玉婷''')
|
|
|
|
print("请输入目标文件夹:")
|
|
targetfile=input()
|
|
print('是否已将文本导入')
|
|
fa=input()
|
|
while True:
|
|
|
|
if fa == '是':
|
|
path = targetfile
|
|
files = os.listdir(path) # 得到文件夹下的所有文件名称
|
|
txts = []
|
|
i = 1
|
|
judg = []
|
|
|
|
for file in files: # 遍历文件夹
|
|
position = path + '\\' + file # 构造绝对路径,"\\",其中一个'\'为转义符
|
|
print(i, '--- ', end='')
|
|
print(file, end='')
|
|
i = i + 1
|
|
|
|
with open(position, "r", encoding='utf-8') as f: # 打开文件
|
|
data = f.read() # 读取文件
|
|
if (is_all_chinese(data)):
|
|
print(" (纯中文)")
|
|
judg.append('z')
|
|
else:
|
|
if (is_chinese(data)):
|
|
print(" (有英文有中文)")
|
|
judg.append('m')
|
|
else:
|
|
print(" (纯英文)")
|
|
judg.append('y')
|
|
##################################################################################
|
|
print("输入你要选择的文本")
|
|
atwo = int(input())
|
|
tine = 0
|
|
k = 1
|
|
for file in files: # 遍历文件夹
|
|
position = path + '\\' + file # 构造绝对路径,"\\",其中一个'\'为转义符
|
|
tine = tine + 1
|
|
if tine == atwo:
|
|
resultlj = position
|
|
print(resultlj)
|
|
#################################################################################
|
|
print(judg[atwo - 1])
|
|
if judg[atwo - 1] == 'z':
|
|
word_cloud_Chinese(resultlj)
|
|
Chineseword(resultlj)
|
|
|
|
elif judg[atwo - 1] == 'y':
|
|
word_cloud_English(resultlj)
|
|
Englishword(resultlj)
|
|
else:
|
|
word_cloud_English_and_Chinese(resultlj)
|
|
English_and_Chinese(resultlj)
|
|
print("是否退出程序")
|
|
ans=input()
|
|
if ans=='是':
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|