# -*- coding=utf-8 -*- import os import glob import os import jieba import wordcloud from wordcloud import STOPWORDS from matplotlib import pyplot as plt #################################################################################### #检验是否全是中文字符 def is_all_chinese(strs): for _char in strs: if not '\u4e00' <= _char <= '\u9fa5': return False return True #检验是否包含中文字符 def is_chinese(strs): for ch in strs: if u'\u4e00' <= ch <= u'\u9fff': return True return False #################################################################################### ''' 纯中文词云 ''' def word_cloud_Chinese(file): fb = open(file, 'r', encoding="utf-8") t = fb.read() fb.close() stopwords = set() content = [line.strip() for line in open('cn_stopwords.txt', 'r', encoding="utf-8").readlines()] stopwords.update(content) ls = jieba.lcut(t) txt = " ".join(ls) w = wordcloud.WordCloud(font_path="STSONG.TTF", width=700, height=700, background_color="white", stopwords=stopwords) w.generate(txt) w.to_file("123.png") plt.imshow(w, interpolation='bilinear') plt.axis('off') plt.tight_layout() plt.show() ''' 纯英文词云 ''' def word_cloud_English(file): fb = open(file, 'r', encoding="utf-8") t = fb.read() fb.close() w = wordcloud.WordCloud(font_path="arial.ttf", width=1000, height=700, background_color="white", stopwords=STOPWORDS) w.generate(t) w.to_file("123.png") plt.imshow(w, interpolation='bilinear') plt.axis('off') plt.tight_layout() plt.show() ''' 中英混合词云 ''' def word_cloud_English_and_Chinese(file): fb = open(file, 'r', encoding="utf-8") t = fb.read() fb.close() stopwords = set() content = [line.strip() for line in open('cn_stopwords.txt', 'r', encoding="utf-8").readlines()] stopwords.update(content) w = wordcloud.WordCloud(font_path="STSONG.TTF", width=1000, height=700, background_color="white", stopwords=stopwords, collocations=False ) ls = jieba.lcut(t) t = " ".join(ls) w.generate(t) w.to_file("123.png") plt.imshow(w, interpolation='bilinear') ################################################################################################# ''' 纯中文词频计数 ''' def Chineseword(file): txt = open(file, "r", encoding='utf-8').read() counts = {} # 通过键值对的形式存储词语及其出现的次数 for ch in " ,。:;,《》!?“\”' ''\n'": txt = txt.replace(ch, "") # 将文本中特殊字符替换为空格 words = jieba.lcut(txt) # 使用精确模式对文本进行分词 for word in words: if (len(word) == 1): continue else: counts[word] = counts.get(word, 0) + 1 # 遍历所有词语,每出现一次其对应的值加 1 items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) # 根据词语出现的次数进行从大到小排序 for i in range(len(items)): print(items[i]) ''' 纯英文词频计数 ''' def Englishword(file): fb = open(file, 'r', encoding="utf-8") wordfile = {} for line in fb: line = line.lower() sword = line.strip().split() for word in sword: if word in wordfile: wordfile[word] += 1 else: wordfile[word] = 1 wordfrehigh = [] for wd, fy in wordfile.items(): wordfrehigh.append((fy, wd)) wordfrehigh.sort(reverse=True) for wd in wordfrehigh: print(wd) fb.close() ''' 中英混合词频计数 ''' def English_and_Chinese(file): fb = open(file, 'r', encoding="utf-8") t = fb.read() ls = jieba.lcut(t) t = " ".join(ls) t = t.lower() for ch in ",。?:;’“!——、~,《》.--?;:'\"!~' ''\n'": t = t.replace(ch, " ") t = t.split(" ") wordfile = {} for line in t: sword = line.split() for word in sword: if word in wordfile: wordfile[word] += 1 else: wordfile[word] = 1 wordfrehigh = [] for wd, fy in wordfile.items(): wordfrehigh.append((fy, wd)) wordfrehigh.sort(reverse=True) for wd in wordfrehigh: print(wd) fb.close() ########################################################################################################### if __name__ =='__main__': print("欢迎使用小浣熊词云转换器") print('''使用介绍: 1.将你想要转换成词云图的文本放入一个文件夹 2.告诉我们这个文件夹的地址 3.确认是否将文本导入(是/否) 4.我们会用序号标好您的所有文本,由您用序号选择转换哪一个文本 开发团队: 李世健,卢婉梅,李子祥, 鲁朕家,兰晶晶,闭玉婷''') print("请输入目标文件夹:") targetfile=input() print('是否已将文本导入') fa=input() while True: if fa == '是': path = targetfile files = os.listdir(path) # 得到文件夹下的所有文件名称 txts = [] i = 1 judg = [] for file in files: # 遍历文件夹 position = path + '\\' + file # 构造绝对路径,"\\",其中一个'\'为转义符 print(i, '--- ', end='') print(file, end='') i = i + 1 with open(position, "r", encoding='utf-8') as f: # 打开文件 data = f.read() # 读取文件 if (is_all_chinese(data)): print(" (纯中文)") judg.append('z') else: if (is_chinese(data)): print(" (有英文有中文)") judg.append('m') else: print(" (纯英文)") judg.append('y') ################################################################################## print("输入你要选择的文本") atwo = int(input()) tine = 0 k = 1 for file in files: # 遍历文件夹 position = path + '\\' + file # 构造绝对路径,"\\",其中一个'\'为转义符 tine = tine + 1 if tine == atwo: resultlj = position print(resultlj) ################################################################################# print(judg[atwo - 1]) if judg[atwo - 1] == 'z': word_cloud_Chinese(resultlj) Chineseword(resultlj) elif judg[atwo - 1] == 'y': word_cloud_English(resultlj) Englishword(resultlj) else: word_cloud_English_and_Chinese(resultlj) English_and_Chinese(resultlj) print("是否退出程序") ans=input() if ans=='是': break