diff --git a/cn_stopwords.txt b/cn_stopwords.txt new file mode 100644 index 0000000..e69de29 diff --git a/main.py b/main.py new file mode 100644 index 0000000..e49925a --- /dev/null +++ b/main.py @@ -0,0 +1,262 @@ +# -*- coding=utf-8 -*- + +import os +import glob +import os +import jieba +import wordcloud +from wordcloud import STOPWORDS +from matplotlib import pyplot as plt + +#################################################################################### +#检验是否全是中文字符 +def is_all_chinese(strs): + for _char in strs: + if not '\u4e00' <= _char <= '\u9fa5': + return False + return True +#检验是否包含中文字符 +def is_chinese(strs): + for ch in strs: + if u'\u4e00' <= ch <= u'\u9fff': + return True + return False +#################################################################################### + + +''' +纯中文词云 +''' +def word_cloud_Chinese(file): + fb = open(file, 'r', encoding="utf-8") + t = fb.read() + fb.close() + + stopwords = set() + content = [line.strip() for line in open('cn_stopwords.txt', 'r', encoding="utf-8").readlines()] + stopwords.update(content) + + ls = jieba.lcut(t) + txt = " ".join(ls) + w = wordcloud.WordCloud(font_path="STSONG.TTF", + width=700, + height=700, + background_color="white", + stopwords=stopwords) + w.generate(txt) + w.to_file("123.png") + + plt.imshow(w, interpolation='bilinear') + plt.axis('off') + plt.tight_layout() + plt.show() + + +''' +纯英文词云 +''' + + +def word_cloud_English(file): + fb = open(file, 'r', encoding="utf-8") + t = fb.read() + fb.close() + w = wordcloud.WordCloud(font_path="arial.ttf", + width=1000, + height=700, + background_color="white", + stopwords=STOPWORDS) + w.generate(t) + w.to_file("123.png") + + plt.imshow(w, interpolation='bilinear') + plt.axis('off') + plt.tight_layout() + plt.show() + + +''' +中英混合词云 +''' + + +def word_cloud_English_and_Chinese(file): + fb = open(file, 'r', encoding="utf-8") + t = fb.read() + fb.close() + stopwords = set() + content = [line.strip() for line in open('cn_stopwords.txt', 'r', encoding="utf-8").readlines()] + stopwords.update(content) + w = wordcloud.WordCloud(font_path="STSONG.TTF", + width=1000, + height=700, + background_color="white", + stopwords=stopwords, + collocations=False + ) + ls = jieba.lcut(t) + t = " ".join(ls) + w.generate(t) + w.to_file("123.png") + + plt.imshow(w, interpolation='bilinear') +################################################################################################# + +''' +纯中文词频计数 +''' + + +def Chineseword(file): + txt = open(file, "r", encoding='utf-8').read() + counts = {} # 通过键值对的形式存储词语及其出现的次数 + for ch in " ,。:;,《》!?“\”' ''\n'": + txt = txt.replace(ch, "") # 将文本中特殊字符替换为空格 + words = jieba.lcut(txt) # 使用精确模式对文本进行分词 + + for word in words: + if (len(word) == 1): + continue + else: + counts[word] = counts.get(word, 0) + 1 # 遍历所有词语,每出现一次其对应的值加 1 + + items = list(counts.items()) + items.sort(key=lambda x: x[1], reverse=True) # 根据词语出现的次数进行从大到小排序 + for i in range(len(items)): + print(items[i]) + + +''' +纯英文词频计数 +''' + + +def Englishword(file): + fb = open(file, 'r', encoding="utf-8") + wordfile = {} + for line in fb: + line = line.lower() + sword = line.strip().split() + for word in sword: + if word in wordfile: + wordfile[word] += 1 + else: + wordfile[word] = 1 + wordfrehigh = [] + for wd, fy in wordfile.items(): + wordfrehigh.append((fy, wd)) + wordfrehigh.sort(reverse=True) + for wd in wordfrehigh: + print(wd) + fb.close() + + +''' +中英混合词频计数 +''' + + +def English_and_Chinese(file): + fb = open(file, 'r', encoding="utf-8") + t = fb.read() + ls = jieba.lcut(t) + t = " ".join(ls) + t = t.lower() + for ch in ",。?:;’“!——、~,《》.--?;:'\"!~' ''\n'": + t = t.replace(ch, " ") + t = t.split(" ") + + wordfile = {} + for line in t: + sword = line.split() + for word in sword: + if word in wordfile: + wordfile[word] += 1 + else: + wordfile[word] = 1 + wordfrehigh = [] + for wd, fy in wordfile.items(): + wordfrehigh.append((fy, wd)) + wordfrehigh.sort(reverse=True) + for wd in wordfrehigh: + print(wd) + fb.close() +########################################################################################################### + +if __name__ =='__main__': + print("欢迎使用小浣熊词云转换器") + print('''使用介绍: + 1.将你想要转换成词云图的文本放入一个文件夹 + 2.告诉我们这个文件夹的地址 + 3.确认是否将文本导入(是/否) + 4.我们会用序号标好您的所有文本,由您用序号选择转换哪一个文本 + + 开发团队: + 李世健,卢婉梅,李子祥, + 鲁朕家,兰晶晶,闭玉婷''') + + print("请输入目标文件夹:") + targetfile=input() + print('是否已将文本导入') + fa=input() + while True: + + if fa == '是': + path = targetfile + files = os.listdir(path) # 得到文件夹下的所有文件名称 + txts = [] + i = 1 + judg = [] + + for file in files: # 遍历文件夹 + position = path + '\\' + file # 构造绝对路径,"\\",其中一个'\'为转义符 + print(i, '--- ', end='') + print(file, end='') + i = i + 1 + + with open(position, "r", encoding='utf-8') as f: # 打开文件 + data = f.read() # 读取文件 + if (is_all_chinese(data)): + print(" (纯中文)") + judg.append('z') + else: + if (is_chinese(data)): + print(" (有英文有中文)") + judg.append('m') + else: + print(" (纯英文)") + judg.append('y') + ################################################################################## + print("输入你要选择的文本") + atwo = int(input()) + tine = 0 + k = 1 + for file in files: # 遍历文件夹 + position = path + '\\' + file # 构造绝对路径,"\\",其中一个'\'为转义符 + tine = tine + 1 + if tine == atwo: + resultlj = position + print(resultlj) + ################################################################################# + print(judg[atwo - 1]) + if judg[atwo - 1] == 'z': + word_cloud_Chinese(resultlj) + Chineseword(resultlj) + + elif judg[atwo - 1] == 'y': + word_cloud_English(resultlj) + Englishword(resultlj) + else: + word_cloud_English_and_Chinese(resultlj) + English_and_Chinese(resultlj) + print("是否退出程序") + ans=input() + if ans=='是': + break + + + + + + +