import glob import os import jieba import wordcloud from wordcloud import STOPWORDS from matplotlib import pyplot as plt ''' 纯中文词云 ''' def word_cloud_Chinese(file): fb = open(file, 'r', encoding="utf-8") t = fb.read() fb.close() stopwords = set() content = [line.strip() for line in open('cn_stopwords.txt', 'r',encoding="utf-8").readlines()] stopwords.update(content) ls = jieba.lcut(t) txt = " ".join(ls) w = wordcloud.WordCloud(font_path="STSONG.TTF", width=700, height=700, background_color="white", stopwords = stopwords) w.generate(txt) w.to_file("123.png") plt.imshow(w,interpolation='bilinear') plt.axis('off') plt.tight_layout() plt.show() ''' 纯英文词云 ''' def word_cloud_English(file): fb = open(file, 'r', encoding="utf-8") t = fb.read() fb.close() w = wordcloud.WordCloud(font_path="arial.ttf", width=1000, height=700, background_color="white", stopwords=STOPWORDS) w.generate(t) w.to_file("123.png") plt.imshow(w,interpolation='bilinear') plt.axis('off') plt.tight_layout() plt.show() ''' 中英混合词云 ''' def word_cloud_English_and_Chinese(file): fb = open(file, 'r', encoding="utf-8") t = fb.read() fb.close() stopwords = set() content = [line.strip() for line in open('cn_stopwords.txt', 'r', encoding="utf-8").readlines()] stopwords.update(content) w = wordcloud.WordCloud(font_path="STSONG.TTF", width=1000, height=700, background_color="white", stopwords=stopwords, collocations = False ) ls = jieba.lcut(t) t = " ".join(ls) w.generate(t) w.to_file("123.png") plt.imshow(w,interpolation='bilinear') ''' 纯中文词频计数 ''' def Chineseword(file): txt = open(file, "r", encoding='utf-8').read() counts = {} # 通过键值对的形式存储词语及其出现的次数 for ch in " ,。:;,《》!?“\”' ''\n'": txt = txt.replace(ch, "") # 将文本中特殊字符替换为空格 words = jieba.lcut(txt) # 使用精确模式对文本进行分词 for word in words: if(len(word)==1): continue else: counts[word] = counts.get(word, 0) + 1 # 遍历所有词语,每出现一次其对应的值加 1 items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) # 根据词语出现的次数进行从大到小排序 for i in range(len(items)): print(items[i]) ''' 纯英文词频计数 ''' def Englishword(file): fb = open(file, 'r', encoding="utf-8") wordfile = {} for line in fb: line=line.lower() sword = line.strip().split() for word in sword: if word in wordfile: wordfile[word] += 1 else: wordfile[word] = 1 wordfrehigh = [] for wd, fy in wordfile.items(): wordfrehigh.append((fy,wd)) wordfrehigh.sort(reverse=True) for wd in wordfrehigh: print(wd) fb.close() ''' 中英混合词频计数 ''' def English_and_Chinese(file): fb = open(file, 'r', encoding="utf-8") t = fb.read() ls = jieba.lcut(t) t = " ".join(ls) t=t.lower() for ch in ",。?:;’“!——、~,《》.--?;:'\"!~' ''\n'": t = t.replace(ch, " ") t=t.split(" ") wordfile = {} for line in t: sword = line.split() for word in sword: if word in wordfile: wordfile[word] += 1 else: wordfile[word] = 1 wordfrehigh = [] for wd, fy in wordfile.items(): wordfrehigh.append((fy,wd)) wordfrehigh.sort(reverse=True) for wd in wordfrehigh: print(wd) fb.close() English_and_Chinese("file.txt") word_cloud_English_and_Chinese("file.txt")