You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

161 lines
4.3 KiB

import glob
import os
import jieba
import wordcloud
from wordcloud import STOPWORDS
from matplotlib import pyplot as plt
'''
纯中文词云
'''
def word_cloud_Chinese(file):
fb = open(file, 'r', encoding="utf-8")
t = fb.read()
fb.close()
stopwords = set()
content = [line.strip() for line in open('cn_stopwords.txt', 'r',encoding="utf-8").readlines()]
stopwords.update(content)
ls = jieba.lcut(t)
txt = " ".join(ls)
w = wordcloud.WordCloud(font_path="STSONG.TTF",
width=700,
height=700,
background_color="white",
stopwords = stopwords)
w.generate(txt)
w.to_file("123.png")
plt.imshow(w,interpolation='bilinear')
plt.axis('off')
plt.tight_layout()
plt.show()
'''
纯英文词云
'''
def word_cloud_English(file):
fb = open(file, 'r', encoding="utf-8")
t = fb.read()
fb.close()
w = wordcloud.WordCloud(font_path="arial.ttf",
width=1000,
height=700,
background_color="white",
stopwords=STOPWORDS)
w.generate(t)
w.to_file("123.png")
plt.imshow(w,interpolation='bilinear')
plt.axis('off')
plt.tight_layout()
plt.show()
'''
中英混合词云
'''
def word_cloud_English_and_Chinese(file):
fb = open(file, 'r', encoding="utf-8")
t = fb.read()
fb.close()
stopwords = set()
content = [line.strip() for line in open('cn_stopwords.txt', 'r', encoding="utf-8").readlines()]
stopwords.update(content)
w = wordcloud.WordCloud(font_path="STSONG.TTF",
width=1000,
height=700,
background_color="white",
stopwords=stopwords,
collocations = False
)
ls = jieba.lcut(t)
t = " ".join(ls)
w.generate(t)
w.to_file("123.png")
plt.imshow(w,interpolation='bilinear')
'''
纯中文词频计数
'''
def Chineseword(file):
txt = open(file, "r", encoding='utf-8').read()
counts = {} # 通过键值对的形式存储词语及其出现的次数
for ch in " ,。:;,《》!?“\' ''\n'":
txt = txt.replace(ch, "") # 将文本中特殊字符替换为空格
words = jieba.lcut(txt) # 使用精确模式对文本进行分词
for word in words:
if(len(word)==1):
continue
else:
counts[word] = counts.get(word, 0) + 1 # 遍历所有词语,每出现一次其对应的值加 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True) # 根据词语出现的次数进行从大到小排序
for i in range(len(items)):
print(items[i])
'''
纯英文词频计数
'''
def Englishword(file):
fb = open(file, 'r', encoding="utf-8")
wordfile = {}
for line in fb:
line=line.lower()
sword = line.strip().split()
for word in sword:
if word in wordfile:
wordfile[word] += 1
else:
wordfile[word] = 1
wordfrehigh = []
for wd, fy in wordfile.items():
wordfrehigh.append((fy,wd))
wordfrehigh.sort(reverse=True)
for wd in wordfrehigh:
print(wd)
fb.close()
'''
中英混合词频计数
'''
def English_and_Chinese(file):
fb = open(file, 'r', encoding="utf-8")
t = fb.read()
ls = jieba.lcut(t)
t = " ".join(ls)
t=t.lower()
for ch in ",。?:;’“!——、~,《》.--?;:'\"!~' ''\n'":
t = t.replace(ch, " ")
t=t.split(" ")
wordfile = {}
for line in t:
sword = line.split()
for word in sword:
if word in wordfile:
wordfile[word] += 1
else:
wordfile[word] = 1
wordfrehigh = []
for wd, fy in wordfile.items():
wordfrehigh.append((fy,wd))
wordfrehigh.sort(reverse=True)
for wd in wordfrehigh:
print(wd)
fb.close()
English_and_Chinese("file.txt")
word_cloud_English_and_Chinese("file.txt")