|
|
|
|
import jieba
|
|
|
|
|
import re
|
|
|
|
|
from gensim import corpora,models,similarities
|
|
|
|
|
import wordcloud
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import imageio
|
|
|
|
|
class processor():
|
|
|
|
|
def __init__(self,data_path,train_path,userdict,exl_path,wordcloud_path):
|
|
|
|
|
#保存数据的路径
|
|
|
|
|
self.data_path = data_path
|
|
|
|
|
#参照相似度的文件的路径
|
|
|
|
|
self.train_path = train_path
|
|
|
|
|
#更新词库路径
|
|
|
|
|
self.userdict = userdict
|
|
|
|
|
#保存的excel文件路径
|
|
|
|
|
self.exl_path = exl_path
|
|
|
|
|
#词云的文件路径
|
|
|
|
|
self.wordcloud_path = wordcloud_path
|
|
|
|
|
#停用词设置
|
|
|
|
|
self.stop_words = ['什么','亚洲','官方','俄罗斯']
|
|
|
|
|
#本文本词袋词袋
|
|
|
|
|
self.dictionary = None
|
|
|
|
|
def read_file(self,path):
|
|
|
|
|
'''
|
|
|
|
|
读取文件,并存储到列表中
|
|
|
|
|
'''
|
|
|
|
|
with open(path,'r',encoding = 'utf-8') as f:
|
|
|
|
|
lines = f.readlines()
|
|
|
|
|
return lines
|
|
|
|
|
|
|
|
|
|
def save_file(self,word_list):
|
|
|
|
|
path = './temp.txt'
|
|
|
|
|
with open(path,'w+',encoding='utf-8') as f:
|
|
|
|
|
f.write(''.join(word_list))
|
|
|
|
|
return path
|
|
|
|
|
|
|
|
|
|
def analyse_vector_tfidf(self):
|
|
|
|
|
'''
|
|
|
|
|
抽取与AI相关性
|
|
|
|
|
较为强烈的弹幕
|
|
|
|
|
'''
|
|
|
|
|
#加载词库
|
|
|
|
|
jieba.load_userdict(self.userdict)
|
|
|
|
|
#将数据的全部分词保存在all_doc
|
|
|
|
|
all_lines = self.read_file(self.data_path)
|
|
|
|
|
all_doc = []
|
|
|
|
|
#与AI应用相关的文本数据
|
|
|
|
|
test_lines = self.read_file(self.train_path)
|
|
|
|
|
test_doc = []
|
|
|
|
|
#开始分词
|
|
|
|
|
#对需要分析的数据集开始分词
|
|
|
|
|
for line in all_lines:
|
|
|
|
|
#利用正则表达式去除中文标点,空的话相关度为零
|
|
|
|
|
line = re.sub('[^\w\s]','',line)
|
|
|
|
|
token = [word for word in jieba.lcut(line.strip()) if word not in self.stop_words and len(word)>1]
|
|
|
|
|
all_doc.append(token)
|
|
|
|
|
#对AI应用相关的文本进行分词
|
|
|
|
|
for line in test_lines:
|
|
|
|
|
line = re.sub('[^\w\s]','',line)
|
|
|
|
|
token = [word for word in jieba.lcut(line.strip()) if word not in self.stop_words and len(word)>1]
|
|
|
|
|
if len(test_doc)==0:
|
|
|
|
|
test_doc = token
|
|
|
|
|
else:
|
|
|
|
|
test_doc += token
|
|
|
|
|
#制作词袋
|
|
|
|
|
self.dictionary = corpora.Dictionary(all_doc)
|
|
|
|
|
#制作语料库,即具有词频的词袋模型(词语ID号,词语频率)
|
|
|
|
|
corpus = [self.dictionary.doc2bow(doc) for doc in all_doc]
|
|
|
|
|
test_doc_vec = [self.dictionary.doc2bow(test_doc)]
|
|
|
|
|
#每个词语的tfidf值
|
|
|
|
|
tfidf = models.TfidfModel(corpus)
|
|
|
|
|
#相似度Sim算法
|
|
|
|
|
sparse_matrix = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=len(self.dictionary.keys()))
|
|
|
|
|
sim = sparse_matrix[tfidf[test_doc_vec]][0]
|
|
|
|
|
sim = sorted(enumerate(sim),key= lambda item: item[1],reverse=True)
|
|
|
|
|
# print(sim)
|
|
|
|
|
|
|
|
|
|
#提取在规定相关度之内的弹幕
|
|
|
|
|
key_words = []
|
|
|
|
|
for i in range(len(all_lines)):
|
|
|
|
|
if sim[i][1]>=0.135:
|
|
|
|
|
key_words.append(all_lines[sim[i][0]])
|
|
|
|
|
else:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
self.save_file(key_words)
|
|
|
|
|
def draw_wordcloud(self):
|
|
|
|
|
#制作词频表
|
|
|
|
|
key_word_dict = {}
|
|
|
|
|
key_words = self.read_file('temp.txt')
|
|
|
|
|
key_words_list = []
|
|
|
|
|
for line in key_words:
|
|
|
|
|
#利用正则表达式去除中文标点,空的话相关度为零
|
|
|
|
|
line = re.sub('[^\w\s]','',line)
|
|
|
|
|
token = [word for word in jieba.lcut(line.strip()) if word not in self.stop_words and len(word)>1]
|
|
|
|
|
if len(key_words_list)==0:
|
|
|
|
|
key_words_list = token
|
|
|
|
|
else:
|
|
|
|
|
key_words_list += token
|
|
|
|
|
key_words_vec = self.dictionary.doc2bow(key_words_list)
|
|
|
|
|
# print(key_words_vec)
|
|
|
|
|
|
|
|
|
|
for tup in key_words_vec:
|
|
|
|
|
key_word_dict[self.dictionary[tup[0]]] = tup[1]
|
|
|
|
|
key_word_dict = sorted(key_word_dict.items(),key=lambda x:x[1],reverse=True)
|
|
|
|
|
print(key_word_dict)
|
|
|
|
|
|
|
|
|
|
# list = []
|
|
|
|
|
# for k,v in key_words_list:
|
|
|
|
|
# list.append(k+' '+v)
|
|
|
|
|
self.save_file([f'{items[0]} {items[1]} ' for items in key_word_dict])
|
|
|
|
|
|
|
|
|
|
#保存Excel表格
|
|
|
|
|
self.save_to_excel(key_word_dict)
|
|
|
|
|
#画词云
|
|
|
|
|
#要有中文字体的路径不然显示不出中文
|
|
|
|
|
w = wordcloud.WordCloud(background_color='white',mask = imageio.imread('./AI.png'),font_path= './24华康POP2体.TTC')
|
|
|
|
|
w.generate(str(self.read_file('./temp.txt')))
|
|
|
|
|
w.to_file(self.wordcloud_path)
|
|
|
|
|
def save_to_excel(self,key_word_dict):
|
|
|
|
|
execl_data = pd.DataFrame(key_word_dict)
|
|
|
|
|
execl_data.to_excel(self.exl_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
# analyse_vector_tfidf()
|
|
|
|
|
path = './关于b站视频2024年巴黎运动会AI应用的弹幕.txt'
|
|
|
|
|
test_path = './巴黎奥运会AI文本库.txt'
|
|
|
|
|
userdict_path = './AI专有名词.txt'
|
|
|
|
|
exl_path = './b站视频2024年巴黎运动会AI应用的弹幕的统计表格.xlsx'
|
|
|
|
|
wordcloud_path = './AImap.png'
|
|
|
|
|
process = processor(path,test_path,userdict_path,exl_path,wordcloud_path)
|
|
|
|
|
process.analyse_vector_tfidf()
|
|
|
|
|
process.draw_wordcloud()
|