You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

138 lines
5.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import jieba
import re
from gensim import corpora,models,similarities
import wordcloud
import pandas as pd
import imageio
class processor():
def __init__(self,data_path,train_path,userdict,exl_path,wordcloud_path):
#保存数据的路径
self.data_path = data_path
#参照相似度的文件的路径
self.train_path = train_path
#更新词库路径
self.userdict = userdict
#保存的excel文件路径
self.exl_path = exl_path
#词云的文件路径
self.wordcloud_path = wordcloud_path
#停用词设置
self.stop_words = ['什么','亚洲','官方','俄罗斯']
#本文本词袋词袋
self.dictionary = None
def read_file(self,path):
'''
读取文件,并存储到列表中
'''
with open(path,'r',encoding = 'utf-8') as f:
lines = f.readlines()
return lines
def save_file(self,word_list):
path = './temp.txt'
with open(path,'w+',encoding='utf-8') as f:
f.write(''.join(word_list))
return path
def analyse_vector_tfidf(self):
'''
抽取与AI相关性
较为强烈的弹幕
'''
#加载词库
jieba.load_userdict(self.userdict)
#将数据的全部分词保存在all_doc
all_lines = self.read_file(self.data_path)
all_doc = []
#与AI应用相关的文本数据
test_lines = self.read_file(self.train_path)
test_doc = []
#开始分词
#对需要分析的数据集开始分词
for line in all_lines:
#利用正则表达式去除中文标点,空的话相关度为零
line = re.sub('[^\w\s]','',line)
token = [word for word in jieba.lcut(line.strip()) if word not in self.stop_words and len(word)>1]
all_doc.append(token)
#对AI应用相关的文本进行分词
for line in test_lines:
line = re.sub('[^\w\s]','',line)
token = [word for word in jieba.lcut(line.strip()) if word not in self.stop_words and len(word)>1]
if len(test_doc)==0:
test_doc = token
else:
test_doc += token
#制作词袋
self.dictionary = corpora.Dictionary(all_doc)
#制作语料库,即具有词频的词袋模型词语ID号词语频率
corpus = [self.dictionary.doc2bow(doc) for doc in all_doc]
test_doc_vec = [self.dictionary.doc2bow(test_doc)]
#每个词语的tfidf值
tfidf = models.TfidfModel(corpus)
#相似度Sim算法
sparse_matrix = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=len(self.dictionary.keys()))
sim = sparse_matrix[tfidf[test_doc_vec]][0]
sim = sorted(enumerate(sim),key= lambda item: item[1],reverse=True)
# print(sim)
#提取在规定相关度之内的弹幕
key_words = []
for i in range(len(all_lines)):
if sim[i][1]>=0.135:
key_words.append(all_lines[sim[i][0]])
else:
break
self.save_file(key_words)
def draw_wordcloud(self):
#制作词频表
key_word_dict = {}
key_words = self.read_file('temp.txt')
key_words_list = []
for line in key_words:
#利用正则表达式去除中文标点,空的话相关度为零
line = re.sub('[^\w\s]','',line)
token = [word for word in jieba.lcut(line.strip()) if word not in self.stop_words and len(word)>1]
if len(key_words_list)==0:
key_words_list = token
else:
key_words_list += token
key_words_vec = self.dictionary.doc2bow(key_words_list)
# print(key_words_vec)
for tup in key_words_vec:
key_word_dict[self.dictionary[tup[0]]] = tup[1]
key_word_dict = sorted(key_word_dict.items(),key=lambda x:x[1],reverse=True)
print(key_word_dict)
# list = []
# for k,v in key_words_list:
# list.append(k+' '+v)
self.save_file([f'{items[0]} {items[1]} ' for items in key_word_dict])
#保存Excel表格
self.save_to_excel(key_word_dict)
#画词云
#要有中文字体的路径不然显示不出中文
w = wordcloud.WordCloud(background_color='white',mask = imageio.imread('./AI.png'),font_path= './24华康POP2体.TTC')
w.generate(str(self.read_file('./temp.txt')))
w.to_file(self.wordcloud_path)
def save_to_excel(self,key_word_dict):
execl_data = pd.DataFrame(key_word_dict)
execl_data.to_excel(self.exl_path)
if __name__ == '__main__':
# analyse_vector_tfidf()
path = './关于b站视频2024年巴黎运动会AI应用的弹幕.txt'
test_path = './巴黎奥运会AI文本库.txt'
userdict_path = './AI专有名词.txt'
exl_path = './b站视频2024年巴黎运动会AI应用的弹幕的统计表格.xlsx'
wordcloud_path = './AImap.png'
process = processor(path,test_path,userdict_path,exl_path,wordcloud_path)
process.analyse_vector_tfidf()
process.draw_wordcloud()