import jieba import re from gensim import corpora,models,similarities import wordcloud import pandas as pd import imageio class processor(): def __init__(self,data_path,train_path,userdict,exl_path,wordcloud_path): #保存数据的路径 self.data_path = data_path #参照相似度的文件的路径 self.train_path = train_path #更新词库路径 self.userdict = userdict #保存的excel文件路径 self.exl_path = exl_path #词云的文件路径 self.wordcloud_path = wordcloud_path #停用词设置 self.stop_words = ['什么','亚洲','官方','俄罗斯'] #本文本词袋词袋 self.dictionary = None def read_file(self,path): ''' 读取文件,并存储到列表中 ''' with open(path,'r',encoding = 'utf-8') as f: lines = f.readlines() return lines def save_file(self,word_list): path = './temp.txt' with open(path,'w+',encoding='utf-8') as f: f.write(''.join(word_list)) return path def analyse_vector_tfidf(self): ''' 抽取与AI相关性 较为强烈的弹幕 ''' #加载词库 jieba.load_userdict(self.userdict) #将数据的全部分词保存在all_doc all_lines = self.read_file(self.data_path) all_doc = [] #与AI应用相关的文本数据 test_lines = self.read_file(self.train_path) test_doc = [] #开始分词 #对需要分析的数据集开始分词 for line in all_lines: #利用正则表达式去除中文标点,空的话相关度为零 line = re.sub('[^\w\s]','',line) token = [word for word in jieba.lcut(line.strip()) if word not in self.stop_words and len(word)>1] all_doc.append(token) #对AI应用相关的文本进行分词 for line in test_lines: line = re.sub('[^\w\s]','',line) token = [word for word in jieba.lcut(line.strip()) if word not in self.stop_words and len(word)>1] if len(test_doc)==0: test_doc = token else: test_doc += token #制作词袋 self.dictionary = corpora.Dictionary(all_doc) #制作语料库,即具有词频的词袋模型(词语ID号,词语频率) corpus = [self.dictionary.doc2bow(doc) for doc in all_doc] test_doc_vec = [self.dictionary.doc2bow(test_doc)] #每个词语的tfidf值 tfidf = models.TfidfModel(corpus) #相似度Sim算法 sparse_matrix = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=len(self.dictionary.keys())) sim = sparse_matrix[tfidf[test_doc_vec]][0] sim = sorted(enumerate(sim),key= lambda item: item[1],reverse=True) # print(sim) #提取在规定相关度之内的弹幕 key_words = [] for i in range(len(all_lines)): if sim[i][1]>=0.135: key_words.append(all_lines[sim[i][0]]) else: break self.save_file(key_words) def draw_wordcloud(self): #制作词频表 key_word_dict = {} key_words = self.read_file('temp.txt') key_words_list = [] for line in key_words: #利用正则表达式去除中文标点,空的话相关度为零 line = re.sub('[^\w\s]','',line) token = [word for word in jieba.lcut(line.strip()) if word not in self.stop_words and len(word)>1] if len(key_words_list)==0: key_words_list = token else: key_words_list += token key_words_vec = self.dictionary.doc2bow(key_words_list) # print(key_words_vec) for tup in key_words_vec: key_word_dict[self.dictionary[tup[0]]] = tup[1] key_word_dict = sorted(key_word_dict.items(),key=lambda x:x[1],reverse=True) print(key_word_dict) # list = [] # for k,v in key_words_list: # list.append(k+' '+v) self.save_file([f'{items[0]} {items[1]} ' for items in key_word_dict]) #保存Excel表格 self.save_to_excel(key_word_dict) #画词云 #要有中文字体的路径不然显示不出中文 w = wordcloud.WordCloud(background_color='white',mask = imageio.imread('./AI.png'),font_path= './24华康POP2体.TTC') w.generate(str(self.read_file('./temp.txt'))) w.to_file(self.wordcloud_path) def save_to_excel(self,key_word_dict): execl_data = pd.DataFrame(key_word_dict) execl_data.to_excel(self.exl_path) if __name__ == '__main__': # analyse_vector_tfidf() path = './关于b站视频2024年巴黎运动会AI应用的弹幕.txt' test_path = './巴黎奥运会AI文本库.txt' userdict_path = './AI专有名词.txt' exl_path = './b站视频2024年巴黎运动会AI应用的弹幕的统计表格.xlsx' wordcloud_path = './AImap.png' process = processor(path,test_path,userdict_path,exl_path,wordcloud_path) process.analyse_vector_tfidf() process.draw_wordcloud()