import jieba import re from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer from gensim import corpora,models,similarities def read_file(path): ''' 读取文件,并存储到列表中 ''' with open(path,'r',encoding = 'utf-8') as f: lines = f.readlines() return lines def analyse_vector_tfidf(analyse_path,test_path,userdict_path): ''' 抽取与AI相关较为强烈的弹幕 ''' #加载词库 # jieba.load_userdict(userdict_path) stop_words = [] #将数据的全部分词保存在all_doc all_lines = read_file(analyse_path) all_doc = [] #与AI应用相关的文本数据 test_lines = read_file(test_path) test_doc = [] #开始分词 #对需要分析的数据集开始分词 for line in all_lines: #利用正则表达式去除中文标点 line = re.sub('[^\w\s]','',line) token = [word for word in jieba.lcut(line.strip()) if word not in stop_words and len(word)>1] all_doc.append(token) #对AI应用相关的文本进行分词 for line in test_lines: line = re.sub('[^\w\s]','',line) token = [word for word in jieba.lcut(line.strip()) if word not in stop_words and len(word)>1] if len(test_doc)==0: test_doc = token else: test_doc += token #制作词袋 dictionary = corpora.Dictionary(all_doc) #制作语料库,即具有词频的词袋模型(词语ID号,词语频率) corpus = [dictionary.doc2bow(doc) for doc in all_doc] test_doc_vec = [dictionary.doc2bow(test_doc)] #每个词语的tfidf值 tfidf = models.TfidfModel(corpus) #相似度Sim算法 sparse_matrix = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=len(dictionary.keys())) sim = sparse_matrix[tfidf[test_doc_vec]][0] sim = sorted(enumerate(sim),key= lambda item: item[1],reverse=True) print(sim) print(all_lines[sim[100][0]]) if __name__ == '__main__': # analyse_vector_tfidf() path = './关于b站视频2024年巴黎运动会AI应用的弹幕.txt' test_path = './巴黎奥运会AI文本库.txt' userdict_path = '' analyse_vector_tfidf(path,test_path,userdict_path)