You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

62 lines
2.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import jieba
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from gensim import corpora,models,similarities
def read_file(path):
'''
读取文件,并存储到列表中
'''
with open(path,'r',encoding = 'utf-8') as f:
lines = f.readlines()
return lines
def analyse_vector_tfidf(analyse_path,test_path,userdict_path):
'''
抽取与AI相关较为强烈的弹幕
'''
#加载词库
# jieba.load_userdict(userdict_path)
stop_words = []
#将数据的全部分词保存在all_doc
all_lines = read_file(analyse_path)
all_doc = []
#与AI应用相关的文本数据
test_lines = read_file(test_path)
test_doc = []
#开始分词
#对需要分析的数据集开始分词
for line in all_lines:
#利用正则表达式去除中文标点
line = re.sub('[^\w\s]','',line)
token = [word for word in jieba.lcut(line.strip()) if word not in stop_words and len(word)>1]
all_doc.append(token)
#对AI应用相关的文本进行分词
for line in test_lines:
line = re.sub('[^\w\s]','',line)
token = [word for word in jieba.lcut(line.strip()) if word not in stop_words and len(word)>1]
if len(test_doc)==0:
test_doc = token
else:
test_doc += token
#制作词袋
dictionary = corpora.Dictionary(all_doc)
#制作语料库,即具有词频的词袋模型词语ID号词语频率
corpus = [dictionary.doc2bow(doc) for doc in all_doc]
test_doc_vec = [dictionary.doc2bow(test_doc)]
#每个词语的tfidf值
tfidf = models.TfidfModel(corpus)
#相似度Sim算法
sparse_matrix = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=len(dictionary.keys()))
sim = sparse_matrix[tfidf[test_doc_vec]][0]
sim = sorted(enumerate(sim),key= lambda item: item[1],reverse=True)
print(sim)
print(all_lines[sim[100][0]])
if __name__ == '__main__':
# analyse_vector_tfidf()
path = './关于b站视频2024年巴黎运动会AI应用的弹幕.txt'
test_path = './巴黎奥运会AI文本库.txt'
userdict_path = ''
analyse_vector_tfidf(path,test_path,userdict_path)