|
|
import jieba
|
|
|
import re
|
|
|
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
|
|
|
from gensim import corpora,models,similarities
|
|
|
|
|
|
def read_file(path):
|
|
|
'''
|
|
|
读取文件,并存储到列表中
|
|
|
'''
|
|
|
with open(path,'r',encoding = 'utf-8') as f:
|
|
|
lines = f.readlines()
|
|
|
return lines
|
|
|
|
|
|
def analyse_vector_tfidf(analyse_path,test_path,userdict_path):
|
|
|
'''
|
|
|
抽取与AI相关较为强烈的弹幕
|
|
|
'''
|
|
|
#加载词库
|
|
|
# jieba.load_userdict(userdict_path)
|
|
|
stop_words = []
|
|
|
#将数据的全部分词保存在all_doc
|
|
|
all_lines = read_file(analyse_path)
|
|
|
all_doc = []
|
|
|
#与AI应用相关的文本数据
|
|
|
test_lines = read_file(test_path)
|
|
|
test_doc = []
|
|
|
#开始分词
|
|
|
#对需要分析的数据集开始分词
|
|
|
for line in all_lines:
|
|
|
#利用正则表达式去除中文标点
|
|
|
line = re.sub('[^\w\s]','',line)
|
|
|
token = [word for word in jieba.lcut(line.strip()) if word not in stop_words and len(word)>1]
|
|
|
all_doc.append(token)
|
|
|
#对AI应用相关的文本进行分词
|
|
|
for line in test_lines:
|
|
|
line = re.sub('[^\w\s]','',line)
|
|
|
token = [word for word in jieba.lcut(line.strip()) if word not in stop_words and len(word)>1]
|
|
|
if len(test_doc)==0:
|
|
|
test_doc = token
|
|
|
else:
|
|
|
test_doc += token
|
|
|
#制作词袋
|
|
|
dictionary = corpora.Dictionary(all_doc)
|
|
|
#制作语料库,即具有词频的词袋模型(词语ID号,词语频率)
|
|
|
corpus = [dictionary.doc2bow(doc) for doc in all_doc]
|
|
|
test_doc_vec = [dictionary.doc2bow(test_doc)]
|
|
|
#每个词语的tfidf值
|
|
|
tfidf = models.TfidfModel(corpus)
|
|
|
#相似度Sim算法
|
|
|
sparse_matrix = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=len(dictionary.keys()))
|
|
|
sim = sparse_matrix[tfidf[test_doc_vec]][0]
|
|
|
sim = sorted(enumerate(sim),key= lambda item: item[1],reverse=True)
|
|
|
print(sim)
|
|
|
print(all_lines[sim[100][0]])
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
# analyse_vector_tfidf()
|
|
|
path = './关于b站视频2024年巴黎运动会AI应用的弹幕.txt'
|
|
|
test_path = './巴黎奥运会AI文本库.txt'
|
|
|
userdict_path = ''
|
|
|
analyse_vector_tfidf(path,test_path,userdict_path)
|