|
|
|
@ -0,0 +1,72 @@
|
|
|
|
|
import sys
|
|
|
|
|
import codecs
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import numpy as np
|
|
|
|
|
import jieba.posseg
|
|
|
|
|
import jieba.analyse
|
|
|
|
|
from sklearn.feature_extraction.text import TfidfTransformer
|
|
|
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 数据预处理操作:分词,去停用词,词性筛选
|
|
|
|
|
def dataPrepos(text, stopkey):
|
|
|
|
|
l = []
|
|
|
|
|
pos = ['n', 'nz', 'v', 'vd', 'vn', 'l', 'a', 'd'] # 定义选取的词性
|
|
|
|
|
seg = jieba.posseg.cut(text) # 分词
|
|
|
|
|
for i in seg:
|
|
|
|
|
if i.word not in stopkey and i.flag in pos: # 去停用词 + 词性筛选
|
|
|
|
|
l.append(i.word)
|
|
|
|
|
return l
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# tf-idf获取文本topK关键词
|
|
|
|
|
def getKeywords_tfidf(corpus, stopkey, topK):
|
|
|
|
|
# 1、构建词频矩阵,将文本中的词语转换成词频矩阵
|
|
|
|
|
vectorizer = CountVectorizer()
|
|
|
|
|
X = vectorizer.fit_transform(corpus) # 词频矩阵
|
|
|
|
|
# 2、统计每个词的tf-idf权值
|
|
|
|
|
transformer = TfidfTransformer()
|
|
|
|
|
tfidf = transformer.fit_transform(X)
|
|
|
|
|
# 3、获取词袋模型中的关键词
|
|
|
|
|
word = vectorizer.get_feature_names_out()
|
|
|
|
|
# 4、获取tf-idf矩阵
|
|
|
|
|
weight = tfidf.toarray()
|
|
|
|
|
|
|
|
|
|
# 5、打印词语权重
|
|
|
|
|
keys = []
|
|
|
|
|
for i in range(len(weight)):
|
|
|
|
|
print("-------这里输出第", i + 1, "篇文本的词语tf-idf------")
|
|
|
|
|
df_word, df_weight = [], [] # 当前文章的所有词汇列表、词汇对应权重列表
|
|
|
|
|
for j in range(len(word)):
|
|
|
|
|
print(word[j], weight[i][j])
|
|
|
|
|
df_word.append(word[j])
|
|
|
|
|
df_weight.append(weight[i][j])
|
|
|
|
|
df_word = pd.DataFrame(df_word, columns=['word'])
|
|
|
|
|
df_weight = pd.DataFrame(df_weight, columns=['weight'])
|
|
|
|
|
word_weight = pd.concat([df_word, df_weight], axis=1) # 拼接词汇列表和权重列表
|
|
|
|
|
word_weight = word_weight.sort_values(by="weight", ascending=False) # 按照权重值降序排列
|
|
|
|
|
keyword = np.array(word_weight['word']) # 选择词汇列并转成数组格式
|
|
|
|
|
word_split = " ".join(keyword[:topK]) # 抽取前topK个词汇作为关键词
|
|
|
|
|
keys.append(word_split)
|
|
|
|
|
return keys
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
# 读取文本文件
|
|
|
|
|
dataFile = 'data/sample.txt'
|
|
|
|
|
with codecs.open(dataFile, 'r', encoding='utf-8') as file:
|
|
|
|
|
corpus = file.read().strip() # 读取整个文件并去除空行
|
|
|
|
|
|
|
|
|
|
# 停用词表
|
|
|
|
|
stopkey = [w.strip() for w in codecs.open('data/stopWord.txt', 'r', encoding='utf-8').readlines()]
|
|
|
|
|
|
|
|
|
|
# tf-idf关键词抽取
|
|
|
|
|
result = getKeywords_tfidf([corpus], stopkey, 10) # 将整个文本作为一个文档传入
|
|
|
|
|
|
|
|
|
|
# 将结果保存到DataFrame并导出为CSV
|
|
|
|
|
result_df = pd.DataFrame({"key": result}, columns=['key'])
|
|
|
|
|
result_df.to_csv("data/keys_TFIDF.txt", index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|