import sys
import codecs
import pandas as pd
import numpy as np
import jieba.posseg
import jieba.analyse
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# 数据预处理操作:分词,去停用词,词性筛选
def dataPrepos(text, stopkey):
l = []
pos = ['n', 'nz', 'v', 'vd', 'vn', 'l', 'a', 'd'] # 定义选取的词性
seg = jieba.posseg.cut(text) # 分词
for i in seg:
if i.word not in stopkey and i.flag in pos: # 去停用词 + 词性筛选
return l
# tf-idf获取文本topK关键词
def getKeywords_tfidf(corpus, stopkey, topK):
# 1、构建词频矩阵,将文本中的词语转换成词频矩阵
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus) # 词频矩阵
# 2、统计每个词的tf-idf权值
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
# 3、获取词袋模型中的关键词
word = vectorizer.get_feature_names_out()
# 4、获取tf-idf矩阵
weight = tfidf.toarray()
# 5、打印词语权重
keys = []
for i in range(len(weight)):
print("-------这里输出第", i + 1, "篇文本的词语tf-idf------")
df_word, df_weight = [], [] # 当前文章的所有词汇列表、词汇对应权重列表
for j in range(len(word)):
print(word[j], weight[i][j])
df_word = pd.DataFrame(df_word, columns=['word'])
df_weight = pd.DataFrame(df_weight, columns=['weight'])
word_weight = pd.concat([df_word, df_weight], axis=1) # 拼接词汇列表和权重列表
word_weight = word_weight.sort_values(by="weight", ascending=False) # 按照权重值降序排列
keyword = np.array(word_weight['word']) # 选择词汇列并转成数组格式
word_split = " ".join(keyword[:topK]) # 抽取前topK个词汇作为关键词
return keys
def main():
# 读取文本文件
dataFile = 'data/sample.txt'
with codecs.open(dataFile, 'r', encoding='utf-8') as file:
corpus = file.read().strip() # 读取整个文件并去除空行
# 停用词表
stopkey = [w.strip() for w in codecs.open('data/stopWord.txt', 'r', encoding='utf-8').readlines()]
# tf-idf关键词抽取
result = getKeywords_tfidf([corpus], stopkey, 10) # 将整个文本作为一个文档传入
# 将结果保存到DataFrame并导出为CSV
result_df = pd.DataFrame({"key": result}, columns=['key'])
result_df.to_csv("data/keys_TFIDF.txt", index=False)
if __name__ == '__main__':