parent
57aa171824
commit
bb625c67b8
@ -0,0 +1,130 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
from collections import Counter
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
import spacy
|
||||||
|
from spacy.lang.zh import Chinese
|
||||||
|
|
||||||
|
# 设置文件路径
|
||||||
|
sample_file = '样本.txt'
|
||||||
|
standard_file = '标准.txt'
|
||||||
|
output_file = '评论提取结果.txt'
|
||||||
|
excel_file = '评论提取结果.xlsx'
|
||||||
|
|
||||||
|
# 加载中文模型
|
||||||
|
nlp = Chinese()
|
||||||
|
|
||||||
|
|
||||||
|
def read_file(file_path):
|
||||||
|
"""读取文件内容"""
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text):
|
||||||
|
"""清洗文本"""
|
||||||
|
# 去除特殊字符和多余空格
|
||||||
|
text = re.sub(r'\W+', ' ', text) # 去除特殊字符
|
||||||
|
text = re.sub(r'\s+', ' ', text) # 合并多余空格
|
||||||
|
return text.strip() # 去掉首尾空格
|
||||||
|
|
||||||
|
|
||||||
|
def extract_keywords_using_tfidf(text, top_n=10):
|
||||||
|
"""使用TF-IDF提取关键词"""
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
tfidf_matrix = vectorizer.fit_transform([text])
|
||||||
|
feature_names = vectorizer.get_feature_names_out()
|
||||||
|
dense = tfidf_matrix.todense()
|
||||||
|
|
||||||
|
# 获取关键词和对应的TF-IDF值
|
||||||
|
tfidf_scores = dense.tolist()[0]
|
||||||
|
keywords = sorted(zip(feature_names, tfidf_scores), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
return keywords[:top_n]
|
||||||
|
|
||||||
|
|
||||||
|
def get_high_frequency_words(text, num=20):
|
||||||
|
"""获取高频词汇"""
|
||||||
|
words = text.split()
|
||||||
|
word_counts = Counter(words)
|
||||||
|
return word_counts.most_common(num)
|
||||||
|
|
||||||
|
|
||||||
|
def get_related_keywords(high_freq_words, standard_keywords):
|
||||||
|
"""从高频词汇中提取与标准关键词相关的词"""
|
||||||
|
return [word for word in high_freq_words if word in standard_keywords]
|
||||||
|
|
||||||
|
|
||||||
|
def get_similar_sentences(text, keywords):
|
||||||
|
"""获取所有匹配关键词的句子"""
|
||||||
|
sentences = re.split(r'[.!?,。]+', text)
|
||||||
|
sentences = [s.strip() for s in sentences if s.strip()]
|
||||||
|
|
||||||
|
# 筛选包含主题关键词的句子
|
||||||
|
relevant_sentences = []
|
||||||
|
for s in sentences:
|
||||||
|
if any(keyword in s for keyword in keywords):
|
||||||
|
relevant_sentences.append(s)
|
||||||
|
|
||||||
|
return Counter(relevant_sentences).most_common()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
sample_text = read_file(sample_file)
|
||||||
|
standard_text = read_file(standard_file)
|
||||||
|
|
||||||
|
cleaned_sample_text = clean_text(sample_text)
|
||||||
|
cleaned_standard_text = clean_text(standard_text)
|
||||||
|
|
||||||
|
# 从标准文本提取关键词
|
||||||
|
standard_keywords_data = extract_keywords_using_tfidf(cleaned_standard_text, top_n=10)
|
||||||
|
standard_keywords = [word for word, score in standard_keywords_data]
|
||||||
|
print(f"标准文本中提取到的关键词: {standard_keywords}")
|
||||||
|
|
||||||
|
# 从样本文本提取关键词
|
||||||
|
high_freq_words_data = extract_keywords_using_tfidf(cleaned_sample_text, top_n=20)
|
||||||
|
high_freq_words = [word for word, score in high_freq_words_data]
|
||||||
|
print(f"样本文本中提取到的高频词汇: {high_freq_words}")
|
||||||
|
|
||||||
|
# 从高频词汇中提取与标准关键词相关的词
|
||||||
|
related_keywords = get_related_keywords(high_freq_words, standard_keywords)
|
||||||
|
print(f"与标准关键字相关的高频词汇:{related_keywords}")
|
||||||
|
|
||||||
|
high_freq_sentences = get_similar_sentences(cleaned_sample_text, related_keywords)
|
||||||
|
|
||||||
|
# 保存结果到文本文件
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write("标准关键词:\n")
|
||||||
|
for word, score in standard_keywords_data:
|
||||||
|
f.write(f"{word}: {score}\n")
|
||||||
|
|
||||||
|
f.write("\n样本高频词汇:\n")
|
||||||
|
for word, score in high_freq_words_data:
|
||||||
|
f.write(f"{word}: {score}\n")
|
||||||
|
|
||||||
|
f.write("\n相关高频句子:\n")
|
||||||
|
for sentence, freq in high_freq_sentences:
|
||||||
|
f.write(f"{sentence.strip()}: {freq}\n")
|
||||||
|
|
||||||
|
# 生成 Excel 文件
|
||||||
|
high_freq_df = pd.DataFrame(high_freq_words_data, columns=['词汇', 'TF-IDF值'])
|
||||||
|
related_keywords_df = pd.DataFrame(related_keywords, columns=['与标准关键词相关的词汇'])
|
||||||
|
|
||||||
|
if os.path.exists(excel_file):
|
||||||
|
os.remove(excel_file)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with pd.ExcelWriter(excel_file) as writer:
|
||||||
|
high_freq_df.to_excel(writer, sheet_name='样本高频词汇', index=False)
|
||||||
|
pd.Series(related_keywords, name='相关高频词汇').to_frame().to_excel(writer, sheet_name='相关高频词汇',
|
||||||
|
index=False)
|
||||||
|
except PermissionError:
|
||||||
|
print(f"无法写入文件 '{excel_file}',请确保该文件未在其他程序中打开。")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"文件写入过程中发生错误: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in new issue