From bb625c67b8fdc64110d2b5e9e74811f4e0f5315c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E8=8D=A3=E6=9D=B0?= <2986694301@qq.com>
Date: Thu, 31 Oct 2024 11:37:00 +0800
Subject: [PATCH] =?UTF-8?q?=E8=AF=84=E8=AE=BA=E6=80=BB=E7=BB=93?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 练习二.py | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 练习二.py

diff --git a/练习二.py b/练习二.py
new file mode 100644
index 0000000..79fa3e0
--- /dev/null
+++ b/练习二.py
@@ -0,0 +1,130 @@
+import os
+import re
+import pandas as pd
+from collections import Counter
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import spacy
+from spacy.lang.zh import Chinese
+
+# 设置文件路径
+sample_file = '样本.txt'
+standard_file = '标准.txt'
+output_file = '评论提取结果.txt'
+excel_file = '评论提取结果.xlsx'
+
+# 加载中文模型
+nlp = Chinese()
+
+
+def read_file(file_path):
+    """读取文件内容"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return f.read()
+
+
+def clean_text(text):
+    """清洗文本"""
+    # 去除特殊字符和多余空格
+    text = re.sub(r'\W+', ' ', text)  # 去除特殊字符
+    text = re.sub(r'\s+', ' ', text)  # 合并多余空格
+    return text.strip()  # 去掉首尾空格
+
+
+def extract_keywords_using_tfidf(text, top_n=10):
+    """使用TF-IDF提取关键词"""
+    vectorizer = TfidfVectorizer()
+    tfidf_matrix = vectorizer.fit_transform([text])
+    feature_names = vectorizer.get_feature_names_out()
+    dense = tfidf_matrix.todense()
+
+    # 获取关键词和对应的TF-IDF值
+    tfidf_scores = dense.tolist()[0]
+    keywords = sorted(zip(feature_names, tfidf_scores), key=lambda x: x[1], reverse=True)
+
+    return keywords[:top_n]
+
+
+def get_high_frequency_words(text, num=20):
+    """获取高频词汇"""
+    words = text.split()
+    word_counts = Counter(words)
+    return word_counts.most_common(num)
+
+
+def get_related_keywords(high_freq_words, standard_keywords):
+    """从高频词汇中提取与标准关键词相关的词"""
+    return [word for word in high_freq_words if word in standard_keywords]
+
+
+def get_similar_sentences(text, keywords):
+    """获取所有匹配关键词的句子"""
+    sentences = re.split(r'[.!?，。]+', text)
+    sentences = [s.strip() for s in sentences if s.strip()]
+
+    # 筛选包含主题关键词的句子
+    relevant_sentences = []
+    for s in sentences:
+        if any(keyword in s for keyword in keywords):
+            relevant_sentences.append(s)
+
+    return Counter(relevant_sentences).most_common()
+
+
+def main():
+    sample_text = read_file(sample_file)
+    standard_text = read_file(standard_file)
+
+    cleaned_sample_text = clean_text(sample_text)
+    cleaned_standard_text = clean_text(standard_text)
+
+    # 从标准文本提取关键词
+    standard_keywords_data = extract_keywords_using_tfidf(cleaned_standard_text, top_n=10)
+    standard_keywords = [word for word, score in standard_keywords_data]
+    print(f"标准文本中提取到的关键词: {standard_keywords}")
+
+    # 从样本文本提取关键词
+    high_freq_words_data = extract_keywords_using_tfidf(cleaned_sample_text, top_n=20)
+    high_freq_words = [word for word, score in high_freq_words_data]
+    print(f"样本文本中提取到的高频词汇: {high_freq_words}")
+
+    # 从高频词汇中提取与标准关键词相关的词
+    related_keywords = get_related_keywords(high_freq_words, standard_keywords)
+    print(f"与标准关键字相关的高频词汇：{related_keywords}")
+
+    high_freq_sentences = get_similar_sentences(cleaned_sample_text, related_keywords)
+
+    # 保存结果到文本文件
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write("标准关键词:\n")
+        for word, score in standard_keywords_data:
+            f.write(f"{word}: {score}\n")
+
+        f.write("\n样本高频词汇:\n")
+        for word, score in high_freq_words_data:
+            f.write(f"{word}: {score}\n")
+
+        f.write("\n相关高频句子:\n")
+        for sentence, freq in high_freq_sentences:
+            f.write(f"{sentence.strip()}: {freq}\n")
+
+            # 生成 Excel 文件
+    high_freq_df = pd.DataFrame(high_freq_words_data, columns=['词汇', 'TF-IDF值'])
+    related_keywords_df = pd.DataFrame(related_keywords, columns=['与标准关键词相关的词汇'])
+
+    if os.path.exists(excel_file):
+        os.remove(excel_file)
+
+    try:
+        with pd.ExcelWriter(excel_file) as writer:
+            high_freq_df.to_excel(writer, sheet_name='样本高频词汇', index=False)
+            pd.Series(related_keywords, name='相关高频词汇').to_frame().to_excel(writer, sheet_name='相关高频词汇',
+                                                                                 index=False)
+    except PermissionError:
+        print(f"无法写入文件 '{excel_file}'，请确保该文件未在其他程序中打开。")
+    except Exception as e:
+        print(f"文件写入过程中发生错误: {e}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file