From bb625c67b8fdc64110d2b5e9e74811f4e0f5315c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E8=8D=A3=E6=9D=B0?= <2986694301@qq.com> Date: Thu, 31 Oct 2024 11:37:00 +0800 Subject: [PATCH] =?UTF-8?q?=E8=AF=84=E8=AE=BA=E6=80=BB=E7=BB=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 练习二.py | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 练习二.py diff --git a/练习二.py b/练习二.py new file mode 100644 index 0000000..79fa3e0 --- /dev/null +++ b/练习二.py @@ -0,0 +1,130 @@ +import os +import re +import pandas as pd +from collections import Counter +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity +import spacy +from spacy.lang.zh import Chinese + +# 设置文件路径 +sample_file = '样本.txt' +standard_file = '标准.txt' +output_file = '评论提取结果.txt' +excel_file = '评论提取结果.xlsx' + +# 加载中文模型 +nlp = Chinese() + + +def read_file(file_path): + """读取文件内容""" + with open(file_path, 'r', encoding='utf-8') as f: + return f.read() + + +def clean_text(text): + """清洗文本""" + # 去除特殊字符和多余空格 + text = re.sub(r'\W+', ' ', text) # 去除特殊字符 + text = re.sub(r'\s+', ' ', text) # 合并多余空格 + return text.strip() # 去掉首尾空格 + + +def extract_keywords_using_tfidf(text, top_n=10): + """使用TF-IDF提取关键词""" + vectorizer = TfidfVectorizer() + tfidf_matrix = vectorizer.fit_transform([text]) + feature_names = vectorizer.get_feature_names_out() + dense = tfidf_matrix.todense() + + # 获取关键词和对应的TF-IDF值 + tfidf_scores = dense.tolist()[0] + keywords = sorted(zip(feature_names, tfidf_scores), key=lambda x: x[1], reverse=True) + + return keywords[:top_n] + + +def get_high_frequency_words(text, num=20): + """获取高频词汇""" + words = text.split() + word_counts = Counter(words) + return word_counts.most_common(num) + + +def get_related_keywords(high_freq_words, standard_keywords): + """从高频词汇中提取与标准关键词相关的词""" + return [word for word in high_freq_words if word in standard_keywords] + + +def get_similar_sentences(text, keywords): + """获取所有匹配关键词的句子""" + sentences = re.split(r'[.!?,。]+', text) + sentences = [s.strip() for s in sentences if s.strip()] + + # 筛选包含主题关键词的句子 + relevant_sentences = [] + for s in sentences: + if any(keyword in s for keyword in keywords): + relevant_sentences.append(s) + + return Counter(relevant_sentences).most_common() + + +def main(): + sample_text = read_file(sample_file) + standard_text = read_file(standard_file) + + cleaned_sample_text = clean_text(sample_text) + cleaned_standard_text = clean_text(standard_text) + + # 从标准文本提取关键词 + standard_keywords_data = extract_keywords_using_tfidf(cleaned_standard_text, top_n=10) + standard_keywords = [word for word, score in standard_keywords_data] + print(f"标准文本中提取到的关键词: {standard_keywords}") + + # 从样本文本提取关键词 + high_freq_words_data = extract_keywords_using_tfidf(cleaned_sample_text, top_n=20) + high_freq_words = [word for word, score in high_freq_words_data] + print(f"样本文本中提取到的高频词汇: {high_freq_words}") + + # 从高频词汇中提取与标准关键词相关的词 + related_keywords = get_related_keywords(high_freq_words, standard_keywords) + print(f"与标准关键字相关的高频词汇:{related_keywords}") + + high_freq_sentences = get_similar_sentences(cleaned_sample_text, related_keywords) + + # 保存结果到文本文件 + with open(output_file, 'w', encoding='utf-8') as f: + f.write("标准关键词:\n") + for word, score in standard_keywords_data: + f.write(f"{word}: {score}\n") + + f.write("\n样本高频词汇:\n") + for word, score in high_freq_words_data: + f.write(f"{word}: {score}\n") + + f.write("\n相关高频句子:\n") + for sentence, freq in high_freq_sentences: + f.write(f"{sentence.strip()}: {freq}\n") + + # 生成 Excel 文件 + high_freq_df = pd.DataFrame(high_freq_words_data, columns=['词汇', 'TF-IDF值']) + related_keywords_df = pd.DataFrame(related_keywords, columns=['与标准关键词相关的词汇']) + + if os.path.exists(excel_file): + os.remove(excel_file) + + try: + with pd.ExcelWriter(excel_file) as writer: + high_freq_df.to_excel(writer, sheet_name='样本高频词汇', index=False) + pd.Series(related_keywords, name='相关高频词汇').to_frame().to_excel(writer, sheet_name='相关高频词汇', + index=False) + except PermissionError: + print(f"无法写入文件 '{excel_file}',请确保该文件未在其他程序中打开。") + except Exception as e: + print(f"文件写入过程中发生错误: {e}") + + +if __name__ == "__main__": + main() \ No newline at end of file