{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": [ "# This is a sample Jupyter Notebook\n", "\n", "Below is an example of a code cell. \n", "Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.\n", "\n", "Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.\n", "\n", "To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).\n", "For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html)." ], "id": "8a77807f92f26ee" }, { "metadata": { "ExecuteTime": { "end_time": "2026-03-25T03:29:47.565366400Z", "start_time": "2026-03-25T03:29:46.418109400Z" } }, "cell_type": "code", "source": [ "# 导入必要的库\n", "import numpy as np\n", "import pandas as pd\n", "import tensorflow as tf\n", "from tensorflow import keras\n", "from tensorflow.keras import layers\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "import matplotlib.pyplot as plt\n", "import jieba # 中文分词\n", "import re\n", "\n", "# 设置随机种子\n", "tf.random.set_seed(42)\n", "np.random.seed(42)\n", "\n", "\n", "\n" ], "id": "857953f3bac95712", "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'tensorflow'", "output_type": "error", "traceback": [ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[1;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", "Cell \u001B[1;32mIn[1], line 4\u001B[0m\n\u001B[0;32m 2\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;21;01mnumpy\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mas\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;21;01mnp\u001B[39;00m\n\u001B[0;32m 3\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;21;01mpandas\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mas\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;21;01mpd\u001B[39;00m\n\u001B[1;32m----> 4\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;21;01mtensorflow\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mas\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;21;01mtf\u001B[39;00m\n\u001B[0;32m 5\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;21;01mtensorflow\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mimport\u001B[39;00m keras\n\u001B[0;32m 6\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;21;01mtensorflow\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mkeras\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mimport\u001B[39;00m layers\n", "\u001B[1;31mModuleNotFoundError\u001B[0m: No module named 'tensorflow'" ] } ], "execution_count": 1 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "\n", "# 创建示例数据(实际应用中可以替换为真实数据)\n", "def create_sample_data():\n", " texts = [\n", " \"这个电影太棒了,非常好看\",\n", " \"质量很差,后悔买了\",\n", " \"服务态度特别好,很满意\",\n", " \"产品很垃圾,不要买\",\n", " \"强烈推荐,非常不错\",\n", " \"太差了,想退货\",\n", " \"性价比很高,值得购买\",\n", " \"体验极差,再也不来了\",\n", " \"非常满意,下次还来\",\n", " \"垃圾产品,差评\",\n", " ]\n", " labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0] # 1: 正面, 0: 负面\n", "\n", " return pd.DataFrame({'text': texts, 'label': labels})\n", "\n", "# 创建数据\n", "df = create_sample_data()\n", "print(\"数据预览:\")\n", "print(df.head())" ], "id": "ba7b31d16a7562bd" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "\n", "class TextPreprocessor:\n", " def __init__(self, max_vocab_size=10000, max_sequence_length=100):\n", " self.max_vocab_size = max_vocab_size\n", " self.max_sequence_length = max_sequence_length\n", " self.tokenizer = None\n", "\n", " def clean_text(self, text):\n", " \"\"\"文本清洗\"\"\"\n", " # 移除特殊字符\n", " text = re.sub(r'[^\\u4e00-\\u9fa5a-zA-Z0-9\\s]', '', text)\n", " # 移除多余空格\n", " text = ' '.join(text.split())\n", " return text\n", "\n", " def tokenize_chinese(self, text):\n", " \"\"\"中文分词\"\"\"\n", " return ' '.join(jieba.cut(text))\n", "\n", " def preprocess(self, texts):\n", " \"\"\"完整的预处理流程\"\"\"\n", " # 清洗文本\n", " texts = [self.clean_text(text) for text in texts]\n", " # 中文分词\n", " texts = [self.tokenize_chinese(text) for text in texts]\n", " return texts\n", "\n", " def fit_tokenizer(self, texts):\n", " \"\"\"训练tokenizer\"\"\"\n", " self.tokenizer = keras.preprocessing.text.Tokenizer(\n", " num_words=self.max_vocab_size,\n", " oov_token=''\n", " )\n", " self.tokenizer.fit_on_texts(texts)\n", "\n", " def texts_to_sequences(self, texts):\n", " \"\"\"文本转序列\"\"\"\n", " sequences = self.tokenizer.texts_to_sequences(texts)\n", " # 填充序列\n", " padded = keras.preprocessing.sequence.pad_sequences(\n", " sequences,\n", " maxlen=self.max_sequence_length,\n", " padding='post',\n", " truncating='post'\n", " )\n", " return padded\n", "\n", "# 实例化预处理器\n", "preprocessor = TextPreprocessor(max_vocab_size=5000, max_sequence_length=50)\n", "\n", "# 预处理文本\n", "df['processed_text'] = preprocessor.preprocess(df['text'])\n", "print(\"\\n预处理后的文本:\")\n", "print(df[['text', 'processed_text']].head())\n" ], "id": "bb7cf86f984d35ab" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "\n", "# 划分训练集和测试集\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " df['processed_text'],\n", " df['label'],\n", " test_size=0.2,\n", " random_state=42,\n", " stratify=df['label']\n", ")\n", "\n", "# 训练tokenizer\n", "preprocessor.fit_tokenizer(X_train)\n", "\n", "# 转换数据\n", "X_train_seq = preprocessor.texts_to_sequences(X_train)\n", "X_test_seq = preprocessor.texts_to_sequences(X_test)\n", "\n", "print(f\"训练集形状: {X_train_seq.shape}\")\n", "print(f\"测试集形状: {X_test_seq.shape}\")\n", "print(f\"词汇表大小: {len(preprocessor.tokenizer.word_index)}\")" ], "id": "1d678033bd73678a" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "\n", "def create_model(vocab_size, embedding_dim=100, max_length=50):\n", " \"\"\"创建深度学习模型\"\"\"\n", "\n", " model = keras.Sequential([\n", " # 嵌入层\n", " layers.Embedding(\n", " input_dim=vocab_size,\n", " output_dim=embedding_dim,\n", " input_length=max_length,\n", " mask_zero=True\n", " ),\n", "\n", " # 卷积层\n", " layers.Conv1D(128, 5, activation='relu'),\n", " layers.GlobalMaxPooling1D(),\n", "\n", " # 全连接层\n", " layers.Dense(64, activation='relu'),\n", " layers.Dropout(0.5),\n", " layers.Dense(1, activation='sigmoid')\n", " ])\n", "\n", " # 编译模型\n", " model.compile(\n", " optimizer='adam',\n", " loss='binary_crossentropy',\n", " metrics=['accuracy']\n", " )\n", "\n", " return model\n", "\n", "# 创建模型\n", "vocab_size = min(preprocessor.max_vocab_size, len(preprocessor.tokenizer.word_index) + 1)\n", "model = create_model(vocab_size, max_length=preprocessor.max_sequence_length)\n", "\n", "# 打印模型结构\n", "model.summary()" ], "id": "92d38af71a61ae21" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "\n", "# 定义回调函数\n", "callbacks = [\n", " # 早停法\n", " keras.callbacks.EarlyStopping(\n", " monitor='val_loss',\n", " patience=3,\n", " restore_best_weights=True\n", " ),\n", " # 学习率调整\n", " keras.callbacks.ReduceLROnPlateau(\n", " monitor='val_loss',\n", " factor=0.5,\n", " patience=2,\n", " min_lr=1e-6\n", " ),\n", " # 模型检查点\n", " keras.callbacks.ModelCheckpoint(\n", " 'best_model.h5',\n", " monitor='val_accuracy',\n", " save_best_only=True\n", " )\n", "]\n", "\n", "# 训练模型\n", "history = model.fit(\n", " X_train_seq, y_train,\n", " epochs=20,\n", " batch_size=4,\n", " validation_split=0.2,\n", " callbacks=callbacks,\n", " verbose=1\n", ")" ], "id": "b7af125c275ffb30" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "\n", "# 绘制训练曲线\n", "def plot_training_history(history):\n", " fig, axes = plt.subplots(1, 2, figsize=(12, 4))\n", "\n", " # 损失曲线\n", " axes[0].plot(history.history['loss'], label='训练损失')\n", " axes[0].plot(history.history['val_loss'], label='验证损失')\n", " axes[0].set_title('模型损失')\n", " axes[0].set_xlabel('Epoch')\n", " axes[0].set_ylabel('Loss')\n", " axes[0].legend()\n", "\n", " # 准确率曲线\n", " axes[1].plot(history.history['accuracy'], label='训练准确率')\n", " axes[1].plot(history.history['val_accuracy'], label='验证准确率')\n", " axes[1].set_title('模型准确率')\n", " axes[1].set_xlabel('Epoch')\n", " axes[1].set_ylabel('Accuracy')\n", " axes[1].legend()\n", "\n", " plt.tight_layout()\n", " plt.show()\n", "\n", "# 绘制训练曲线\n", "plot_training_history(history)\n", "\n", "# 在测试集上评估\n", "test_loss, test_accuracy = model.evaluate(X_test_seq, y_test)\n", "print(f\"\\n测试集准确率: {test_accuracy:.4f}\")\n", "print(f\"测试集损失: {test_loss:.4f}\")" ], "id": "6512b3a9b423101c" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "\n", "def predict_sentiment(texts, model, preprocessor):\n", " \"\"\"预测新文本的情感\"\"\"\n", " # 预处理\n", " processed_texts = preprocessor.preprocess(texts)\n", " # 转换\n", " sequences = preprocessor.texts_to_sequences(processed_texts)\n", " # 预测\n", " predictions = model.predict(sequences)\n", "\n", " results = []\n", " for text, pred in zip(texts, predictions):\n", " sentiment = \"正面\" if pred > 0.5 else \"负面\"\n", " confidence = pred[0] if pred > 0.5 else 1 - pred[0]\n", " results.append({\n", " 'text': text,\n", " 'sentiment': sentiment,\n", " 'confidence': confidence\n", " })\n", "\n", " return results\n", "\n", "# 测试预测\n", "test_texts = [\n", " \"这部电影真的很精彩,值得一看\",\n", " \"质量太差了,完全不值这个价格\",\n", " \"服务还不错,挺满意的\"\n", "]\n", "\n", "results = predict_sentiment(test_texts, model, preprocessor)\n", "print(\"\\n预测结果:\")\n", "for r in results:\n", " print(f\"文本: {r['text']}\")\n", " print(f\"情感: {r['sentiment']} (置信度: {r['confidence']:.4f})\")\n", "print(\"-\" * 50)" ], "id": "3e26891ae92fc6ab" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "\n", "\n", "\n", "def create_advanced_model(vocab_size, embedding_dim=100, max_length=50):\n", " \"\"\"使用更复杂的模型结构\"\"\"\n", "\n", " # 输入层\n", " inputs = keras.Input(shape=(max_length,))\n", "\n", " # 嵌入层\n", " embedding = layers.Embedding(vocab_size, embedding_dim)(inputs)\n", "\n", " # 双向LSTM\n", " x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(embedding)\n", " x = layers.Bidirectional(layers.LSTM(32))(x)\n", "\n", " # 全连接层\n", " x = layers.Dense(64, activation='relu')(x)\n", " x = layers.Dropout(0.5)(x)\n", " x = layers.Dense(32, activation='relu')(x)\n", "\n", " # 输出层\n", " outputs = layers.Dense(1, activation='sigmoid')(x)\n", "\n", " model = keras.Model(inputs, outputs)\n", "\n", " model.compile(\n", " optimizer='adam',\n", " loss='binary_crossentropy',\n", " metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()]\n", " )\n", "\n", " return model\n", "\n", "# 创建高级模型\n", "advanced_model = create_advanced_model(vocab_size, max_length=preprocessor.max_sequence_length)\n", "print(\"\\n高级模型结构:\")\n", "advanced_model.summary()\n" ], "id": "d02ce6bc00168449" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "\n", "# 保存完整模型\n", "def save_model_and_tokenizer(model, preprocessor, filepath='sentiment_model'):\n", " \"\"\"保存模型和预处理器\"\"\"\n", " # 保存模型\n", " model.save(f'{filepath}.h5')\n", "\n", " # 保存预处理器配置\n", " import pickle\n", " with open(f'{filepath}_preprocessor.pkl', 'wb') as f:\n", " pickle.dump({\n", " 'tokenizer': preprocessor.tokenizer,\n", " 'max_vocab_size': preprocessor.max_vocab_size,\n", " 'max_sequence_length': preprocessor.max_sequence_length\n", " }, f)\n", " print(f\"模型和预处理器已保存到 {filepath}\")\n", "\n", "# 加载模型\n", "def load_model_and_tokenizer(filepath='sentiment_model'):\n", " \"\"\"加载模型和预处理器\"\"\"\n", " import pickle\n", "\n", " # 加载模型\n", " model = keras.models.load_model(f'{filepath}.h5')\n", "\n", " # 加载预处理器配置\n", " with open(f'{filepath}_preprocessor.pkl', 'rb') as f:\n", " config = pickle.load(f)\n", "\n", " # 重建预处理器\n", " preprocessor = TextPreprocessor(\n", " max_vocab_size=config['max_vocab_size'],\n", " max_sequence_length=config['max_sequence_length']\n", " )\n", " preprocessor.tokenizer = config['tokenizer']\n", "\n", " return model, preprocessor\n", "\n", "# 保存模型\n", "save_model_and_tokenizer(model, preprocessor)" ], "id": "764daf72cfae12cd" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "print(\"Hello World!\")\n", "id": "fbc121e30a2defb3" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }