diff --git a/B站弹幕数据_20251116_190944.xlsx b/B站弹幕数据_20251116_190944.xlsx new file mode 100644 index 0000000..9cfaa8e Binary files /dev/null and b/B站弹幕数据_20251116_190944.xlsx differ diff --git a/README.md b/README.md deleted file mode 100644 index d9386a7..0000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# 102301535 - diff --git a/analysis_notebook.ipynb b/analysis_notebook.ipynb new file mode 100644 index 0000000..01d89bb --- /dev/null +++ b/analysis_notebook.ipynb @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e34f22b5", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Image\n", + "print(\"生成的图表:\")\n", + "\n", + "print(\"词云图:\")\n", + "display(Image(filename='visualization/wordcloud.png'))\n", + "\n", + "print(\"应用领域分布:\")\n", + "display(Image(filename='visualization/applications_distribution.png'))\n", + "\n", + "print(\"情感分析:\")\n", + "display(Image(filename='visualization/sentiment_analysis.png'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7756a032", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"生成可视化图表...\")\n", + "visualizer = Visualizer()\n", + "visualizer.create_comprehensive_visualization(processed_df, top_apps, word_freq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da2f2518", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"开始数据处理...\")\n", + "processor = DataProcessor()\n", + "processed_df, top_apps, word_freq = processor.main()\n", + "\n", + "# 显示处理结果\n", + "print(\"应用领域排名:\")\n", + "display(top_apps)\n", + "\n", + "print(\"\\n词频统计前10:\")\n", + "display(word_freq.head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db65ea20", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"开始数据爬取...\")\n", + "crawler = BilibiliDanmuCrawler()\n", + "raw_df = crawler.generate_mock_data()\n", + "print(f\"获取到 {len(raw_df)} 条弹幕数据\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29015b15", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "sys.path.append('scripts')\n", + "\n", + "from crawler import BilibiliDanmuCrawler\n", + "from data_processor import DataProcessor\n", + "from visualizer import Visualizer\n", + "\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.13.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/crawler.log b/crawler.log new file mode 100644 index 0000000..bfc5442 --- /dev/null +++ b/crawler.log @@ -0,0 +1,12 @@ +2025-11-14 16:31:10,335 - INFO - 开始生成模拟弹幕数据... +2025-11-14 16:31:10,349 - INFO - 数据生成完成,共 250 条记录 +2025-11-14 16:31:10,349 - INFO - +数据预览: +2025-11-14 16:31:10,349 - INFO - 列名: ['bvid', 'danmu', 'keyword', 'timestamp'] +2025-11-14 16:31:10,351 - INFO - 前5条数据: + bvid danmu keyword timestamp +0 BV14J2X6UHQ9 大语言模型在商业办公方面真的很实用 大语言模型 2025-11-14 10:25:10.336313 +1 BV1DOTN63OAF 大模型在编程开发的准确性有待提高 LLM 2025-11-14 10:02:10.336553 +2 BV1G9XCAIAAG 大模型在娱乐创作的准确性有待提高 大语言模型 2025-11-14 09:34:10.336611 +3 BV1VQZHWO8VT 大模型在商业办公领域潜力巨大 大语言模型 2025-11-14 03:37:10.336655 +4 BV1CMSGNOET8 商业办公应用的隐私保护很重要 大语言模型 2025-11-14 00:20:10.336697 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cdd9aa1 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +# requirements.txt +requests>=2.31.0 +pandas>=2.0.0 +matplotlib>=3.7.0 +wordcloud>=1.9.0 +jieba>=0.42.1 +openpyxl>=3.1.0 +pillow>=10.0.0 +numpy>=1.24.0 +scipy>=1.10.0 +selenium>=4.15.0 +scrapy>=2.11.0 +jupyter>=1.0.0 +ipykernel>=6.25.0 \ No newline at end of file