From 9258108413544812ff6db94136b04cb50ee153c7 Mon Sep 17 00:00:00 2001
From: fzu102301523 <102301523@fzu.edu.cn>
Date: Tue, 18 Nov 2025 14:51:26 +0800
Subject: [PATCH] ADD file via upload

---
 code1.py | 324 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 324 insertions(+)
 create mode 100644 code1.py

diff --git a/code1.py b/code1.py
new file mode 100644
index 0000000..48338aa
--- /dev/null
+++ b/code1.py
@@ -0,0 +1,324 @@
+import requests
+import re
+import json
+import pandas as pd
+import jieba
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+from collections import Counter
+import time
+import random
+from bs4 import BeautifulSoup
+import numpy as np
+from PIL import Image
+
+# 设置中文显示
+plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
+plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题
+
+class BilibiliDanmakuAnalyzer:
+    def __init__(self):
+        # 初始化请求头，模拟浏览器访问
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Accept": "*/*",
+            "Accept-Language": "zh-CN,zh;q=0.9",
+            "Connection": "keep-alive"
+        }
+        # 存储爬取的所有弹幕
+        self.all_danmakus = []
+        # 存储视频信息
+        self.video_info = []
+        
+    def get_video_ids(self, keyword, page_count=36):
+        """
+        根据关键词搜索视频，获取视频ID
+        每页约10个视频，36页共约360个视频
+        """
+        video_ids = []
+        for page in range(1, page_count + 1):
+            try:
+                # B站搜索接口
+                url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&search_type=video&page={page}"
+                response = requests.get(url, headers=self.headers)
+                data = json.loads(response.text)
+                
+                if data.get("code") == 0 and data.get("data"):
+                    results = data["data"]["result"]
+                    for item in results:
+                        video_id = item["aid"]
+                        video_ids.append(video_id)
+                        # 保存视频信息
+                        self.video_info.append({
+                            "video_id": video_id,
+                            "title": item["title"],
+                            "up主": item["author"],
+                            "播放量": item["play"],
+                            "弹幕数": item["video_review"]
+                        })
+                    print(f"已获取第{page}页视频，累计{len(video_ids)}个视频ID")
+                
+                # 随机休眠，避免被反爬
+                time.sleep(random.uniform(1, 3))
+            except Exception as e:
+                print(f"获取第{page}页视频ID失败: {str(e)}")
+                continue
+        
+        return list(set(video_ids))  # 去重
+    
+    def get_danmakus(self, video_id):
+        """获取单个视频的弹幕"""
+        try:
+            # 获取cid（弹幕池ID）
+            url = f"https://api.bilibili.com/x/web-interface/view?aid={video_id}"
+            response = requests.get(url, headers=self.headers)
+            data = json.loads(response.text)
+            
+            if data.get("code") == 0 and data.get("data"):
+                cid = data["data"]["cid"]
+                
+                # 获取弹幕
+                danmaku_url = f"https://comment.bilibili.com/{cid}.xml"
+                response = requests.get(danmaku_url, headers=self.headers)
+                response.encoding = "utf-8"
+                
+                # 解析XML格式的弹幕
+                soup = BeautifulSoup(response.text, "xml")
+                danmakus = soup.find_all("d")
+                
+                # 提取弹幕文本
+                danmaku_texts = [danmaku.text.strip() for danmaku in danmakus]
+                print(f"视频ID {video_id} 获取到 {len(danmaku_texts)} 条弹幕")
+                
+                return danmaku_texts
+            else:
+                print(f"获取视频 {video_id} 的cid失败")
+                return []
+        except Exception as e:
+            print(f"获取视频 {video_id} 弹幕失败: {str(e)}")
+            return []
+    
+    def crawl_all_danmakus(self, keywords=["大语言模型", "大模型", "LLM"], max_videos=360):
+        """爬取所有相关视频的弹幕"""
+        all_video_ids = []
+        
+        # 从多个关键词获取视频ID
+        for keyword in keywords:
+            print(f"开始搜索关键词: {keyword}")
+            video_ids = self.get_video_ids(keyword)
+            all_video_ids.extend(video_ids)
+            time.sleep(2)
+        
+        # 去重并限制最大数量
+        unique_video_ids = list(set(all_video_ids))[:max_videos]
+        print(f"共获取到 {len(unique_video_ids)} 个不重复的视频ID，开始爬取弹幕...")
+        
+        # 爬取每个视频的弹幕
+        for i, video_id in enumerate(unique_video_ids):
+            danmakus = self.get_danmakus(video_id)
+            self.all_danmakus.extend(danmakus)
+            
+            # 每爬取10个视频保存一次数据，防止意外丢失
+            if (i + 1) % 10 == 0:
+                self.save_danmakus_to_file()
+                print(f"已完成 {i + 1}/{len(unique_video_ids)} 个视频的弹幕爬取，累计弹幕数: {len(self.all_danmakus)}")
+            
+            # 随机休眠，避免被反爬
+            time.sleep(random.uniform(1, 2))
+        
+        # 最终保存一次
+        self.save_danmakus_to_file()
+        print(f"所有视频弹幕爬取完成，共获取 {len(self.all_danmakus)} 条弹幕")
+        
+        # 保存视频信息
+        df = pd.DataFrame(self.video_info)
+        df.to_excel("视频信息.xlsx", index=False)
+        
+        return self.all_danmakus
+    
+    def save_danmakus_to_file(self, filename="弹幕数据.txt"):
+        """保存弹幕数据到文件"""
+        with open(filename, "w", encoding="utf-8") as f:
+            for danmaku in self.all_danmakus:
+                f.write(danmaku + "\n")
+    
+    def load_danmakus_from_file(self, filename="弹幕数据.txt"):
+        """从文件加载弹幕数据"""
+        try:
+            with open(filename, "r", encoding="utf-8") as f:
+                self.all_danmakus = [line.strip() for line in f.readlines() if line.strip()]
+            print(f"从文件加载了 {len(self.all_danmakus)} 条弹幕数据")
+            return self.all_danmakus
+        except Exception as e:
+            print(f"加载弹幕数据失败: {str(e)}")
+            return []
+    
+    def analyze_application_cases(self, top_n=8):
+        """分析AI技术应用案例，统计排名前N的弹幕"""
+        # 常见的LLM应用领域关键词
+        application_keywords = [
+            "聊天机器人", "智能客服", "内容创作", "代码生成", 
+            "教育辅导", "翻译", "数据分析", "医疗诊断",
+            "自动写作", "语音助手", "图像生成", "游戏开发",
+            "推荐系统", "法律咨询", "金融分析", "市场营销"
+        ]
+        
+        # 统计每个应用领域出现的次数
+        application_counts = {keyword: 0 for keyword in application_keywords}
+        
+        for danmaku in self.all_danmakus:
+            for keyword in application_keywords:
+                if keyword in danmaku:
+                    application_counts[keyword] += 1
+        
+        # 按出现次数排序
+        sorted_applications = sorted(application_counts.items(), key=lambda x: x[1], reverse=True)
+        
+        # 提取前N名
+        top_applications = sorted_applications[:top_n]
+        
+        # 保存到Excel
+        df = pd.DataFrame(top_applications, columns=["应用案例", "出现次数"])
+        df.to_excel("LLM应用案例统计.xlsx", index=False)
+        
+        return top_applications
+    
+    def generate_wordcloud(self, mask=None, filename="弹幕词云图.png"):
+        """生成词云图"""
+        # 合并所有弹幕文本
+        text = " ".join(self.all_danmakus)
+        
+        # 使用jieba进行分词
+        words = jieba.cut(text)
+        words = [word for word in words if len(word) > 1]  # 过滤单字
+        words_text = " ".join(words)
+        
+        # 配置词云
+        wc = WordCloud(
+            font_path="simhei.ttf",  # 确保有中文字体
+            background_color="white",
+            max_words=200,
+            mask=mask,
+            contour_width=1,
+            contour_color="steelblue"
+        )
+        
+        # 生成词云
+        wc.generate(words_text)
+        
+        # 显示词云
+        plt.figure(figsize=(12, 8))
+        plt.imshow(wc, interpolation="bilinear")
+        plt.axis("off")
+        plt.title("B站大语言模型相关视频弹幕词云")
+        plt.tight_layout(pad=0)
+        
+        # 保存词云图
+        wc.to_file(filename)
+        print(f"词云图已保存为 {filename}")
+        
+        plt.show()
+    
+    def analyze_sentiment(self):
+        """简单分析用户观点"""
+        # 成本相关关键词
+        cost_keywords = ["贵", "便宜", "成本", "收费", "免费", "价格"]
+        # 应用领域关键词
+        field_keywords = ["教育", "医疗", "工作", "学习", "娱乐", "创作", "办公"]
+        # 不利影响关键词
+        negative_keywords = ["失业", "取代", "错误", "偏见", "隐私", "风险", "依赖"]
+        # 积极影响关键词
+        positive_keywords = ["方便", "高效", "有用", "帮助", "创新", "进步", "强大"]
+        
+        # 统计各类关键词出现次数
+        cost_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in cost_keywords))
+        field_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in field_keywords))
+        negative_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in negative_keywords))
+        positive_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in positive_keywords))
+        
+        # 计算百分比
+        total = len(self.all_danmakus)
+        if total == 0:
+            return {}
+            
+        result = {
+            "成本相关讨论占比": f"{cost_count/total*100:.2f}%",
+            "应用领域讨论占比": f"{field_count/total*100:.2f}%",
+            "不利影响讨论占比": f"{negative_count/total*100:.2f}%",
+            "积极影响讨论占比": f"{positive_count/total*100:.2f}%"
+        }
+        
+        return result
+    
+    def predict_trend(self):
+        """预测大语言模型应用发展趋势"""
+        # 基于常见观点的简单趋势预测
+        trends = [
+            "1. 行业垂直化：大语言模型将更深入各个专业领域，如医疗、法律、教育等",
+            "2. 个性化增强：模型将更加了解用户需求，提供个性化服务",
+            "3. 多模态融合：文本、图像、语音等多模态能力将深度融合",
+            "4. 边缘部署增加：更多模型将在边缘设备上运行，提升响应速度和隐私性",
+            "5. 监管加强：随着应用广泛，相关法律法规将逐步完善",
+            "6. 低代码/无代码结合：降低AI应用门槛，使更多人能使用LLM能力"
+        ]
+        return trends
+
+def main():
+    # 创建分析器实例
+    analyzer = BilibiliDanmakuAnalyzer()
+    
+    # 选择是爬取新数据还是加载已有数据
+    choice = input("请选择操作 (1: 爬取新数据, 2: 加载已有数据): ")
+    
+    if choice == "1":
+        # 爬取弹幕数据
+        analyzer.crawl_all_danmakus()
+    else:
+        # 加载已有数据
+        analyzer.load_danmakus_from_file()
+    
+    if not analyzer.all_danmakus:
+        print("没有可用的弹幕数据，程序退出")
+        return
+    
+    # 分析应用案例并输出前8名
+    print("\n===== LLM应用案例排名前8 =====")
+    top_applications = analyzer.analyze_application_cases(8)
+    for i, (app, count) in enumerate(top_applications, 1):
+        print(f"{i}. {app}: {count}次")
+    
+    # 生成词云图
+    print("\n===== 生成词云图 =====")
+    # 可以使用自定义形状作为词云掩码
+    try:
+        mask = np.array(Image.open("cloud_mask.png"))  # 如果有掩码图片
+        analyzer.generate_wordcloud(mask)
+    except:
+        analyzer.generate_wordcloud()
+    
+    # 分析用户观点
+    print("\n===== 用户观点分析 =====")
+    sentiment = analyzer.analyze_sentiment()
+    for key, value in sentiment.items():
+        print(f"{key}: {value}")
+    
+    # 生成结论
+    print("\n===== 分析结论 =====")
+    print("1. 从弹幕讨论来看，B站用户最关注的大语言模型应用领域是：" + 
+          ", ".join([app for app, _ in top_applications[:3]]))
+    
+    if float(sentiment["积极影响讨论占比"].rstrip('%')) > float(sentiment["不利影响讨论占比"].rstrip('%')):
+        print("2. 整体来看，用户对大语言模型的评价偏向积极，更多讨论其带来的便利和效率提升")
+    else:
+        print("2. 整体来看，用户对大语言模型存在较多担忧，主要集中在其可能带来的负面影响")
+    
+    print("3. 应用领域的讨论最为广泛，说明用户普遍关注大语言模型的实际落地场景")
+    
+    # 预测发展趋势
+    print("\n===== 大语言模型应用发展趋势预测 =====")
+    trends = analyzer.predict_trend()
+    for trend in trends:
+        print(trend)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file