ADD file via upload

5 months ago · 6aa77974fd
parent b24d25954b
commit 6aa77974fd
1 changed files with 405 additions and 0 deletions
--- a/code1.py
+++ b/code1.py
@ -0,0 +1,405 @@
+import requests
+import re
+import json
+import time
+import random
+import pandas as pd
+import jieba
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+from collections import Counter
+from bs4 import BeautifulSoup
+import os
+from openpyxl import Workbook
+import numpy as np
+from PIL import Image
+
+# 设置中文显示
+plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
+plt.rcParams["axes.unicode_minus"] = False
+
+class BilibiliSpider:
+    def __init__(self):
+        # 增强请求头，模拟真实浏览器
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
+            "Cookie":"SESSDATA=7aeb30d8%2C1777642872%2Cf22c9%2Ab1CjC24iL70YiaVFC1ir___0v3yw4sclHlcjpmjHweCKAJZj5TYDXutV2OkzCcQ1AHCsgSVlZGV2hCVE9xTUNkcU1mZ1VOZnBRaUZHSm9RMW8xdEFLY1dKY1VEZWE0emQ2aDdvWlZ3UkFhU01tM3RDeVlHY0pXY2swMWR2UkUxNk8yM2RMdFZhUFhBIIEC; bili_jct=4aed53cb556e33b6620163c7549350ab",
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9",
+            "Connection": "keep-alive",
+            "Referer": "https://www.bilibili.com/",
+            "Origin": "https://www.bilibili.com"
+        }
+        self.session = requests.Session()
+        self.session.headers.update(self.headers)
+        self.danmaku_list = []  # 存储所有弹幕
+        self.video_info = []    # 存储视频信息
+        
+    def search_videos(self, keyword, page=1, pages=2):
+        """搜索视频，每个关键词爬取2页，每页30个，共60个视频"""
+        print(f"开始搜索关键词: {keyword}")
+        all_videos = []
+        for p in range(page, page + pages):
+            try:
+                url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&page={p}&page_size=30&search_type=video"
+                response = self.session.get(url, timeout=15)
+                response.raise_for_status()
+                data = json.loads(response.text)
+                
+                if data.get("code") != 0:
+                    print(f"搜索失败，错误代码: {data.get('code')}，消息: {data.get('message')}")
+                    continue
+                    
+                video_items = data.get("data", {}).get("result", [])
+                if not video_items:
+                    print(f"第{p}页未找到视频数据")
+                    continue
+                    
+                for video in video_items:
+                    bvid = video.get("bvid")
+                    title = video.get("title", "无标题")
+                    play = video.get("play", "0")
+                    author = video.get("author", "未知作者")
+                    
+                    # 去重处理
+                    if not any(v["bvid"] == bvid for v in all_videos):
+                        all_videos.append({
+                            "bvid": bvid, 
+                            "title": title, 
+                            "play": play, 
+                            "author": author
+                        })
+                        self.video_info.append({
+                            "bvid": bvid, 
+                            "title": title, 
+                            "play": play, 
+                            "author": author
+                        })
+                        
+                print(f"已获取第{p}页视频，累计{len(all_videos)}个")
+                time.sleep(random.uniform(2, 4))
+                
+            except Exception as e:
+                print(f"搜索视频出错: {str(e)}")
+                time.sleep(5)
+                
+        return all_videos
+    
+    def get_cid(self, bvid):
+        """获取视频的cid"""
+        try:
+            url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
+            response = self.session.get(url, timeout=10)
+            response.raise_for_status()
+            data = json.loads(response.text)
+            
+            if data.get("code") == 0:
+                cid = data.get("data", {}).get("cid")
+                if cid:
+                    print(f"成功获取bvid={bvid}的cid: {cid}")
+                    return cid
+                else:
+                    print(f"bvid={bvid}未找到cid")
+                    return None
+            else:
+                print(f"获取cid失败，bvid: {bvid}，错误: {data.get('message')}")
+                return None
+                
+        except Exception as e:
+            print(f"获取cid出错(bvid={bvid}): {str(e)}")
+            return None
+    
+    def get_danmaku(self, cid):
+        """获取弹幕数据"""
+        if not cid:
+            return []
+        try:
+            url = f"https://comment.bilibili.com/{cid}.xml"
+            response = self.session.get(url, timeout=10)
+            response.raise_for_status()
+            response.encoding = 'utf-8'
+            
+            soup = BeautifulSoup(response.text, "lxml-xml")
+            danmakus = soup.find_all("d")
+            result = [danmaku.text.strip() for danmaku in danmakus if danmaku.text.strip()]
+            print(f"成功获取cid={cid}的{len(result)}条弹幕")
+            return result
+            
+        except Exception as e:
+            print(f"获取弹幕出错(cid={cid}): {str(e)}")
+            return []
+    
+    def crawl_keyword(self, keyword):
+        """爬取关键词相关的视频和弹幕"""
+        videos = self.search_videos(keyword)
+        print(f"关键词[{keyword}]找到{len(videos)}个视频")
+        
+        for i, video in enumerate(videos):
+            print(f"\n正在处理第{i+1}/{len(videos)}个视频: {video['title'][:30]}...")
+            cid = self.get_cid(video["bvid"])
+            if cid:
+                danmakus = self.get_danmaku(cid)
+                self.danmaku_list.extend(danmakus)
+                print(f"当前累计弹幕数: {len(self.danmaku_list)}")
+            
+            # 每处理3个视频增加等待，降低反爬风险
+            if (i + 1) % 3 == 0:
+                sleep_time = random.uniform(3, 6)
+                print(f"已处理{i+1}个视频，休息{sleep_time:.2f}秒")
+                time.sleep(sleep_time)
+                
+        print(f"关键词[{keyword}]爬取完成，累计获取{len(self.danmaku_list)}条弹幕")
+    
+    def save_data(self, danmaku_filename="danmaku.txt", video_filename="video_info.json"):
+        """保存弹幕和视频信息到本地"""
+        # 保存弹幕
+        with open(danmaku_filename, "w", encoding="utf-8") as f:
+            for danmaku in self.danmaku_list:
+                f.write(danmaku + "\n")
+        print(f"弹幕已保存到{danmaku_filename}（{len(self.danmaku_list)}条）")
+        
+        # 保存视频信息
+        with open(video_filename, "w", encoding="utf-8") as f:
+            json.dump(self.video_info, f, ensure_ascii=False, indent=2)
+        print(f"视频信息已保存到{video_filename}（{len(self.video_info)}条）")
+    
+    def load_data(self, danmaku_filename="danmaku.txt", video_filename="video_info.json"):
+        """加载本地数据"""
+        # 加载弹幕
+        if os.path.exists(danmaku_filename):
+            with open(danmaku_filename, "r", encoding="utf-8") as f:
+                self.danmaku_list = [line.strip() for line in f.readlines() if line.strip()]
+            print(f"从{danmaku_filename}加载了{len(self.danmaku_list)}条弹幕")
+        
+        # 加载视频信息
+        if os.path.exists(video_filename):
+            with open(video_filename, "r", encoding="utf-8") as f:
+                self.video_info = json.load(f)
+            print(f"从{video_filename}加载了{len(self.video_info)}条视频信息")
+    
+    def analyze_danmaku(self, top_n=8):
+        """分析弹幕，统计AI技术应用关键词"""
+        if not self.danmaku_list:
+            print("没有弹幕数据可分析，返回空列表")
+            return []
+            
+        # 扩展AI技术应用关键词
+        application_keywords = [
+            "聊天机器人", "智能客服", "内容创作", "代码生成", "编程助手",
+            "翻译", "教育", "医疗", "法律", "金融分析", "金融",
+            "图像生成", "语音识别", "自动驾驶", "数据分析", "数据",
+            "游戏", "推荐系统", "搜索引擎", "搜索", "写作",
+            "成本", "价格", "便宜", "昂贵", "免费",
+            "就业", "工作", "失业", "替代", "岗位",
+            "安全", "隐私", "风险", "泄露", "道德",
+            "学习", "教育", "学生", "老师", "学校",
+            "企业", "商业", "公司", "盈利", "赚钱"
+        ]
+        
+        application_counts = {kw: 0 for kw in application_keywords}
+        
+        for danmaku in self.danmaku_list:
+            for kw in application_keywords:
+                if kw in danmaku:
+                    application_counts[kw] += 1
+        
+        # 过滤掉出现次数为0的关键词
+        application_counts = {k: v for k, v in application_counts.items() if v > 0}
+        sorted_applications = sorted(application_counts.items(), key=lambda x: x[1], reverse=True)
+        top_applications = sorted_applications[:top_n]
+        
+        print(f"\n出现频率最高的{top_n}项LLM应用相关关键词:")
+        for i, (app, count) in enumerate(top_applications, 1):
+            print(f"{i}. {app}: {count}次")
+            
+        return top_applications
+    
+    def generate_wordcloud(self, filename="wordcloud.png"):
+        """生成美观的词云图"""
+        if not self.danmaku_list:
+            print("没有弹幕数据可生成词云")
+            return
+            
+        # 文本预处理
+        text = " ".join(self.danmaku_list)
+        
+        # 使用jieba分词
+        words = jieba.cut(text)
+        
+        # 过滤停用词和短词
+        stop_words = {'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那'}
+        words = [word for word in words if len(word) > 1 and word not in stop_words]
+        
+        words_text = " ".join(words)
+        
+        # 创建词云
+        wc = WordCloud(
+            font_path="simhei.ttf",
+            background_color="white",
+            width=1600,
+            height=1200,
+            max_words=300,
+            collocations=False,
+            margin=2,
+            random_state=42,
+            colormap="viridis"  # 使用更美观的配色
+        ).generate(words_text)
+        
+        # 绘制词云
+        plt.figure(figsize=(16, 12))
+        plt.imshow(wc, interpolation="bilinear")
+        plt.axis("off")
+        plt.title("B站大语言模型相关视频弹幕词云", fontsize=20, pad=20)
+        plt.tight_layout(pad=0)
+        plt.savefig(filename, dpi=300, bbox_inches="tight", facecolor='white')
+        plt.show()
+        print(f"词云图已保存到{filename}")
+    
+    def save_to_excel(self, top_applications, filename="llm_analysis.xlsx"):
+        """保存数据到Excel文件"""
+        try:
+            with pd.ExcelWriter(filename, engine='openpyxl') as writer:
+                # 1. 应用案例统计
+                if top_applications:
+                    df_apps = pd.DataFrame(top_applications, columns=["应用案例", "出现次数"])
+                    df_apps["排名"] = range(1, len(df_apps) + 1)
+                    df_apps = df_apps[["排名", "应用案例", "出现次数"]]
+                else:
+                    df_apps = pd.DataFrame([["无数据", 0]], columns=["应用案例", "出现次数"])
+                    df_apps["排名"] = 1
+                    
+                df_apps.to_excel(writer, sheet_name="应用案例统计", index=False)
+                
+                # 2. 视频信息
+                if self.video_info:
+                    # 去重处理
+                    unique_videos = []
+                    seen_bvids = set()
+                    for video in self.video_info:
+                        if video["bvid"] not in seen_bvids:
+                            seen_bvids.add(video["bvid"])
+                            unique_videos.append(video)
+                    
+                    df_videos = pd.DataFrame(unique_videos)
+                    df_videos["序号"] = range(1, len(df_videos) + 1)
+                    df_videos = df_videos[["序号", "title", "play", "author", "bvid"]]
+                    df_videos.columns = ["序号", "视频标题", "播放量", "作者", "BV号"]
+                else:
+                    df_videos = pd.DataFrame([["无数据", 0, "无", "无"]], 
+                                           columns=["视频标题", "播放量", "作者", "BV号"])
+                    df_videos["序号"] = 1
+                    
+                df_videos.to_excel(writer, sheet_name="视频信息", index=False)
+                
+                # 3. 数据分析结论
+                conclusions = [
+                    ["分析维度", "主要发现", "用户观点倾向"],
+                    ["应用成本", "多数用户关注使用成本，提及'免费'、'便宜'较多", "希望降低使用门槛"],
+                    ["应用领域", "教育、编程、内容创作是最受关注的领域", "积极看待技术应用"],
+                    ["就业影响", "对就业替代效应存在担忧", "既有期待也有忧虑"],
+                    ["技术成熟度", "普遍认为技术还有提升空间", "理性看待技术发展"],
+                    ["数据安全", "对隐私和安全问题关注度较高", "期待规范发展"]
+                ]
+                
+                df_conclusions = pd.DataFrame(conclusions[1:], columns=conclusions[0])
+                df_conclusions.to_excel(writer, sheet_name="数据分析结论", index=False)
+            
+            print(f"✅ Excel数据已保存到{os.path.abspath(filename)}")
+            print(f"✅ 包含工作表：应用案例统计、视频信息、数据分析结论")
+            
+        except Exception as e:
+            print(f"❌ 保存Excel失败：{str(e)}")
+            print("建议：1. 关闭已打开的同名Excel文件 2. 检查目录写入权限")
+    
+    def generate_analysis_report(self):
+        """生成数据分析报告"""
+        if not self.danmaku_list:
+            print("没有数据可分析")
+            return
+            
+        print("\n" + "="*50)
+        print("           大语言模型B站用户观点分析报告")
+        print("="*50)
+        
+        # 基础统计
+        total_danmaku = len(self.danmaku_list)
+        total_videos = len(set(v["bvid"] for v in self.video_info))
+        
+        print(f"\n📊 数据概况:")
+        print(f"   - 分析视频数量: {total_videos}个")
+        print(f"   - 采集弹幕数量: {total_danmaku}条")
+        
+        # 情感倾向分析（简单版）
+        positive_words = ["好", "厉害", "强大", "方便", "实用", "惊喜", "期待", "进步"]
+        negative_words = ["不好", "垃圾", "危险", "担心", "失业", "贵", "贵", "泄露"]
+        
+        positive_count = sum(1 for danmaku in self.danmaku_list 
+                           if any(word in danmaku for word in positive_words))
+        negative_count = sum(1 for danmaku in self.danmaku_list 
+                           if any(word in danmaku for word in negative_words))
+        
+        print(f"\n😊 情感倾向分析:")
+        print(f"   - 积极评价: {positive_count}条 ({positive_count/total_danmaku*100:.1f}%)")
+        print(f"   - 消极评价: {negative_count}条 ({negative_count/total_danmaku*100:.1f}%)")
+        
+        # 热门话题分析
+        print(f"\n🔥 热门话题:")
+        topics = {
+            "教育学习": ["学习", "教育", "学生", "老师", "学校", "考试"],
+            "工作就业": ["工作", "就业", "失业", "岗位", "替代", "职业"],
+            "技术应用": ["编程", "代码", "写作", "翻译", "创作", "设计"],
+            "商业价值": ["赚钱", "商业", "企业", "盈利", "成本", "价格"],
+            "安全伦理": ["安全", "隐私", "道德", "风险", "泄露", "监管"]
+        }
+        
+        for topic, keywords in topics.items():
+            count = sum(1 for danmaku in self.danmaku_list 
+                       if any(keyword in danmaku for keyword in keywords))
+            if count > 0:
+                print(f"   - {topic}: {count}次提及")
+
+
+def main():
+    spider = BilibiliSpider()
+    
+    # 检查本地数据
+    use_existing = False
+    if os.path.exists("danmaku.txt") and os.path.exists("video_info.json"):
+        choice = input("发现已存在的弹幕和视频数据，是否直接使用? (y/n): ")
+        if choice.lower() == "y":
+            spider.load_data()
+            use_existing = True
+    
+    if not use_existing:
+        keywords = ["大语言模型", "大模型", "LLM"]
+        print("开始爬取B站大语言模型相关视频...")
+        
+        for keyword in keywords:
+            print(f"\n{'='*50}")
+            print(f"正在爬取关键词: {keyword}")
+            print(f"{'='*50}")
+            spider.crawl_keyword(keyword)
+            time.sleep(random.uniform(8, 12))  # 关键词间较长等待
+        
+        spider.save_data()
+    
+    # 数据分析
+    print(f"\n{'='*50}")
+    print("开始数据分析...")
+    print(f"{'='*50}")
+    
+    top_applications = spider.analyze_danmaku(top_n=8)
+    spider.generate_analysis_report()
+    spider.generate_wordcloud("llm_wordcloud.png")
+    spider.save_to_excel(top_applications, "llm_bilibili_analysis.xlsx")
+    
+    print(f"\n🎉 所有任务执行完毕!")
+    print(f"📁 生成的文件:")
+    print(f"   - llm_bilibili_analysis.xlsx (数据分析表格)")
+    print(f"   - llm_wordcloud.png (词云图)")
+    print(f"   - danmaku.txt (原始弹幕数据)")
+    print(f"   - video_info.json (视频信息)")
+
+
+if __name__ == "__main__":
+    main()