ADD file via upload

4 months ago · ac2d3f53ff
parent 0cd18e36a6
commit ac2d3f53ff
1 changed files with 242 additions and 0 deletions
--- a/danmaku_analysis.py
+++ b/danmaku_analysis.py
@ -0,0 +1,242 @@
+import requests
+import re
+import json
+import pandas as pd
+import jieba
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+from collections import Counter
+import time
+import random
+from openpyxl import Workbook
+
+# 设置中文字体
+plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
+
+class BilibiliDanmakuSpider:
+    def __init__(self):
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Referer": "https://www.bilibili.com/"
+        }
+        self.danmaku_list = []
+        
+    def get_video_ids(self, keyword, page_count=36):
+        """获取搜索结果的视频ID，每页10个视频，36页共360个"""
+        video_ids = []
+        for page in range(1, page_count + 1):
+            try:
+                url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&search_type=video&page={page}"
+                response = requests.get(url, headers=self.headers)
+                data = json.loads(response.text)
+                
+                if data["code"] == 0 and data["data"]["result"]:
+                    for item in data["data"]["result"]:
+                        video_ids.append(item["aid"])
+                
+                # 随机延迟，避免被反爬
+                time.sleep(random.uniform(1, 3))
+            except Exception as e:
+                print(f"获取第{page}页视频ID失败: {e}")
+        
+        return list(set(video_ids))[:360]  # 去重并确保最多360个
+    
+    def get_danmakus(self, aid):
+        """获取单个视频的弹幕"""
+        try:
+            # 获取cid
+            url = f"https://api.bilibili.com/x/web-interface/view?aid={aid}"
+            response = requests.get(url, headers=self.headers)
+            cid = json.loads(response.text)["data"]["cid"]
+            
+            # 获取弹幕
+            danmaku_url = f"https://comment.bilibili.com/{cid}.xml"
+            response = requests.get(danmaku_url, headers=self.headers)
+            response.encoding = "utf-8"
+            
+            # 提取弹幕内容
+            danmakus = re.findall(r'<d.*?>(.*?)</d>', response.text)
+            self.danmaku_list.extend(danmakus)
+            
+            print(f"成功获取视频{aid}的{len(danmakus)}条弹幕")
+            time.sleep(random.uniform(0.5, 1.5))
+            return True
+        except Exception as e:
+            print(f"获取视频{aid}弹幕失败: {e}")
+            return False
+    
+    def run(self, keywords=["大语言模型", "大模型", "LLM"]):
+        """运行爬虫主程序"""
+        all_video_ids = []
+        for keyword in keywords:
+            print(f"搜索关键词: {keyword}")
+            video_ids = self.get_video_ids(keyword)
+            all_video_ids.extend(video_ids)
+        
+        # 去重并确保总数不超过360
+        unique_video_ids = list(set(all_video_ids))[:360]
+        print(f"共获取{len(unique_video_ids)}个视频ID，开始爬取弹幕...")
+        
+        for idx, aid in enumerate(unique_video_ids, 1):
+            print(f"正在爬取第{idx}/{len(unique_video_ids)}个视频")
+            self.get_danmakus(aid)
+        
+        print(f"爬取完成，共获取{len(self.danmaku_list)}条弹幕")
+        return self.danmaku_list
+
+class DanmakuAnalyzer:
+    def __init__(self, danmakus):
+        self.danmakus = [d.strip() for d in danmakus if d.strip()]
+        self.stopwords = self.load_stopwords()
+    
+    def load_stopwords(self):
+        """加载停用词"""
+        try:
+            with open("stopwords.txt", "r", encoding="utf-8") as f:
+                return set([line.strip() for line in f.readlines()])
+        except:
+            # 默认停用词
+            return set(["的", "了", "是", "在", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这"])
+    
+    def count_danmakus(self):
+        """统计弹幕数量"""
+        # 过滤与AI技术应用无关的弹幕
+        ai_related_words = ["AI", "人工智能", "大模型", "语言模型", "LLM", "训练", "算法", "数据", "应用", "成本", "效率", "伦理", "隐私", "安全", "未来", "发展", "编程", "创作", "教育"]
+        ai_danmakus = [d for d in self.danmakus if any(word in d for word in ai_related_words)]
+        
+        # 统计所有弹幕
+        all_counter = Counter(self.danmakus)
+        # 统计AI相关弹幕
+        ai_counter = Counter(ai_danmakus)
+        
+        return all_counter, ai_counter
+    
+    def save_to_excel(self, ai_counter, filename="弹幕统计.xlsx"):
+        """保存统计结果到Excel"""
+        # 转换为DataFrame
+        df = pd.DataFrame(ai_counter.most_common(), columns=["弹幕内容", "出现次数"])
+        # 保存到Excel
+        df.to_excel(filename, index=False)
+        print(f"统计结果已保存到{filename}")
+        return df
+    
+    def generate_wordcloud(self, filename="弹幕词云.png"):
+        """生成词云图"""
+        # 分词处理
+        all_text = " ".join(self.danmakus)
+        words = jieba.cut(all_text)
+        words = [word for word in words if word not in self.stopwords and len(word) > 1]
+        text = " ".join(words)
+        
+        # 生成词云
+        wc = WordCloud(
+            font_path="simhei.ttf",  # 替换为你的中文字体路径
+            background_color="white",
+            width=1200,
+            height=800,
+            max_words=200,
+            collocations=False
+        ).generate(text)
+        
+        # 显示并保存词云
+        plt.figure(figsize=(12, 8))
+        plt.imshow(wc, interpolation="bilinear")
+        plt.axis("off")
+        plt.tight_layout()
+        plt.savefig(filename, dpi=300)
+        plt.show()
+        print(f"词云图已保存到{filename}")
+    
+    def get_main_views(self, ai_counter):
+        """分析用户主流看法"""
+        # 分类关键词
+        cost_related = ["成本", "价格", "费用", "免费", "付费"]
+        application_related = ["应用", "使用", "场景", "领域", "行业", "教育", "医疗", "工作", "创作", "编程"]
+        negative_related = ["风险", "危险", "伦理", "隐私", "安全", "失业", "替代", "问题"]
+        positive_related = ["方便", "高效", "厉害", "强大", "有用", "帮助", "进步"]
+        
+        # 统计各类别弹幕
+        categories = {
+            "应用成本": 0,
+            "应用领域": 0,
+            "不利影响": 0,
+            "积极影响": 0,
+            "其他观点": 0
+        }
+        
+        for danmaku, count in ai_counter.items():
+            matched = False
+            for word in cost_related:
+                if word in danmaku:
+                    categories["应用成本"] += count
+                    matched = True
+                    break
+            if matched:
+                continue
+                
+            for word in application_related:
+                if word in danmaku:
+                    categories["应用领域"] += count
+                    matched = True
+                    break
+            if matched:
+                continue
+                
+            for word in negative_related:
+                if word in danmaku:
+                    categories["不利影响"] += count
+                    matched = True
+                    break
+            if matched:
+                continue
+                
+            for word in positive_related:
+                if word in danmaku:
+                    categories["积极影响"] += count
+                    matched = True
+                    break
+            if matched:
+                continue
+                
+            categories["其他观点"] += count
+        
+        return categories
+
+if __name__ == "__main__":
+    # 爬取弹幕
+    spider = BilibiliDanmakuSpider()
+    danmakus = spider.run()
+    
+    # 分析弹幕
+    analyzer = DanmakuAnalyzer(danmakus)
+    all_counter, ai_counter = analyzer.count_danmakus()
+    
+    # 输出排名前8的AI相关弹幕
+    print("\nAI技术应用相关弹幕数量排名前8:")
+    top8 = ai_counter.most_common(8)
+    for i, (danmaku, count) in enumerate(top8, 1):
+        print(f"{i}. {danmaku}: {count}次")
+    
+    # 保存到Excel
+    df = analyzer.save_to_excel(ai_counter)
+    
+    # 生成词云
+    analyzer.generate_wordcloud()
+    
+    # 分析主流看法
+    main_views = analyzer.get_main_views(ai_counter)
+    print("\nB站用户对大语言模型技术的主流看法统计:")
+    for view, count in main_views.items():
+        print(f"{view}: {count}条相关弹幕")
+    
+    # 可视化主流看法
+    plt.figure(figsize=(10, 6))
+    plt.bar(main_views.keys(), main_views.values(), color=['#4CAF50', '#2196F3', '#f44336', '#FFC107', '#9E9E9E'])
+    plt.title('用户对大语言模型的主要关注点分布')
+    plt.ylabel('弹幕数量')
+    plt.xticks(rotation=30)
+    for i, v in enumerate(main_views.values()):
+        plt.text(i, v + 5, str(v), ha='center')
+    plt.tight_layout()
+    plt.savefig('用户观点分布.png', dpi=300)
+    plt.show()