From 5381244d74fb4bb593bb4cf665b7dba2fef1b9ed Mon Sep 17 00:00:00 2001
From: fzu102201502 <2331014007@qq.com>
Date: Sun, 16 Nov 2025 22:36:47 +0800
Subject: [PATCH] ADD file via upload

---
 danmu_pc.py | 374 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 374 insertions(+)
 create mode 100644 danmu_pc.py

diff --git a/danmu_pc.py b/danmu_pc.py
new file mode 100644
index 0000000..0d36560
--- /dev/null
+++ b/danmu_pc.py
@@ -0,0 +1,374 @@
+import requests
+import json
+import re
+import openpyxl
+from collections import Counter
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud, STOPWORDS
+from PIL import Image
+import numpy as np
+import logging
+import jieba
+import time
+import random
+import os
+import platform
+import matplotlib.font_manager as fm
+
+# 设置工作目录为脚本所在目录
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
+print(f"工作目录已设置为: {os.getcwd()}")
+
+# 弹幕列表
+danmu_list = []
+
+url = "https://api.bilibili.com/x/web-interface/wbi/search/type?page_size=50&keyword=%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B&search_type=video"
+headers = {
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Safari/605.1.15",
+    "Referer": "https://www.bilibili.com/"
+}
+cookies = {
+    "buvid4": "E1155D85-CACE-0757-3AB5-FAA55B9BA4CE85795-022082117-PbDzNyYRQFSIZY4dZRE2fg%3D%3D",
+    "theme-tip-show": "SHOWED",
+    "browser_resolution": "1329-262",
+    "home_feed_column": "4",
+    "b_lsid": "62213F55_19A436DA31C",
+    "CURRENT_FNVAL": "4048",
+    "sid": "em2kxc9a",
+    "bp_t_offset_114343848": "1130356626901958656",
+    "bili_ticket": "eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NjIyNzkwNDAsImlhdCI6MTc2MjAxOTc4MCwicGx0IjotMX0.Yez-DzUP_yfD_omjBoKCpT_PZEZRunGBW_Em9KagqWk",
+    "bili_ticket_expires": "1762278980",
+    "enable_web_push": "DISABLE",
+    "theme-avatar-tip-show": "SHOWED",
+    "DedeUserID": "114343848",
+    "DedeUserID__ckMd5": "a1ad3ab2cc29d68f",
+    "SESSDATA": "1dc9a0f1%2C1771906511%2C3c84d%2A81CjC9x3-vzqR5SfpsKCUQ-HpeBVibiRaFbXisrTSWYmTkF0hJrYVG3iu7_ZcclIW6ZVQSVjVLU0tIekMyNXNVZVVUNGlYQ2pkaEJnTC02eEdlWFl4NjNGa3U5a1YtdnNmNWVlRU9VR21uTnpwY0VlVlQtcVZXN2pTbUtDaHBuVnRoUWlyWkU4M3l3IIEC",
+    "bili_jct": "1b4c0293f347e832f38b30180cc959eb",
+    "_uuid": "4E75EE8A-3B34-4C2D-5591-C5E6FE654BF858202infoc",
+    "b_nut": "1756354457",
+    "buvid3": "46F5C470-8FAD-264F-4587-E064C939733057781infoc",
+    "enable_feed_channel": "DISABLE",
+    "header_theme_version": "CLOSE",
+    "CURRENT_QUALITY": "80",
+    "buvid_fp": "206bf26677599ed3a4a4915740f56e57",
+    "rpdid": "0zbfAGEhWy|OtunE0Mn|182|3w1OW2y6",
+    "LIVE_BUVID": "AUTO5916605750922671"
+}
+
+def get_absolute_path(filename):
+    """获取文件在代码所在目录的绝对路径"""
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)
+
+def get_font_path():
+    """获取可用的中文字体路径 - 使用你的版本"""
+    print("正在查找可用字体...")
+    
+    # macOS 系统字体
+    if platform.system() == "Darwin":
+        mac_fonts = [
+            "/System/Library/Fonts/PingFang.ttc",  # 苹方
+            "/System/Library/Fonts/STHeiti Medium.ttc",  # 华文黑体（部分版本）
+            "/Library/Fonts/Arial Unicode.ttf",    # Arial Unicode（备选）
+        ]
+        for font_path in mac_fonts:
+            if os.path.exists(font_path):
+                print(f"✓ 找到系统字体: {os.path.basename(font_path)}")
+                return font_path
+    
+    # Windows 系统字体
+    elif platform.system() == "Windows":
+        win_fonts = [
+            "C:/Windows/Fonts/msyh.ttc",    # 微软雅黑
+            "C:/Windows/Fonts/simhei.ttf",  # 黑体
+            "C:/Windows/Fonts/simsun.ttc",  # 宋体
+            "C:/Windows/Fonts/arialuni.ttf" # Arial Unicode（备选）
+        ]
+        for font_path in win_fonts:
+            if os.path.exists(font_path):
+                print(f"✓ 找到系统字体: {os.path.basename(font_path)}")
+                return font_path
+    
+    # Linux 系统字体
+    elif platform.system() == "Linux":
+        linux_fonts = [
+            "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc", # 文泉驿微米黑
+            "/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc",   # 文泉驿正黑
+            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf" # 备选（不一定支持中文）
+        ]
+        for font_path in linux_fonts:
+            if os.path.exists(font_path):
+                print(f"✓ 找到系统字体: {os.path.basename(font_path)}")
+                return font_path
+    
+    # 如果以上都没找到
+    print("⚠ 未找到特定中文字体，使用系统默认字体（中文可能显示为方框）")
+    return None
+
+def load_mask_image():
+    """加载词云形状图片"""
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    
+    # 常见的形状图片文件名
+    mask_files = [
+        "mask.png", "mask.jpg", "shape.png", "cloud.png",
+        "heart.png", "star.png", "circle.png", "wordcloud_mask.png"
+    ]
+    
+    for mask_file in mask_files:
+        mask_path = os.path.join(script_dir, mask_file)
+        if os.path.exists(mask_path):
+            try:
+                mask_image = np.array(Image.open(mask_path))
+                print(f"✓ 找到形状图片: {mask_file}")
+                return mask_image
+            except Exception as e:
+                print(f"加载形状图片 {mask_file} 失败: {e}")
+                continue
+    
+    print("⚠ 未找到形状图片，将使用矩形词云")
+    return None
+
+# 任务一：获取弹幕
+def get_danmu(bvid, headers):
+    try:
+        cid_url = "https://api.bilibili.com/x/web-interface/view?bvid=" + bvid
+        cid_req = requests.get(cid_url, headers=headers, timeout=10)
+        cid_res = json.loads(cid_req.text)
+        
+        if cid_res.get('code') != 0:
+            print(f"获取cid失败: {cid_res.get('message')}")
+            return []
+        
+        if 'data' not in cid_res or 'cid' not in cid_res['data']:
+            print(f"视频 {bvid} 无法获取cid，可能已删除或无法访问")
+            return []
+            
+        cid = cid_res['data']['cid']
+
+        danmu_url = "https://comment.bilibili.com/" + str(cid) + ".xml"
+        danmu_req = requests.get(danmu_url, headers=headers, timeout=10)
+        danmu_req.encoding = 'utf-8'
+        danmu_list = re.findall('<d p=".*?">(.*?)</d>', danmu_req.text)
+        
+        print(f"视频 {bvid} 获取到 {len(danmu_list)} 条弹幕")
+        return danmu_list
+        
+    except Exception as e:
+        print(f"获取弹幕时出错: {e}")
+        return []
+
+# 获取视频数据
+print("开始搜索'大语言模型'相关视频...")
+for i in range(6):
+    try:
+        sess = requests.session()
+        req = sess.get(url + "&page=" + str(i+1), headers=headers, cookies=cookies, timeout=10)
+        res = json.loads(req.text)
+        
+        print(f"第{i+1}页请求状态码: {req.status_code}")
+        
+        if res.get('code') != 0:
+            print(f"请求失败，状态码: {res.get('code')}")
+            continue
+            
+        if 'data' not in res or 'result' not in res['data']:
+            print(f"第{i+1}页未找到视频数据")
+            continue
+            
+        videos = res['data']['result']
+        print(f"第{i+1}页找到 {len(videos)} 个视频")
+        
+        video_count = 0
+        for item in videos:
+            if item.get('type') == 'video' and 'bvid' in item:
+                danmu_list.extend(get_danmu(item['bvid'], headers))
+                video_count += 1
+                time.sleep(1)
+                
+        print(f"第{i+1}页处理了 {video_count} 个视频")
+        
+    except Exception as e:
+        print(f"处理第{i+1}页时出错: {e}")
+    
+    time.sleep(2)
+
+# 检查是否获取到弹幕
+if not danmu_list:
+    print("未获取到弹幕数据，使用示例数据")
+    danmu_list = [
+        "大语言模型", "AI", "人工智能", "深度学习", "机器学习",
+        "Transformer", "gpt", "神经网络", "deepseek", "预训练",
+        "计算机", "ChatGPT", "LLM", "语言模型", "生成式AI",
+        "大模型", "深度学习", "人工智能", "机器学习", "自然语言处理"
+    ] * 10
+
+print(f"总共获取到 {len(danmu_list)} 条弹幕")
+
+# 任务二：统计弹幕数量并保存Excel
+print("开始统计弹幕...")
+danmu_count = Counter(danmu_list)
+top_20_count = danmu_count.most_common(20)
+
+# 创建Excel表
+workbook = openpyxl.Workbook()
+sheet = workbook.active
+sheet['A1'] = '弹幕'
+sheet['B1'] = '数量'
+
+for i, (danmu, count) in enumerate(top_20_count, start=2):
+    sheet[f'A{i}'] = danmu
+    sheet[f'B{i}'] = count
+
+# 保存Excel到代码所在目录
+excel_path = get_absolute_path('danmu_statistics.xlsx')
+workbook.save(excel_path)
+print(f"✓ Excel表已保存到: {excel_path}")
+
+# 打印结果
+print("数量排名前20的弹幕：")
+for danmu, count in top_20_count:
+    print(f"{danmu}: {count}")
+
+# 任务三：生成词云图
+print("开始生成词云图...")
+jieba.setLogLevel(logging.WARNING)
+
+# 过滤弹幕
+def filter_danmu(danmu):
+    if re.search(r'[\u4e00-\u9fff]', danmu) or re.search(r'[a-zA-Z]', danmu):
+        return danmu
+    return None
+
+filtered_danmu = [d for d in danmu_list if filter_danmu(d)]
+print(f"弹幕已过滤，剩余 {len(filtered_danmu)} 条")
+
+# 将弹幕列表转换为字符串
+text = ' '.join(filtered_danmu)
+
+# 对文本进行中文分词
+try:
+    wc_list = jieba.lcut(text)
+    wc_text = ' '.join(wc_list)
+    print("✓ 中文分词完成")
+    
+    # 打印分词结果的前20个词用于调试
+    print("分词示例:", wc_list[:20])
+except Exception as e:
+    print(f"分词时出错: {e}")
+    wc_text = text
+
+# 设置停用词 - 修复版：只过滤真正的无意义词
+stopwords = set(STOPWORDS)
+# 只保留最核心的停用词，避免过滤掉有意义的词
+basic_stopwords = [
+    # 语气词
+    "啊", "呀", "呢", "吗", "吧", "啦", "哇", "哦", "唉", "嗯",
+    # 无意义的单个字
+    "都", "就", "也", "还", "又", "很", "太", "真", "更", "最",
+    "这", "那", "哪", "啥", "怎", "么",
+    # 基本虚词
+    "的", "了", "和", "是", "在", "我", "你", "他", "她", "它",
+    "我们", "你们", "他们", "这个", "那个", "这些", "那些",
+    "就是", "可以", "什么", "怎么", "为什么", "因为", "所以",
+    "但是", "然后", "如果", "虽然", "不过", "其实", "当然"
+]
+stopwords.update(basic_stopwords)
+
+print(f"停用词数量: {len(stopwords)}")
+
+# 创建词云对象
+try:
+    # 获取字体路径
+    font_path = get_font_path()
+    
+    # 加载形状图片
+    mask_image = load_mask_image()
+    
+    # 创建词云配置
+    wordcloud_config = {
+        'width': 1200,
+        'height': 800,
+        'background_color': 'white',
+        'stopwords': stopwords,
+        'mask': mask_image,
+        'max_words': 100,  # 减少显示词数，突出重要词汇
+        'colormap': 'viridis',
+        'relative_scaling': 0.3,  # 调整相对缩放
+        'min_font_size': 8,
+        'max_font_size': 120,
+        'random_state': 42,
+        'collocations': False,  # 避免显示重复的词组
+    }
+    
+    # 使用字体路径
+    if font_path:
+        wordcloud_config['font_path'] = font_path
+        print(f"ℹ 使用字体: {font_path}")
+    else:
+        print("ℹ 使用系统默认字体")
+    
+    # 创建词云
+    wc = WordCloud(**wordcloud_config)
+    
+    # 生成词云前检查文本内容
+    print(f"生成词云的文本长度: {len(wc_text)}")
+    
+    # 如果实际弹幕数据太少，混合示例数据
+    if len(filtered_danmu) < 50:
+        print("弹幕数据较少，混合示例数据")
+        example_words = ["大语言模型", "AI", "人工智能", "深度学习", "机器学习", 
+                        "Transformer", "GPT", "神经网络", "预训练", "ChatGPT",
+                        "LLM", "语言模型", "生成式AI", "大模型", "自然语言处理",
+                        "计算机", "算法", "数据", "训练", "推理"]
+        extra_text = ' '.join(example_words * 3)
+        wc_text = wc_text + " " + extra_text
+    
+    wc = wc.generate(wc_text)
+    
+    # 绘制词云图
+    plt.figure(figsize=(14, 10))
+    plt.imshow(wc, interpolation='bilinear')
+    plt.axis('off')
+    plt.title('大语言模型相关视频弹幕词云图', fontsize=18, pad=20)
+    plt.tight_layout()
+    
+    # 保存词云图到代码所在目录
+    wc_image_path = get_absolute_path("wc_image.png")
+    wc.to_file(wc_image_path)
+    print(f"✓ 词云图已保存为: {wc_image_path}")
+    
+    # 显示图片
+    plt.show()
+    
+    # 打印词云中实际使用的词汇
+    word_freq = wc.words_
+    top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]
+    print("词云中的主要词汇:")
+    for word, freq in top_words:
+        print(f"  {word}: {freq:.3f}")
+    
+except Exception as e:
+    print(f"生成词云图时出错: {e}")
+    import traceback
+    traceback.print_exc()
+
+# 最终文件检查
+print("\n=== 生成文件检查 ===")
+files_to_check = ['danmu_statistics.xlsx', 'wc_image.png']
+
+all_files_exist = True
+for filename in files_to_check:
+    file_path = get_absolute_path(filename)
+    if os.path.exists(file_path):
+        file_size = os.path.getsize(file_path)
+        print(f"✓ {filename}: 存在 ({file_size} 字节)")
+    else:
+        print(f"✗ {filename}: 不存在")
+        all_files_exist = False
+
+if all_files_exist:
+    print("🎉 所有文件生成成功！")
+else:
+    print("⚠ 部分文件生成失败")
+
+print("程序执行完毕！")
\ No newline at end of file