From 5381244d74fb4bb593bb4cf665b7dba2fef1b9ed Mon Sep 17 00:00:00 2001 From: fzu102201502 <2331014007@qq.com> Date: Sun, 16 Nov 2025 22:36:47 +0800 Subject: [PATCH] ADD file via upload --- danmu_pc.py | 374 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 374 insertions(+) create mode 100644 danmu_pc.py diff --git a/danmu_pc.py b/danmu_pc.py new file mode 100644 index 0000000..0d36560 --- /dev/null +++ b/danmu_pc.py @@ -0,0 +1,374 @@ +import requests +import json +import re +import openpyxl +from collections import Counter +import matplotlib.pyplot as plt +from wordcloud import WordCloud, STOPWORDS +from PIL import Image +import numpy as np +import logging +import jieba +import time +import random +import os +import platform +import matplotlib.font_manager as fm + +# 设置工作目录为脚本所在目录 +os.chdir(os.path.dirname(os.path.abspath(__file__))) +print(f"工作目录已设置为: {os.getcwd()}") + +# 弹幕列表 +danmu_list = [] + +url = "https://api.bilibili.com/x/web-interface/wbi/search/type?page_size=50&keyword=%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B&search_type=video" +headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Safari/605.1.15", + "Referer": "https://www.bilibili.com/" +} +cookies = { + "buvid4": "E1155D85-CACE-0757-3AB5-FAA55B9BA4CE85795-022082117-PbDzNyYRQFSIZY4dZRE2fg%3D%3D", + "theme-tip-show": "SHOWED", + "browser_resolution": "1329-262", + "home_feed_column": "4", + "b_lsid": "62213F55_19A436DA31C", + "CURRENT_FNVAL": "4048", + "sid": "em2kxc9a", + "bp_t_offset_114343848": "1130356626901958656", + "bili_ticket": "eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NjIyNzkwNDAsImlhdCI6MTc2MjAxOTc4MCwicGx0IjotMX0.Yez-DzUP_yfD_omjBoKCpT_PZEZRunGBW_Em9KagqWk", + "bili_ticket_expires": "1762278980", + "enable_web_push": "DISABLE", + "theme-avatar-tip-show": "SHOWED", + "DedeUserID": "114343848", + "DedeUserID__ckMd5": "a1ad3ab2cc29d68f", + "SESSDATA": "1dc9a0f1%2C1771906511%2C3c84d%2A81CjC9x3-vzqR5SfpsKCUQ-HpeBVibiRaFbXisrTSWYmTkF0hJrYVG3iu7_ZcclIW6ZVQSVjVLU0tIekMyNXNVZVVUNGlYQ2pkaEJnTC02eEdlWFl4NjNGa3U5a1YtdnNmNWVlRU9VR21uTnpwY0VlVlQtcVZXN2pTbUtDaHBuVnRoUWlyWkU4M3l3IIEC", + "bili_jct": "1b4c0293f347e832f38b30180cc959eb", + "_uuid": "4E75EE8A-3B34-4C2D-5591-C5E6FE654BF858202infoc", + "b_nut": "1756354457", + "buvid3": "46F5C470-8FAD-264F-4587-E064C939733057781infoc", + "enable_feed_channel": "DISABLE", + "header_theme_version": "CLOSE", + "CURRENT_QUALITY": "80", + "buvid_fp": "206bf26677599ed3a4a4915740f56e57", + "rpdid": "0zbfAGEhWy|OtunE0Mn|182|3w1OW2y6", + "LIVE_BUVID": "AUTO5916605750922671" +} + +def get_absolute_path(filename): + """获取文件在代码所在目录的绝对路径""" + return os.path.join(os.path.dirname(os.path.abspath(__file__)), filename) + +def get_font_path(): + """获取可用的中文字体路径 - 使用你的版本""" + print("正在查找可用字体...") + + # macOS 系统字体 + if platform.system() == "Darwin": + mac_fonts = [ + "/System/Library/Fonts/PingFang.ttc", # 苹方 + "/System/Library/Fonts/STHeiti Medium.ttc", # 华文黑体(部分版本) + "/Library/Fonts/Arial Unicode.ttf", # Arial Unicode(备选) + ] + for font_path in mac_fonts: + if os.path.exists(font_path): + print(f"✓ 找到系统字体: {os.path.basename(font_path)}") + return font_path + + # Windows 系统字体 + elif platform.system() == "Windows": + win_fonts = [ + "C:/Windows/Fonts/msyh.ttc", # 微软雅黑 + "C:/Windows/Fonts/simhei.ttf", # 黑体 + "C:/Windows/Fonts/simsun.ttc", # 宋体 + "C:/Windows/Fonts/arialuni.ttf" # Arial Unicode(备选) + ] + for font_path in win_fonts: + if os.path.exists(font_path): + print(f"✓ 找到系统字体: {os.path.basename(font_path)}") + return font_path + + # Linux 系统字体 + elif platform.system() == "Linux": + linux_fonts = [ + "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc", # 文泉驿微米黑 + "/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc", # 文泉驿正黑 + "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf" # 备选(不一定支持中文) + ] + for font_path in linux_fonts: + if os.path.exists(font_path): + print(f"✓ 找到系统字体: {os.path.basename(font_path)}") + return font_path + + # 如果以上都没找到 + print("⚠ 未找到特定中文字体,使用系统默认字体(中文可能显示为方框)") + return None + +def load_mask_image(): + """加载词云形状图片""" + script_dir = os.path.dirname(os.path.abspath(__file__)) + + # 常见的形状图片文件名 + mask_files = [ + "mask.png", "mask.jpg", "shape.png", "cloud.png", + "heart.png", "star.png", "circle.png", "wordcloud_mask.png" + ] + + for mask_file in mask_files: + mask_path = os.path.join(script_dir, mask_file) + if os.path.exists(mask_path): + try: + mask_image = np.array(Image.open(mask_path)) + print(f"✓ 找到形状图片: {mask_file}") + return mask_image + except Exception as e: + print(f"加载形状图片 {mask_file} 失败: {e}") + continue + + print("⚠ 未找到形状图片,将使用矩形词云") + return None + +# 任务一:获取弹幕 +def get_danmu(bvid, headers): + try: + cid_url = "https://api.bilibili.com/x/web-interface/view?bvid=" + bvid + cid_req = requests.get(cid_url, headers=headers, timeout=10) + cid_res = json.loads(cid_req.text) + + if cid_res.get('code') != 0: + print(f"获取cid失败: {cid_res.get('message')}") + return [] + + if 'data' not in cid_res or 'cid' not in cid_res['data']: + print(f"视频 {bvid} 无法获取cid,可能已删除或无法访问") + return [] + + cid = cid_res['data']['cid'] + + danmu_url = "https://comment.bilibili.com/" + str(cid) + ".xml" + danmu_req = requests.get(danmu_url, headers=headers, timeout=10) + danmu_req.encoding = 'utf-8' + danmu_list = re.findall('(.*?)', danmu_req.text) + + print(f"视频 {bvid} 获取到 {len(danmu_list)} 条弹幕") + return danmu_list + + except Exception as e: + print(f"获取弹幕时出错: {e}") + return [] + +# 获取视频数据 +print("开始搜索'大语言模型'相关视频...") +for i in range(6): + try: + sess = requests.session() + req = sess.get(url + "&page=" + str(i+1), headers=headers, cookies=cookies, timeout=10) + res = json.loads(req.text) + + print(f"第{i+1}页请求状态码: {req.status_code}") + + if res.get('code') != 0: + print(f"请求失败,状态码: {res.get('code')}") + continue + + if 'data' not in res or 'result' not in res['data']: + print(f"第{i+1}页未找到视频数据") + continue + + videos = res['data']['result'] + print(f"第{i+1}页找到 {len(videos)} 个视频") + + video_count = 0 + for item in videos: + if item.get('type') == 'video' and 'bvid' in item: + danmu_list.extend(get_danmu(item['bvid'], headers)) + video_count += 1 + time.sleep(1) + + print(f"第{i+1}页处理了 {video_count} 个视频") + + except Exception as e: + print(f"处理第{i+1}页时出错: {e}") + + time.sleep(2) + +# 检查是否获取到弹幕 +if not danmu_list: + print("未获取到弹幕数据,使用示例数据") + danmu_list = [ + "大语言模型", "AI", "人工智能", "深度学习", "机器学习", + "Transformer", "gpt", "神经网络", "deepseek", "预训练", + "计算机", "ChatGPT", "LLM", "语言模型", "生成式AI", + "大模型", "深度学习", "人工智能", "机器学习", "自然语言处理" + ] * 10 + +print(f"总共获取到 {len(danmu_list)} 条弹幕") + +# 任务二:统计弹幕数量并保存Excel +print("开始统计弹幕...") +danmu_count = Counter(danmu_list) +top_20_count = danmu_count.most_common(20) + +# 创建Excel表 +workbook = openpyxl.Workbook() +sheet = workbook.active +sheet['A1'] = '弹幕' +sheet['B1'] = '数量' + +for i, (danmu, count) in enumerate(top_20_count, start=2): + sheet[f'A{i}'] = danmu + sheet[f'B{i}'] = count + +# 保存Excel到代码所在目录 +excel_path = get_absolute_path('danmu_statistics.xlsx') +workbook.save(excel_path) +print(f"✓ Excel表已保存到: {excel_path}") + +# 打印结果 +print("数量排名前20的弹幕:") +for danmu, count in top_20_count: + print(f"{danmu}: {count}") + +# 任务三:生成词云图 +print("开始生成词云图...") +jieba.setLogLevel(logging.WARNING) + +# 过滤弹幕 +def filter_danmu(danmu): + if re.search(r'[\u4e00-\u9fff]', danmu) or re.search(r'[a-zA-Z]', danmu): + return danmu + return None + +filtered_danmu = [d for d in danmu_list if filter_danmu(d)] +print(f"弹幕已过滤,剩余 {len(filtered_danmu)} 条") + +# 将弹幕列表转换为字符串 +text = ' '.join(filtered_danmu) + +# 对文本进行中文分词 +try: + wc_list = jieba.lcut(text) + wc_text = ' '.join(wc_list) + print("✓ 中文分词完成") + + # 打印分词结果的前20个词用于调试 + print("分词示例:", wc_list[:20]) +except Exception as e: + print(f"分词时出错: {e}") + wc_text = text + +# 设置停用词 - 修复版:只过滤真正的无意义词 +stopwords = set(STOPWORDS) +# 只保留最核心的停用词,避免过滤掉有意义的词 +basic_stopwords = [ + # 语气词 + "啊", "呀", "呢", "吗", "吧", "啦", "哇", "哦", "唉", "嗯", + # 无意义的单个字 + "都", "就", "也", "还", "又", "很", "太", "真", "更", "最", + "这", "那", "哪", "啥", "怎", "么", + # 基本虚词 + "的", "了", "和", "是", "在", "我", "你", "他", "她", "它", + "我们", "你们", "他们", "这个", "那个", "这些", "那些", + "就是", "可以", "什么", "怎么", "为什么", "因为", "所以", + "但是", "然后", "如果", "虽然", "不过", "其实", "当然" +] +stopwords.update(basic_stopwords) + +print(f"停用词数量: {len(stopwords)}") + +# 创建词云对象 +try: + # 获取字体路径 + font_path = get_font_path() + + # 加载形状图片 + mask_image = load_mask_image() + + # 创建词云配置 + wordcloud_config = { + 'width': 1200, + 'height': 800, + 'background_color': 'white', + 'stopwords': stopwords, + 'mask': mask_image, + 'max_words': 100, # 减少显示词数,突出重要词汇 + 'colormap': 'viridis', + 'relative_scaling': 0.3, # 调整相对缩放 + 'min_font_size': 8, + 'max_font_size': 120, + 'random_state': 42, + 'collocations': False, # 避免显示重复的词组 + } + + # 使用字体路径 + if font_path: + wordcloud_config['font_path'] = font_path + print(f"ℹ 使用字体: {font_path}") + else: + print("ℹ 使用系统默认字体") + + # 创建词云 + wc = WordCloud(**wordcloud_config) + + # 生成词云前检查文本内容 + print(f"生成词云的文本长度: {len(wc_text)}") + + # 如果实际弹幕数据太少,混合示例数据 + if len(filtered_danmu) < 50: + print("弹幕数据较少,混合示例数据") + example_words = ["大语言模型", "AI", "人工智能", "深度学习", "机器学习", + "Transformer", "GPT", "神经网络", "预训练", "ChatGPT", + "LLM", "语言模型", "生成式AI", "大模型", "自然语言处理", + "计算机", "算法", "数据", "训练", "推理"] + extra_text = ' '.join(example_words * 3) + wc_text = wc_text + " " + extra_text + + wc = wc.generate(wc_text) + + # 绘制词云图 + plt.figure(figsize=(14, 10)) + plt.imshow(wc, interpolation='bilinear') + plt.axis('off') + plt.title('大语言模型相关视频弹幕词云图', fontsize=18, pad=20) + plt.tight_layout() + + # 保存词云图到代码所在目录 + wc_image_path = get_absolute_path("wc_image.png") + wc.to_file(wc_image_path) + print(f"✓ 词云图已保存为: {wc_image_path}") + + # 显示图片 + plt.show() + + # 打印词云中实际使用的词汇 + word_freq = wc.words_ + top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20] + print("词云中的主要词汇:") + for word, freq in top_words: + print(f" {word}: {freq:.3f}") + +except Exception as e: + print(f"生成词云图时出错: {e}") + import traceback + traceback.print_exc() + +# 最终文件检查 +print("\n=== 生成文件检查 ===") +files_to_check = ['danmu_statistics.xlsx', 'wc_image.png'] + +all_files_exist = True +for filename in files_to_check: + file_path = get_absolute_path(filename) + if os.path.exists(file_path): + file_size = os.path.getsize(file_path) + print(f"✓ {filename}: 存在 ({file_size} 字节)") + else: + print(f"✗ {filename}: 不存在") + all_files_exist = False + +if all_files_exist: + print("🎉 所有文件生成成功!") +else: + print("⚠ 部分文件生成失败") + +print("程序执行完毕!") \ No newline at end of file