import requests import re import json import pandas as pd import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt from collections import Counter import time import random from bs4 import BeautifulSoup import numpy as np from PIL import Image # 设置中文显示 plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"] plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 class BilibiliDanmakuAnalyzer: def __init__(self): # 初始化请求头,模拟浏览器访问 self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Accept": "*/*", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive" } # 存储爬取的所有弹幕 self.all_danmakus = [] # 存储视频信息 self.video_info = [] def get_video_ids(self, keyword, page_count=36): """ 根据关键词搜索视频,获取视频ID 每页约10个视频,36页共约360个视频 """ video_ids = [] for page in range(1, page_count + 1): try: # B站搜索接口 url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&search_type=video&page={page}" response = requests.get(url, headers=self.headers) data = json.loads(response.text) if data.get("code") == 0 and data.get("data"): results = data["data"]["result"] for item in results: video_id = item["aid"] video_ids.append(video_id) # 保存视频信息 self.video_info.append({ "video_id": video_id, "title": item["title"], "up主": item["author"], "播放量": item["play"], "弹幕数": item["video_review"] }) print(f"已获取第{page}页视频,累计{len(video_ids)}个视频ID") # 随机休眠,避免被反爬 time.sleep(random.uniform(1, 3)) except Exception as e: print(f"获取第{page}页视频ID失败: {str(e)}") continue return list(set(video_ids)) # 去重 def get_danmakus(self, video_id): """获取单个视频的弹幕""" try: # 获取cid(弹幕池ID) url = f"https://api.bilibili.com/x/web-interface/view?aid={video_id}" response = requests.get(url, headers=self.headers) data = json.loads(response.text) if data.get("code") == 0 and data.get("data"): cid = data["data"]["cid"] # 获取弹幕 danmaku_url = f"https://comment.bilibili.com/{cid}.xml" response = requests.get(danmaku_url, headers=self.headers) response.encoding = "utf-8" # 解析XML格式的弹幕 soup = BeautifulSoup(response.text, "xml") danmakus = soup.find_all("d") # 提取弹幕文本 danmaku_texts = [danmaku.text.strip() for danmaku in danmakus] print(f"视频ID {video_id} 获取到 {len(danmaku_texts)} 条弹幕") return danmaku_texts else: print(f"获取视频 {video_id} 的cid失败") return [] except Exception as e: print(f"获取视频 {video_id} 弹幕失败: {str(e)}") return [] def crawl_all_danmakus(self, keywords=["大语言模型", "大模型", "LLM"], max_videos=360): """爬取所有相关视频的弹幕""" all_video_ids = [] # 从多个关键词获取视频ID for keyword in keywords: print(f"开始搜索关键词: {keyword}") video_ids = self.get_video_ids(keyword) all_video_ids.extend(video_ids) time.sleep(2) # 去重并限制最大数量 unique_video_ids = list(set(all_video_ids))[:max_videos] print(f"共获取到 {len(unique_video_ids)} 个不重复的视频ID,开始爬取弹幕...") # 爬取每个视频的弹幕 for i, video_id in enumerate(unique_video_ids): danmakus = self.get_danmakus(video_id) self.all_danmakus.extend(danmakus) # 每爬取10个视频保存一次数据,防止意外丢失 if (i + 1) % 10 == 0: self.save_danmakus_to_file() print(f"已完成 {i + 1}/{len(unique_video_ids)} 个视频的弹幕爬取,累计弹幕数: {len(self.all_danmakus)}") # 随机休眠,避免被反爬 time.sleep(random.uniform(1, 2)) # 最终保存一次 self.save_danmakus_to_file() print(f"所有视频弹幕爬取完成,共获取 {len(self.all_danmakus)} 条弹幕") # 保存视频信息 df = pd.DataFrame(self.video_info) df.to_excel("视频信息.xlsx", index=False) return self.all_danmakus def save_danmakus_to_file(self, filename="弹幕数据.txt"): """保存弹幕数据到文件""" with open(filename, "w", encoding="utf-8") as f: for danmaku in self.all_danmakus: f.write(danmaku + "\n") def load_danmakus_from_file(self, filename="弹幕数据.txt"): """从文件加载弹幕数据""" try: with open(filename, "r", encoding="utf-8") as f: self.all_danmakus = [line.strip() for line in f.readlines() if line.strip()] print(f"从文件加载了 {len(self.all_danmakus)} 条弹幕数据") return self.all_danmakus except Exception as e: print(f"加载弹幕数据失败: {str(e)}") return [] def analyze_application_cases(self, top_n=8): """分析AI技术应用案例,统计排名前N的弹幕""" # 常见的LLM应用领域关键词 application_keywords = [ "聊天机器人", "智能客服", "内容创作", "代码生成", "教育辅导", "翻译", "数据分析", "医疗诊断", "自动写作", "语音助手", "图像生成", "游戏开发", "推荐系统", "法律咨询", "金融分析", "市场营销" ] # 统计每个应用领域出现的次数 application_counts = {keyword: 0 for keyword in application_keywords} for danmaku in self.all_danmakus: for keyword in application_keywords: if keyword in danmaku: application_counts[keyword] += 1 # 按出现次数排序 sorted_applications = sorted(application_counts.items(), key=lambda x: x[1], reverse=True) # 提取前N名 top_applications = sorted_applications[:top_n] # 保存到Excel df = pd.DataFrame(top_applications, columns=["应用案例", "出现次数"]) df.to_excel("LLM应用案例统计.xlsx", index=False) return top_applications def generate_wordcloud(self, mask=None, filename="弹幕词云图.png"): """生成词云图""" # 合并所有弹幕文本 text = " ".join(self.all_danmakus) # 使用jieba进行分词 words = jieba.cut(text) words = [word for word in words if len(word) > 1] # 过滤单字 words_text = " ".join(words) # 配置词云 wc = WordCloud( font_path="simhei.ttf", # 确保有中文字体 background_color="white", max_words=200, mask=mask, contour_width=1, contour_color="steelblue" ) # 生成词云 wc.generate(words_text) # 显示词云 plt.figure(figsize=(12, 8)) plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.title("B站大语言模型相关视频弹幕词云") plt.tight_layout(pad=0) # 保存词云图 wc.to_file(filename) print(f"词云图已保存为 {filename}") plt.show() def analyze_sentiment(self): """简单分析用户观点""" # 成本相关关键词 cost_keywords = ["贵", "便宜", "成本", "收费", "免费", "价格"] # 应用领域关键词 field_keywords = ["教育", "医疗", "工作", "学习", "娱乐", "创作", "办公"] # 不利影响关键词 negative_keywords = ["失业", "取代", "错误", "偏见", "隐私", "风险", "依赖"] # 积极影响关键词 positive_keywords = ["方便", "高效", "有用", "帮助", "创新", "进步", "强大"] # 统计各类关键词出现次数 cost_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in cost_keywords)) field_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in field_keywords)) negative_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in negative_keywords)) positive_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in positive_keywords)) # 计算百分比 total = len(self.all_danmakus) if total == 0: return {} result = { "成本相关讨论占比": f"{cost_count/total*100:.2f}%", "应用领域讨论占比": f"{field_count/total*100:.2f}%", "不利影响讨论占比": f"{negative_count/total*100:.2f}%", "积极影响讨论占比": f"{positive_count/total*100:.2f}%" } return result def predict_trend(self): """预测大语言模型应用发展趋势""" # 基于常见观点的简单趋势预测 trends = [ "1. 行业垂直化:大语言模型将更深入各个专业领域,如医疗、法律、教育等", "2. 个性化增强:模型将更加了解用户需求,提供个性化服务", "3. 多模态融合:文本、图像、语音等多模态能力将深度融合", "4. 边缘部署增加:更多模型将在边缘设备上运行,提升响应速度和隐私性", "5. 监管加强:随着应用广泛,相关法律法规将逐步完善", "6. 低代码/无代码结合:降低AI应用门槛,使更多人能使用LLM能力" ] return trends def main(): # 创建分析器实例 analyzer = BilibiliDanmakuAnalyzer() # 选择是爬取新数据还是加载已有数据 choice = input("请选择操作 (1: 爬取新数据, 2: 加载已有数据): ") if choice == "1": # 爬取弹幕数据 analyzer.crawl_all_danmakus() else: # 加载已有数据 analyzer.load_danmakus_from_file() if not analyzer.all_danmakus: print("没有可用的弹幕数据,程序退出") return # 分析应用案例并输出前8名 print("\n===== LLM应用案例排名前8 =====") top_applications = analyzer.analyze_application_cases(8) for i, (app, count) in enumerate(top_applications, 1): print(f"{i}. {app}: {count}次") # 生成词云图 print("\n===== 生成词云图 =====") # 可以使用自定义形状作为词云掩码 try: mask = np.array(Image.open("cloud_mask.png")) # 如果有掩码图片 analyzer.generate_wordcloud(mask) except: analyzer.generate_wordcloud() # 分析用户观点 print("\n===== 用户观点分析 =====") sentiment = analyzer.analyze_sentiment() for key, value in sentiment.items(): print(f"{key}: {value}") # 生成结论 print("\n===== 分析结论 =====") print("1. 从弹幕讨论来看,B站用户最关注的大语言模型应用领域是:" + ", ".join([app for app, _ in top_applications[:3]])) if float(sentiment["积极影响讨论占比"].rstrip('%')) > float(sentiment["不利影响讨论占比"].rstrip('%')): print("2. 整体来看,用户对大语言模型的评价偏向积极,更多讨论其带来的便利和效率提升") else: print("2. 整体来看,用户对大语言模型存在较多担忧,主要集中在其可能带来的负面影响") print("3. 应用领域的讨论最为广泛,说明用户普遍关注大语言模型的实际落地场景") # 预测发展趋势 print("\n===== 大语言模型应用发展趋势预测 =====") trends = analyzer.predict_trend() for trend in trends: print(trend) if __name__ == "__main__": main()