|
|
import requests
|
|
|
import re
|
|
|
import json
|
|
|
import pandas as pd
|
|
|
import jieba
|
|
|
from wordcloud import WordCloud
|
|
|
import matplotlib.pyplot as plt
|
|
|
from collections import Counter
|
|
|
import time
|
|
|
import random
|
|
|
from bs4 import BeautifulSoup
|
|
|
import numpy as np
|
|
|
from PIL import Image
|
|
|
|
|
|
# 设置中文显示
|
|
|
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
|
|
|
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
|
|
|
|
|
|
class BilibiliDanmakuAnalyzer:
|
|
|
def __init__(self):
|
|
|
# 初始化请求头,模拟浏览器访问
|
|
|
self.headers = {
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
|
"Accept": "*/*",
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
|
|
"Connection": "keep-alive"
|
|
|
}
|
|
|
# 存储爬取的所有弹幕
|
|
|
self.all_danmakus = []
|
|
|
# 存储视频信息
|
|
|
self.video_info = []
|
|
|
|
|
|
def get_video_ids(self, keyword, page_count=36):
|
|
|
"""
|
|
|
根据关键词搜索视频,获取视频ID
|
|
|
每页约10个视频,36页共约360个视频
|
|
|
"""
|
|
|
video_ids = []
|
|
|
for page in range(1, page_count + 1):
|
|
|
try:
|
|
|
# B站搜索接口
|
|
|
url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&search_type=video&page={page}"
|
|
|
response = requests.get(url, headers=self.headers)
|
|
|
data = json.loads(response.text)
|
|
|
|
|
|
if data.get("code") == 0 and data.get("data"):
|
|
|
results = data["data"]["result"]
|
|
|
for item in results:
|
|
|
video_id = item["aid"]
|
|
|
video_ids.append(video_id)
|
|
|
# 保存视频信息
|
|
|
self.video_info.append({
|
|
|
"video_id": video_id,
|
|
|
"title": item["title"],
|
|
|
"up主": item["author"],
|
|
|
"播放量": item["play"],
|
|
|
"弹幕数": item["video_review"]
|
|
|
})
|
|
|
print(f"已获取第{page}页视频,累计{len(video_ids)}个视频ID")
|
|
|
|
|
|
# 随机休眠,避免被反爬
|
|
|
time.sleep(random.uniform(1, 3))
|
|
|
except Exception as e:
|
|
|
print(f"获取第{page}页视频ID失败: {str(e)}")
|
|
|
continue
|
|
|
|
|
|
return list(set(video_ids)) # 去重
|
|
|
|
|
|
def get_danmakus(self, video_id):
|
|
|
"""获取单个视频的弹幕"""
|
|
|
try:
|
|
|
# 获取cid(弹幕池ID)
|
|
|
url = f"https://api.bilibili.com/x/web-interface/view?aid={video_id}"
|
|
|
response = requests.get(url, headers=self.headers)
|
|
|
data = json.loads(response.text)
|
|
|
|
|
|
if data.get("code") == 0 and data.get("data"):
|
|
|
cid = data["data"]["cid"]
|
|
|
|
|
|
# 获取弹幕
|
|
|
danmaku_url = f"https://comment.bilibili.com/{cid}.xml"
|
|
|
response = requests.get(danmaku_url, headers=self.headers)
|
|
|
response.encoding = "utf-8"
|
|
|
|
|
|
# 解析XML格式的弹幕
|
|
|
soup = BeautifulSoup(response.text, "xml")
|
|
|
danmakus = soup.find_all("d")
|
|
|
|
|
|
# 提取弹幕文本
|
|
|
danmaku_texts = [danmaku.text.strip() for danmaku in danmakus]
|
|
|
print(f"视频ID {video_id} 获取到 {len(danmaku_texts)} 条弹幕")
|
|
|
|
|
|
return danmaku_texts
|
|
|
else:
|
|
|
print(f"获取视频 {video_id} 的cid失败")
|
|
|
return []
|
|
|
except Exception as e:
|
|
|
print(f"获取视频 {video_id} 弹幕失败: {str(e)}")
|
|
|
return []
|
|
|
|
|
|
def crawl_all_danmakus(self, keywords=["大语言模型", "大模型", "LLM"], max_videos=360):
|
|
|
"""爬取所有相关视频的弹幕"""
|
|
|
all_video_ids = []
|
|
|
|
|
|
# 从多个关键词获取视频ID
|
|
|
for keyword in keywords:
|
|
|
print(f"开始搜索关键词: {keyword}")
|
|
|
video_ids = self.get_video_ids(keyword)
|
|
|
all_video_ids.extend(video_ids)
|
|
|
time.sleep(2)
|
|
|
|
|
|
# 去重并限制最大数量
|
|
|
unique_video_ids = list(set(all_video_ids))[:max_videos]
|
|
|
print(f"共获取到 {len(unique_video_ids)} 个不重复的视频ID,开始爬取弹幕...")
|
|
|
|
|
|
# 爬取每个视频的弹幕
|
|
|
for i, video_id in enumerate(unique_video_ids):
|
|
|
danmakus = self.get_danmakus(video_id)
|
|
|
self.all_danmakus.extend(danmakus)
|
|
|
|
|
|
# 每爬取10个视频保存一次数据,防止意外丢失
|
|
|
if (i + 1) % 10 == 0:
|
|
|
self.save_danmakus_to_file()
|
|
|
print(f"已完成 {i + 1}/{len(unique_video_ids)} 个视频的弹幕爬取,累计弹幕数: {len(self.all_danmakus)}")
|
|
|
|
|
|
# 随机休眠,避免被反爬
|
|
|
time.sleep(random.uniform(1, 2))
|
|
|
|
|
|
# 最终保存一次
|
|
|
self.save_danmakus_to_file()
|
|
|
print(f"所有视频弹幕爬取完成,共获取 {len(self.all_danmakus)} 条弹幕")
|
|
|
|
|
|
# 保存视频信息
|
|
|
df = pd.DataFrame(self.video_info)
|
|
|
df.to_excel("视频信息.xlsx", index=False)
|
|
|
|
|
|
return self.all_danmakus
|
|
|
|
|
|
def save_danmakus_to_file(self, filename="弹幕数据.txt"):
|
|
|
"""保存弹幕数据到文件"""
|
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|
|
for danmaku in self.all_danmakus:
|
|
|
f.write(danmaku + "\n")
|
|
|
|
|
|
def load_danmakus_from_file(self, filename="弹幕数据.txt"):
|
|
|
"""从文件加载弹幕数据"""
|
|
|
try:
|
|
|
with open(filename, "r", encoding="utf-8") as f:
|
|
|
self.all_danmakus = [line.strip() for line in f.readlines() if line.strip()]
|
|
|
print(f"从文件加载了 {len(self.all_danmakus)} 条弹幕数据")
|
|
|
return self.all_danmakus
|
|
|
except Exception as e:
|
|
|
print(f"加载弹幕数据失败: {str(e)}")
|
|
|
return []
|
|
|
|
|
|
def analyze_application_cases(self, top_n=8):
|
|
|
"""分析AI技术应用案例,统计排名前N的弹幕"""
|
|
|
# 常见的LLM应用领域关键词
|
|
|
application_keywords = [
|
|
|
"聊天机器人", "智能客服", "内容创作", "代码生成",
|
|
|
"教育辅导", "翻译", "数据分析", "医疗诊断",
|
|
|
"自动写作", "语音助手", "图像生成", "游戏开发",
|
|
|
"推荐系统", "法律咨询", "金融分析", "市场营销"
|
|
|
]
|
|
|
|
|
|
# 统计每个应用领域出现的次数
|
|
|
application_counts = {keyword: 0 for keyword in application_keywords}
|
|
|
|
|
|
for danmaku in self.all_danmakus:
|
|
|
for keyword in application_keywords:
|
|
|
if keyword in danmaku:
|
|
|
application_counts[keyword] += 1
|
|
|
|
|
|
# 按出现次数排序
|
|
|
sorted_applications = sorted(application_counts.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
|
|
# 提取前N名
|
|
|
top_applications = sorted_applications[:top_n]
|
|
|
|
|
|
# 保存到Excel
|
|
|
df = pd.DataFrame(top_applications, columns=["应用案例", "出现次数"])
|
|
|
df.to_excel("LLM应用案例统计.xlsx", index=False)
|
|
|
|
|
|
return top_applications
|
|
|
|
|
|
def generate_wordcloud(self, mask=None, filename="弹幕词云图.png"):
|
|
|
"""生成词云图"""
|
|
|
# 合并所有弹幕文本
|
|
|
text = " ".join(self.all_danmakus)
|
|
|
|
|
|
# 使用jieba进行分词
|
|
|
words = jieba.cut(text)
|
|
|
words = [word for word in words if len(word) > 1] # 过滤单字
|
|
|
words_text = " ".join(words)
|
|
|
|
|
|
# 配置词云
|
|
|
wc = WordCloud(
|
|
|
font_path="simhei.ttf", # 确保有中文字体
|
|
|
background_color="white",
|
|
|
max_words=200,
|
|
|
mask=mask,
|
|
|
contour_width=1,
|
|
|
contour_color="steelblue"
|
|
|
)
|
|
|
|
|
|
# 生成词云
|
|
|
wc.generate(words_text)
|
|
|
|
|
|
# 显示词云
|
|
|
plt.figure(figsize=(12, 8))
|
|
|
plt.imshow(wc, interpolation="bilinear")
|
|
|
plt.axis("off")
|
|
|
plt.title("B站大语言模型相关视频弹幕词云")
|
|
|
plt.tight_layout(pad=0)
|
|
|
|
|
|
# 保存词云图
|
|
|
wc.to_file(filename)
|
|
|
print(f"词云图已保存为 {filename}")
|
|
|
|
|
|
plt.show()
|
|
|
|
|
|
def analyze_sentiment(self):
|
|
|
"""简单分析用户观点"""
|
|
|
# 成本相关关键词
|
|
|
cost_keywords = ["贵", "便宜", "成本", "收费", "免费", "价格"]
|
|
|
# 应用领域关键词
|
|
|
field_keywords = ["教育", "医疗", "工作", "学习", "娱乐", "创作", "办公"]
|
|
|
# 不利影响关键词
|
|
|
negative_keywords = ["失业", "取代", "错误", "偏见", "隐私", "风险", "依赖"]
|
|
|
# 积极影响关键词
|
|
|
positive_keywords = ["方便", "高效", "有用", "帮助", "创新", "进步", "强大"]
|
|
|
|
|
|
# 统计各类关键词出现次数
|
|
|
cost_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in cost_keywords))
|
|
|
field_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in field_keywords))
|
|
|
negative_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in negative_keywords))
|
|
|
positive_count = sum(1 for danmaku in self.all_danmakus if any(kw in danmaku for kw in positive_keywords))
|
|
|
|
|
|
# 计算百分比
|
|
|
total = len(self.all_danmakus)
|
|
|
if total == 0:
|
|
|
return {}
|
|
|
|
|
|
result = {
|
|
|
"成本相关讨论占比": f"{cost_count/total*100:.2f}%",
|
|
|
"应用领域讨论占比": f"{field_count/total*100:.2f}%",
|
|
|
"不利影响讨论占比": f"{negative_count/total*100:.2f}%",
|
|
|
"积极影响讨论占比": f"{positive_count/total*100:.2f}%"
|
|
|
}
|
|
|
|
|
|
return result
|
|
|
|
|
|
def predict_trend(self):
|
|
|
"""预测大语言模型应用发展趋势"""
|
|
|
# 基于常见观点的简单趋势预测
|
|
|
trends = [
|
|
|
"1. 行业垂直化:大语言模型将更深入各个专业领域,如医疗、法律、教育等",
|
|
|
"2. 个性化增强:模型将更加了解用户需求,提供个性化服务",
|
|
|
"3. 多模态融合:文本、图像、语音等多模态能力将深度融合",
|
|
|
"4. 边缘部署增加:更多模型将在边缘设备上运行,提升响应速度和隐私性",
|
|
|
"5. 监管加强:随着应用广泛,相关法律法规将逐步完善",
|
|
|
"6. 低代码/无代码结合:降低AI应用门槛,使更多人能使用LLM能力"
|
|
|
]
|
|
|
return trends
|
|
|
|
|
|
def main():
|
|
|
# 创建分析器实例
|
|
|
analyzer = BilibiliDanmakuAnalyzer()
|
|
|
|
|
|
# 选择是爬取新数据还是加载已有数据
|
|
|
choice = input("请选择操作 (1: 爬取新数据, 2: 加载已有数据): ")
|
|
|
|
|
|
if choice == "1":
|
|
|
# 爬取弹幕数据
|
|
|
analyzer.crawl_all_danmakus()
|
|
|
else:
|
|
|
# 加载已有数据
|
|
|
analyzer.load_danmakus_from_file()
|
|
|
|
|
|
if not analyzer.all_danmakus:
|
|
|
print("没有可用的弹幕数据,程序退出")
|
|
|
return
|
|
|
|
|
|
# 分析应用案例并输出前8名
|
|
|
print("\n===== LLM应用案例排名前8 =====")
|
|
|
top_applications = analyzer.analyze_application_cases(8)
|
|
|
for i, (app, count) in enumerate(top_applications, 1):
|
|
|
print(f"{i}. {app}: {count}次")
|
|
|
|
|
|
# 生成词云图
|
|
|
print("\n===== 生成词云图 =====")
|
|
|
# 可以使用自定义形状作为词云掩码
|
|
|
try:
|
|
|
mask = np.array(Image.open("cloud_mask.png")) # 如果有掩码图片
|
|
|
analyzer.generate_wordcloud(mask)
|
|
|
except:
|
|
|
analyzer.generate_wordcloud()
|
|
|
|
|
|
# 分析用户观点
|
|
|
print("\n===== 用户观点分析 =====")
|
|
|
sentiment = analyzer.analyze_sentiment()
|
|
|
for key, value in sentiment.items():
|
|
|
print(f"{key}: {value}")
|
|
|
|
|
|
# 生成结论
|
|
|
print("\n===== 分析结论 =====")
|
|
|
print("1. 从弹幕讨论来看,B站用户最关注的大语言模型应用领域是:" +
|
|
|
", ".join([app for app, _ in top_applications[:3]]))
|
|
|
|
|
|
if float(sentiment["积极影响讨论占比"].rstrip('%')) > float(sentiment["不利影响讨论占比"].rstrip('%')):
|
|
|
print("2. 整体来看,用户对大语言模型的评价偏向积极,更多讨论其带来的便利和效率提升")
|
|
|
else:
|
|
|
print("2. 整体来看,用户对大语言模型存在较多担忧,主要集中在其可能带来的负面影响")
|
|
|
|
|
|
print("3. 应用领域的讨论最为广泛,说明用户普遍关注大语言模型的实际落地场景")
|
|
|
|
|
|
# 预测发展趋势
|
|
|
print("\n===== 大语言模型应用发展趋势预测 =====")
|
|
|
trends = analyzer.predict_trend()
|
|
|
for trend in trends:
|
|
|
print(trend)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main() |