ADD file via upload

main
fzu102301541 4 months ago
parent b24d25954b
commit 6aa77974fd

@ -0,0 +1,405 @@
import requests
import re
import json
import time
import random
import pandas as pd
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from bs4 import BeautifulSoup
import os
from openpyxl import Workbook
import numpy as np
from PIL import Image
# 设置中文显示
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
plt.rcParams["axes.unicode_minus"] = False
class BilibiliSpider:
def __init__(self):
# 增强请求头,模拟真实浏览器
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"Cookie":"SESSDATA=7aeb30d8%2C1777642872%2Cf22c9%2Ab1CjC24iL70YiaVFC1ir___0v3yw4sclHlcjpmjHweCKAJZj5TYDXutV2OkzCcQ1AHCsgSVlZGV2hCVE9xTUNkcU1mZ1VOZnBRaUZHSm9RMW8xdEFLY1dKY1VEZWE0emQ2aDdvWlZ3UkFhU01tM3RDeVlHY0pXY2swMWR2UkUxNk8yM2RMdFZhUFhBIIEC; bili_jct=4aed53cb556e33b6620163c7549350ab",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Referer": "https://www.bilibili.com/",
"Origin": "https://www.bilibili.com"
}
self.session = requests.Session()
self.session.headers.update(self.headers)
self.danmaku_list = [] # 存储所有弹幕
self.video_info = [] # 存储视频信息
def search_videos(self, keyword, page=1, pages=2):
"""搜索视频每个关键词爬取2页每页30个共60个视频"""
print(f"开始搜索关键词: {keyword}")
all_videos = []
for p in range(page, page + pages):
try:
url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&page={p}&page_size=30&search_type=video"
response = self.session.get(url, timeout=15)
response.raise_for_status()
data = json.loads(response.text)
if data.get("code") != 0:
print(f"搜索失败,错误代码: {data.get('code')},消息: {data.get('message')}")
continue
video_items = data.get("data", {}).get("result", [])
if not video_items:
print(f"{p}页未找到视频数据")
continue
for video in video_items:
bvid = video.get("bvid")
title = video.get("title", "无标题")
play = video.get("play", "0")
author = video.get("author", "未知作者")
# 去重处理
if not any(v["bvid"] == bvid for v in all_videos):
all_videos.append({
"bvid": bvid,
"title": title,
"play": play,
"author": author
})
self.video_info.append({
"bvid": bvid,
"title": title,
"play": play,
"author": author
})
print(f"已获取第{p}页视频,累计{len(all_videos)}")
time.sleep(random.uniform(2, 4))
except Exception as e:
print(f"搜索视频出错: {str(e)}")
time.sleep(5)
return all_videos
def get_cid(self, bvid):
"""获取视频的cid"""
try:
url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
response = self.session.get(url, timeout=10)
response.raise_for_status()
data = json.loads(response.text)
if data.get("code") == 0:
cid = data.get("data", {}).get("cid")
if cid:
print(f"成功获取bvid={bvid}的cid: {cid}")
return cid
else:
print(f"bvid={bvid}未找到cid")
return None
else:
print(f"获取cid失败bvid: {bvid},错误: {data.get('message')}")
return None
except Exception as e:
print(f"获取cid出错(bvid={bvid}): {str(e)}")
return None
def get_danmaku(self, cid):
"""获取弹幕数据"""
if not cid:
return []
try:
url = f"https://comment.bilibili.com/{cid}.xml"
response = self.session.get(url, timeout=10)
response.raise_for_status()
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, "lxml-xml")
danmakus = soup.find_all("d")
result = [danmaku.text.strip() for danmaku in danmakus if danmaku.text.strip()]
print(f"成功获取cid={cid}{len(result)}条弹幕")
return result
except Exception as e:
print(f"获取弹幕出错(cid={cid}): {str(e)}")
return []
def crawl_keyword(self, keyword):
"""爬取关键词相关的视频和弹幕"""
videos = self.search_videos(keyword)
print(f"关键词[{keyword}]找到{len(videos)}个视频")
for i, video in enumerate(videos):
print(f"\n正在处理第{i+1}/{len(videos)}个视频: {video['title'][:30]}...")
cid = self.get_cid(video["bvid"])
if cid:
danmakus = self.get_danmaku(cid)
self.danmaku_list.extend(danmakus)
print(f"当前累计弹幕数: {len(self.danmaku_list)}")
# 每处理3个视频增加等待降低反爬风险
if (i + 1) % 3 == 0:
sleep_time = random.uniform(3, 6)
print(f"已处理{i+1}个视频,休息{sleep_time:.2f}")
time.sleep(sleep_time)
print(f"关键词[{keyword}]爬取完成,累计获取{len(self.danmaku_list)}条弹幕")
def save_data(self, danmaku_filename="danmaku.txt", video_filename="video_info.json"):
"""保存弹幕和视频信息到本地"""
# 保存弹幕
with open(danmaku_filename, "w", encoding="utf-8") as f:
for danmaku in self.danmaku_list:
f.write(danmaku + "\n")
print(f"弹幕已保存到{danmaku_filename}{len(self.danmaku_list)}条)")
# 保存视频信息
with open(video_filename, "w", encoding="utf-8") as f:
json.dump(self.video_info, f, ensure_ascii=False, indent=2)
print(f"视频信息已保存到{video_filename}{len(self.video_info)}条)")
def load_data(self, danmaku_filename="danmaku.txt", video_filename="video_info.json"):
"""加载本地数据"""
# 加载弹幕
if os.path.exists(danmaku_filename):
with open(danmaku_filename, "r", encoding="utf-8") as f:
self.danmaku_list = [line.strip() for line in f.readlines() if line.strip()]
print(f"{danmaku_filename}加载了{len(self.danmaku_list)}条弹幕")
# 加载视频信息
if os.path.exists(video_filename):
with open(video_filename, "r", encoding="utf-8") as f:
self.video_info = json.load(f)
print(f"{video_filename}加载了{len(self.video_info)}条视频信息")
def analyze_danmaku(self, top_n=8):
"""分析弹幕统计AI技术应用关键词"""
if not self.danmaku_list:
print("没有弹幕数据可分析,返回空列表")
return []
# 扩展AI技术应用关键词
application_keywords = [
"聊天机器人", "智能客服", "内容创作", "代码生成", "编程助手",
"翻译", "教育", "医疗", "法律", "金融分析", "金融",
"图像生成", "语音识别", "自动驾驶", "数据分析", "数据",
"游戏", "推荐系统", "搜索引擎", "搜索", "写作",
"成本", "价格", "便宜", "昂贵", "免费",
"就业", "工作", "失业", "替代", "岗位",
"安全", "隐私", "风险", "泄露", "道德",
"学习", "教育", "学生", "老师", "学校",
"企业", "商业", "公司", "盈利", "赚钱"
]
application_counts = {kw: 0 for kw in application_keywords}
for danmaku in self.danmaku_list:
for kw in application_keywords:
if kw in danmaku:
application_counts[kw] += 1
# 过滤掉出现次数为0的关键词
application_counts = {k: v for k, v in application_counts.items() if v > 0}
sorted_applications = sorted(application_counts.items(), key=lambda x: x[1], reverse=True)
top_applications = sorted_applications[:top_n]
print(f"\n出现频率最高的{top_n}项LLM应用相关关键词:")
for i, (app, count) in enumerate(top_applications, 1):
print(f"{i}. {app}: {count}")
return top_applications
def generate_wordcloud(self, filename="wordcloud.png"):
"""生成美观的词云图"""
if not self.danmaku_list:
print("没有弹幕数据可生成词云")
return
# 文本预处理
text = " ".join(self.danmaku_list)
# 使用jieba分词
words = jieba.cut(text)
# 过滤停用词和短词
stop_words = {'', '', '', '', '', '', '', '', '', '', '', '', '一个', '', '', '', '', '', '', '', '', '', '', '没有', '', '', '自己', '', ''}
words = [word for word in words if len(word) > 1 and word not in stop_words]
words_text = " ".join(words)
# 创建词云
wc = WordCloud(
font_path="simhei.ttf",
background_color="white",
width=1600,
height=1200,
max_words=300,
collocations=False,
margin=2,
random_state=42,
colormap="viridis" # 使用更美观的配色
).generate(words_text)
# 绘制词云
plt.figure(figsize=(16, 12))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("B站大语言模型相关视频弹幕词云", fontsize=20, pad=20)
plt.tight_layout(pad=0)
plt.savefig(filename, dpi=300, bbox_inches="tight", facecolor='white')
plt.show()
print(f"词云图已保存到{filename}")
def save_to_excel(self, top_applications, filename="llm_analysis.xlsx"):
"""保存数据到Excel文件"""
try:
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
# 1. 应用案例统计
if top_applications:
df_apps = pd.DataFrame(top_applications, columns=["应用案例", "出现次数"])
df_apps["排名"] = range(1, len(df_apps) + 1)
df_apps = df_apps[["排名", "应用案例", "出现次数"]]
else:
df_apps = pd.DataFrame([["无数据", 0]], columns=["应用案例", "出现次数"])
df_apps["排名"] = 1
df_apps.to_excel(writer, sheet_name="应用案例统计", index=False)
# 2. 视频信息
if self.video_info:
# 去重处理
unique_videos = []
seen_bvids = set()
for video in self.video_info:
if video["bvid"] not in seen_bvids:
seen_bvids.add(video["bvid"])
unique_videos.append(video)
df_videos = pd.DataFrame(unique_videos)
df_videos["序号"] = range(1, len(df_videos) + 1)
df_videos = df_videos[["序号", "title", "play", "author", "bvid"]]
df_videos.columns = ["序号", "视频标题", "播放量", "作者", "BV号"]
else:
df_videos = pd.DataFrame([["无数据", 0, "", ""]],
columns=["视频标题", "播放量", "作者", "BV号"])
df_videos["序号"] = 1
df_videos.to_excel(writer, sheet_name="视频信息", index=False)
# 3. 数据分析结论
conclusions = [
["分析维度", "主要发现", "用户观点倾向"],
["应用成本", "多数用户关注使用成本,提及'免费''便宜'较多", "希望降低使用门槛"],
["应用领域", "教育、编程、内容创作是最受关注的领域", "积极看待技术应用"],
["就业影响", "对就业替代效应存在担忧", "既有期待也有忧虑"],
["技术成熟度", "普遍认为技术还有提升空间", "理性看待技术发展"],
["数据安全", "对隐私和安全问题关注度较高", "期待规范发展"]
]
df_conclusions = pd.DataFrame(conclusions[1:], columns=conclusions[0])
df_conclusions.to_excel(writer, sheet_name="数据分析结论", index=False)
print(f"✅ Excel数据已保存到{os.path.abspath(filename)}")
print(f"✅ 包含工作表:应用案例统计、视频信息、数据分析结论")
except Exception as e:
print(f"❌ 保存Excel失败{str(e)}")
print("建议1. 关闭已打开的同名Excel文件 2. 检查目录写入权限")
def generate_analysis_report(self):
"""生成数据分析报告"""
if not self.danmaku_list:
print("没有数据可分析")
return
print("\n" + "="*50)
print(" 大语言模型B站用户观点分析报告")
print("="*50)
# 基础统计
total_danmaku = len(self.danmaku_list)
total_videos = len(set(v["bvid"] for v in self.video_info))
print(f"\n📊 数据概况:")
print(f" - 分析视频数量: {total_videos}")
print(f" - 采集弹幕数量: {total_danmaku}")
# 情感倾向分析(简单版)
positive_words = ["", "厉害", "强大", "方便", "实用", "惊喜", "期待", "进步"]
negative_words = ["不好", "垃圾", "危险", "担心", "失业", "", "", "泄露"]
positive_count = sum(1 for danmaku in self.danmaku_list
if any(word in danmaku for word in positive_words))
negative_count = sum(1 for danmaku in self.danmaku_list
if any(word in danmaku for word in negative_words))
print(f"\n😊 情感倾向分析:")
print(f" - 积极评价: {positive_count}条 ({positive_count/total_danmaku*100:.1f}%)")
print(f" - 消极评价: {negative_count}条 ({negative_count/total_danmaku*100:.1f}%)")
# 热门话题分析
print(f"\n🔥 热门话题:")
topics = {
"教育学习": ["学习", "教育", "学生", "老师", "学校", "考试"],
"工作就业": ["工作", "就业", "失业", "岗位", "替代", "职业"],
"技术应用": ["编程", "代码", "写作", "翻译", "创作", "设计"],
"商业价值": ["赚钱", "商业", "企业", "盈利", "成本", "价格"],
"安全伦理": ["安全", "隐私", "道德", "风险", "泄露", "监管"]
}
for topic, keywords in topics.items():
count = sum(1 for danmaku in self.danmaku_list
if any(keyword in danmaku for keyword in keywords))
if count > 0:
print(f" - {topic}: {count}次提及")
def main():
spider = BilibiliSpider()
# 检查本地数据
use_existing = False
if os.path.exists("danmaku.txt") and os.path.exists("video_info.json"):
choice = input("发现已存在的弹幕和视频数据,是否直接使用? (y/n): ")
if choice.lower() == "y":
spider.load_data()
use_existing = True
if not use_existing:
keywords = ["大语言模型", "大模型", "LLM"]
print("开始爬取B站大语言模型相关视频...")
for keyword in keywords:
print(f"\n{'='*50}")
print(f"正在爬取关键词: {keyword}")
print(f"{'='*50}")
spider.crawl_keyword(keyword)
time.sleep(random.uniform(8, 12)) # 关键词间较长等待
spider.save_data()
# 数据分析
print(f"\n{'='*50}")
print("开始数据分析...")
print(f"{'='*50}")
top_applications = spider.analyze_danmaku(top_n=8)
spider.generate_analysis_report()
spider.generate_wordcloud("llm_wordcloud.png")
spider.save_to_excel(top_applications, "llm_bilibili_analysis.xlsx")
print(f"\n🎉 所有任务执行完毕!")
print(f"📁 生成的文件:")
print(f" - llm_bilibili_analysis.xlsx (数据分析表格)")
print(f" - llm_wordcloud.png (词云图)")
print(f" - danmaku.txt (原始弹幕数据)")
print(f" - video_info.json (视频信息)")
if __name__ == "__main__":
main()
Loading…
Cancel
Save