|
|
import requests
|
|
|
import re
|
|
|
import json
|
|
|
import time
|
|
|
import random
|
|
|
import pandas as pd
|
|
|
import jieba
|
|
|
from wordcloud import WordCloud
|
|
|
import matplotlib.pyplot as plt
|
|
|
from collections import Counter
|
|
|
from bs4 import BeautifulSoup
|
|
|
import os
|
|
|
from openpyxl import Workbook
|
|
|
|
|
|
# 设置中文显示
|
|
|
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
|
|
|
plt.rcParams["axes.unicode_minus"] = False
|
|
|
|
|
|
class BilibiliSpider:
|
|
|
def __init__(self):
|
|
|
self.headers = {
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
|
"Accept": "application/json, text/plain, */*",
|
|
|
"Referer": "https://www.bilibili.com/"
|
|
|
}
|
|
|
self.session = requests.Session()
|
|
|
self.session.headers.update(self.headers)
|
|
|
self.danmaku_list = [] # 存储所有弹幕
|
|
|
self.video_info = [] # 存储视频信息
|
|
|
|
|
|
def search_videos(self, keyword, page=1, pages=36):
|
|
|
print(f"开始搜索关键词: {keyword}")
|
|
|
all_videos = []
|
|
|
for p in range(page, page + pages):
|
|
|
try:
|
|
|
url = f"https://api.bilibili.com/x/web-interface/search/all/v2?keyword={keyword}&page={p}&page_size=10"
|
|
|
response = self.session.get(url)
|
|
|
data = json.loads(response.text)
|
|
|
if data.get("code") != 0:
|
|
|
print(f"搜索失败,错误代码: {data.get('code')}")
|
|
|
continue
|
|
|
video_items = data.get("data", {}).get("result", [])
|
|
|
for item in video_items:
|
|
|
if item.get("result_type") == "video":
|
|
|
for video in item.get("data", []):
|
|
|
bvid = video.get("bvid")
|
|
|
title = video.get("title")
|
|
|
play = video.get("play")
|
|
|
author = video.get("author")
|
|
|
all_videos.append({"bvid": bvid, "title": title, "play": play, "author": author})
|
|
|
self.video_info.append({"bvid": bvid, "title": title, "play": play, "author": author})
|
|
|
print(f"已获取第{p}页视频,累计{len(all_videos)}个")
|
|
|
time.sleep(random.uniform(1, 3))
|
|
|
except Exception as e:
|
|
|
print(f"搜索视频出错: {e}")
|
|
|
time.sleep(5)
|
|
|
return all_videos[:360]
|
|
|
|
|
|
def get_cid(self, bvid):
|
|
|
try:
|
|
|
url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
|
|
|
response = self.session.get(url)
|
|
|
data = json.loads(response.text)
|
|
|
if data.get("code") == 0:
|
|
|
return data.get("data", {}).get("cid")
|
|
|
else:
|
|
|
print(f"获取cid失败,bvid: {bvid},错误代码: {data.get('code')}")
|
|
|
return None
|
|
|
except Exception as e:
|
|
|
print(f"获取cid出错: {e}")
|
|
|
return None
|
|
|
|
|
|
def get_danmaku(self, cid):
|
|
|
if not cid:
|
|
|
return []
|
|
|
try:
|
|
|
url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
|
|
|
response = self.session.get(url)
|
|
|
response.encoding = "utf-8"
|
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
|
danmakus = soup.find_all("d")
|
|
|
result = [danmaku.text.strip() for danmaku in danmakus]
|
|
|
print(f"成功获取{len(result)}条弹幕")
|
|
|
return result
|
|
|
except Exception as e:
|
|
|
print(f"获取弹幕出错: {e}")
|
|
|
return []
|
|
|
|
|
|
def crawl_keyword(self, keyword):
|
|
|
videos = self.search_videos(keyword)
|
|
|
for i, video in enumerate(videos):
|
|
|
print(f"正在处理第{i+1}/{len(videos)}个视频: {video['title']}")
|
|
|
cid = self.get_cid(video["bvid"])
|
|
|
if cid:
|
|
|
danmakus = self.get_danmaku(cid)
|
|
|
self.danmaku_list.extend(danmakus)
|
|
|
if (i + 1) % 5 == 0:
|
|
|
time.sleep(random.uniform(3, 5))
|
|
|
print(f"关键词[{keyword}]爬取完成,累计获取{len(self.danmaku_list)}条弹幕")
|
|
|
|
|
|
def save_danmaku(self, filename="danmaku.txt"):
|
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|
|
for danmaku in self.danmaku_list:
|
|
|
f.write(danmaku + "\n")
|
|
|
print(f"弹幕已保存到{filename}")
|
|
|
|
|
|
def load_danmaku(self, filename="danmaku.txt"):
|
|
|
if os.path.exists(filename):
|
|
|
with open(filename, "r", encoding="utf-8") as f:
|
|
|
self.danmaku_list = [line.strip() for line in f.readlines() if line.strip()]
|
|
|
print(f"从{filename}加载了{len(self.danmaku_list)}条弹幕")
|
|
|
|
|
|
def analyze_danmaku(self, top_n=8):
|
|
|
"""修复:返回空列表时明确提示,避免后续保存出错"""
|
|
|
if not self.danmaku_list:
|
|
|
print("没有弹幕数据可分析,返回空列表")
|
|
|
return []
|
|
|
application_keywords = [
|
|
|
"聊天机器人", "智能客服", "内容创作", "代码生成",
|
|
|
"翻译", "教育", "医疗", "法律", "金融分析",
|
|
|
"图像生成", "语音识别", "自动驾驶", "数据分析",
|
|
|
"游戏", "推荐系统", "搜索引擎"
|
|
|
]
|
|
|
application_counts = {kw: 0 for kw in application_keywords}
|
|
|
for danmaku in self.danmaku_list:
|
|
|
for kw in application_keywords:
|
|
|
if kw in danmaku:
|
|
|
application_counts[kw] += 1
|
|
|
sorted_applications = sorted(application_counts.items(), key=lambda x: x[1], reverse=True)
|
|
|
top_applications = sorted_applications[:top_n]
|
|
|
print(f"出现频率最高的{top_n}项LLM应用:")
|
|
|
for i, (app, count) in enumerate(top_applications, 1):
|
|
|
print(f"{i}. {app}: {count}次")
|
|
|
# 若前N项全为0次,提示并返回非空列表(避免Excel写入失败)
|
|
|
if all(count == 0 for _, count in top_applications):
|
|
|
print("未匹配到任何LLM应用关键词,Excel将写入空数据")
|
|
|
return top_applications
|
|
|
|
|
|
def generate_wordcloud(self, filename="wordcloud.png"):
|
|
|
if not self.danmaku_list:
|
|
|
print("没有弹幕数据可生成词云")
|
|
|
return
|
|
|
text = " ".join(self.danmaku_list)
|
|
|
words = jieba.cut(text)
|
|
|
words = [word for word in words if len(word) > 1]
|
|
|
words_text = " ".join(words)
|
|
|
wc = WordCloud(
|
|
|
font_path="simhei.ttf",
|
|
|
background_color="white",
|
|
|
width=1200,
|
|
|
height=800,
|
|
|
max_words=200,
|
|
|
collocations=False
|
|
|
).generate(words_text)
|
|
|
plt.figure(figsize=(12, 8))
|
|
|
plt.imshow(wc, interpolation="bilinear")
|
|
|
plt.axis("off")
|
|
|
plt.tight_layout(pad=0)
|
|
|
plt.savefig(filename, dpi=300, bbox_inches="tight")
|
|
|
plt.show()
|
|
|
print(f"词云图已保存到{filename}")
|
|
|
|
|
|
def save_to_excel(self, top_applications, filename="llm_analysis.xlsx"):
|
|
|
"""修复:添加异常捕获、空数据处理,确保文件生成"""
|
|
|
try:
|
|
|
wb = Workbook()
|
|
|
# 应用案例工作表(处理空数据)
|
|
|
ws_apps = wb.active
|
|
|
ws_apps.title = "应用案例统计"
|
|
|
ws_apps.append(["排名", "应用案例", "出现次数"])
|
|
|
if top_applications:
|
|
|
for i, (app, count) in enumerate(top_applications, 1):
|
|
|
ws_apps.append([i, app, count])
|
|
|
else:
|
|
|
ws_apps.append([1, "无匹配数据", 0]) # 空数据时写入占位行
|
|
|
|
|
|
# 视频信息工作表(处理空数据)
|
|
|
ws_videos = wb.create_sheet(title="视频信息")
|
|
|
ws_videos.append(["序号", "视频标题", "播放量", "作者", "BV号"])
|
|
|
if self.video_info:
|
|
|
for i, video in enumerate(self.video_info[:360], 1):
|
|
|
ws_videos.append([i, video["title"], video["play"], video["author"], video["bvid"]])
|
|
|
else:
|
|
|
ws_videos.append([1, "无视频数据", 0, "无", "无"]) # 空数据时写入占位行
|
|
|
|
|
|
# 保存文件(确保路径可写)
|
|
|
wb.save(filename)
|
|
|
print(f"✅ Excel数据已成功保存到{os.path.abspath(filename)}")
|
|
|
except Exception as e:
|
|
|
# 捕获所有保存错误并提示
|
|
|
print(f"❌ 保存Excel失败:{e}")
|
|
|
print("建议:1. 关闭已打开的同名Excel文件 2. 检查当前目录是否有写入权限")
|
|
|
|
|
|
|
|
|
def main():
|
|
|
spider = BilibiliSpider()
|
|
|
if os.path.exists("danmaku.txt"):
|
|
|
choice = input("发现已存在的弹幕数据,是否直接使用? (y/n): ")
|
|
|
if choice.lower() == "y":
|
|
|
spider.load_danmaku()
|
|
|
else:
|
|
|
keywords = ["大语言模型", "大模型", "LLM"]
|
|
|
for keyword in keywords:
|
|
|
spider.crawl_keyword(keyword)
|
|
|
time.sleep(random.uniform(5, 10))
|
|
|
spider.save_danmaku()
|
|
|
else:
|
|
|
keywords = ["大语言模型", "大模型", "LLM"]
|
|
|
for keyword in keywords:
|
|
|
spider.crawl_keyword(keyword)
|
|
|
time.sleep(random.uniform(5, 10))
|
|
|
spider.save_danmaku()
|
|
|
|
|
|
# 分析弹幕(即使返回空列表,也继续执行保存)
|
|
|
top_applications = spider.analyze_danmaku(top_n=8)
|
|
|
|
|
|
# 生成词云
|
|
|
spider.generate_wordcloud()
|
|
|
|
|
|
# 强制执行Excel保存(添加异常捕获,确保不中断)
|
|
|
try:
|
|
|
spider.save_to_excel(top_applications)
|
|
|
except Exception as e:
|
|
|
print(f"Excel保存最终失败:{e}")
|
|
|
|
|
|
print("所有任务执行完毕(含异常处理)!")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main() |