You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

230 lines
9.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import re
import json
import time
import random
import pandas as pd
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from bs4 import BeautifulSoup
import os
from openpyxl import Workbook
# 设置中文显示
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
plt.rcParams["axes.unicode_minus"] = False
class BilibiliSpider:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Referer": "https://www.bilibili.com/"
}
self.session = requests.Session()
self.session.headers.update(self.headers)
self.danmaku_list = [] # 存储所有弹幕
self.video_info = [] # 存储视频信息
def search_videos(self, keyword, page=1, pages=36):
print(f"开始搜索关键词: {keyword}")
all_videos = []
for p in range(page, page + pages):
try:
url = f"https://api.bilibili.com/x/web-interface/search/all/v2?keyword={keyword}&page={p}&page_size=10"
response = self.session.get(url)
data = json.loads(response.text)
if data.get("code") != 0:
print(f"搜索失败,错误代码: {data.get('code')}")
continue
video_items = data.get("data", {}).get("result", [])
for item in video_items:
if item.get("result_type") == "video":
for video in item.get("data", []):
bvid = video.get("bvid")
title = video.get("title")
play = video.get("play")
author = video.get("author")
all_videos.append({"bvid": bvid, "title": title, "play": play, "author": author})
self.video_info.append({"bvid": bvid, "title": title, "play": play, "author": author})
print(f"已获取第{p}页视频,累计{len(all_videos)}")
time.sleep(random.uniform(1, 3))
except Exception as e:
print(f"搜索视频出错: {e}")
time.sleep(5)
return all_videos[:360]
def get_cid(self, bvid):
try:
url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
response = self.session.get(url)
data = json.loads(response.text)
if data.get("code") == 0:
return data.get("data", {}).get("cid")
else:
print(f"获取cid失败bvid: {bvid},错误代码: {data.get('code')}")
return None
except Exception as e:
print(f"获取cid出错: {e}")
return None
def get_danmaku(self, cid):
if not cid:
return []
try:
url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
response = self.session.get(url)
response.encoding = "utf-8"
soup = BeautifulSoup(response.text, "lxml")
danmakus = soup.find_all("d")
result = [danmaku.text.strip() for danmaku in danmakus]
print(f"成功获取{len(result)}条弹幕")
return result
except Exception as e:
print(f"获取弹幕出错: {e}")
return []
def crawl_keyword(self, keyword):
videos = self.search_videos(keyword)
for i, video in enumerate(videos):
print(f"正在处理第{i+1}/{len(videos)}个视频: {video['title']}")
cid = self.get_cid(video["bvid"])
if cid:
danmakus = self.get_danmaku(cid)
self.danmaku_list.extend(danmakus)
if (i + 1) % 5 == 0:
time.sleep(random.uniform(3, 5))
print(f"关键词[{keyword}]爬取完成,累计获取{len(self.danmaku_list)}条弹幕")
def save_danmaku(self, filename="danmaku.txt"):
with open(filename, "w", encoding="utf-8") as f:
for danmaku in self.danmaku_list:
f.write(danmaku + "\n")
print(f"弹幕已保存到{filename}")
def load_danmaku(self, filename="danmaku.txt"):
if os.path.exists(filename):
with open(filename, "r", encoding="utf-8") as f:
self.danmaku_list = [line.strip() for line in f.readlines() if line.strip()]
print(f"{filename}加载了{len(self.danmaku_list)}条弹幕")
def analyze_danmaku(self, top_n=8):
"""修复:返回空列表时明确提示,避免后续保存出错"""
if not self.danmaku_list:
print("没有弹幕数据可分析,返回空列表")
return []
application_keywords = [
"聊天机器人", "智能客服", "内容创作", "代码生成",
"翻译", "教育", "医疗", "法律", "金融分析",
"图像生成", "语音识别", "自动驾驶", "数据分析",
"游戏", "推荐系统", "搜索引擎"
]
application_counts = {kw: 0 for kw in application_keywords}
for danmaku in self.danmaku_list:
for kw in application_keywords:
if kw in danmaku:
application_counts[kw] += 1
sorted_applications = sorted(application_counts.items(), key=lambda x: x[1], reverse=True)
top_applications = sorted_applications[:top_n]
print(f"出现频率最高的{top_n}项LLM应用:")
for i, (app, count) in enumerate(top_applications, 1):
print(f"{i}. {app}: {count}")
# 若前N项全为0次提示并返回非空列表避免Excel写入失败
if all(count == 0 for _, count in top_applications):
print("未匹配到任何LLM应用关键词Excel将写入空数据")
return top_applications
def generate_wordcloud(self, filename="wordcloud.png"):
if not self.danmaku_list:
print("没有弹幕数据可生成词云")
return
text = " ".join(self.danmaku_list)
words = jieba.cut(text)
words = [word for word in words if len(word) > 1]
words_text = " ".join(words)
wc = WordCloud(
font_path="simhei.ttf",
background_color="white",
width=1200,
height=800,
max_words=200,
collocations=False
).generate(words_text)
plt.figure(figsize=(12, 8))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.savefig(filename, dpi=300, bbox_inches="tight")
plt.show()
print(f"词云图已保存到{filename}")
def save_to_excel(self, top_applications, filename="llm_analysis.xlsx"):
"""修复:添加异常捕获、空数据处理,确保文件生成"""
try:
wb = Workbook()
# 应用案例工作表(处理空数据)
ws_apps = wb.active
ws_apps.title = "应用案例统计"
ws_apps.append(["排名", "应用案例", "出现次数"])
if top_applications:
for i, (app, count) in enumerate(top_applications, 1):
ws_apps.append([i, app, count])
else:
ws_apps.append([1, "无匹配数据", 0]) # 空数据时写入占位行
# 视频信息工作表(处理空数据)
ws_videos = wb.create_sheet(title="视频信息")
ws_videos.append(["序号", "视频标题", "播放量", "作者", "BV号"])
if self.video_info:
for i, video in enumerate(self.video_info[:360], 1):
ws_videos.append([i, video["title"], video["play"], video["author"], video["bvid"]])
else:
ws_videos.append([1, "无视频数据", 0, "", ""]) # 空数据时写入占位行
# 保存文件(确保路径可写)
wb.save(filename)
print(f"✅ Excel数据已成功保存到{os.path.abspath(filename)}")
except Exception as e:
# 捕获所有保存错误并提示
print(f"❌ 保存Excel失败{e}")
print("建议1. 关闭已打开的同名Excel文件 2. 检查当前目录是否有写入权限")
def main():
spider = BilibiliSpider()
if os.path.exists("danmaku.txt"):
choice = input("发现已存在的弹幕数据,是否直接使用? (y/n): ")
if choice.lower() == "y":
spider.load_danmaku()
else:
keywords = ["大语言模型", "大模型", "LLM"]
for keyword in keywords:
spider.crawl_keyword(keyword)
time.sleep(random.uniform(5, 10))
spider.save_danmaku()
else:
keywords = ["大语言模型", "大模型", "LLM"]
for keyword in keywords:
spider.crawl_keyword(keyword)
time.sleep(random.uniform(5, 10))
spider.save_danmaku()
# 分析弹幕(即使返回空列表,也继续执行保存)
top_applications = spider.analyze_danmaku(top_n=8)
# 生成词云
spider.generate_wordcloud()
# 强制执行Excel保存添加异常捕获确保不中断
try:
spider.save_to_excel(top_applications)
except Exception as e:
print(f"Excel保存最终失败{e}")
print("所有任务执行完毕(含异常处理)!")
if __name__ == "__main__":
main()