|
|
|
|
@ -0,0 +1,69 @@
|
|
|
|
|
// src/main/java/com/llm/analysis/BiliBiliCrawler.java
|
|
|
|
|
package com.llm.analysis;
|
|
|
|
|
|
|
|
|
|
import org.jsoup.Jsoup;
|
|
|
|
|
import org.jsoup.nodes.Document;
|
|
|
|
|
import com.google.gson.JsonParser;
|
|
|
|
|
import com.google.gson.JsonObject;
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
|
|
|
|
|
public class BiliBiliCrawler {
|
|
|
|
|
|
|
|
|
|
private static final String DM_BASE_URL = "https://comment.bilibili.com/";
|
|
|
|
|
// 新增:用于通过 BVID 获取 CID 的 API
|
|
|
|
|
private static final String PAGELIST_API = "https://api.bilibili.com/x/player/pagelist";
|
|
|
|
|
|
|
|
|
|
// User-Agent 保持一致,防止被服务器识别为不同来源
|
|
|
|
|
private static final String USER_AGENT =
|
|
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 根据 BVID 获取视频的 CID (用于弹幕请求)。
|
|
|
|
|
* @param bvid 视频的BVID
|
|
|
|
|
* @return 视频的CID字符串,如果失败返回null
|
|
|
|
|
*/
|
|
|
|
|
public static String getCidFromBvid(String bvid) {
|
|
|
|
|
String url = String.format("%s?bvid=%s", PAGELIST_API, bvid);
|
|
|
|
|
try {
|
|
|
|
|
// 访问 pagelist API
|
|
|
|
|
String jsonStr = Jsoup.connect(url)
|
|
|
|
|
.ignoreContentType(true)
|
|
|
|
|
.userAgent(USER_AGENT)
|
|
|
|
|
.execute()
|
|
|
|
|
.body();
|
|
|
|
|
|
|
|
|
|
// 注意:新版 Gson 使用 parseString
|
|
|
|
|
JsonObject jsonResponse = JsonParser.parseString(jsonStr).getAsJsonObject();
|
|
|
|
|
|
|
|
|
|
if (jsonResponse.get("code").getAsInt() == 0) {
|
|
|
|
|
// CID 通常在 data 数组的第一个元素里
|
|
|
|
|
return jsonResponse.getAsJsonArray("data")
|
|
|
|
|
.get(0).getAsJsonObject()
|
|
|
|
|
.get("cid").getAsString();
|
|
|
|
|
}
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
// 错误处理,但不再打印红色错误,而是安静失败
|
|
|
|
|
}
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 根据CID获取弹幕的原始XML内容。
|
|
|
|
|
* @param cid 视频的CID
|
|
|
|
|
* @return 弹幕XML字符串
|
|
|
|
|
*/
|
|
|
|
|
public static String getDanmuXml(String cid) {
|
|
|
|
|
String url = DM_BASE_URL + cid + ".xml";
|
|
|
|
|
try {
|
|
|
|
|
Document doc = Jsoup.connect(url)
|
|
|
|
|
.userAgent(USER_AGENT)
|
|
|
|
|
.ignoreContentType(true)
|
|
|
|
|
.get();
|
|
|
|
|
return doc.outerHtml();
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
// 失败时返回 null
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|