|
|
// src/main/java/com/llm/analysis/BiliBiliSearchCrawler.java
|
|
|
package com.llm.analysis;
|
|
|
|
|
|
import org.jsoup.Connection;
|
|
|
import org.jsoup.Jsoup;
|
|
|
import com.google.gson.JsonArray;
|
|
|
import com.google.gson.JsonObject;
|
|
|
import com.google.gson.JsonParser;
|
|
|
import com.google.gson.JsonSyntaxException;
|
|
|
|
|
|
import java.io.IOException;
|
|
|
import java.net.URLEncoder;
|
|
|
import java.nio.charset.StandardCharsets;
|
|
|
import java.util.ArrayList;
|
|
|
import java.util.HashMap;
|
|
|
import java.util.List;
|
|
|
import java.util.Map;
|
|
|
|
|
|
public class BiliBiliSearchCrawler {
|
|
|
|
|
|
private static final String SEARCH_API = "https://api.bilibili.com/x/web-interface/search/type";
|
|
|
|
|
|
// 你提供的 B站 SESSDATA Cookie 值
|
|
|
private static final String BILI_SESSDATA = "24d61dbf%2C1777714611%2C44d8f%2Ab1CjBJl1nuF4mxKurV7W0WGijaO58BOVfVjSzeBUQynz2TUGqXiUIx2DA1UyDblAjNPF8SVkg3eWg0UWdneW9sWXFfZGE5U2tzcDFZZVhtX1N4dnBaS0tELXAwSEtmUTNGOW00UEUxdmJGMFg5SHJERW5pYlViaU5xLXFjYWp1dmFBNFprck8ydThRIIEC";
|
|
|
|
|
|
private static final int ITEMS_PER_PAGE = 20;
|
|
|
private static final int TARGET_PAGES = 300 / ITEMS_PER_PAGE;
|
|
|
|
|
|
private static final String USER_AGENT =
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36";
|
|
|
|
|
|
/**
|
|
|
* 调用 B站搜索 API,获取指定关键词和数量的视频列表。只获取 BVID 和 Title。
|
|
|
* @param keyword 搜索关键词 (如 "大语言模型")
|
|
|
* @return 包含 BVID 和 Title 的 Map 列表
|
|
|
*/
|
|
|
public static List<Map<String, String>> getTopVideos(String keyword) {
|
|
|
List<Map<String, String>> videoList = new ArrayList<>();
|
|
|
|
|
|
try {
|
|
|
String encodedKeyword = URLEncoder.encode(keyword, StandardCharsets.UTF_8);
|
|
|
|
|
|
for (int page = 1; page <= TARGET_PAGES; page++) {
|
|
|
String url = String.format("%s?search_type=video&keyword=%s&page=%d&pagesize=%d",
|
|
|
SEARCH_API, encodedKeyword, page, ITEMS_PER_PAGE);
|
|
|
|
|
|
System.out.println("🤖 正在爬取第 " + page + " 页数据...");
|
|
|
|
|
|
Connection.Response response = Jsoup.connect(url)
|
|
|
.method(Connection.Method.GET)
|
|
|
.ignoreContentType(true)
|
|
|
.userAgent(USER_AGENT)
|
|
|
.cookie("SESSDATA", BILI_SESSDATA)
|
|
|
.execute();
|
|
|
|
|
|
String jsonStr = response.body();
|
|
|
|
|
|
// 诊断代码:只打印第一页,用于调试
|
|
|
if (page == 1) {
|
|
|
System.out.println("--- 诊断:第 1 页 JSON 响应 (仅截取前 500 字符) ---");
|
|
|
System.out.println(jsonStr.substring(0, Math.min(jsonStr.length(), 500)));
|
|
|
System.out.println("-----------------------------------------------------");
|
|
|
}
|
|
|
|
|
|
// 解析 JSON
|
|
|
JsonObject jsonResponse = JsonParser.parseString(jsonStr).getAsJsonObject();
|
|
|
|
|
|
if (jsonResponse.get("code").getAsInt() != 0) {
|
|
|
System.err.println("API 调用失败,返回消息: " + jsonResponse.get("message").getAsString());
|
|
|
break;
|
|
|
}
|
|
|
|
|
|
JsonObject data = jsonResponse.getAsJsonObject("data");
|
|
|
if (data == null) break;
|
|
|
|
|
|
JsonArray results = data.getAsJsonArray("result");
|
|
|
if (results == null || results.size() == 0) break;
|
|
|
|
|
|
for (int i = 0; i < results.size(); i++) {
|
|
|
JsonObject item = results.get(i).getAsJsonObject();
|
|
|
|
|
|
if (!item.has("type") || !"video".equals(item.get("type").getAsString())) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
// 提取 BVID 和 Title
|
|
|
String bvid = item.get("bvid") != null ? item.get("bvid").getAsString() : null;
|
|
|
String title = item.get("title") != null ? item.get("title").getAsString() : "无标题";
|
|
|
|
|
|
// 关键修正:只检查 bvid 是否存在
|
|
|
if (bvid != null) {
|
|
|
Map<String, String> videoInfo = new HashMap<>();
|
|
|
videoInfo.put("bvid", bvid);
|
|
|
videoInfo.put("title", title.replaceAll("<em[^>]*>|</em>", ""));
|
|
|
videoList.add(videoInfo);
|
|
|
}
|
|
|
if (videoList.size() >= 300) break;
|
|
|
}
|
|
|
|
|
|
if (videoList.size() >= 300) break;
|
|
|
|
|
|
// 礼貌暂停 1 秒
|
|
|
Thread.sleep(1000);
|
|
|
}
|
|
|
} catch (IOException | InterruptedException | JsonSyntaxException e) {
|
|
|
System.err.println("爬取过程中发生错误:" + e.getMessage());
|
|
|
}
|
|
|
|
|
|
System.out.println("🎉 成功获取到 " + videoList.size() + " 条视频的 BVID/Title 数据。");
|
|
|
return videoList;
|
|
|
}
|
|
|
} |