// src/main/java/com/llm/analysis/BiliBiliSearchCrawler.java package com.llm.analysis; import org.jsoup.Connection; import org.jsoup.Jsoup; import com.google.gson.JsonArray; import com.google.gson.JsonObject; import com.google.gson.JsonParser; import com.google.gson.JsonSyntaxException; import java.io.IOException; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; public class BiliBiliSearchCrawler { private static final String SEARCH_API = "https://api.bilibili.com/x/web-interface/search/type"; // 你提供的 B站 SESSDATA Cookie 值 private static final String BILI_SESSDATA = "24d61dbf%2C1777714611%2C44d8f%2Ab1CjBJl1nuF4mxKurV7W0WGijaO58BOVfVjSzeBUQynz2TUGqXiUIx2DA1UyDblAjNPF8SVkg3eWg0UWdneW9sWXFfZGE5U2tzcDFZZVhtX1N4dnBaS0tELXAwSEtmUTNGOW00UEUxdmJGMFg5SHJERW5pYlViaU5xLXFjYWp1dmFBNFprck8ydThRIIEC"; private static final int ITEMS_PER_PAGE = 20; private static final int TARGET_PAGES = 300 / ITEMS_PER_PAGE; private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"; /** * 调用 B站搜索 API,获取指定关键词和数量的视频列表。只获取 BVID 和 Title。 * @param keyword 搜索关键词 (如 "大语言模型") * @return 包含 BVID 和 Title 的 Map 列表 */ public static List> getTopVideos(String keyword) { List> videoList = new ArrayList<>(); try { String encodedKeyword = URLEncoder.encode(keyword, StandardCharsets.UTF_8); for (int page = 1; page <= TARGET_PAGES; page++) { String url = String.format("%s?search_type=video&keyword=%s&page=%d&pagesize=%d", SEARCH_API, encodedKeyword, page, ITEMS_PER_PAGE); System.out.println("🤖 正在爬取第 " + page + " 页数据..."); Connection.Response response = Jsoup.connect(url) .method(Connection.Method.GET) .ignoreContentType(true) .userAgent(USER_AGENT) .cookie("SESSDATA", BILI_SESSDATA) .execute(); String jsonStr = response.body(); // 诊断代码:只打印第一页,用于调试 if (page == 1) { System.out.println("--- 诊断:第 1 页 JSON 响应 (仅截取前 500 字符) ---"); System.out.println(jsonStr.substring(0, Math.min(jsonStr.length(), 500))); System.out.println("-----------------------------------------------------"); } // 解析 JSON JsonObject jsonResponse = JsonParser.parseString(jsonStr).getAsJsonObject(); if (jsonResponse.get("code").getAsInt() != 0) { System.err.println("API 调用失败,返回消息: " + jsonResponse.get("message").getAsString()); break; } JsonObject data = jsonResponse.getAsJsonObject("data"); if (data == null) break; JsonArray results = data.getAsJsonArray("result"); if (results == null || results.size() == 0) break; for (int i = 0; i < results.size(); i++) { JsonObject item = results.get(i).getAsJsonObject(); if (!item.has("type") || !"video".equals(item.get("type").getAsString())) { continue; } // 提取 BVID 和 Title String bvid = item.get("bvid") != null ? item.get("bvid").getAsString() : null; String title = item.get("title") != null ? item.get("title").getAsString() : "无标题"; // 关键修正:只检查 bvid 是否存在 if (bvid != null) { Map videoInfo = new HashMap<>(); videoInfo.put("bvid", bvid); videoInfo.put("title", title.replaceAll("]*>|", "")); videoList.add(videoInfo); } if (videoList.size() >= 300) break; } if (videoList.size() >= 300) break; // 礼貌暂停 1 秒 Thread.sleep(1000); } } catch (IOException | InterruptedException | JsonSyntaxException e) { System.err.println("爬取过程中发生错误:" + e.getMessage()); } System.out.println("🎉 成功获取到 " + videoList.size() + " 条视频的 BVID/Title 数据。"); return videoList; } }