You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
JAVA/BiliBiliSearchCrawler.java

112 lines
5.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

// src/main/java/com/llm/analysis/BiliBiliSearchCrawler.java
package com.llm.analysis;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.JsonSyntaxException;
import java.io.IOException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class BiliBiliSearchCrawler {
private static final String SEARCH_API = "https://api.bilibili.com/x/web-interface/search/type";
// 你提供的 B站 SESSDATA Cookie 值
private static final String BILI_SESSDATA = "24d61dbf%2C1777714611%2C44d8f%2Ab1CjBJl1nuF4mxKurV7W0WGijaO58BOVfVjSzeBUQynz2TUGqXiUIx2DA1UyDblAjNPF8SVkg3eWg0UWdneW9sWXFfZGE5U2tzcDFZZVhtX1N4dnBaS0tELXAwSEtmUTNGOW00UEUxdmJGMFg5SHJERW5pYlViaU5xLXFjYWp1dmFBNFprck8ydThRIIEC";
private static final int ITEMS_PER_PAGE = 20;
private static final int TARGET_PAGES = 300 / ITEMS_PER_PAGE;
private static final String USER_AGENT =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36";
/**
* 调用 B站搜索 API获取指定关键词和数量的视频列表。只获取 BVID 和 Title。
* @param keyword 搜索关键词 (如 "大语言模型")
* @return 包含 BVID 和 Title 的 Map 列表
*/
public static List<Map<String, String>> getTopVideos(String keyword) {
List<Map<String, String>> videoList = new ArrayList<>();
try {
String encodedKeyword = URLEncoder.encode(keyword, StandardCharsets.UTF_8);
for (int page = 1; page <= TARGET_PAGES; page++) {
String url = String.format("%s?search_type=video&keyword=%s&page=%d&pagesize=%d",
SEARCH_API, encodedKeyword, page, ITEMS_PER_PAGE);
System.out.println("🤖 正在爬取第 " + page + " 页数据...");
Connection.Response response = Jsoup.connect(url)
.method(Connection.Method.GET)
.ignoreContentType(true)
.userAgent(USER_AGENT)
.cookie("SESSDATA", BILI_SESSDATA)
.execute();
String jsonStr = response.body();
// 诊断代码:只打印第一页,用于调试
if (page == 1) {
System.out.println("--- 诊断:第 1 页 JSON 响应 (仅截取前 500 字符) ---");
System.out.println(jsonStr.substring(0, Math.min(jsonStr.length(), 500)));
System.out.println("-----------------------------------------------------");
}
// 解析 JSON
JsonObject jsonResponse = JsonParser.parseString(jsonStr).getAsJsonObject();
if (jsonResponse.get("code").getAsInt() != 0) {
System.err.println("API 调用失败,返回消息: " + jsonResponse.get("message").getAsString());
break;
}
JsonObject data = jsonResponse.getAsJsonObject("data");
if (data == null) break;
JsonArray results = data.getAsJsonArray("result");
if (results == null || results.size() == 0) break;
for (int i = 0; i < results.size(); i++) {
JsonObject item = results.get(i).getAsJsonObject();
if (!item.has("type") || !"video".equals(item.get("type").getAsString())) {
continue;
}
// 提取 BVID 和 Title
String bvid = item.get("bvid") != null ? item.get("bvid").getAsString() : null;
String title = item.get("title") != null ? item.get("title").getAsString() : "无标题";
// 关键修正:只检查 bvid 是否存在
if (bvid != null) {
Map<String, String> videoInfo = new HashMap<>();
videoInfo.put("bvid", bvid);
videoInfo.put("title", title.replaceAll("<em[^>]*>|</em>", ""));
videoList.add(videoInfo);
}
if (videoList.size() >= 300) break;
}
if (videoList.size() >= 300) break;
// 礼貌暂停 1 秒
Thread.sleep(1000);
}
} catch (IOException | InterruptedException | JsonSyntaxException e) {
System.err.println("爬取过程中发生错误:" + e.getMessage());
}
System.out.println("🎉 成功获取到 " + videoList.size() + " 条视频的 BVID/Title 数据。");
return videoList;
}
}