You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

54 lines
2.1 KiB

2 months ago
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class BilibiliDanmuCrawler {
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36";
private static final int MAX_DANMU_COUNT = 300;
private static int count = 0;
public static void main(String[] args) throws IOException {
List<String> pageUrlList = getPageUrl(10);
List<String> cidList = getCid(pageUrlList);
List<String> danmuList = getDanmu(cidList);
// 将弹幕写入文件
try (FileWriter writer = new FileWriter("danmu.txt")) {
for (String danmu : danmuList) {
writer.write(danmu + "\n");
}
}
System.out.println("弹幕爬取完成!");
}
// 获取多个页面的URL
private static List<String> getPageUrl(int n) {
List<String> pageUrlList = new ArrayList<>();
for (int i = 0; i < n; i++) {
String pageUrl;
if (i == 0) {
pageUrl = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A";
} else {
pageUrl = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&page=" + (i + 1);
}
pageUrlList.add(pageUrl);
}
return pageUrlList;
}
// 获取视频的 cid 列表
private static List<String> getCid(List<String> pageUrlList) throws IOException {
List<String> cidList = new ArrayList<>();
for (String pageUrl : pageUrlList) {
if (count >= MAX_DANMU_