You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
54 lines
2.1 KiB
54 lines
2.1 KiB
import org.apache.http.HttpResponse;
|
|
import org.apache.http.client.methods.HttpGet;
|
|
import org.apache.http.impl.client.CloseableHttpClient;
|
|
import org.apache.http.impl.client.HttpClients;
|
|
import org.apache.http.util.EntityUtils;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import java.io.FileWriter;
|
|
import java.io.IOException;
|
|
import java.util.*;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
public class BilibiliDanmuCrawler {
|
|
|
|
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36";
|
|
private static final int MAX_DANMU_COUNT = 300;
|
|
private static int count = 0;
|
|
|
|
public static void main(String[] args) throws IOException {
|
|
List<String> pageUrlList = getPageUrl(10);
|
|
List<String> cidList = getCid(pageUrlList);
|
|
List<String> danmuList = getDanmu(cidList);
|
|
|
|
// 将弹幕写入文件
|
|
try (FileWriter writer = new FileWriter("danmu.txt")) {
|
|
for (String danmu : danmuList) {
|
|
writer.write(danmu + "\n");
|
|
}
|
|
}
|
|
System.out.println("弹幕爬取完成!");
|
|
}
|
|
|
|
// 获取多个页面的URL
|
|
private static List<String> getPageUrl(int n) {
|
|
List<String> pageUrlList = new ArrayList<>();
|
|
for (int i = 0; i < n; i++) {
|
|
String pageUrl;
|
|
if (i == 0) {
|
|
pageUrl = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A";
|
|
} else {
|
|
pageUrl = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&page=" + (i + 1);
|
|
}
|
|
pageUrlList.add(pageUrl);
|
|
}
|
|
return pageUrlList;
|
|
}
|
|
|
|
// 获取视频的 cid 列表
|
|
private static List<String> getCid(List<String> pageUrlList) throws IOException {
|
|
List<String> cidList = new ArrayList<>();
|
|
for (String pageUrl : pageUrlList) {
|
|
if (count >= MAX_DANMU_
|