From 78c706bd1012eceae92617c0e6598529a7fb832e Mon Sep 17 00:00:00 2001 From: pqjhl759w <115566001@qq.com> Date: Wed, 18 Sep 2024 23:58:03 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E8=99=AB=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 爬虫.txt | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 爬虫.txt diff --git a/爬虫.txt b/爬虫.txt new file mode 100644 index 0000000..9c0078b --- /dev/null +++ b/爬虫.txt @@ -0,0 +1,53 @@ +import org.apache.http.HttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.util.EntityUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import java.io.FileWriter; +import java.io.IOException; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class BilibiliDanmuCrawler { + + private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"; + private static final int MAX_DANMU_COUNT = 300; + private static int count = 0; + + public static void main(String[] args) throws IOException { + List pageUrlList = getPageUrl(10); + List cidList = getCid(pageUrlList); + List danmuList = getDanmu(cidList); + + // 将弹幕写入文件 + try (FileWriter writer = new FileWriter("danmu.txt")) { + for (String danmu : danmuList) { + writer.write(danmu + "\n"); + } + } + System.out.println("弹幕爬取完成!"); + } + + // 获取多个页面的URL + private static List getPageUrl(int n) { + List pageUrlList = new ArrayList<>(); + for (int i = 0; i < n; i++) { + String pageUrl; + if (i == 0) { + pageUrl = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A"; + } else { + pageUrl = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&page=" + (i + 1); + } + pageUrlList.add(pageUrl); + } + return pageUrlList; + } + + // 获取视频的 cid 列表 + private static List getCid(List pageUrlList) throws IOException { + List cidList = new ArrayList<>(); + for (String pageUrl : pageUrlList) { + if (count >= MAX_DANMU_