diff --git a/README.md b/README.md index 2cd09b2..34c8db5 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,38 @@ -# bilibili-spider +## 一.主任务要求 +**1.数据获取** + +利用爬虫B站爬取所需弹幕数据,搜索关键词“2024巴黎奥运会”,爬取综合排序前300的所有视频弹幕。 + +**2.数据统计** + +统计AI技术应该方面的每种弹幕数量,并输出数量排名前8的弹幕。 +将统计的数据利用编程工具或开发包自动写入Excel表中。 + +**3.数据可视化** + +对采集的数据集进行可视化表示,制作词云图,越美观越好。 + +**4.数据结论** + +通过统计数据得出当前B站用户对于2024巴黎奥运会应用AI技术的主流看法。 + +## 二.附加题 + +爬取福州大学的通知、文件系统 + +地址:https://info22.fzu.edu.cn/lm_list.jsp?wbtreeid=1460【要开校园网访问】 + +包含发布时间,作者,标题以及正文。 + +可自动翻页(爬虫可以自动对后续页面进行爬取,而不需要我们指定第几页) + +指定爬取范围:如2020年1月1号 - 2021年9月1号 + +## 三.使用技术栈 + +* Http请求:OkHttp Jsoup +* Json解析:Gson +* Excel写入:EasyExcel +* 词云生成:Kumo +* 日志打印:Slf4j diff --git a/pom.xml b/pom.xml index bf41f92..70799a8 100644 --- a/pom.xml +++ b/pom.xml @@ -14,14 +14,6 @@ 2.6.13 - - org.springframework.boot - spring-boot-starter - - - org.springframework.boot - spring-boot-starter-web - org.springframework.boot spring-boot-starter-test @@ -36,14 +28,16 @@ org.projectlombok lombok + - com.squareup.okhttp3 - okhttp - 4.10.0 + ch.qos.logback + logback-classic + 1.2.10 + - org.mockito - mockito-inline + com.squareup.okhttp3 + okhttp 4.10.0 @@ -64,12 +58,6 @@ 1.27 - - org.apache.commons - commons-collections4 - 4.4 - - com.alibaba easyexcel diff --git a/src/main/java/com/flyingpig/bilibilispider/additionalWork/FzuNewsCrawler.java b/src/main/java/com/flyingpig/bilibilispider/additionalWork/FzuNewsCrawler.java index 1440449..68664d8 100644 --- a/src/main/java/com/flyingpig/bilibilispider/additionalWork/FzuNewsCrawler.java +++ b/src/main/java/com/flyingpig/bilibilispider/additionalWork/FzuNewsCrawler.java @@ -1,6 +1,7 @@ package com.flyingpig.bilibilispider.additionalWork; +import com.flyingpig.bilibilispider.constant.HeaderConstant; import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; @@ -22,7 +23,7 @@ public class FzuNewsCrawler { private static final String BASE_URL = "https://info22.fzu.edu.cn/lm_list.jsp?"; private static final String NEWS_URL = BASE_URL + "totalpage=948&PAGENUM=%d&urltype=tree.TreeTempUrl&wbtreeid=1460"; - + private static final String CONTENT_URL = "https://info22.fzu.edu.cn/content.jsp?"; // 爬取福大通知和文件系统 public List crawlFzuNotificationsAndFileSystems(String newsBeginTime, String newsEndTime) throws Exception { List fzuNewsList = new ArrayList<>(); // 保存新闻列表 @@ -61,7 +62,7 @@ public class FzuNewsCrawler { // 获取新闻的作者、标题和正文链接 String author = newsElement.getElementsByTag("a").eq(0).text(); String title = newsElement.getElementsByTag("a").eq(1).text(); - String textHref = BASE_URL + newsElement.getElementsByTag("a").get(1).attr("href"); + String textHref = CONTENT_URL + newsElement.getElementsByTag("a").get(1).attr("href"); String text = fetchNewsText(textHref); // 将新闻信息写入文件中 @@ -89,7 +90,8 @@ public class FzuNewsCrawler { // 获取新闻正文内容 private String fetchNewsText(String textHref) throws Exception { StringBuilder textBuilder = new StringBuilder(); // 使用StringBuilder拼接正文内容 - Document document = Jsoup.connect(textHref).get(); + Document document = Jsoup.connect(textHref) + .header("User-Agent", HeaderConstant.USER_AGENT).get(); // 获取所有

标签中的正文 Elements paragraphs = document.getElementsByTag("p"); diff --git a/src/main/java/com/flyingpig/bilibilispider/constant/HeaderConstant.java b/src/main/java/com/flyingpig/bilibilispider/constant/HeaderConstant.java index 60a172a..10debc1 100644 --- a/src/main/java/com/flyingpig/bilibilispider/constant/HeaderConstant.java +++ b/src/main/java/com/flyingpig/bilibilispider/constant/HeaderConstant.java @@ -6,4 +6,5 @@ package com.flyingpig.bilibilispider.constant; public class HeaderConstant { public static final String COOKIE = "buvid3=06742E53-942C-670A-D496-6E2A79F196FF52832infoc; b_nut=1694915352; i-wanna-go-back=-1; b_ut=7; _uuid=F744F108A-8F810-A165-2EA4-7B4956ACF910C53152infoc; buvid4=8D3A427A-5715-145B-3AA8-9ACC6E30889D54861-023091709-5h4N7ejh5A4ENQhvWFdRwQ%3D%3D; hit-new-style-dyn=1; hit-dyn-v2=1; header_theme_version=CLOSE; rpdid=0zbfAGEiSg|12slY8UuG|pS|3w1QHGXk; LIVE_BUVID=AUTO6216955654982035; buvid_fp_plain=undefined; enable_web_push=DISABLE; dy_spec_agreed=1; is-2022-channel=1; CURRENT_BLACKGAP=0; DedeUserID=398014090; DedeUserID__ckMd5=da87c9926c73fac5; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=618de7e5%2C1726673190%2C8e231%2A32CjAB8DOA4RwqxG7hQy82813je1HM5n0r08KRLyQfyA9zqBalG7QRrNNsuqvI7RejMHwSVnloTURqNE5qWmZmV0M3b0hqS0dTU21ES3NEbU9BX1JwNTNHU0VYMXc2YXNYRU9aOTJ2Q1ZXZkV4aWUyMzQ1VFo0eWpOMGVxUVQydVFfVmVGWjNoY1d3IIEC; bili_jct=51d42647c6b3da22794e018c68e239ba; CURRENT_FNVAL=4048; CURRENT_QUALITY=80; fingerprint=86d24a6d98af903f094c42f9498dfc3d; PVID=3; buvid_fp=86d24a6d98af903f094c42f9498dfc3d; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjQ4ODkxMDUsImlhdCI6MTcyNDYyOTg0NSwicGx0IjotMX0.SuRMicGtcUVVh2xh7DrFQCJReFKXavWzf07sThaixyU; bili_ticket_expires=1724889045; home_feed_column=5; browser_resolution=1440-641; sid=7oephj0f; bp_t_offset_398014090=970621780936884224; b_lsid=410A8B269_1919809748E; bsource=search_google; xxoo-tmp=zhHans"; + public static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"; } diff --git a/src/main/java/com/flyingpig/bilibilispider/task/BilibiliSpiderTask.java b/src/main/java/com/flyingpig/bilibilispider/task/BilibiliSpiderTask.java index 4a2d601..1835fd5 100644 --- a/src/main/java/com/flyingpig/bilibilispider/task/BilibiliSpiderTask.java +++ b/src/main/java/com/flyingpig/bilibilispider/task/BilibiliSpiderTask.java @@ -4,8 +4,12 @@ import com.flyingpig.bilibilispider.constant.FileName; import com.google.gson.JsonArray; import com.google.gson.JsonParser; import lombok.extern.slf4j.Slf4j; -import org.springframework.web.util.UriComponentsBuilder; -import java.io.*; +import okhttp3.HttpUrl; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -13,8 +17,10 @@ import java.util.concurrent.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.Inflater; + import static com.flyingpig.bilibilispider.constant.UrlConstant.*; -import static com.flyingpig.bilibilispider.util.RequestUtil.*; +import static com.flyingpig.bilibilispider.util.RequestUtil.requestToGetBodyBytes; +import static com.flyingpig.bilibilispider.util.RequestUtil.requesttToGetBodyString; @Slf4j public class BilibiliSpiderTask { @@ -31,11 +37,13 @@ public class BilibiliSpiderTask { final int page = j; Future> future = executor.submit(() -> { List pageCidList = new ArrayList<>(); - String searchUrl = UriComponentsBuilder.fromHttpUrl(BILIBILI_SEARCH_URL) - .queryParam("keyword", keyword) - .queryParam("search_type", "video") - .queryParam("page", page) - .queryParam("page_size", 50).toUriString(); + + String searchUrl = HttpUrl.parse(BILIBILI_SEARCH_URL).newBuilder() + .addQueryParameter("keyword", keyword) + .addQueryParameter("search_type", "video") + .addQueryParameter("page", String.valueOf(page)) + .addQueryParameter("page_size", String.valueOf(50)) + .build().toString(); log.info("爬取第 {} 页", page); @@ -45,11 +53,12 @@ public class BilibiliSpiderTask { for (int i = 0; i < searchResultArray.size(); i++) { String bvid = searchResultArray.get(i).getAsJsonObject().get("bvid").getAsString(); - log.info("视频bvid: {}", bvid); - String getCidUrl = UriComponentsBuilder.fromHttpUrl(BILIBILI_GETCID_URL) - .queryParam("bvid", bvid) - .queryParam("jsonp", "jsonp").toUriString(); + String getCidUrl = HttpUrl.parse(BILIBILI_GETCID_URL).newBuilder() + .addQueryParameter("bvid", bvid) + .addQueryParameter("jsonp", "jsonp") + .build() + .toString(); Long cid = JsonParser.parseString(requesttToGetBodyString(getCidUrl)) .getAsJsonObject().getAsJsonArray("data") @@ -143,6 +152,7 @@ public class BilibiliSpiderTask { } decompressData = outputStream.toByteArray(); } catch (Exception e) { + log.error("解压数据失败", e); } finally { outputStream.close(); } diff --git a/src/main/java/com/flyingpig/bilibilispider/task/DataAnalysisTask.java b/src/main/java/com/flyingpig/bilibilispider/task/DataAnalysisTask.java index 2ba0046..1be96eb 100644 --- a/src/main/java/com/flyingpig/bilibilispider/task/DataAnalysisTask.java +++ b/src/main/java/com/flyingpig/bilibilispider/task/DataAnalysisTask.java @@ -38,6 +38,7 @@ public class DataAnalysisTask { } + public static HashMap getTop8BarrageListAboutAI() { System.out.println(aiKeywords.size()); @@ -125,4 +126,6 @@ public class DataAnalysisTask { return new HashMap<>(sortedMap); } + + }