develop v1.1
flying_pig 3 months ago
parent 8664e817af
commit ae9b2ef89f

@ -1,2 +1,38 @@
# bilibili-spider
## 一.主任务要求
**1.数据获取**
利用爬虫B站爬取所需弹幕数据搜索关键词“2024巴黎奥运会”爬取综合排序前300的所有视频弹幕。
**2.数据统计**
统计AI技术应该方面的每种弹幕数量并输出数量排名前8的弹幕。
将统计的数据利用编程工具或开发包自动写入Excel表中。
**3.数据可视化**
对采集的数据集进行可视化表示,制作词云图,越美观越好。
**4.数据结论**
通过统计数据得出当前B站用户对于2024巴黎奥运会应用AI技术的主流看法。
## 二.附加题
爬取福州大学的通知、文件系统
地址https://info22.fzu.edu.cn/lm_list.jsp?wbtreeid=1460【要开校园网访问】
包含发布时间,作者,标题以及正文。
可自动翻页(爬虫可以自动对后续页面进行爬取,而不需要我们指定第几页)
指定爬取范围如2020年1月1号 - 2021年9月1号
## 三.使用技术栈
* Http请求OkHttp Jsoup
* Json解析Gson
* Excel写入EasyExcel
* 词云生成Kumo
* 日志打印Slf4j

@ -14,14 +14,6 @@
<spring-boot.version>2.6.13</spring-boot.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
@ -36,14 +28,16 @@
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
<!-- Logback Classic -->
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>4.10.0</version>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.10</version>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-inline</artifactId>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>4.10.0</version>
</dependency>
<dependency>
@ -64,12 +58,6 @@
<version>1.27</version>
</dependency>
<!--Trip数等数据结构相关依赖-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
<version>4.4</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>

@ -1,6 +1,7 @@
package com.flyingpig.bilibilispider.additionalWork;
import com.flyingpig.bilibilispider.constant.HeaderConstant;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@ -22,7 +23,7 @@ public class FzuNewsCrawler {
private static final String BASE_URL = "https://info22.fzu.edu.cn/lm_list.jsp?";
private static final String NEWS_URL = BASE_URL + "totalpage=948&PAGENUM=%d&urltype=tree.TreeTempUrl&wbtreeid=1460";
private static final String CONTENT_URL = "https://info22.fzu.edu.cn/content.jsp?";
// 爬取福大通知和文件系统
public List<FzuNews> crawlFzuNotificationsAndFileSystems(String newsBeginTime, String newsEndTime) throws Exception {
List<FzuNews> fzuNewsList = new ArrayList<>(); // 保存新闻列表
@ -61,7 +62,7 @@ public class FzuNewsCrawler {
// 获取新闻的作者、标题和正文链接
String author = newsElement.getElementsByTag("a").eq(0).text();
String title = newsElement.getElementsByTag("a").eq(1).text();
String textHref = BASE_URL + newsElement.getElementsByTag("a").get(1).attr("href");
String textHref = CONTENT_URL + newsElement.getElementsByTag("a").get(1).attr("href");
String text = fetchNewsText(textHref);
// 将新闻信息写入文件中
@ -89,7 +90,8 @@ public class FzuNewsCrawler {
// 获取新闻正文内容
private String fetchNewsText(String textHref) throws Exception {
StringBuilder textBuilder = new StringBuilder(); // 使用StringBuilder拼接正文内容
Document document = Jsoup.connect(textHref).get();
Document document = Jsoup.connect(textHref)
.header("User-Agent", HeaderConstant.USER_AGENT).get();
// 获取所有<p>标签中的正文
Elements paragraphs = document.getElementsByTag("p");

@ -6,4 +6,5 @@ package com.flyingpig.bilibilispider.constant;
public class HeaderConstant {
public static final String COOKIE = "buvid3=06742E53-942C-670A-D496-6E2A79F196FF52832infoc; b_nut=1694915352; i-wanna-go-back=-1; b_ut=7; _uuid=F744F108A-8F810-A165-2EA4-7B4956ACF910C53152infoc; buvid4=8D3A427A-5715-145B-3AA8-9ACC6E30889D54861-023091709-5h4N7ejh5A4ENQhvWFdRwQ%3D%3D; hit-new-style-dyn=1; hit-dyn-v2=1; header_theme_version=CLOSE; rpdid=0zbfAGEiSg|12slY8UuG|pS|3w1QHGXk; LIVE_BUVID=AUTO6216955654982035; buvid_fp_plain=undefined; enable_web_push=DISABLE; dy_spec_agreed=1; is-2022-channel=1; CURRENT_BLACKGAP=0; DedeUserID=398014090; DedeUserID__ckMd5=da87c9926c73fac5; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=618de7e5%2C1726673190%2C8e231%2A32CjAB8DOA4RwqxG7hQy82813je1HM5n0r08KRLyQfyA9zqBalG7QRrNNsuqvI7RejMHwSVnloTURqNE5qWmZmV0M3b0hqS0dTU21ES3NEbU9BX1JwNTNHU0VYMXc2YXNYRU9aOTJ2Q1ZXZkV4aWUyMzQ1VFo0eWpOMGVxUVQydVFfVmVGWjNoY1d3IIEC; bili_jct=51d42647c6b3da22794e018c68e239ba; CURRENT_FNVAL=4048; CURRENT_QUALITY=80; fingerprint=86d24a6d98af903f094c42f9498dfc3d; PVID=3; buvid_fp=86d24a6d98af903f094c42f9498dfc3d; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjQ4ODkxMDUsImlhdCI6MTcyNDYyOTg0NSwicGx0IjotMX0.SuRMicGtcUVVh2xh7DrFQCJReFKXavWzf07sThaixyU; bili_ticket_expires=1724889045; home_feed_column=5; browser_resolution=1440-641; sid=7oephj0f; bp_t_offset_398014090=970621780936884224; b_lsid=410A8B269_1919809748E; bsource=search_google; xxoo-tmp=zhHans";
public static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3";
}

@ -4,8 +4,12 @@ import com.flyingpig.bilibilispider.constant.FileName;
import com.google.gson.JsonArray;
import com.google.gson.JsonParser;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.util.UriComponentsBuilder;
import java.io.*;
import okhttp3.HttpUrl;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
@ -13,8 +17,10 @@ import java.util.concurrent.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.Inflater;
import static com.flyingpig.bilibilispider.constant.UrlConstant.*;
import static com.flyingpig.bilibilispider.util.RequestUtil.*;
import static com.flyingpig.bilibilispider.util.RequestUtil.requestToGetBodyBytes;
import static com.flyingpig.bilibilispider.util.RequestUtil.requesttToGetBodyString;
@Slf4j
public class BilibiliSpiderTask {
@ -31,11 +37,13 @@ public class BilibiliSpiderTask {
final int page = j;
Future<List<Long>> future = executor.submit(() -> {
List<Long> pageCidList = new ArrayList<>();
String searchUrl = UriComponentsBuilder.fromHttpUrl(BILIBILI_SEARCH_URL)
.queryParam("keyword", keyword)
.queryParam("search_type", "video")
.queryParam("page", page)
.queryParam("page_size", 50).toUriString();
String searchUrl = HttpUrl.parse(BILIBILI_SEARCH_URL).newBuilder()
.addQueryParameter("keyword", keyword)
.addQueryParameter("search_type", "video")
.addQueryParameter("page", String.valueOf(page))
.addQueryParameter("page_size", String.valueOf(50))
.build().toString();
log.info("爬取第 {} 页", page);
@ -45,11 +53,12 @@ public class BilibiliSpiderTask {
for (int i = 0; i < searchResultArray.size(); i++) {
String bvid = searchResultArray.get(i).getAsJsonObject().get("bvid").getAsString();
log.info("视频bvid: {}", bvid);
String getCidUrl = UriComponentsBuilder.fromHttpUrl(BILIBILI_GETCID_URL)
.queryParam("bvid", bvid)
.queryParam("jsonp", "jsonp").toUriString();
String getCidUrl = HttpUrl.parse(BILIBILI_GETCID_URL).newBuilder()
.addQueryParameter("bvid", bvid)
.addQueryParameter("jsonp", "jsonp")
.build()
.toString();
Long cid = JsonParser.parseString(requesttToGetBodyString(getCidUrl))
.getAsJsonObject().getAsJsonArray("data")
@ -143,6 +152,7 @@ public class BilibiliSpiderTask {
}
decompressData = outputStream.toByteArray();
} catch (Exception e) {
log.error("解压数据失败", e);
} finally {
outputStream.close();
}

@ -38,6 +38,7 @@ public class DataAnalysisTask {
}
public static HashMap<String, Integer> getTop8BarrageListAboutAI() {
System.out.println(aiKeywords.size());
@ -125,4 +126,6 @@ public class DataAnalysisTask {
return new HashMap<>(sortedMap);
}
}

Loading…
Cancel
Save