feat: 多线程优化代码 && 附加题实现

develop
flying_pig 3 months ago
parent 536c8ef470
commit d8509a5551

@ -41,7 +41,11 @@
<artifactId>okhttp</artifactId> <artifactId>okhttp</artifactId>
<version>4.10.0</version> <version>4.10.0</version>
</dependency> </dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-inline</artifactId>
<version>4.10.0</version>
</dependency>
<dependency> <dependency>
<groupId>com.ibm.icu</groupId> <groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId> <artifactId>icu4j</artifactId>
@ -98,18 +102,25 @@
</configuration> </configuration>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.springframework.boot</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>spring-boot-maven-plugin</artifactId> <artifactId>maven-assembly-plugin</artifactId>
<version>${spring-boot.version}</version> <version>3.3.0</version>
<configuration> <configuration>
<mainClass>com.flyingpig.bilibilispider.BilibiliSpiderApplication</mainClass> <archive>
<skip>true</skip> <manifest>
<mainClass>com.flyingpig.bilibilispider.BilibiliSpiderApplication</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration> </configuration>
<executions> <executions>
<execution> <execution>
<id>repackage</id> <id>make-assembly</id>
<phase>package</phase>
<goals> <goals>
<goal>repackage</goal> <goal>single</goal>
</goals> </goals>
</execution> </execution>
</executions> </executions>

@ -5,13 +5,16 @@ import com.flyingpig.bilibilispider.task.DataAnalysisTask;
import com.flyingpig.bilibilispider.task.ExcelWriteTask; import com.flyingpig.bilibilispider.task.ExcelWriteTask;
import com.flyingpig.bilibilispider.util.WordCloudUtil; import com.flyingpig.bilibilispider.util.WordCloudUtil;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.ParserConfigurationException;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@Slf4j @Slf4j
public class BilibiliSpiderApplication { public class BilibiliSpiderApplication {
public static void main(String[] args) throws ParserConfigurationException { public static void main(String[] args) throws ParserConfigurationException {
log.info("爬取启动!!!"); log.info("爬取启动!!!");
@ -26,7 +29,7 @@ public class BilibiliSpiderApplication {
// 生成词云 // 生成词云
WordCloudUtil.generateWordCloud(wordCountMap); WordCloudUtil.generateWordCloud(wordCountMap);
log.trace("爬取结束!!!"); log.info("爬取结束!!!");
} }
} }

@ -0,0 +1,113 @@
package com.flyingpig.bilibilispider.additionalWork;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.List;
@Slf4j
public class FzuNewsCrawler {
// 常量URL 和 Cookie
private static final String BASE_URL = "https://info22.fzu.edu.cn/lm_list.jsp?";
private static final String NEWS_URL = BASE_URL + "totalpage=948&PAGENUM=%d&urltype=tree.TreeTempUrl&wbtreeid=1460";
// 爬取福大通知和文件系统
public List<FzuNews> crawlFzuNotificationsAndFileSystems(String newsBeginTime, String newsEndTime) throws Exception {
List<FzuNews> fzuNewsList = new ArrayList<>(); // 保存新闻列表
int pageNumber = 0; // 当前页码
boolean continueCrawling = true; // 是否继续爬取标志
FileWriter writer = new FileWriter("news.txt");
// 循环爬取页面,直到时间范围超出设定
while (continueCrawling) {
pageNumber++;
String pageUrl = String.format(NEWS_URL, pageNumber);
try {
// 获取当前页的文档
Document pageDocument = fetchPage(pageUrl);
// 获取新闻列表
Elements newsElements = pageDocument.getElementsByClass("clearfloat");
for (Element newsElement : newsElements) {
String time = newsElement.getElementsByTag("span").eq(0).text();
// 如果新闻时间早于指定开始时间,停止爬取
if (time.compareTo(newsBeginTime) < 0) {
continueCrawling = false;
break;
}
// 如果新闻时间晚于结束时间,跳过这条新闻
if (time.compareTo(newsEndTime) > 0) {
continue;
}
// 获取新闻的作者、标题和正文链接
String author = newsElement.getElementsByTag("a").eq(0).text();
String title = newsElement.getElementsByTag("a").eq(1).text();
String textHref = BASE_URL + newsElement.getElementsByTag("a").get(1).attr("href");
String text = fetchNewsText(textHref);
// 将新闻信息写入文件中
writer.write("Author: " + author + "\n");
writer.write("Title: " + title + "\n");
writer.write("Link: " + textHref + "\n");
writer.write("Text: " + text + "\n");
log.info("News written to file successfully!");
}
} catch (Exception e) {
// 打印错误信息并继续爬取
log.error("抓取或解析页面时出错: " + pageUrl + "。错误信息: " + e.getMessage());
}
}
return fzuNewsList; // 返回新闻列表
}
// 获取指定页面的HTML内容
private Document fetchPage(String url) throws Exception {
return Jsoup.connect(url).get();
}
// 获取新闻正文内容
private String fetchNewsText(String textHref) throws Exception {
StringBuilder textBuilder = new StringBuilder(); // 使用StringBuilder拼接正文内容
Document document = Jsoup.connect(textHref).get();
// 获取所有<p>标签中的正文
Elements paragraphs = document.getElementsByTag("p");
for (Element paragraph : paragraphs) {
textBuilder.append(paragraph.text()).append("\n");
}
return textBuilder.toString(); // 返回拼接好的正文
}
// FzuNews类用于封装新闻数据
@Data
@NoArgsConstructor
@AllArgsConstructor
public static class FzuNews {
private String time; // 新闻发布时间
private String author; // 新闻作者
private String title; // 新闻标题
private String text; // 新闻正文
}
}

@ -1,8 +1,10 @@
package com.flyingpig.bilibilispider.constant; package com.flyingpig.bilibilispider.constant;
public class FileName { public class FileName {
public static String WORDCLOUD = System.getProperty("user.dir") + "\\src\\main\\resources\\wordCloud.png";
public static String KEYWORD = System.getProperty("user.dir") + "\\src\\main\\resources\\keyword.txt"; public static String WORDCLOUD = "wordCloud.png";
public static String BARRAGE = System.getProperty("user.dir") + "\\src\\main\\resources\\barrage.txt"; public static String BARRAGE = "barrage.txt";
public static String WORD_COUNT = System.getProperty("user.dir") + "\\src\\main\\resources\\wordCount.xlsx"; public static String WORD_COUNT = "wordCount.xlsx";
public static String KEYWORD = "keyword.txt";
} }

@ -1,5 +1,8 @@
package com.flyingpig.bilibilispider.constant; package com.flyingpig.bilibilispider.constant;
/*
*/
public class HeaderConstant { public class HeaderConstant {
public static final String COOKIE = "buvid3=06742E53-942C-670A-D496-6E2A79F196FF52832infoc; b_nut=1694915352; i-wanna-go-back=-1; b_ut=7; _uuid=F744F108A-8F810-A165-2EA4-7B4956ACF910C53152infoc; buvid4=8D3A427A-5715-145B-3AA8-9ACC6E30889D54861-023091709-5h4N7ejh5A4ENQhvWFdRwQ%3D%3D; hit-new-style-dyn=1; hit-dyn-v2=1; header_theme_version=CLOSE; rpdid=0zbfAGEiSg|12slY8UuG|pS|3w1QHGXk; LIVE_BUVID=AUTO6216955654982035; buvid_fp_plain=undefined; enable_web_push=DISABLE; dy_spec_agreed=1; is-2022-channel=1; CURRENT_BLACKGAP=0; DedeUserID=398014090; DedeUserID__ckMd5=da87c9926c73fac5; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=618de7e5%2C1726673190%2C8e231%2A32CjAB8DOA4RwqxG7hQy82813je1HM5n0r08KRLyQfyA9zqBalG7QRrNNsuqvI7RejMHwSVnloTURqNE5qWmZmV0M3b0hqS0dTU21ES3NEbU9BX1JwNTNHU0VYMXc2YXNYRU9aOTJ2Q1ZXZkV4aWUyMzQ1VFo0eWpOMGVxUVQydVFfVmVGWjNoY1d3IIEC; bili_jct=51d42647c6b3da22794e018c68e239ba; CURRENT_FNVAL=4048; CURRENT_QUALITY=80; fingerprint=86d24a6d98af903f094c42f9498dfc3d; PVID=3; buvid_fp=86d24a6d98af903f094c42f9498dfc3d; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjQ4ODkxMDUsImlhdCI6MTcyNDYyOTg0NSwicGx0IjotMX0.SuRMicGtcUVVh2xh7DrFQCJReFKXavWzf07sThaixyU; bili_ticket_expires=1724889045; home_feed_column=5; browser_resolution=1440-641; sid=7oephj0f; bp_t_offset_398014090=970621780936884224; b_lsid=410A8B269_1919809748E; bsource=search_google; xxoo-tmp=zhHans"; public static final String COOKIE = "buvid3=06742E53-942C-670A-D496-6E2A79F196FF52832infoc; b_nut=1694915352; i-wanna-go-back=-1; b_ut=7; _uuid=F744F108A-8F810-A165-2EA4-7B4956ACF910C53152infoc; buvid4=8D3A427A-5715-145B-3AA8-9ACC6E30889D54861-023091709-5h4N7ejh5A4ENQhvWFdRwQ%3D%3D; hit-new-style-dyn=1; hit-dyn-v2=1; header_theme_version=CLOSE; rpdid=0zbfAGEiSg|12slY8UuG|pS|3w1QHGXk; LIVE_BUVID=AUTO6216955654982035; buvid_fp_plain=undefined; enable_web_push=DISABLE; dy_spec_agreed=1; is-2022-channel=1; CURRENT_BLACKGAP=0; DedeUserID=398014090; DedeUserID__ckMd5=da87c9926c73fac5; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=618de7e5%2C1726673190%2C8e231%2A32CjAB8DOA4RwqxG7hQy82813je1HM5n0r08KRLyQfyA9zqBalG7QRrNNsuqvI7RejMHwSVnloTURqNE5qWmZmV0M3b0hqS0dTU21ES3NEbU9BX1JwNTNHU0VYMXc2YXNYRU9aOTJ2Q1ZXZkV4aWUyMzQ1VFo0eWpOMGVxUVQydVFfVmVGWjNoY1d3IIEC; bili_jct=51d42647c6b3da22794e018c68e239ba; CURRENT_FNVAL=4048; CURRENT_QUALITY=80; fingerprint=86d24a6d98af903f094c42f9498dfc3d; PVID=3; buvid_fp=86d24a6d98af903f094c42f9498dfc3d; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjQ4ODkxMDUsImlhdCI6MTcyNDYyOTg0NSwicGx0IjotMX0.SuRMicGtcUVVh2xh7DrFQCJReFKXavWzf07sThaixyU; bili_ticket_expires=1724889045; home_feed_column=5; browser_resolution=1440-641; sid=7oephj0f; bp_t_offset_398014090=970621780936884224; b_lsid=410A8B269_1919809748E; bsource=search_google; xxoo-tmp=zhHans";

@ -1,5 +1,8 @@
package com.flyingpig.bilibilispider.constant; package com.flyingpig.bilibilispider.constant;
/*
URL
*/
public class UrlConstant { public class UrlConstant {
public static final String BILIBILI_SEARCH_URL = "https://api.bilibili.com/x/web-interface/search/type"; public static final String BILIBILI_SEARCH_URL = "https://api.bilibili.com/x/web-interface/search/type";

@ -4,67 +4,77 @@ import com.flyingpig.bilibilispider.constant.FileName;
import com.google.gson.JsonArray; import com.google.gson.JsonArray;
import com.google.gson.JsonParser; import com.google.gson.JsonParser;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import org.springframework.web.util.UriComponentsBuilder; import org.springframework.web.util.UriComponentsBuilder;
import java.io.*; import java.io.*;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.concurrent.*;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.zip.Inflater; import java.util.zip.Inflater;
import static com.flyingpig.bilibilispider.constant.UrlConstant.*; import static com.flyingpig.bilibilispider.constant.UrlConstant.*;
import static com.flyingpig.bilibilispider.util.RequestUtil.*; import static com.flyingpig.bilibilispider.util.RequestUtil.*;
@Component
@Slf4j @Slf4j
public class BilibiliSpiderTask { public class BilibiliSpiderTask {
public static List<Long> SearchVideoCidListByKeyWord(String keyword) { public static List<Long> SearchVideoCidListByKeyWord(String keyword) {
log.info("搜索获得cid任务开始"); log.info("搜索获得cid任务开始");
//每次爬取上限为50条所以要分6次爬取 List<Long> cidList = Collections.synchronizedList(new ArrayList<>());
List<Long> cidList = new ArrayList<>(); ExecutorService executor = Executors.newFixedThreadPool(6);
for (int j = 1; j <= 6; j++) {
// 搜索URL
String searchUrl = UriComponentsBuilder.fromHttpUrl(BILIBILI_SEARCH_URL)
.queryParam("keyword", keyword)
.queryParam("search_type", "video")
.queryParam("page", j)
.queryParam("page_size", 50).toUriString();
log.info("爬取第 {} 页", j);
// 获取搜索结果中的seid再根据seid获取cid封装成视频的cid集合
JsonArray searchResultArray = JsonParser.parseString(requesttToGetBodyString(searchUrl))
.getAsJsonObject().getAsJsonObject("data")
.getAsJsonArray("result");
try {
for (int i = 0; i < searchResultArray.size(); i++) { List<Future<List<Long>>> futures = new ArrayList<>();
// 获取搜索结果的bvid for (int j = 1; j <= 6; j++) {
String bvid = searchResultArray.get(i).getAsJsonObject().get("bvid").getAsString(); final int page = j;
log.info("视频bvid: {}", bvid); Future<List<Long>> future = executor.submit(() -> {
List<Long> pageCidList = new ArrayList<>();
// 根据bvid获取cid String searchUrl = UriComponentsBuilder.fromHttpUrl(BILIBILI_SEARCH_URL)
String getCidUrl = UriComponentsBuilder.fromHttpUrl(BILIBILI_GETCID_URL) .queryParam("keyword", keyword)
.queryParam("bvid", bvid) .queryParam("search_type", "video")
.queryParam("jsonp", "jsonp").toUriString(); .queryParam("page", page)
.queryParam("page_size", 50).toUriString();
Long cid = JsonParser.parseString(requesttToGetBodyString(getCidUrl)).
getAsJsonObject().getAsJsonArray("data") log.info("爬取第 {} 页", page);
.get(0).getAsJsonObject().get("cid").getAsLong();
cidList.add(cid); JsonArray searchResultArray = JsonParser.parseString(requesttToGetBodyString(searchUrl))
.getAsJsonObject().getAsJsonObject("data")
log.info("视频cid: {}", cid); .getAsJsonArray("result");
for (int i = 0; i < searchResultArray.size(); i++) {
String bvid = searchResultArray.get(i).getAsJsonObject().get("bvid").getAsString();
log.info("视频bvid: {}", bvid);
String getCidUrl = UriComponentsBuilder.fromHttpUrl(BILIBILI_GETCID_URL)
.queryParam("bvid", bvid)
.queryParam("jsonp", "jsonp").toUriString();
Long cid = JsonParser.parseString(requesttToGetBodyString(getCidUrl))
.getAsJsonObject().getAsJsonArray("data")
.get(0).getAsJsonObject().get("cid").getAsLong();
pageCidList.add(cid);
log.info("视频cid: {}", cid);
}
return pageCidList;
});
futures.add(future);
} }
for (Future<List<Long>> future : futures) {
try {
cidList.addAll(future.get());
} catch (InterruptedException | ExecutionException e) {
log.error("获取cid时发生错误", e);
}
}
} finally {
executor.shutdown();
} }
log.info("搜索任务结束"); log.info("搜索任务结束");
return cidList; return cidList;
} }
@ -78,26 +88,46 @@ public class BilibiliSpiderTask {
if (file.exists()) { if (file.exists()) {
file.delete(); file.delete();
} }
// 创建一个线程池
ExecutorService executorService = Executors.newFixedThreadPool(5);
for (Long cid : cidList) { for (Long cid : cidList) {
try { executorService.submit(() -> {
byte[] bytes = requestToGetBodyBytes(DM_URL + cid + ".xml");//获取字节码数据 try {
bytes = decompress(bytes);//解压数据 byte[] bytes = requestToGetBodyBytes(DM_URL + cid + ".xml"); // 获取字节码数据
List<String> barriageList = extractDTagContents(new String(bytes)); bytes = decompress(bytes); // 解压数据
// 将弹幕写入文件,如果文件不存在则创建,如果存在则追加 List<String> barriageList = extractDTagContents(new String(bytes));
try (FileWriter fileWriter = new FileWriter(fileName, true)) { // 将弹幕写入文件, 如果文件不存在则创建,如果存在则追加
for (String barrage : barriageList) { synchronized (BilibiliSpiderTask.class) {
fileWriter.write(barrage + "\n"); try (FileWriter fileWriter = new FileWriter(fileName, true)) {
for (String barrage : barriageList) {
fileWriter.write(barrage + "\n");
}
}
} }
log.info("已经爬取cid为 {} 的弹幕", cid);
} catch (Exception e) {
log.error("获取弹幕数据失败", e);
} }
log.info("已经爬取cid为 {} 的弹幕", cid); });
} catch (Exception e) { }
log.error("获取弹幕数据失败", e);
} // 关闭线程池
executorService.shutdown();
try {
// 等待所有任务完成
executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
} catch (InterruptedException e) {
log.error("线程池等待终止失败", e);
Thread.currentThread().interrupt();
} }
log.info("爬取弹幕任务结束"); log.info("爬取弹幕任务结束");
} }
// 解压数据 // 解压数据
private static byte[] decompress(byte[] data) throws IOException { private static byte[] decompress(byte[] data) throws IOException {
byte[] decompressData = null; byte[] decompressData = null;

@ -2,18 +2,19 @@ package com.flyingpig.bilibilispider.task;
import com.flyingpig.bilibilispider.constant.FileName; import com.flyingpig.bilibilispider.constant.FileName;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.Trie;
import org.apache.commons.collections4.trie.PatriciaTrie;
import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.time.Duration;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.util.ArrayList; import java.util.*;
import java.util.HashMap; import java.util.concurrent.*;
import java.util.List; import java.util.stream.Collectors;
import java.util.Map;
@Slf4j @Slf4j
public class DataAnalysisTask { public class DataAnalysisTask {
@ -22,30 +23,34 @@ public class DataAnalysisTask {
private static List<String> aiKeywords = new ArrayList<>(); private static List<String> aiKeywords = new ArrayList<>();
static { static {
// 定义文件路径 // 使用类加载器获取资源文件的输入流
Path filePath = Paths.get(FileName.KEYWORD); InputStream inputStream = DataAnalysisTask.class.getClassLoader().getResourceAsStream(FileName.KEYWORD);
try { if (inputStream != null) {
// 读取文件的所有行并存储到aiKeyWordsList中 // 读取文件内容
aiKeywords = Files.readAllLines(filePath); aiKeywords = new BufferedReader(new InputStreamReader(inputStream))
} catch (IOException e) { .lines()
e.printStackTrace(); .collect(Collectors.toList());
} else {
System.err.println("Resource not found: keyword.txt");
} }
} }
// 原来的写法
public static HashMap<String, Integer> getTop8BarrageListAboutAI() { public static HashMap<String, Integer> getTop8BarrageListAboutAI() {
System.out.println(aiKeywords.size());
log.info("开始统计弹幕中关于AI技术应用的关键词出现次数"); log.info("开始统计弹幕中关于AI技术应用的关键词出现次数");
// 从文件中读取弹幕集合 // 从文件中读取弹幕集合
Path filePath = Paths.get(FileName.BARRAGE); Path filePath = Paths.get(FileName.BARRAGE);
List<String> barrageList = new ArrayList<>(); List<String> barrageList;
try { try {
barrageList = Files.readAllLines(filePath); barrageList = Files.readAllLines(filePath);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
return new HashMap<>();
} }
// 记录初始时间 // 记录初始时间
@ -53,39 +58,71 @@ public class DataAnalysisTask {
// 初始化Map集合 // 初始化Map集合
HashMap<String, Integer> wordMap = new HashMap<>(); HashMap<String, Integer> wordMap = new HashMap<>();
// 将关键词加入Trie树并初始化出现次数为0
for (String keyword : aiKeywords) { for (String keyword : aiKeywords) {
wordMap.put(keyword, 0); wordMap.put(keyword, 0);
} }
// 遍历弹幕列表并统计关键词出现次数 // 创建线程池
for (String barrage : barrageList) { ExecutorService executorService = Executors.newFixedThreadPool(5);
for (String keyword : aiKeywords) {
if (barrage.contains(keyword)) { // 分割弹幕列表
wordMap.put(keyword, wordMap.get(keyword) + 1); int chunkSize = (int) Math.ceil((double) barrageList.size() / 5);
List<Future<HashMap<String, Integer>>> futures = new ArrayList<>();
for (int i = 0; i < barrageList.size(); i += chunkSize) {
final int start = i;
final int end = Math.min(i + chunkSize, barrageList.size());
Future<HashMap<String, Integer>> future = executorService.submit(() -> {
HashMap<String, Integer> localMap = new HashMap<>(wordMap);
for (String barrage : barrageList.subList(start, end)) {
for (String keyword : aiKeywords) {
if (barrage.contains(keyword)) {
localMap.put(keyword, localMap.get(keyword) + 1);
}
}
}
return localMap;
});
futures.add(future);
}
// 合并结果
try {
for (Future<HashMap<String, Integer>> future : futures) {
HashMap<String, Integer> localMap = future.get();
for (Map.Entry<String, Integer> entry : localMap.entrySet()) {
wordMap.put(entry.getKey(), wordMap.getOrDefault(entry.getKey(), 0) + entry.getValue());
} }
} }
} catch (InterruptedException | ExecutionException e) {
log.error("任务执行失败", e);
} finally {
executorService.shutdown();
try {
executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
} catch (InterruptedException e) {
log.error("线程池等待终止失败", e);
Thread.currentThread().interrupt();
}
} }
// 将Trie内容转换为Map并排序 // 将Map内容排序并获取前8个
HashMap<String, Integer> sortedMap = new HashMap<>(); HashMap<String, Integer> sortedMap = wordMap.entrySet().stream()
wordMap.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed()) .sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(8) .limit(8)
.forEachOrdered(entry -> sortedMap.put(entry.getKey(), entry.getValue())); .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new));
// 输出前8个关键词及其出现次数 // 输出前8个关键词及其出现次数
for (Map.Entry<String, Integer> entry : sortedMap.entrySet()) { for (Map.Entry<String, Integer> entry : sortedMap.entrySet()) {
log.info(entry.getKey() + " : " + entry.getValue()); log.info(entry.getKey() + " : " + entry.getValue());
} }
log.info("统计弹幕中关于AI技术应用的关键词出现次数任务结束, 耗时: {}ms",
Duration.between(startTime, LocalDateTime.now()).toMillis());
log.info("统计弹幕中关于AI技术应用的关键词出现次数任务结束, 耗时: {}ms", LocalDateTime.now().getNano() - startTime.getNano()); return new HashMap<>(sortedMap);
// 返回统计结果
return wordMap;
} }
} }

@ -1,393 +0,0 @@
package com.flyingpig.bilibilispider.task;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class DoubleArrayTrie {
private final static int BUF_SIZE = 16384;
private final static int UNIT_SIZE = 8; // size of int + int
private static class Node {
int code;
int depth;
int left;
int right;
};
private int check[];
private int base[];
private boolean used[];
private int size;
private int allocSize;
private List<String> key;
private int keySize;
private int length[];
private int value[];
private int progress;
private int nextCheckPos;
// boolean no_delete_;
int error_;
// int (*progressfunc_) (size_t, size_t);
// inline _resize expanded
private int resize(int newSize) {
int[] base2 = new int[newSize];
int[] check2 = new int[newSize];
boolean used2[] = new boolean[newSize];
if (allocSize > 0) {
System.arraycopy(base, 0, base2, 0, allocSize);
System.arraycopy(check, 0, check2, 0, allocSize);
System.arraycopy(used2, 0, used2, 0, allocSize);
}
base = base2;
check = check2;
used = used2;
return allocSize = newSize;
}
private int fetch(Node parent, List<Node> siblings) {
if (error_ < 0)
return 0;
int prev = 0;
for (int i = parent.left; i < parent.right; i++) {
if ((length != null ? length[i] : key.get(i).length()) < parent.depth)
continue;
String tmp = key.get(i);
int cur = 0;
if ((length != null ? length[i] : tmp.length()) != parent.depth)
cur = (int) tmp.charAt(parent.depth) + 1;
if (prev > cur) {
error_ = -3;
return 0;
}
if (cur != prev || siblings.size() == 0) {
Node tmp_node = new Node();
tmp_node.depth = parent.depth + 1;
tmp_node.code = cur;
tmp_node.left = i;
if (siblings.size() != 0)
siblings.get(siblings.size() - 1).right = i;
siblings.add(tmp_node);
}
prev = cur;
}
if (siblings.size() != 0)
siblings.get(siblings.size() - 1).right = parent.right;
return siblings.size();
}
private int insert(List<Node> siblings) {
if (error_ < 0)
return 0;
int begin = 0;
int pos = ((siblings.get(0).code + 1 > nextCheckPos) ? siblings.get(0).code + 1
: nextCheckPos) - 1;
int nonzero_num = 0;
int first = 0;
if (allocSize <= pos)
resize(pos + 1);
outer: while (true) {
pos++;
if (allocSize <= pos)
resize(pos + 1);
if (check[pos] != 0) {
nonzero_num++;
continue;
} else if (first == 0) {
nextCheckPos = pos;
first = 1;
}
begin = pos - siblings.get(0).code;
if (allocSize <= (begin + siblings.get(siblings.size() - 1).code)) {
// progress can be zero
double l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05 : 1.0
* keySize / (progress + 1);
resize((int) (allocSize * l));
}
if (used[begin])
continue;
for (int i = 1; i < siblings.size(); i++)
if (check[begin + siblings.get(i).code] != 0)
continue outer;
break;
}
// -- Simple heuristics --
// if the percentage of non-empty contents in check between the
// index
// 'next_check_pos' and 'check' is greater than some constant value
// (e.g. 0.9),
// new 'next_check_pos' index is written by 'check'.
if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95)
nextCheckPos = pos;
used[begin] = true;
size = (size > begin + siblings.get(siblings.size() - 1).code + 1) ? size
: begin + siblings.get(siblings.size() - 1).code + 1;
for (int i = 0; i < siblings.size(); i++)
check[begin + siblings.get(i).code] = begin;
for (int i = 0; i < siblings.size(); i++) {
List<Node> new_siblings = new ArrayList<Node>();
if (fetch(siblings.get(i), new_siblings) == 0) {
base[begin + siblings.get(i).code] = (value != null) ? (-value[siblings
.get(i).left] - 1) : (-siblings.get(i).left - 1);
if (value != null && (-value[siblings.get(i).left] - 1) >= 0) {
error_ = -2;
return 0;
}
progress++;
// if (progress_func_) (*progress_func_) (progress,
// keySize);
} else {
int h = insert(new_siblings);
base[begin + siblings.get(i).code] = h;
}
}
return begin;
}
public DoubleArrayTrie() {
check = null;
base = null;
used = null;
size = 0;
allocSize = 0;
// no_delete_ = false;
error_ = 0;
}
// no deconstructor
// set_result omitted
// the search methods returns (the list of) the value(s) instead
// of (the list of) the pair(s) of value(s) and length(s)
// set_array omitted
// array omitted
void clear() {
// if (! no_delete_)
check = null;
base = null;
used = null;
allocSize = 0;
size = 0;
// no_delete_ = false;
}
public int getUnitSize() {
return UNIT_SIZE;
}
public int getSize() {
return size;
}
public int getTotalSize() {
return size * UNIT_SIZE;
}
public int getNonzeroSize() {
int result = 0;
for (int i = 0; i < size; i++)
if (check[i] != 0)
result++;
return result;
}
public int build(List<String> key) {
return build(key, null, null, key.size());
}
public int build(List<String> _key, int _length[], int _value[],
int _keySize) {
if (_keySize > _key.size() || _key == null)
return 0;
// progress_func_ = progress_func;
key = _key;
length = _length;
keySize = _keySize;
value = _value;
progress = 0;
resize(65536 * 32);
base[0] = 1;
nextCheckPos = 0;
Node root_node = new Node();
root_node.left = 0;
root_node.right = keySize;
root_node.depth = 0;
List<Node> siblings = new ArrayList<Node>();
fetch(root_node, siblings);
insert(siblings);
// size += (1 << 8 * 2) + 1; // ???
// if (size >= allocSize) resize (size);
used = null;
key = null;
return error_;
}
public void open(String fileName) throws IOException {
File file = new File(fileName);
size = (int) file.length() / UNIT_SIZE;
check = new int[size];
base = new int[size];
DataInputStream is = null;
try {
is = new DataInputStream(new BufferedInputStream(
new FileInputStream(file), BUF_SIZE));
for (int i = 0; i < size; i++) {
base[i] = is.readInt();
check[i] = is.readInt();
}
} finally {
if (is != null)
is.close();
}
}
public void save(String fileName) throws IOException {
DataOutputStream out = null;
try {
out = new DataOutputStream(new BufferedOutputStream(
new FileOutputStream(fileName)));
for (int i = 0; i < size; i++) {
out.writeInt(base[i]);
out.writeInt(check[i]);
}
out.close();
} finally {
if (out != null)
out.close();
}
}
public int exactMatchSearch(String key) {
return exactMatchSearch(key, 0, 0, 0);
}
public int exactMatchSearch(String key, int pos, int len, int nodePos) {
if (len <= 0)
len = key.length();
if (nodePos <= 0)
nodePos = 0;
int result = -1;
char[] keyChars = key.toCharArray();
int b = base[nodePos];
int p;
for (int i = pos; i < len; i++) {
p = b + (int) (keyChars[i]) + 1;
if (b == check[p])
b = base[p];
else
return result;
}
p = b;
int n = base[p];
if (b == check[p] && n < 0) {
result = -n - 1;
}
return result;
}
public List<Integer> commonPrefixSearch(String key) {
return commonPrefixSearch(key, 0, 0, 0);
}
public List<Integer> commonPrefixSearch(String key, int pos, int len,
int nodePos) {
if (len <= 0)
len = key.length();
if (nodePos <= 0)
nodePos = 0;
List<Integer> result = new ArrayList<Integer>();
char[] keyChars = key.toCharArray();
int b = base[nodePos];
int n;
int p;
for (int i = pos; i < len; i++) {
p = b;
n = base[p];
if (b == check[p] && n < 0) {
result.add(-n - 1);
}
p = b + (int) (keyChars[i]) + 1;
if (b == check[p])
b = base[p];
else
return result;
}
p = b;
n = base[p];
if (b == check[p] && n < 0) {
result.add(-n - 1);
}
return result;
}
// debug
public void dump() {
for (int i = 0; i < size; i++) {
System.err.println("i: " + i + " [" + base[i] + ", " + check[i]
+ "]");
}
}
}

@ -12,6 +12,7 @@ import java.util.Map;
@Slf4j @Slf4j
public class ExcelWriteTask { public class ExcelWriteTask {
public static void writeBarrageListToExcel(Map<String, Integer> barrageCountMap) { public static void writeBarrageListToExcel(Map<String, Integer> barrageCountMap) {
log.info("开始将统计结果写入Excel文件"); log.info("开始将统计结果写入Excel文件");

File diff suppressed because it is too large Load Diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 139 KiB

Binary file not shown.

@ -1,24 +1,118 @@
package com.flyingpig.bilibilispider; package com.flyingpig.bilibilispider;
import com.flyingpig.bilibilispider.additionalWork.FzuNewsCrawler;
import com.flyingpig.bilibilispider.task.BilibiliSpiderTask;
import com.flyingpig.bilibilispider.task.DataAnalysisTask;
import com.flyingpig.bilibilispider.task.ExcelWriteTask;
import com.flyingpig.bilibilispider.util.WordCloudUtil;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired; import org.mockito.Mock;
import org.springframework.boot.test.context.SpringBootTest; import org.mockito.MockedStatic;
import org.springframework.web.client.RestTemplate;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.Mockito.*;
@SpringBootTest
class BilibiliSpiderApplicationTests { class BilibiliSpiderApplicationTests {
@Autowired @Mock
RestTemplate restTemplate; private ExcelWriteTask excelWriteTask;
@Test
public void testSearchVideoCidListByKeyWord() {
List<Long> cidList = List.of(123L, 456L);
// Mock static methods
try (MockedStatic<BilibiliSpiderTask> mockedStatic = mockStatic(BilibiliSpiderTask.class)) {
// Set up the mock behavior
mockedStatic.when(() -> BilibiliSpiderTask.SearchVideoCidListByKeyWord("2024巴黎奥运会")).thenReturn(cidList);
// Call the method under test
List<Long> resultCidList = BilibiliSpiderTask.SearchVideoCidListByKeyWord("2024巴黎奥运会");
// Verify interactions
mockedStatic.verify(() -> BilibiliSpiderTask.SearchVideoCidListByKeyWord("2024巴黎奥运会"));
// Check the result
assertEquals(cidList, resultCidList);
}
}
@Test
public void testSearchBarrageListByCidList() throws Exception {
List<Long> cidList = List.of(123L, 456L);
try (MockedStatic<BilibiliSpiderTask> mockedStatic = mockStatic(BilibiliSpiderTask.class)) {
mockedStatic.when(() -> BilibiliSpiderTask.SearchBarrageListByCidList(cidList)).thenAnswer(invocation -> {
return null;
});
assertDoesNotThrow(() -> BilibiliSpiderTask.SearchBarrageListByCidList(cidList));
// Verify interactions with the static method
mockedStatic.verify(() -> BilibiliSpiderTask.SearchBarrageListByCidList(cidList));
}
}
@Test
public void testGetTop8BarrageListAboutAI() throws Exception {
Map<String, Integer> wordCountMap = new HashMap<>();
wordCountMap.put("AI", 50);
wordCountMap.put("Machine Learning", 30);
try (MockedStatic<DataAnalysisTask> mockedStatic = mockStatic(DataAnalysisTask.class)) {
mockedStatic.when(DataAnalysisTask::getTop8BarrageListAboutAI).thenReturn(wordCountMap);
Map<String, Integer> resultMap = DataAnalysisTask.getTop8BarrageListAboutAI();
mockedStatic.verify(DataAnalysisTask::getTop8BarrageListAboutAI);
assertEquals(wordCountMap, resultMap);
}
}
@Test
public void testWriteBarrageListToExcel() throws Exception {
Map<String, Integer> wordCountMap = Map.of("AI", 50, "Machine Learning", 30);
doNothing().when(excelWriteTask).writeBarrageListToExcel(wordCountMap);
ExcelWriteTask.writeBarrageListToExcel(wordCountMap);
verify(excelWriteTask).writeBarrageListToExcel(wordCountMap);
}
@Test
public void testGenerateWordCloud() {
Map<String, Integer> wordCountMap = Map.of("AI", 50, "Machine Learning", 30);
try (MockedStatic<WordCloudUtil> mockedStatic = mockStatic(WordCloudUtil.class)) {
mockedStatic.when(() -> WordCloudUtil.generateWordCloud(wordCountMap)).thenAnswer(invocation -> {
return null;
});
WordCloudUtil.generateWordCloud(wordCountMap);
mockedStatic.verify(() -> WordCloudUtil.generateWordCloud(wordCountMap));
}
}
// 注意要开校园网进行爬取!
@Test @Test
void testSearchInterface() { void testFzuNewsCraw() throws Exception{
restTemplate.exchange("https://api.bilibili.com/x/web-interface/search/type?keyword=2024巴黎奥运会&search_type=video", FzuNewsCrawler fzuNewsCrawler = new FzuNewsCrawler();
null, null, String.class); fzuNewsCrawler.crawlFzuNotificationsAndFileSystems("2023-01-01","2023-02-01");
String searchResult = restTemplate.getForObject(
"https://api.bilibili.com/x/web-interface/search/type?keyword=2024巴黎奥运会&search_type=video", String.class);
System.out.println(searchResult);
} }
} }

Loading…
Cancel
Save