Initial commit

develop
flying_pig 3 months ago
parent d4c73b83b7
commit 536c8ef470

33
.gitignore vendored

@ -0,0 +1,33 @@
HELP.md
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
!**/src/test/**/target/
### STS ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
### IntelliJ IDEA ###
.idea
*.iws
*.iml
*.ipr
### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
build/
!**/src/main/**/build/
!**/src/test/**/build/
### VS Code ###
.vscode/

@ -0,0 +1,120 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.flyingpig</groupId>
<artifactId>bilibili-spider</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>bilibili-spider</name>
<description>bilibili-spider</description>
<properties>
<java.version>1.8</java.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<spring-boot.version>2.6.13</spring-boot.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.10.1</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>4.10.0</version>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>69.1</version>
</dependency>
<!--词云生成相关依赖-->
<dependency>
<groupId>com.kennycason</groupId>
<artifactId>kumo-core</artifactId>
<version>1.27</version>
</dependency>
<dependency>
<groupId>com.kennycason</groupId>
<artifactId>kumo-tokenizers</artifactId>
<version>1.27</version>
</dependency>
<!--Trip数等数据结构相关依赖-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
<version>4.4</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>
<version>3.2.0</version> <!-- 请根据需要选择具体版本 -->
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-dependencies</artifactId>
<version>${spring-boot.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>11</source>
<target>11</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<version>${spring-boot.version}</version>
<configuration>
<mainClass>com.flyingpig.bilibilispider.BilibiliSpiderApplication</mainClass>
<skip>true</skip>
</configuration>
<executions>
<execution>
<id>repackage</id>
<goals>
<goal>repackage</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

@ -0,0 +1,32 @@
package com.flyingpig.bilibilispider;
import com.flyingpig.bilibilispider.task.BilibiliSpiderTask;
import com.flyingpig.bilibilispider.task.DataAnalysisTask;
import com.flyingpig.bilibilispider.task.ExcelWriteTask;
import com.flyingpig.bilibilispider.util.WordCloudUtil;
import lombok.extern.slf4j.Slf4j;
import javax.xml.parsers.ParserConfigurationException;
import java.util.List;
import java.util.Map;
@Slf4j
public class BilibiliSpiderApplication {
public static void main(String[] args) throws ParserConfigurationException {
log.info("爬取启动!!!");
// 搜索2024巴黎奥运会的视频并获取视频的cid封装成视频的cid集合
List<Long> cidList = BilibiliSpiderTask.SearchVideoCidListByKeyWord("2024巴黎奥运会");
// 通过CID集合获取弹幕集合并写入到文件中
BilibiliSpiderTask.SearchBarrageListByCidList(cidList);
// 统计AI技术应用方面的每种弹幕数量并输出数量排名前8的弹幕返回弹幕统计
Map<String, Integer> wordCountMap = DataAnalysisTask.getTop8BarrageListAboutAI();
// 将统计处的数据写入excel中
ExcelWriteTask.writeBarrageListToExcel(wordCountMap);
// 生成词云
WordCloudUtil.generateWordCloud(wordCountMap);
log.trace("爬取结束!!!");
}
}

@ -0,0 +1,8 @@
package com.flyingpig.bilibilispider.constant;
public class FileName {
public static String WORDCLOUD = System.getProperty("user.dir") + "\\src\\main\\resources\\wordCloud.png";
public static String KEYWORD = System.getProperty("user.dir") + "\\src\\main\\resources\\keyword.txt";
public static String BARRAGE = System.getProperty("user.dir") + "\\src\\main\\resources\\barrage.txt";
public static String WORD_COUNT = System.getProperty("user.dir") + "\\src\\main\\resources\\wordCount.xlsx";
}

@ -0,0 +1,6 @@
package com.flyingpig.bilibilispider.constant;
public class HeaderConstant {
public static final String COOKIE = "buvid3=06742E53-942C-670A-D496-6E2A79F196FF52832infoc; b_nut=1694915352; i-wanna-go-back=-1; b_ut=7; _uuid=F744F108A-8F810-A165-2EA4-7B4956ACF910C53152infoc; buvid4=8D3A427A-5715-145B-3AA8-9ACC6E30889D54861-023091709-5h4N7ejh5A4ENQhvWFdRwQ%3D%3D; hit-new-style-dyn=1; hit-dyn-v2=1; header_theme_version=CLOSE; rpdid=0zbfAGEiSg|12slY8UuG|pS|3w1QHGXk; LIVE_BUVID=AUTO6216955654982035; buvid_fp_plain=undefined; enable_web_push=DISABLE; dy_spec_agreed=1; is-2022-channel=1; CURRENT_BLACKGAP=0; DedeUserID=398014090; DedeUserID__ckMd5=da87c9926c73fac5; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=618de7e5%2C1726673190%2C8e231%2A32CjAB8DOA4RwqxG7hQy82813je1HM5n0r08KRLyQfyA9zqBalG7QRrNNsuqvI7RejMHwSVnloTURqNE5qWmZmV0M3b0hqS0dTU21ES3NEbU9BX1JwNTNHU0VYMXc2YXNYRU9aOTJ2Q1ZXZkV4aWUyMzQ1VFo0eWpOMGVxUVQydVFfVmVGWjNoY1d3IIEC; bili_jct=51d42647c6b3da22794e018c68e239ba; CURRENT_FNVAL=4048; CURRENT_QUALITY=80; fingerprint=86d24a6d98af903f094c42f9498dfc3d; PVID=3; buvid_fp=86d24a6d98af903f094c42f9498dfc3d; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjQ4ODkxMDUsImlhdCI6MTcyNDYyOTg0NSwicGx0IjotMX0.SuRMicGtcUVVh2xh7DrFQCJReFKXavWzf07sThaixyU; bili_ticket_expires=1724889045; home_feed_column=5; browser_resolution=1440-641; sid=7oephj0f; bp_t_offset_398014090=970621780936884224; b_lsid=410A8B269_1919809748E; bsource=search_google; xxoo-tmp=zhHans";
}

@ -0,0 +1,9 @@
package com.flyingpig.bilibilispider.constant;
public class UrlConstant {
public static final String BILIBILI_SEARCH_URL = "https://api.bilibili.com/x/web-interface/search/type";
public static final String BILIBILI_GETCID_URL = "https://api.bilibili.com/x/player/pagelist";
public static final String DM_URL = "https://comment.bilibili.com/";
}

@ -0,0 +1,143 @@
package com.flyingpig.bilibilispider.task;
import com.flyingpig.bilibilispider.constant.FileName;
import com.google.gson.JsonArray;
import com.google.gson.JsonParser;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import org.springframework.web.util.UriComponentsBuilder;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.Inflater;
import static com.flyingpig.bilibilispider.constant.UrlConstant.*;
import static com.flyingpig.bilibilispider.util.RequestUtil.*;
@Component
@Slf4j
public class BilibiliSpiderTask {
public static List<Long> SearchVideoCidListByKeyWord(String keyword) {
log.info("搜索获得cid任务开始");
//每次爬取上限为50条所以要分6次爬取
List<Long> cidList = new ArrayList<>();
for (int j = 1; j <= 6; j++) {
// 搜索URL
String searchUrl = UriComponentsBuilder.fromHttpUrl(BILIBILI_SEARCH_URL)
.queryParam("keyword", keyword)
.queryParam("search_type", "video")
.queryParam("page", j)
.queryParam("page_size", 50).toUriString();
log.info("爬取第 {} 页", j);
// 获取搜索结果中的seid再根据seid获取cid封装成视频的cid集合
JsonArray searchResultArray = JsonParser.parseString(requesttToGetBodyString(searchUrl))
.getAsJsonObject().getAsJsonObject("data")
.getAsJsonArray("result");
for (int i = 0; i < searchResultArray.size(); i++) {
// 获取搜索结果的bvid
String bvid = searchResultArray.get(i).getAsJsonObject().get("bvid").getAsString();
log.info("视频bvid: {}", bvid);
// 根据bvid获取cid
String getCidUrl = UriComponentsBuilder.fromHttpUrl(BILIBILI_GETCID_URL)
.queryParam("bvid", bvid)
.queryParam("jsonp", "jsonp").toUriString();
Long cid = JsonParser.parseString(requesttToGetBodyString(getCidUrl)).
getAsJsonObject().getAsJsonArray("data")
.get(0).getAsJsonObject().get("cid").getAsLong();
cidList.add(cid);
log.info("视频cid: {}", cid);
}
}
log.info("搜索任务结束");
return cidList;
}
public static void SearchBarrageListByCidList(List<Long> cidList) {
log.info("爬取弹幕任务开始");
String fileName = FileName.BARRAGE;
// 先删除之前的弹幕文件
File file = new File(fileName);
if (file.exists()) {
file.delete();
}
for (Long cid : cidList) {
try {
byte[] bytes = requestToGetBodyBytes(DM_URL + cid + ".xml");//获取字节码数据
bytes = decompress(bytes);//解压数据
List<String> barriageList = extractDTagContents(new String(bytes));
// 将弹幕写入文件,如果文件不存在则创建,如果存在则追加
try (FileWriter fileWriter = new FileWriter(fileName, true)) {
for (String barrage : barriageList) {
fileWriter.write(barrage + "\n");
}
}
log.info("已经爬取cid为 {} 的弹幕", cid);
} catch (Exception e) {
log.error("获取弹幕数据失败", e);
}
}
log.info("爬取弹幕任务结束");
}
// 解压数据
private static byte[] decompress(byte[] data) throws IOException {
byte[] decompressData = null;
Inflater decompressor = new Inflater(true);
decompressor.reset();
decompressor.setInput(data);
ByteArrayOutputStream outputStream = new ByteArrayOutputStream(data.length);
try {
byte[] buf = new byte[1024];
while (!decompressor.finished()) {
int i = decompressor.inflate(buf);
outputStream.write(buf, 0, i);
}
decompressData = outputStream.toByteArray();
} catch (Exception e) {
} finally {
outputStream.close();
}
decompressor.end();
return decompressData;
}
// 提取D标签内容
private static List<String> extractDTagContents(String xmlContent) {
List<String> result = new ArrayList<>();
String regex = "<d[^>]*>(.*?)</d>";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(xmlContent);
while (matcher.find()) {
result.add(matcher.group(1));
}
return result;
}
}

@ -0,0 +1,91 @@
package com.flyingpig.bilibilispider.task;
import com.flyingpig.bilibilispider.constant.FileName;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.Trie;
import org.apache.commons.collections4.trie.PatriciaTrie;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@Slf4j
public class DataAnalysisTask {
private static List<String> aiKeywords = new ArrayList<>();
static {
// 定义文件路径
Path filePath = Paths.get(FileName.KEYWORD);
try {
// 读取文件的所有行并存储到aiKeyWordsList中
aiKeywords = Files.readAllLines(filePath);
} catch (IOException e) {
e.printStackTrace();
}
}
// 原来的写法
public static HashMap<String, Integer> getTop8BarrageListAboutAI() {
log.info("开始统计弹幕中关于AI技术应用的关键词出现次数");
// 从文件中读取弹幕集合
Path filePath = Paths.get(FileName.BARRAGE);
List<String> barrageList = new ArrayList<>();
try {
barrageList = Files.readAllLines(filePath);
} catch (IOException e) {
e.printStackTrace();
}
// 记录初始时间
LocalDateTime startTime = LocalDateTime.now();
// 初始化Map集合
HashMap<String, Integer> wordMap = new HashMap<>();
// 将关键词加入Trie树并初始化出现次数为0
for (String keyword : aiKeywords) {
wordMap.put(keyword, 0);
}
// 遍历弹幕列表并统计关键词出现次数
for (String barrage : barrageList) {
for (String keyword : aiKeywords) {
if (barrage.contains(keyword)) {
wordMap.put(keyword, wordMap.get(keyword) + 1);
}
}
}
// 将Trie内容转换为Map并排序
HashMap<String, Integer> sortedMap = new HashMap<>();
wordMap.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(8)
.forEachOrdered(entry -> sortedMap.put(entry.getKey(), entry.getValue()));
// 输出前8个关键词及其出现次数
for (Map.Entry<String, Integer> entry : sortedMap.entrySet()) {
log.info(entry.getKey() + " : " + entry.getValue());
}
log.info("统计弹幕中关于AI技术应用的关键词出现次数任务结束, 耗时: {}ms", LocalDateTime.now().getNano() - startTime.getNano());
// 返回统计结果
return wordMap;
}
}

@ -0,0 +1,393 @@
package com.flyingpig.bilibilispider.task;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class DoubleArrayTrie {
private final static int BUF_SIZE = 16384;
private final static int UNIT_SIZE = 8; // size of int + int
private static class Node {
int code;
int depth;
int left;
int right;
};
private int check[];
private int base[];
private boolean used[];
private int size;
private int allocSize;
private List<String> key;
private int keySize;
private int length[];
private int value[];
private int progress;
private int nextCheckPos;
// boolean no_delete_;
int error_;
// int (*progressfunc_) (size_t, size_t);
// inline _resize expanded
private int resize(int newSize) {
int[] base2 = new int[newSize];
int[] check2 = new int[newSize];
boolean used2[] = new boolean[newSize];
if (allocSize > 0) {
System.arraycopy(base, 0, base2, 0, allocSize);
System.arraycopy(check, 0, check2, 0, allocSize);
System.arraycopy(used2, 0, used2, 0, allocSize);
}
base = base2;
check = check2;
used = used2;
return allocSize = newSize;
}
private int fetch(Node parent, List<Node> siblings) {
if (error_ < 0)
return 0;
int prev = 0;
for (int i = parent.left; i < parent.right; i++) {
if ((length != null ? length[i] : key.get(i).length()) < parent.depth)
continue;
String tmp = key.get(i);
int cur = 0;
if ((length != null ? length[i] : tmp.length()) != parent.depth)
cur = (int) tmp.charAt(parent.depth) + 1;
if (prev > cur) {
error_ = -3;
return 0;
}
if (cur != prev || siblings.size() == 0) {
Node tmp_node = new Node();
tmp_node.depth = parent.depth + 1;
tmp_node.code = cur;
tmp_node.left = i;
if (siblings.size() != 0)
siblings.get(siblings.size() - 1).right = i;
siblings.add(tmp_node);
}
prev = cur;
}
if (siblings.size() != 0)
siblings.get(siblings.size() - 1).right = parent.right;
return siblings.size();
}
private int insert(List<Node> siblings) {
if (error_ < 0)
return 0;
int begin = 0;
int pos = ((siblings.get(0).code + 1 > nextCheckPos) ? siblings.get(0).code + 1
: nextCheckPos) - 1;
int nonzero_num = 0;
int first = 0;
if (allocSize <= pos)
resize(pos + 1);
outer: while (true) {
pos++;
if (allocSize <= pos)
resize(pos + 1);
if (check[pos] != 0) {
nonzero_num++;
continue;
} else if (first == 0) {
nextCheckPos = pos;
first = 1;
}
begin = pos - siblings.get(0).code;
if (allocSize <= (begin + siblings.get(siblings.size() - 1).code)) {
// progress can be zero
double l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05 : 1.0
* keySize / (progress + 1);
resize((int) (allocSize * l));
}
if (used[begin])
continue;
for (int i = 1; i < siblings.size(); i++)
if (check[begin + siblings.get(i).code] != 0)
continue outer;
break;
}
// -- Simple heuristics --
// if the percentage of non-empty contents in check between the
// index
// 'next_check_pos' and 'check' is greater than some constant value
// (e.g. 0.9),
// new 'next_check_pos' index is written by 'check'.
if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95)
nextCheckPos = pos;
used[begin] = true;
size = (size > begin + siblings.get(siblings.size() - 1).code + 1) ? size
: begin + siblings.get(siblings.size() - 1).code + 1;
for (int i = 0; i < siblings.size(); i++)
check[begin + siblings.get(i).code] = begin;
for (int i = 0; i < siblings.size(); i++) {
List<Node> new_siblings = new ArrayList<Node>();
if (fetch(siblings.get(i), new_siblings) == 0) {
base[begin + siblings.get(i).code] = (value != null) ? (-value[siblings
.get(i).left] - 1) : (-siblings.get(i).left - 1);
if (value != null && (-value[siblings.get(i).left] - 1) >= 0) {
error_ = -2;
return 0;
}
progress++;
// if (progress_func_) (*progress_func_) (progress,
// keySize);
} else {
int h = insert(new_siblings);
base[begin + siblings.get(i).code] = h;
}
}
return begin;
}
public DoubleArrayTrie() {
check = null;
base = null;
used = null;
size = 0;
allocSize = 0;
// no_delete_ = false;
error_ = 0;
}
// no deconstructor
// set_result omitted
// the search methods returns (the list of) the value(s) instead
// of (the list of) the pair(s) of value(s) and length(s)
// set_array omitted
// array omitted
void clear() {
// if (! no_delete_)
check = null;
base = null;
used = null;
allocSize = 0;
size = 0;
// no_delete_ = false;
}
public int getUnitSize() {
return UNIT_SIZE;
}
public int getSize() {
return size;
}
public int getTotalSize() {
return size * UNIT_SIZE;
}
public int getNonzeroSize() {
int result = 0;
for (int i = 0; i < size; i++)
if (check[i] != 0)
result++;
return result;
}
public int build(List<String> key) {
return build(key, null, null, key.size());
}
public int build(List<String> _key, int _length[], int _value[],
int _keySize) {
if (_keySize > _key.size() || _key == null)
return 0;
// progress_func_ = progress_func;
key = _key;
length = _length;
keySize = _keySize;
value = _value;
progress = 0;
resize(65536 * 32);
base[0] = 1;
nextCheckPos = 0;
Node root_node = new Node();
root_node.left = 0;
root_node.right = keySize;
root_node.depth = 0;
List<Node> siblings = new ArrayList<Node>();
fetch(root_node, siblings);
insert(siblings);
// size += (1 << 8 * 2) + 1; // ???
// if (size >= allocSize) resize (size);
used = null;
key = null;
return error_;
}
public void open(String fileName) throws IOException {
File file = new File(fileName);
size = (int) file.length() / UNIT_SIZE;
check = new int[size];
base = new int[size];
DataInputStream is = null;
try {
is = new DataInputStream(new BufferedInputStream(
new FileInputStream(file), BUF_SIZE));
for (int i = 0; i < size; i++) {
base[i] = is.readInt();
check[i] = is.readInt();
}
} finally {
if (is != null)
is.close();
}
}
public void save(String fileName) throws IOException {
DataOutputStream out = null;
try {
out = new DataOutputStream(new BufferedOutputStream(
new FileOutputStream(fileName)));
for (int i = 0; i < size; i++) {
out.writeInt(base[i]);
out.writeInt(check[i]);
}
out.close();
} finally {
if (out != null)
out.close();
}
}
public int exactMatchSearch(String key) {
return exactMatchSearch(key, 0, 0, 0);
}
public int exactMatchSearch(String key, int pos, int len, int nodePos) {
if (len <= 0)
len = key.length();
if (nodePos <= 0)
nodePos = 0;
int result = -1;
char[] keyChars = key.toCharArray();
int b = base[nodePos];
int p;
for (int i = pos; i < len; i++) {
p = b + (int) (keyChars[i]) + 1;
if (b == check[p])
b = base[p];
else
return result;
}
p = b;
int n = base[p];
if (b == check[p] && n < 0) {
result = -n - 1;
}
return result;
}
public List<Integer> commonPrefixSearch(String key) {
return commonPrefixSearch(key, 0, 0, 0);
}
public List<Integer> commonPrefixSearch(String key, int pos, int len,
int nodePos) {
if (len <= 0)
len = key.length();
if (nodePos <= 0)
nodePos = 0;
List<Integer> result = new ArrayList<Integer>();
char[] keyChars = key.toCharArray();
int b = base[nodePos];
int n;
int p;
for (int i = pos; i < len; i++) {
p = b;
n = base[p];
if (b == check[p] && n < 0) {
result.add(-n - 1);
}
p = b + (int) (keyChars[i]) + 1;
if (b == check[p])
b = base[p];
else
return result;
}
p = b;
n = base[p];
if (b == check[p] && n < 0) {
result.add(-n - 1);
}
return result;
}
// debug
public void dump() {
for (int i = 0; i < size; i++) {
System.err.println("i: " + i + " [" + base[i] + ", " + check[i]
+ "]");
}
}
}

@ -0,0 +1,45 @@
package com.flyingpig.bilibilispider.task;
import com.alibaba.excel.EasyExcel;
import com.flyingpig.bilibilispider.constant.FileName;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@Slf4j
public class ExcelWriteTask {
public static void writeBarrageListToExcel(Map<String, Integer> barrageCountMap) {
log.info("开始将统计结果写入Excel文件");
// 设置文件输出路径
String fileName = FileName.WORD_COUNT;
// 准备要写入的数据
List<WordCount> dataList = new ArrayList<>();
for (Map.Entry<String, Integer> entry : barrageCountMap.entrySet()) {
WordCount wordCount = new WordCount();
wordCount.word = entry.getKey();
wordCount.count = entry.getValue();
dataList.add(wordCount);
}
// 使用EasyExcel写入Excel文件
EasyExcel.write(fileName, WordCount.class).sheet("Sheet1").doWrite(dataList);
log.info("统计结果写入Excel文件任务结束");
}
@Data
@NoArgsConstructor
@AllArgsConstructor
private static class WordCount {
private String word;
private Integer count;
}
}

@ -0,0 +1,46 @@
package com.flyingpig.bilibilispider.util;
import com.flyingpig.bilibilispider.constant.HeaderConstant;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
public class RequestUtil {
public static String requesttToGetBodyString(String url) {
try {
OkHttpClient client = new OkHttpClient();
Request request = new Request.Builder()
.url(url)
.addHeader("Cookie", HeaderConstant.COOKIE)
.build();
try (Response response = client.newCall(request).execute()) {
if (response.body() != null) {
return response.body().string();
}
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public static byte[] requestToGetBodyBytes(String url) {
try {
OkHttpClient client = new OkHttpClient();
Request request = new Request.Builder()
.url(url)
.addHeader("Cookie", HeaderConstant.COOKIE)
.build();
try (Response response = client.newCall(request).execute()) {
if (response.body() != null) {
return response.body().bytes();
}
}
} catch (Exception e) {
e.printStackTrace();
}
return new byte[0];
}
}

@ -0,0 +1,61 @@
package com.flyingpig.bilibilispider.util;
import com.flyingpig.bilibilispider.constant.FileName;
import com.kennycason.kumo.CollisionMode;
import com.kennycason.kumo.WordCloud;
import com.kennycason.kumo.WordFrequency;
import com.kennycason.kumo.bg.CircleBackground;
import com.kennycason.kumo.font.KumoFont;
import com.kennycason.kumo.font.scale.SqrtFontScalar;
import com.kennycason.kumo.nlp.FrequencyAnalyzer;
import com.kennycason.kumo.nlp.tokenizers.ChineseWordTokenizer;
import com.kennycason.kumo.palette.LinearGradientColorPalette;
import java.awt.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/*
*/
public class WordCloudUtil {
public static void generateWordCloud(Map<String, Integer> wordCountMap) {
// 生成词云
FrequencyAnalyzer frequencyAnalyzer = new FrequencyAnalyzer();
frequencyAnalyzer.setWordFrequenciesToReturn(600);
frequencyAnalyzer.setMinWordLength(2);
frequencyAnalyzer.setWordTokenizer(new ChineseWordTokenizer());
List<WordFrequency> wordFrequencies = new ArrayList<>();
// 用词语来随机生成词云
for (Map.Entry<String, Integer> entry : wordCountMap.entrySet()) {
wordFrequencies.add(new WordFrequency(entry.getKey(), entry.getValue()));
}
//加入分词并随机生成权重,每次生成得图片都不一样
//test.stream().forEach(e-> wordFrequencies.add(new WordFrequency(e,new Random().nextInt(test.size()))));
//此处不设置会出现中文乱码
java.awt.Font font = new java.awt.Font("STSong-Light", 2, 18);
//设置图片分辨率
Dimension dimension = new Dimension(500, 500);
//此处的设置采用内置常量即可,生成词云对象
WordCloud wordCloud = new WordCloud(dimension, CollisionMode.PIXEL_PERFECT);
//设置边界及字体
wordCloud.setPadding(2);
//因为我这边是生成一个圆形,这边设置圆的半径
wordCloud.setBackground(new CircleBackground(255));
wordCloud.setFontScalar(new SqrtFontScalar(12, 42));
//设置词云显示的三种颜色,越靠前设置表示词频越高的词语的颜色
wordCloud.setColorPalette(new LinearGradientColorPalette(Color.RED, Color.BLUE, Color.GREEN, 30, 30));
wordCloud.setKumoFont(new KumoFont(font));
wordCloud.setBackgroundColor(new Color(255, 255, 255));
// 这边是生成一个圆形,这边设置圆的半径
wordCloud.build(wordFrequencies);
//生成词云图路径
wordCloud.writeToFile(FileName.WORDCLOUD);
}
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,102 @@
AI
ai
GPT
人工智能
智能
gpt
ChatGPT
chatgpt
机器人
Robot
Artificial Intelligence
artificial intelligence
机器学习
machine learning
Deep Learning
deep learning
Neural Network
neural network
自然语言处理
Natural Language Processing
NLP
nlp
Computer Vision
computer vision
自动驾驶
Autonomous Driving
Reinforcement Learning
reinforcement learning
Algorithm
algorithm
Big Data
big data
Speech Recognition
speech recognition
Data Mining
data mining
Pattern Recognition
pattern recognition
Smart Contracts
smart contracts
Virtual Assistant
virtual assistant
Cloud Computing
cloud computing
Image Processing
image processing
Predictive Analytics
predictive analytics
Drone
drone
Recommendation System
recommendation system
Data Science
data science
Generative Model
generative model
Quantum Computing
quantum computing
Transfer Learning
transfer learning
Augmented Reality
augmented reality
Virtual Reality
virtual reality
Internet of Things
IoT
iot
Edge Computing
edge computing
Intelligent Transportation
intelligent transportation
Smart Agriculture
smart agriculture
Image Recognition
image recognition
Machine Translation
machine translation
Multimodal
multimodal
Unsupervised Learning
unsupervised learning
Generative Adversarial Network
GAN
gan
Image Classification
image classification
Logistic Regression
logistic regression
Support Vector Machine
support vector machine
Random Forest
random forest
Convolutional Neural Network
CNN
cnn
Recurrent Neural Network
RNN
rnn
Knowledge Graph
knowledge graph
Semantic Segmentation
semantic segmentation

Binary file not shown.

After

Width:  |  Height:  |  Size: 139 KiB

Binary file not shown.

@ -0,0 +1,24 @@
package com.flyingpig.bilibilispider;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.web.client.RestTemplate;
@SpringBootTest
class BilibiliSpiderApplicationTests {
@Autowired
RestTemplate restTemplate;
@Test
void testSearchInterface() {
restTemplate.exchange("https://api.bilibili.com/x/web-interface/search/type?keyword=2024巴黎奥运会&search_type=video",
null, null, String.class);
String searchResult = restTemplate.getForObject(
"https://api.bilibili.com/x/web-interface/search/type?keyword=2024巴黎奥运会&search_type=video", String.class);
System.out.println(searchResult);
}
}
Loading…
Cancel
Save