You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
83 lines
3.1 KiB
83 lines
3.1 KiB
// src/main/java/com/llm/analysis/MainApplication.java
|
|
package com.llm.analysis;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
|
|
public class MainApplication {
|
|
|
|
private static final String SEARCH_KEYWORD = "大语言模型";
|
|
private static final int TOP_N_WORDS = 8;
|
|
private static final String OUTPUT_FILENAME = "llm_analysis_top8.xlsx";
|
|
|
|
public static void main(String[] args) {
|
|
System.out.println("--- LLM 应用弹幕分析项目启动 ---");
|
|
System.out.println("目标关键词:" + SEARCH_KEYWORD);
|
|
|
|
// 1. 获取前 300 视频列表 (BVID 和 Title)
|
|
List<Map<String, String>> topVideos = BiliBiliSearchCrawler.getTopVideos(SEARCH_KEYWORD);
|
|
|
|
if (topVideos.isEmpty()) {
|
|
System.err.println("未获取到任何视频数据,程序终止。");
|
|
return;
|
|
}
|
|
|
|
List<String> allDanmus = new ArrayList<>();
|
|
int videoCount = 0;
|
|
|
|
// 2. 遍历视频列表,为每个视频获取 CID 并处理弹幕
|
|
for (Map<String, String> video : topVideos) {
|
|
videoCount++;
|
|
String bvid = video.get("bvid");
|
|
String title = video.get("title");
|
|
|
|
// 【关键步骤】动态获取 CID
|
|
String cid = BiliBiliCrawler.getCidFromBvid(bvid);
|
|
|
|
System.out.printf("处理中 (%d/%d): %s (BVID: %s)\n", videoCount, topVideos.size(), title, bvid);
|
|
|
|
if (cid == null) {
|
|
System.out.println(" 跳过此视频,无法获取 CID。");
|
|
continue;
|
|
}
|
|
|
|
// 获取弹幕 XML
|
|
String xmlContent = BiliBiliCrawler.getDanmuXml(cid);
|
|
if (xmlContent == null) {
|
|
System.out.println(" 跳过此视频,无法获取弹幕。");
|
|
continue;
|
|
}
|
|
|
|
// 解析和清洗弹幕
|
|
List<String> danmus = DanmuParser.parseAndCleanDanmu(xmlContent);
|
|
allDanmus.addAll(danmus);
|
|
|
|
// 礼貌暂停,减缓对服务器的压力
|
|
try {
|
|
Thread.sleep(500);
|
|
} catch (InterruptedException e) {
|
|
Thread.currentThread().interrupt();
|
|
}
|
|
}
|
|
|
|
System.out.println("\n🎉 所有视频处理完毕!总计弹幕数:" + allDanmus.size());
|
|
|
|
if (allDanmus.isEmpty()) {
|
|
System.out.println("没有可分析的弹幕,程序终止。");
|
|
return;
|
|
}
|
|
|
|
System.out.println("开始中文分词和词频统计...");
|
|
List<Map.Entry<String, Long>> topWords =
|
|
WordAnalyzer.analyzeAndGetTopN(allDanmus, TOP_N_WORDS);
|
|
|
|
System.out.println("\n--- 最终结果 (Top " + TOP_N_WORDS + ") ---");
|
|
topWords.forEach(entry ->
|
|
System.out.printf("词语: %-15s | 频次: %d\n", entry.getKey(), entry.getValue())
|
|
);
|
|
|
|
ExcelExporter.exportToXLSX(topWords, OUTPUT_FILENAME);
|
|
System.out.println("\n--- 项目执行完毕 ---");
|
|
}
|
|
} |