parent
e19bfa2ff7
commit
38ed99b928
@ -0,0 +1,83 @@
|
||||
// src/main/java/com/llm/analysis/MainApplication.java
|
||||
package com.llm.analysis;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class MainApplication {
|
||||
|
||||
private static final String SEARCH_KEYWORD = "大语言模型";
|
||||
private static final int TOP_N_WORDS = 8;
|
||||
private static final String OUTPUT_FILENAME = "llm_analysis_top8.xlsx";
|
||||
|
||||
public static void main(String[] args) {
|
||||
System.out.println("--- LLM 应用弹幕分析项目启动 ---");
|
||||
System.out.println("目标关键词:" + SEARCH_KEYWORD);
|
||||
|
||||
// 1. 获取前 300 视频列表 (BVID 和 Title)
|
||||
List<Map<String, String>> topVideos = BiliBiliSearchCrawler.getTopVideos(SEARCH_KEYWORD);
|
||||
|
||||
if (topVideos.isEmpty()) {
|
||||
System.err.println("未获取到任何视频数据,程序终止。");
|
||||
return;
|
||||
}
|
||||
|
||||
List<String> allDanmus = new ArrayList<>();
|
||||
int videoCount = 0;
|
||||
|
||||
// 2. 遍历视频列表,为每个视频获取 CID 并处理弹幕
|
||||
for (Map<String, String> video : topVideos) {
|
||||
videoCount++;
|
||||
String bvid = video.get("bvid");
|
||||
String title = video.get("title");
|
||||
|
||||
// 【关键步骤】动态获取 CID
|
||||
String cid = BiliBiliCrawler.getCidFromBvid(bvid);
|
||||
|
||||
System.out.printf("处理中 (%d/%d): %s (BVID: %s)\n", videoCount, topVideos.size(), title, bvid);
|
||||
|
||||
if (cid == null) {
|
||||
System.out.println(" 跳过此视频,无法获取 CID。");
|
||||
continue;
|
||||
}
|
||||
|
||||
// 获取弹幕 XML
|
||||
String xmlContent = BiliBiliCrawler.getDanmuXml(cid);
|
||||
if (xmlContent == null) {
|
||||
System.out.println(" 跳过此视频,无法获取弹幕。");
|
||||
continue;
|
||||
}
|
||||
|
||||
// 解析和清洗弹幕
|
||||
List<String> danmus = DanmuParser.parseAndCleanDanmu(xmlContent);
|
||||
allDanmus.addAll(danmus);
|
||||
|
||||
// 礼貌暂停,减缓对服务器的压力
|
||||
try {
|
||||
Thread.sleep(500);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("\n🎉 所有视频处理完毕!总计弹幕数:" + allDanmus.size());
|
||||
|
||||
if (allDanmus.isEmpty()) {
|
||||
System.out.println("没有可分析的弹幕,程序终止。");
|
||||
return;
|
||||
}
|
||||
|
||||
System.out.println("开始中文分词和词频统计...");
|
||||
List<Map.Entry<String, Long>> topWords =
|
||||
WordAnalyzer.analyzeAndGetTopN(allDanmus, TOP_N_WORDS);
|
||||
|
||||
System.out.println("\n--- 最终结果 (Top " + TOP_N_WORDS + ") ---");
|
||||
topWords.forEach(entry ->
|
||||
System.out.printf("词语: %-15s | 频次: %d\n", entry.getKey(), entry.getValue())
|
||||
);
|
||||
|
||||
ExcelExporter.exportToXLSX(topWords, OUTPUT_FILENAME);
|
||||
System.out.println("\n--- 项目执行完毕 ---");
|
||||
}
|
||||
}
|
||||
Loading…
Reference in new issue