|
|
|
|
@ -0,0 +1,57 @@
|
|
|
|
|
// src/main/java/com/llm/analysis/DanmuParser.java
|
|
|
|
|
package com.llm.analysis;
|
|
|
|
|
|
|
|
|
|
import org.w3c.dom.Document;
|
|
|
|
|
import org.w3c.dom.Element;
|
|
|
|
|
import org.w3c.dom.NodeList;
|
|
|
|
|
import javax.xml.parsers.DocumentBuilder;
|
|
|
|
|
import javax.xml.parsers.DocumentBuilderFactory;
|
|
|
|
|
import java.io.ByteArrayInputStream;
|
|
|
|
|
import java.nio.charset.StandardCharsets;
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
|
import java.util.Arrays;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
|
|
|
|
|
public class DanmuParser {
|
|
|
|
|
|
|
|
|
|
// 基础噪声词列表,您需要扩充它!
|
|
|
|
|
private static final List<String> NOISE_WORDS = Arrays.asList(
|
|
|
|
|
"666", "哈哈哈", "点赞", "马克", "awsl", "前排", "草"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 解析B站弹幕XML,提取内容并进行基础噪声过滤。
|
|
|
|
|
* @param xmlContent 弹幕XML字符串
|
|
|
|
|
* @return 清理后的弹幕文本列表
|
|
|
|
|
*/
|
|
|
|
|
public static List<String> parseAndCleanDanmu(String xmlContent) {
|
|
|
|
|
List<String> danmuList = new ArrayList<>();
|
|
|
|
|
if (xmlContent == null) return danmuList;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
|
|
|
|
|
DocumentBuilder db = dbf.newDocumentBuilder();
|
|
|
|
|
// 将字符串转为输入流,进行解析
|
|
|
|
|
Document doc = db.parse(new ByteArrayInputStream(xmlContent.getBytes(StandardCharsets.UTF_8)));
|
|
|
|
|
|
|
|
|
|
// 弹幕内容在XML中的 'd' 标签内
|
|
|
|
|
NodeList dNodes = doc.getElementsByTagName("d");
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < dNodes.getLength(); i++) {
|
|
|
|
|
Element danmuElement = (Element) dNodes.item(i);
|
|
|
|
|
String danmuText = danmuElement.getTextContent().trim();
|
|
|
|
|
|
|
|
|
|
// 噪声过滤
|
|
|
|
|
if (!danmuText.isEmpty() && danmuText.length() > 1) {
|
|
|
|
|
boolean isNoise = NOISE_WORDS.stream().anyMatch(danmuText::contains);
|
|
|
|
|
if (!isNoise) {
|
|
|
|
|
danmuList.add(danmuText);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
System.err.println("解析弹幕XML失败:" + e.getMessage());
|
|
|
|
|
}
|
|
|
|
|
return danmuList;
|
|
|
|
|
}
|
|
|
|
|
}
|