You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

57 lines
2.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

// src/main/java/com/llm/analysis/DanmuParser.java
package com.llm.analysis;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class DanmuParser {
// 基础噪声词列表,您需要扩充它!
private static final List<String> NOISE_WORDS = Arrays.asList(
"666", "哈哈哈", "点赞", "马克", "awsl", "前排", "草"
);
/**
* 解析B站弹幕XML提取内容并进行基础噪声过滤。
* @param xmlContent 弹幕XML字符串
* @return 清理后的弹幕文本列表
*/
public static List<String> parseAndCleanDanmu(String xmlContent) {
List<String> danmuList = new ArrayList<>();
if (xmlContent == null) return danmuList;
try {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
// 将字符串转为输入流,进行解析
Document doc = db.parse(new ByteArrayInputStream(xmlContent.getBytes(StandardCharsets.UTF_8)));
// 弹幕内容在XML中的 'd' 标签内
NodeList dNodes = doc.getElementsByTagName("d");
for (int i = 0; i < dNodes.getLength(); i++) {
Element danmuElement = (Element) dNodes.item(i);
String danmuText = danmuElement.getTextContent().trim();
// 噪声过滤
if (!danmuText.isEmpty() && danmuText.length() > 1) {
boolean isNoise = NOISE_WORDS.stream().anyMatch(danmuText::contains);
if (!isNoise) {
danmuList.add(danmuText);
}
}
}
} catch (Exception e) {
System.err.println("解析弹幕XML失败" + e.getMessage());
}
return danmuList;
}
}