From 495c79f67e7fd258836d3cb037a02df0ca5c0796 Mon Sep 17 00:00:00 2001 From: p7mpv4cbt <1352787923@qq.com> Date: Wed, 5 Nov 2025 11:29:11 +0800 Subject: [PATCH] ADD file via upload --- DanmuParser.java | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 DanmuParser.java diff --git a/DanmuParser.java b/DanmuParser.java new file mode 100644 index 0000000..7bdd767 --- /dev/null +++ b/DanmuParser.java @@ -0,0 +1,57 @@ +// src/main/java/com/llm/analysis/DanmuParser.java +package com.llm.analysis; + +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class DanmuParser { + + // 基础噪声词列表,您需要扩充它! + private static final List NOISE_WORDS = Arrays.asList( + "666", "哈哈哈", "点赞", "马克", "awsl", "前排", "草" + ); + + /** + * 解析B站弹幕XML,提取内容并进行基础噪声过滤。 + * @param xmlContent 弹幕XML字符串 + * @return 清理后的弹幕文本列表 + */ + public static List parseAndCleanDanmu(String xmlContent) { + List danmuList = new ArrayList<>(); + if (xmlContent == null) return danmuList; + + try { + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + DocumentBuilder db = dbf.newDocumentBuilder(); + // 将字符串转为输入流,进行解析 + Document doc = db.parse(new ByteArrayInputStream(xmlContent.getBytes(StandardCharsets.UTF_8))); + + // 弹幕内容在XML中的 'd' 标签内 + NodeList dNodes = doc.getElementsByTagName("d"); + + for (int i = 0; i < dNodes.getLength(); i++) { + Element danmuElement = (Element) dNodes.item(i); + String danmuText = danmuElement.getTextContent().trim(); + + // 噪声过滤 + if (!danmuText.isEmpty() && danmuText.length() > 1) { + boolean isNoise = NOISE_WORDS.stream().anyMatch(danmuText::contains); + if (!isNoise) { + danmuList.add(danmuText); + } + } + } + } catch (Exception e) { + System.err.println("解析弹幕XML失败:" + e.getMessage()); + } + return danmuList; + } +} \ No newline at end of file