From e19bfa2ff762b9b497ac2e5d72962f6ea2524d2b Mon Sep 17 00:00:00 2001 From: p7mpv4cbt <1352787923@qq.com> Date: Wed, 5 Nov 2025 11:28:22 +0800 Subject: [PATCH] ADD file via upload --- WordAnalyzer.java | 60 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 WordAnalyzer.java diff --git a/WordAnalyzer.java b/WordAnalyzer.java new file mode 100644 index 0000000..fdb599b --- /dev/null +++ b/WordAnalyzer.java @@ -0,0 +1,60 @@ +// src/main/java/com/llm/analysis/WordAnalyzer.java +package com.llm.analysis; + +import com.hankcs.hanlp.HanLP; +import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary; +import com.hankcs.hanlp.seg.common.Term; +import java.util.*; +import java.util.stream.Collectors; + +public class WordAnalyzer { + + // 目标关键词 (用于筛选相关弹幕,已转小写) + private static final List LLM_KEYWORDS = Arrays.asList( + "大语言模型", "大模型", "llm", "gpt", "ai", "人工智能", "chatgpt", "文心一言", "通义千问", "讯飞星火" + ); + + /** + * 对弹幕列表进行分词、筛选和词频统计,并返回Top N结果。 + * @param danmuList 原始弹幕列表 + * @param topN 需要返回的Top数量 + * @return 包含词语和频次的Map列表 + */ + public static List> analyzeAndGetTopN(List danmuList, int topN) { + // 1. 筛选包含LLM关键词的弹幕 + List relevantDanmus = danmuList.stream() + // 检查弹幕 d (已转小写) 是否包含任何一个关键词 (也转小写) + .filter(d -> LLM_KEYWORDS.stream().anyMatch(keyword -> d.contains(keyword.toLowerCase()))) + .collect(Collectors.toList()); + + System.out.println("筛选出包含LLM关键词的弹幕:" + relevantDanmus.size() + " 条。"); + + // 2. 中文分词和计数 + Map wordCounts = new HashMap<>(); + for (String danmu : relevantDanmus) { + // HanLP分词 + List termList = HanLP.segment(danmu); + + for (Term term : termList) { + String word = term.word.trim(); + + // 进一步过滤: + // 1. 词语长度大于 1 + // 2. 核心停用词过滤 (FIXED: 传入 word,而非 term) + // 3. 过滤掉纯数字或特殊符号 + if (word.length() > 1 + && !CoreStopWordDictionary.contains(word) // <-- 修正:传入 String 类型的 word + && word.matches(".*[\\u4e00-\\u9fa5a-zA-Z0-9]+.*")) { + + wordCounts.put(word, wordCounts.getOrDefault(word, 0L) + 1); + } + } + } + + // 3. 排序并获取 Top N + return wordCounts.entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .limit(topN) + .collect(Collectors.toList()); + } +} \ No newline at end of file