You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

60 lines
2.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

// src/main/java/com/llm/analysis/WordAnalyzer.java
package com.llm.analysis;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
import com.hankcs.hanlp.seg.common.Term;
import java.util.*;
import java.util.stream.Collectors;
public class WordAnalyzer {
// 目标关键词 (用于筛选相关弹幕,已转小写)
private static final List<String> LLM_KEYWORDS = Arrays.asList(
"大语言模型", "大模型", "llm", "gpt", "ai", "人工智能", "chatgpt", "文心一言", "通义千问", "讯飞星火"
);
/**
* 对弹幕列表进行分词、筛选和词频统计并返回Top N结果。
* @param danmuList 原始弹幕列表
* @param topN 需要返回的Top数量
* @return 包含词语和频次的Map列表
*/
public static List<Map.Entry<String, Long>> analyzeAndGetTopN(List<String> danmuList, int topN) {
// 1. 筛选包含LLM关键词的弹幕
List<String> relevantDanmus = danmuList.stream()
// 检查弹幕 d (已转小写) 是否包含任何一个关键词 (也转小写)
.filter(d -> LLM_KEYWORDS.stream().anyMatch(keyword -> d.contains(keyword.toLowerCase())))
.collect(Collectors.toList());
System.out.println("筛选出包含LLM关键词的弹幕" + relevantDanmus.size() + " 条。");
// 2. 中文分词和计数
Map<String, Long> wordCounts = new HashMap<>();
for (String danmu : relevantDanmus) {
// HanLP分词
List<Term> termList = HanLP.segment(danmu);
for (Term term : termList) {
String word = term.word.trim();
// 进一步过滤:
// 1. 词语长度大于 1
// 2. 核心停用词过滤 (FIXED: 传入 word而非 term)
// 3. 过滤掉纯数字或特殊符号
if (word.length() > 1
&& !CoreStopWordDictionary.contains(word) // <-- 修正:传入 String 类型的 word
&& word.matches(".*[\\u4e00-\\u9fa5a-zA-Z0-9]+.*")) {
wordCounts.put(word, wordCounts.getOrDefault(word, 0L) + 1);
}
}
}
// 3. 排序并获取 Top N
return wordCounts.entrySet().stream()
.sorted(Map.Entry.<String, Long>comparingByValue().reversed())
.limit(topN)
.collect(Collectors.toList());
}
}