|
|
|
@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
// src/main/java/com/llm/analysis/WordAnalyzer.java
|
|
|
|
|
|
|
|
package com.llm.analysis;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import com.hankcs.hanlp.HanLP;
|
|
|
|
|
|
|
|
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
|
|
|
|
|
|
|
|
import com.hankcs.hanlp.seg.common.Term;
|
|
|
|
|
|
|
|
import java.util.*;
|
|
|
|
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public class WordAnalyzer {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 目标关键词 (用于筛选相关弹幕,已转小写)
|
|
|
|
|
|
|
|
private static final List<String> LLM_KEYWORDS = Arrays.asList(
|
|
|
|
|
|
|
|
"大语言模型", "大模型", "llm", "gpt", "ai", "人工智能", "chatgpt", "文心一言", "通义千问", "讯飞星火"
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
|
|
* 对弹幕列表进行分词、筛选和词频统计,并返回Top N结果。
|
|
|
|
|
|
|
|
* @param danmuList 原始弹幕列表
|
|
|
|
|
|
|
|
* @param topN 需要返回的Top数量
|
|
|
|
|
|
|
|
* @return 包含词语和频次的Map列表
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
public static List<Map.Entry<String, Long>> analyzeAndGetTopN(List<String> danmuList, int topN) {
|
|
|
|
|
|
|
|
// 1. 筛选包含LLM关键词的弹幕
|
|
|
|
|
|
|
|
List<String> relevantDanmus = danmuList.stream()
|
|
|
|
|
|
|
|
// 检查弹幕 d (已转小写) 是否包含任何一个关键词 (也转小写)
|
|
|
|
|
|
|
|
.filter(d -> LLM_KEYWORDS.stream().anyMatch(keyword -> d.contains(keyword.toLowerCase())))
|
|
|
|
|
|
|
|
.collect(Collectors.toList());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
System.out.println("筛选出包含LLM关键词的弹幕:" + relevantDanmus.size() + " 条。");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 2. 中文分词和计数
|
|
|
|
|
|
|
|
Map<String, Long> wordCounts = new HashMap<>();
|
|
|
|
|
|
|
|
for (String danmu : relevantDanmus) {
|
|
|
|
|
|
|
|
// HanLP分词
|
|
|
|
|
|
|
|
List<Term> termList = HanLP.segment(danmu);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (Term term : termList) {
|
|
|
|
|
|
|
|
String word = term.word.trim();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 进一步过滤:
|
|
|
|
|
|
|
|
// 1. 词语长度大于 1
|
|
|
|
|
|
|
|
// 2. 核心停用词过滤 (FIXED: 传入 word,而非 term)
|
|
|
|
|
|
|
|
// 3. 过滤掉纯数字或特殊符号
|
|
|
|
|
|
|
|
if (word.length() > 1
|
|
|
|
|
|
|
|
&& !CoreStopWordDictionary.contains(word) // <-- 修正:传入 String 类型的 word
|
|
|
|
|
|
|
|
&& word.matches(".*[\\u4e00-\\u9fa5a-zA-Z0-9]+.*")) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wordCounts.put(word, wordCounts.getOrDefault(word, 0L) + 1);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 3. 排序并获取 Top N
|
|
|
|
|
|
|
|
return wordCounts.entrySet().stream()
|
|
|
|
|
|
|
|
.sorted(Map.Entry.<String, Long>comparingByValue().reversed())
|
|
|
|
|
|
|
|
.limit(topN)
|
|
|
|
|
|
|
|
.collect(Collectors.toList());
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|