|
|
|
@ -1,819 +0,0 @@
|
|
|
|
|
/**
|
|
|
|
|
* 文本分析工具函数
|
|
|
|
|
* 提供词频统计、趋势分析、搭配词分析等功能
|
|
|
|
|
*/
|
|
|
|
|
// import _ from 'lodash'
|
|
|
|
|
|
|
|
|
|
// 中文停用词列表(常见的虚词、助词等)
|
|
|
|
|
const chineseStopWords = [
|
|
|
|
|
'的', '了', '和', '是', '在', '我', '有', '就', '不', '人', '都',
|
|
|
|
|
'一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会',
|
|
|
|
|
'着', '没有', '看', '好', '自己', '这', '那', '这个', '那个', '啊',
|
|
|
|
|
'吧', '呢', '啦', '呀', '吗', '哦', '哪', '对', '可以', '他', '她',
|
|
|
|
|
'它', '这些', '那些', '把', '让', '向', '往', '是否', '什么', '怎么',
|
|
|
|
|
'如何', '为', '为了', '依', '从', '当', '来', '被'
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
// 英文停用词列表
|
|
|
|
|
const englishStopWords = [
|
|
|
|
|
'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
|
|
|
|
|
'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
|
|
|
|
|
'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
|
|
|
|
|
'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
|
|
|
|
|
'theirs', 'themselves', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
|
|
|
|
|
'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do',
|
|
|
|
|
'does', 'did', 'doing', 'would', 'should', 'could', 'ought', 'i\'m', 'you\'re',
|
|
|
|
|
'he\'s', 'she\'s', 'it\'s', 'we\'re', 'they\'re', 'i\'ve', 'you\'ve', 'we\'ve',
|
|
|
|
|
'they\'ve', 'i\'d', 'you\'d', 'he\'d', 'she\'d', 'we\'d', 'they\'d', 'i\'ll',
|
|
|
|
|
'you\'ll', 'he\'ll', 'she\'ll', 'we\'ll', 'they\'ll', 'isn\'t', 'aren\'t',
|
|
|
|
|
'wasn\'t', 'weren\'t', 'hasn\'t', 'haven\'t', 'hadn\'t', 'doesn\'t', 'don\'t',
|
|
|
|
|
'didn\'t', 'won\'t', 'wouldn\'t', 'shan\'t', 'shouldn\'t', 'can\'t', 'cannot',
|
|
|
|
|
'couldn\'t', 'mustn\'t', 'let\'s', 'that\'s', 'who\'s', 'what\'s', 'here\'s',
|
|
|
|
|
'there\'s', 'when\'s', 'where\'s', 'why\'s', 'how\'s', 'of', 'on', 'at', 'in',
|
|
|
|
|
'to', 'for', 'with', 'by', 'about', 'against', 'between', 'into', 'through',
|
|
|
|
|
'during', 'before', 'after', 'above', 'below', 'from', 'up', 'down', 'out',
|
|
|
|
|
'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there',
|
|
|
|
|
'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
|
|
|
|
|
'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
|
|
|
|
|
'so', 'than', 'too', 'very'
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
// 合并停用词列表
|
|
|
|
|
const stopWordsList = [...chineseStopWords, ...englishStopWords]
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 分词处理函数
|
|
|
|
|
* 简单按空格和标点符号拆分
|
|
|
|
|
*
|
|
|
|
|
* @param {string} text - 输入文本
|
|
|
|
|
* @param {object} options - 分词选项
|
|
|
|
|
* @returns {Array} - 分词结果数组
|
|
|
|
|
*/
|
|
|
|
|
export const tokenize = (text, options = {}) => {
|
|
|
|
|
const defaultOptions = {
|
|
|
|
|
minLength: 2, // 最小词长
|
|
|
|
|
filterStopWords: true, // 是否过滤停用词
|
|
|
|
|
language: 'zh' // 语言
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const opts = { ...defaultOptions, ...options }
|
|
|
|
|
|
|
|
|
|
if (!text || typeof text !== 'string') {
|
|
|
|
|
return []
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 预处理文本,统一替换标点符号为空格,转小写
|
|
|
|
|
let processedText = text.toLowerCase()
|
|
|
|
|
.replace(/[.,?!;:()[\]{}""''「」『』【】《》〈〉]/g, ' ')
|
|
|
|
|
.replace(/\s+/g, ' ')
|
|
|
|
|
.trim()
|
|
|
|
|
|
|
|
|
|
// 按空格分词(对英文、数字有效)
|
|
|
|
|
let tokens = processedText.split(' ')
|
|
|
|
|
|
|
|
|
|
// 对于中文,执行额外的字符级拆分
|
|
|
|
|
if (opts.language === 'zh') {
|
|
|
|
|
// 处理中文字符
|
|
|
|
|
tokens = tokens.reduce((result, token) => {
|
|
|
|
|
// 如果当前token是纯中文
|
|
|
|
|
if (/^[\u4e00-\u9fa5]+$/.test(token)) {
|
|
|
|
|
// 双字组合
|
|
|
|
|
for (let i = 0; i < token.length - 1; i++) {
|
|
|
|
|
result.push(token.substring(i, i + 2))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 三字组合
|
|
|
|
|
if (token.length >= 3) {
|
|
|
|
|
for (let i = 0; i < token.length - 2; i++) {
|
|
|
|
|
result.push(token.substring(i, i + 3))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 四字组合
|
|
|
|
|
if (token.length >= 4) {
|
|
|
|
|
for (let i = 0; i < token.length - 3; i++) {
|
|
|
|
|
result.push(token.substring(i, i + 4))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 单字也添加进来
|
|
|
|
|
for (let i = 0; i < token.length; i++) {
|
|
|
|
|
result.push(token[i])
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// 非中文直接添加
|
|
|
|
|
result.push(token)
|
|
|
|
|
}
|
|
|
|
|
return result
|
|
|
|
|
}, [])
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 过滤掉长度小于最小词长的词
|
|
|
|
|
tokens = tokens.filter(token => token.length >= opts.minLength)
|
|
|
|
|
|
|
|
|
|
// 如果需要过滤停用词
|
|
|
|
|
if (opts.filterStopWords) {
|
|
|
|
|
tokens = tokens.filter(token => !stopWordsList.includes(token))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return tokens
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 词频分析
|
|
|
|
|
*
|
|
|
|
|
* @param {Array} segments - 文本段落数组
|
|
|
|
|
* @param {Object} options - 分析选项
|
|
|
|
|
* @returns {Array} - 词频统计结果
|
|
|
|
|
*/
|
|
|
|
|
export const analyzeTermFrequency = (segments, options = {}) => {
|
|
|
|
|
const defaultOptions = {
|
|
|
|
|
minLength: 2,
|
|
|
|
|
filterStopWords: true,
|
|
|
|
|
language: 'zh',
|
|
|
|
|
limit: 100 // 返回的最大词汇数量
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const opts = { ...defaultOptions, ...options }
|
|
|
|
|
|
|
|
|
|
if (!segments || !Array.isArray(segments) || segments.length === 0) {
|
|
|
|
|
return []
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 所有段落的文本内容
|
|
|
|
|
const allText = segments.map(segment => segment.text).join(' ')
|
|
|
|
|
|
|
|
|
|
// 分词
|
|
|
|
|
const tokens = tokenize(allText, {
|
|
|
|
|
minLength: opts.minLength,
|
|
|
|
|
filterStopWords: opts.filterStopWords,
|
|
|
|
|
language: opts.language
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 统计词频
|
|
|
|
|
const frequencyMap = tokens.reduce((acc, token) => {
|
|
|
|
|
acc[token] = (acc[token] || 0) + 1
|
|
|
|
|
return acc
|
|
|
|
|
}, {})
|
|
|
|
|
|
|
|
|
|
// 转换为数组并排序
|
|
|
|
|
const sortedTerms = Object.entries(frequencyMap)
|
|
|
|
|
.map(([term, count]) => ({
|
|
|
|
|
term,
|
|
|
|
|
count,
|
|
|
|
|
percentage: count / tokens.length
|
|
|
|
|
}))
|
|
|
|
|
.sort((a, b) => b.count - a.count)
|
|
|
|
|
.slice(0, opts.limit)
|
|
|
|
|
|
|
|
|
|
return sortedTerms
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 趋势分析
|
|
|
|
|
*
|
|
|
|
|
* @param {Array} segments - 文本段落数组
|
|
|
|
|
* @param {Array} terms - 要分析趋势的词汇列表
|
|
|
|
|
* @param {Object} options - 分析选项
|
|
|
|
|
* @returns {Object} - 趋势分析结果
|
|
|
|
|
*/
|
|
|
|
|
export const analyzeTermTrends = (segments, terms, options = {}) => {
|
|
|
|
|
const defaultOptions = {
|
|
|
|
|
minLength: 2,
|
|
|
|
|
filterStopWords: true,
|
|
|
|
|
language: 'zh',
|
|
|
|
|
normalization: 'relative' // 'raw' 或 'relative'
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const opts = { ...defaultOptions, ...options }
|
|
|
|
|
|
|
|
|
|
if (!segments || !Array.isArray(segments) || segments.length === 0) {
|
|
|
|
|
return { terms: [], data: [] }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!terms || !Array.isArray(terms) || terms.length === 0) {
|
|
|
|
|
return { terms: [], data: [] }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 分析每个段落中各词汇的出现频率
|
|
|
|
|
const trendsData = segments.map(segment => {
|
|
|
|
|
// 分词
|
|
|
|
|
const tokens = tokenize(segment.text, {
|
|
|
|
|
minLength: opts.minLength,
|
|
|
|
|
filterStopWords: opts.filterStopWords,
|
|
|
|
|
language: opts.language
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 计算该段落的总词数(用于相对频率)
|
|
|
|
|
const totalTokens = tokens.length
|
|
|
|
|
|
|
|
|
|
// 统计该段落中每个指定词汇的出现频率
|
|
|
|
|
const segmentFrequencies = {}
|
|
|
|
|
|
|
|
|
|
// 初始化所有词汇的频率为0
|
|
|
|
|
terms.forEach(term => {
|
|
|
|
|
segmentFrequencies[term] = 0
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 统计词频
|
|
|
|
|
tokens.forEach(token => {
|
|
|
|
|
if (terms.includes(token)) {
|
|
|
|
|
segmentFrequencies[token]++
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 如果使用相对频率,将词频除以总词数
|
|
|
|
|
if (opts.normalization === 'relative' && totalTokens > 0) {
|
|
|
|
|
Object.keys(segmentFrequencies).forEach(term => {
|
|
|
|
|
segmentFrequencies[term] = segmentFrequencies[term] / totalTokens
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
position: segment.position,
|
|
|
|
|
id: segment.id,
|
|
|
|
|
frequencies: segmentFrequencies
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 组织数据格式,适合图表展示
|
|
|
|
|
const result = {
|
|
|
|
|
terms: terms,
|
|
|
|
|
data: trendsData
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 搭配词分析
|
|
|
|
|
*
|
|
|
|
|
* @param {Array} segments - 文本段落数组
|
|
|
|
|
* @param {String} targetTerm - 目标词汇
|
|
|
|
|
* @param {Object} options - 分析选项
|
|
|
|
|
* @returns {Array} - 搭配词分析结果
|
|
|
|
|
*/
|
|
|
|
|
export const analyzeCollocates = (segments, targetTerm, options = {}) => {
|
|
|
|
|
const defaultOptions = {
|
|
|
|
|
minLength: 2,
|
|
|
|
|
filterStopWords: true,
|
|
|
|
|
language: 'zh',
|
|
|
|
|
window: 5, // 上下文窗口大小
|
|
|
|
|
limit: 50 // 返回的最大搭配词数量
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const opts = { ...defaultOptions, ...options }
|
|
|
|
|
|
|
|
|
|
if (!segments || !Array.isArray(segments) || segments.length === 0) {
|
|
|
|
|
return []
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!targetTerm || typeof targetTerm !== 'string') {
|
|
|
|
|
return []
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let collocatesMap = {}
|
|
|
|
|
|
|
|
|
|
// 遍历每个段落
|
|
|
|
|
segments.forEach(segment => {
|
|
|
|
|
// 分词
|
|
|
|
|
const tokens = tokenize(segment.text, {
|
|
|
|
|
minLength: 1, // 设为1以捕获所有可能的词
|
|
|
|
|
filterStopWords: false, // 暂时不过滤停用词
|
|
|
|
|
language: opts.language
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 查找目标词在token数组中的所有位置
|
|
|
|
|
const targetIndices = []
|
|
|
|
|
tokens.forEach((token, index) => {
|
|
|
|
|
if (token === targetTerm) {
|
|
|
|
|
targetIndices.push(index)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 对于每个目标词出现的位置,收集其上下文中的词
|
|
|
|
|
targetIndices.forEach(targetIndex => {
|
|
|
|
|
// 计算上下文窗口的起止位置
|
|
|
|
|
const start = Math.max(0, targetIndex - opts.window)
|
|
|
|
|
const end = Math.min(tokens.length - 1, targetIndex + opts.window)
|
|
|
|
|
|
|
|
|
|
// 收集上下文中的词
|
|
|
|
|
for (let i = start; i <= end; i++) {
|
|
|
|
|
if (i !== targetIndex) { // 排除目标词自身
|
|
|
|
|
const collocate = tokens[i]
|
|
|
|
|
|
|
|
|
|
// 忽略长度小于最小长度的词
|
|
|
|
|
if (collocate.length < opts.minLength) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 如果需要过滤停用词
|
|
|
|
|
if (opts.filterStopWords && stopWordsList.includes(collocate)) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 统计搭配词出现次数
|
|
|
|
|
collocatesMap[collocate] = (collocatesMap[collocate] || 0) + 1
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 转换为数组并排序
|
|
|
|
|
const sortedCollocates = Object.entries(collocatesMap)
|
|
|
|
|
.map(([term, count]) => ({
|
|
|
|
|
term,
|
|
|
|
|
count
|
|
|
|
|
}))
|
|
|
|
|
.sort((a, b) => b.count - a.count)
|
|
|
|
|
.slice(0, opts.limit)
|
|
|
|
|
|
|
|
|
|
return sortedCollocates
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 上下文分析
|
|
|
|
|
*
|
|
|
|
|
* @param {Array} segments - 文本段落数组
|
|
|
|
|
* @param {String} targetTerm - 目标词汇
|
|
|
|
|
* @param {Object} options - 分析选项
|
|
|
|
|
* @returns {Array} - 上下文分析结果
|
|
|
|
|
*/
|
|
|
|
|
export const analyzeContexts = (segments, targetTerm, options = {}) => {
|
|
|
|
|
const defaultOptions = {
|
|
|
|
|
window: 8, // 上下文窗口大小(左侧和右侧各多少个字符)
|
|
|
|
|
limit: 50 // 返回的最大上下文数量
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const opts = { ...defaultOptions, ...options }
|
|
|
|
|
|
|
|
|
|
if (!segments || !Array.isArray(segments) || segments.length === 0) {
|
|
|
|
|
return []
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!targetTerm || typeof targetTerm !== 'string') {
|
|
|
|
|
return []
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const contexts = []
|
|
|
|
|
|
|
|
|
|
// 遍历每个段落
|
|
|
|
|
segments.forEach(segment => {
|
|
|
|
|
const text = segment.text
|
|
|
|
|
const segmentId = segment.id
|
|
|
|
|
|
|
|
|
|
// 查找目标词在文本中的所有位置
|
|
|
|
|
let position = 0
|
|
|
|
|
let foundPos = text.indexOf(targetTerm, position)
|
|
|
|
|
|
|
|
|
|
while (foundPos !== -1 && contexts.length < opts.limit) {
|
|
|
|
|
// 计算上下文窗口的起止位置
|
|
|
|
|
const startPos = Math.max(0, foundPos - opts.window)
|
|
|
|
|
const endPos = Math.min(text.length, foundPos + targetTerm.length + opts.window)
|
|
|
|
|
|
|
|
|
|
// 提取上下文
|
|
|
|
|
const before = text.substring(startPos, foundPos)
|
|
|
|
|
const after = text.substring(foundPos + targetTerm.length, endPos)
|
|
|
|
|
|
|
|
|
|
// 添加到结果列表
|
|
|
|
|
contexts.push({
|
|
|
|
|
segmentId,
|
|
|
|
|
position: foundPos,
|
|
|
|
|
before,
|
|
|
|
|
term: targetTerm,
|
|
|
|
|
after
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 继续查找下一个位置
|
|
|
|
|
position = foundPos + targetTerm.length
|
|
|
|
|
foundPos = text.indexOf(targetTerm, position)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
return contexts.slice(0, opts.limit)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 生成词云数据
|
|
|
|
|
* @param {Array} termFrequency - 术语频率数据,包含 term 和 frequency 字段
|
|
|
|
|
* @param {Number} maxCount - 最大显示术语数量
|
|
|
|
|
* @param {Object} options - 配置选项
|
|
|
|
|
* @param {Number} options.minSize - 最小字体大小
|
|
|
|
|
* @param {Number} options.maxSize - 最大字体大小
|
|
|
|
|
* @returns {Array} 词云数据
|
|
|
|
|
*/
|
|
|
|
|
export function generateWordCloudData(termFrequency, maxCount = 100, options = {}) {
|
|
|
|
|
if (!termFrequency || termFrequency.length === 0) {
|
|
|
|
|
return []
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const { minSize = 12, maxSize = 32 } = options
|
|
|
|
|
const sortedData = [...termFrequency].sort((a, b) => b.frequency - a.frequency)
|
|
|
|
|
const slicedData = sortedData.slice(0, maxCount)
|
|
|
|
|
|
|
|
|
|
// 找出最大和最小频率
|
|
|
|
|
const maxFreq = slicedData[0].frequency
|
|
|
|
|
const minFreq = slicedData[slicedData.length - 1].frequency
|
|
|
|
|
|
|
|
|
|
// 计算每个词的大小
|
|
|
|
|
return slicedData.map(item => {
|
|
|
|
|
// 根据频率计算字体大小(线性映射)
|
|
|
|
|
let size = minSize
|
|
|
|
|
if (maxFreq !== minFreq) {
|
|
|
|
|
size = minSize + ((item.frequency - minFreq) / (maxFreq - minFreq)) * (maxSize - minSize)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
name: item.term,
|
|
|
|
|
value: item.frequency,
|
|
|
|
|
size: Math.round(size)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 提取关键词
|
|
|
|
|
* @param {String} text - 文本内容
|
|
|
|
|
* @param {Object} options - 配置选项
|
|
|
|
|
* @param {Number} options.count - 提取关键词数量
|
|
|
|
|
* @returns {Array} 关键词数组
|
|
|
|
|
*/
|
|
|
|
|
export function extractKeywords(text, options = {}) {
|
|
|
|
|
const { count = 10 } = options
|
|
|
|
|
|
|
|
|
|
// 这里是简化的实现,实际应用中可能需要更复杂的算法
|
|
|
|
|
if (!text) return []
|
|
|
|
|
|
|
|
|
|
// 将文本分词并计算频率(简化实现)
|
|
|
|
|
const words = text.split(/\s+|,|。|;|:|!|?|,|\.|;|:|!|\?/)
|
|
|
|
|
.filter(word => word.length > 1) // 过滤掉单字和空字符
|
|
|
|
|
|
|
|
|
|
const wordFreq = {}
|
|
|
|
|
words.forEach(word => {
|
|
|
|
|
wordFreq[word] = (wordFreq[word] || 0) + 1
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 转换为数组并排序
|
|
|
|
|
const sortedWords = Object.keys(wordFreq)
|
|
|
|
|
.map(term => ({ term, score: wordFreq[term] }))
|
|
|
|
|
.sort((a, b) => b.score - a.score)
|
|
|
|
|
.slice(0, count)
|
|
|
|
|
|
|
|
|
|
return sortedWords
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 情感分析
|
|
|
|
|
* @param {String} text - 文本内容
|
|
|
|
|
* @returns {Object} 情感分析结果
|
|
|
|
|
*/
|
|
|
|
|
export function analyzeSentiment(text) {
|
|
|
|
|
if (!text) {
|
|
|
|
|
return {
|
|
|
|
|
score: 0,
|
|
|
|
|
label: '中性',
|
|
|
|
|
positiveCount: 0,
|
|
|
|
|
negativeCount: 0,
|
|
|
|
|
neutralCount: 0
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 简化的情感词典
|
|
|
|
|
const positiveWords = [
|
|
|
|
|
'优秀', '良好', '满意', '喜欢', '赞', '好', '优化', '提高', '增长',
|
|
|
|
|
'成功', '积极', '优点', '强', '高效', '精确', '合适', '顺利'
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
const negativeWords = [
|
|
|
|
|
'差', '失败', '缺点', '问题', '错误', '缺陷', '不良', '不足',
|
|
|
|
|
'弱', '低效', '不精确', '不合适', '困难', '危险', '降低', '减少'
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
// 简单统计正负面词语出现次数
|
|
|
|
|
let positiveCount = 0
|
|
|
|
|
let negativeCount = 0
|
|
|
|
|
let neutralCount = 0
|
|
|
|
|
|
|
|
|
|
// 分词(简化实现)
|
|
|
|
|
const words = text.split(/\s+|,|。|;|:|!|?|,|\.|;|:|!|\?/)
|
|
|
|
|
.filter(word => word.length > 0)
|
|
|
|
|
|
|
|
|
|
words.forEach(word => {
|
|
|
|
|
if (positiveWords.some(pw => word.includes(pw))) {
|
|
|
|
|
positiveCount++
|
|
|
|
|
} else if (negativeWords.some(nw => word.includes(nw))) {
|
|
|
|
|
negativeCount++
|
|
|
|
|
} else {
|
|
|
|
|
neutralCount++
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 计算情感得分:范围从-1到1,-1表示极度负面,1表示极度正面
|
|
|
|
|
const totalWords = positiveCount + negativeCount + neutralCount
|
|
|
|
|
let score = 0
|
|
|
|
|
|
|
|
|
|
if (totalWords > 0) {
|
|
|
|
|
score = (positiveCount - negativeCount) / totalWords
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 确定情感标签
|
|
|
|
|
let label = '中性'
|
|
|
|
|
if (score > 0.6) label = '非常积极'
|
|
|
|
|
else if (score > 0.2) label = '积极'
|
|
|
|
|
else if (score > -0.2) label = '中性'
|
|
|
|
|
else if (score > -0.6) label = '消极'
|
|
|
|
|
else label = '非常消极'
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
score,
|
|
|
|
|
label,
|
|
|
|
|
positiveCount,
|
|
|
|
|
negativeCount,
|
|
|
|
|
neutralCount
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 根据词语类别分组术语
|
|
|
|
|
* @param {Array} termFrequency - 术语频率数据
|
|
|
|
|
* @param {Object} categories - 类别词典,键为类别名称,值为该类别下的词语数组
|
|
|
|
|
* @returns {Object} 按类别分组的术语
|
|
|
|
|
*/
|
|
|
|
|
export function groupTermsByCategory(termFrequency, categories) {
|
|
|
|
|
if (!termFrequency || !categories) {
|
|
|
|
|
return {}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const result = {}
|
|
|
|
|
|
|
|
|
|
// 初始化结果对象
|
|
|
|
|
Object.keys(categories).forEach(category => {
|
|
|
|
|
result[category] = []
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 遍历术语频率数据
|
|
|
|
|
termFrequency.forEach(item => {
|
|
|
|
|
const { term } = item
|
|
|
|
|
|
|
|
|
|
// 检查该术语属于哪个类别
|
|
|
|
|
Object.keys(categories).forEach(category => {
|
|
|
|
|
const categoryWords = categories[category]
|
|
|
|
|
// 如果术语在类别词语列表中,或者术语包含类别词语中的某个词
|
|
|
|
|
if (categoryWords.includes(term) || categoryWords.some(word => term.includes(word))) {
|
|
|
|
|
result[category].push(item)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 计算文本相似度
|
|
|
|
|
* @param {String} text1 - 第一个文本
|
|
|
|
|
* @param {String} text2 - 第二个文本
|
|
|
|
|
* @returns {Object} 相似度分析结果
|
|
|
|
|
*/
|
|
|
|
|
export function calculateTextSimilarity(text1, text2) {
|
|
|
|
|
if (!text1 || !text2) {
|
|
|
|
|
return {
|
|
|
|
|
similarity: 0,
|
|
|
|
|
commonWords: [],
|
|
|
|
|
uniqueWords1: [],
|
|
|
|
|
uniqueWords2: []
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 简化实现,将文本分词
|
|
|
|
|
const words1 = text1.split(/\s+|,|。|;|:|!|?|,|\.|;|:|!|\?/)
|
|
|
|
|
.filter(word => word.length > 1)
|
|
|
|
|
const words2 = text2.split(/\s+|,|。|;|:|!|?|,|\.|;|:|!|\?/)
|
|
|
|
|
.filter(word => word.length > 1)
|
|
|
|
|
|
|
|
|
|
// 计算词频
|
|
|
|
|
const freq1 = {}
|
|
|
|
|
const freq2 = {}
|
|
|
|
|
|
|
|
|
|
words1.forEach(word => {
|
|
|
|
|
freq1[word] = (freq1[word] || 0) + 1
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
words2.forEach(word => {
|
|
|
|
|
freq2[word] = (freq2[word] || 0) + 1
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 找出共有词和独有词
|
|
|
|
|
const commonWords = []
|
|
|
|
|
const uniqueWords1 = []
|
|
|
|
|
const uniqueWords2 = []
|
|
|
|
|
|
|
|
|
|
Object.keys(freq1).forEach(word => {
|
|
|
|
|
if (freq2[word]) {
|
|
|
|
|
commonWords.push({
|
|
|
|
|
term: word,
|
|
|
|
|
freq1: freq1[word],
|
|
|
|
|
freq2: freq2[word]
|
|
|
|
|
})
|
|
|
|
|
} else {
|
|
|
|
|
uniqueWords1.push({
|
|
|
|
|
term: word,
|
|
|
|
|
frequency: freq1[word]
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
Object.keys(freq2).forEach(word => {
|
|
|
|
|
if (!freq1[word]) {
|
|
|
|
|
uniqueWords2.push({
|
|
|
|
|
term: word,
|
|
|
|
|
frequency: freq2[word]
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 使用Jaccard相似度计算文本相似度
|
|
|
|
|
const allWords = new Set([...Object.keys(freq1), ...Object.keys(freq2)])
|
|
|
|
|
const intersection = commonWords.length
|
|
|
|
|
const similarity = intersection / allWords.size
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
similarity,
|
|
|
|
|
commonWords,
|
|
|
|
|
uniqueWords1,
|
|
|
|
|
uniqueWords2
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 将术语频率数据转换为CSV格式
|
|
|
|
|
* @param {Array} termFrequency - 术语频率数据
|
|
|
|
|
* @returns {String} CSV格式的数据
|
|
|
|
|
*/
|
|
|
|
|
export function exportTermFrequencyToCsv(termFrequency) {
|
|
|
|
|
if (!termFrequency || !termFrequency.length) {
|
|
|
|
|
return ''
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 创建CSV头
|
|
|
|
|
let csvContent = '术语,频率,百分比\n'
|
|
|
|
|
|
|
|
|
|
// 计算总频率
|
|
|
|
|
const totalFrequency = termFrequency.reduce((sum, item) => sum + item.frequency, 0)
|
|
|
|
|
|
|
|
|
|
// 添加每一行数据
|
|
|
|
|
termFrequency.forEach(item => {
|
|
|
|
|
const percentage = ((item.frequency / totalFrequency) * 100).toFixed(2)
|
|
|
|
|
csvContent += `${item.term},${item.frequency},${percentage}%\n`
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
return csvContent
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 生成模拟的词频数据(用于测试)
|
|
|
|
|
* @param {Number} count - 词条数量
|
|
|
|
|
* @returns {Array} 模拟的词频数据
|
|
|
|
|
*/
|
|
|
|
|
export function generateMockTermFrequency(count = 50) {
|
|
|
|
|
const terms = [
|
|
|
|
|
'数据', '分析', '报告', '增长', '下降', '趋势', '预测', '计划',
|
|
|
|
|
'指标', '目标', '完成', '销售', '产品', '市场', '客户', '服务',
|
|
|
|
|
'研发', '技术', '创新', '战略', '管理', '团队', '协作', '效率',
|
|
|
|
|
'质量', '成本', '收入', '利润', '预算', '投资', '回报', '风险',
|
|
|
|
|
'机会', '挑战', '问题', '解决', '方案', '实施', '评估', '改进',
|
|
|
|
|
'优化', '整合', '扩展', '收缩', '专注', '多元', '流程', '体系',
|
|
|
|
|
'标准', '规范', '合规', '审计', '监控', '决策', '执行', '反馈',
|
|
|
|
|
'调整', '变革', '稳定', '增值', '季度', '年度', '月度', '周期',
|
|
|
|
|
'短期', '长期', '快速', '稳健', '领先', '落后', '核心', '边缘',
|
|
|
|
|
'内部', '外部', '上游', '下游', '供应', '需求', '库存', '周转',
|
|
|
|
|
'提高', '降低', '强化', '弱化', '轻量', '重量', '敏捷', '刚性',
|
|
|
|
|
'柔性', '弹性', '创造', '破坏', '构建', '拆解', '增加', '减少'
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
// 确保词条数量不超过可用词条
|
|
|
|
|
const actualCount = Math.min(count, terms.length)
|
|
|
|
|
const selectedTerms = []
|
|
|
|
|
|
|
|
|
|
// 随机选择词条
|
|
|
|
|
while (selectedTerms.length < actualCount) {
|
|
|
|
|
const randomIndex = Math.floor(Math.random() * terms.length)
|
|
|
|
|
const term = terms[randomIndex]
|
|
|
|
|
|
|
|
|
|
if (!selectedTerms.find(item => item.term === term)) {
|
|
|
|
|
// 生成随机频率,使前面的项有更高的频率(模拟长尾分布)
|
|
|
|
|
const frequency = Math.floor(Math.random() * 100) +
|
|
|
|
|
Math.floor(Math.random() * (100 - selectedTerms.length * 2))
|
|
|
|
|
|
|
|
|
|
selectedTerms.push({
|
|
|
|
|
term,
|
|
|
|
|
frequency
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 按频率降序排序
|
|
|
|
|
return selectedTerms.sort((a, b) => b.frequency - a.frequency)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 生成词汇关联网络数据
|
|
|
|
|
*
|
|
|
|
|
* @param {Array} segments - 文本段落数组
|
|
|
|
|
* @param {Array} terms - 要分析关系的词汇列表
|
|
|
|
|
* @param {Object} options - 分析选项
|
|
|
|
|
* @returns {Object} - 关联网络数据
|
|
|
|
|
*/
|
|
|
|
|
export const generateTermLinks = (segments, terms, options = {}) => {
|
|
|
|
|
const defaultOptions = {
|
|
|
|
|
minCooccurrence: 2, // 最小共现次数
|
|
|
|
|
window: 50, // 共现窗口大小(段落内字符数)
|
|
|
|
|
maxLinks: 50 // 最大链接数
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const opts = { ...defaultOptions, ...options }
|
|
|
|
|
|
|
|
|
|
if (!segments || !Array.isArray(segments) || segments.length === 0) {
|
|
|
|
|
return { nodes: [], links: [] }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!terms || !Array.isArray(terms) || terms.length < 2) {
|
|
|
|
|
return { nodes: [], links: [] }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 初始化共现矩阵
|
|
|
|
|
const cooccurrenceMatrix = {}
|
|
|
|
|
terms.forEach(term1 => {
|
|
|
|
|
cooccurrenceMatrix[term1] = {}
|
|
|
|
|
terms.forEach(term2 => {
|
|
|
|
|
if (term1 !== term2) {
|
|
|
|
|
cooccurrenceMatrix[term1][term2] = 0
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 统计词汇共现次数
|
|
|
|
|
segments.forEach(segment => {
|
|
|
|
|
const text = segment.text
|
|
|
|
|
|
|
|
|
|
// 对每对词汇,检查它们是否在窗口内共现
|
|
|
|
|
terms.forEach(term1 => {
|
|
|
|
|
// 找出term1在文本中的所有位置
|
|
|
|
|
let positions1 = []
|
|
|
|
|
let pos = 0
|
|
|
|
|
while (text.indexOf(term1, pos) !== -1) {
|
|
|
|
|
const foundPos = text.indexOf(term1, pos)
|
|
|
|
|
positions1.push(foundPos)
|
|
|
|
|
pos = foundPos + term1.length
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 对于term1的每个位置,检查其他词是否在窗口内
|
|
|
|
|
terms.forEach(term2 => {
|
|
|
|
|
if (term1 !== term2) {
|
|
|
|
|
// 找出term2在文本中的所有位置
|
|
|
|
|
let positions2 = []
|
|
|
|
|
pos = 0
|
|
|
|
|
while (text.indexOf(term2, pos) !== -1) {
|
|
|
|
|
const foundPos = text.indexOf(term2, pos)
|
|
|
|
|
positions2.push(foundPos)
|
|
|
|
|
pos = foundPos + term2.length
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 检查term1和term2是否在窗口内共现
|
|
|
|
|
positions1.forEach(pos1 => {
|
|
|
|
|
positions2.forEach(pos2 => {
|
|
|
|
|
if (Math.abs(pos1 - pos2) <= opts.window) {
|
|
|
|
|
cooccurrenceMatrix[term1][term2]++
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 生成节点和链接数据
|
|
|
|
|
const nodes = terms.map(term => ({
|
|
|
|
|
name: term,
|
|
|
|
|
value: 1 // 可以替换为词频
|
|
|
|
|
}))
|
|
|
|
|
|
|
|
|
|
let links = []
|
|
|
|
|
|
|
|
|
|
// 添加链接
|
|
|
|
|
terms.forEach(term1 => {
|
|
|
|
|
terms.forEach(term2 => {
|
|
|
|
|
if (term1 !== term2 && cooccurrenceMatrix[term1][term2] >= opts.minCooccurrence) {
|
|
|
|
|
links.push({
|
|
|
|
|
source: term1,
|
|
|
|
|
target: term2,
|
|
|
|
|
value: cooccurrenceMatrix[term1][term2]
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 按共现次数排序并限制链接数量
|
|
|
|
|
links = links.sort((a, b) => b.value - a.value).slice(0, opts.maxLinks)
|
|
|
|
|
|
|
|
|
|
return { nodes, links }
|
|
|
|
|
}
|