parent
af79ad9d01
commit
b747054991
|
After Width: | Height: | Size: 91 KiB |
Binary file not shown.
@ -0,0 +1,20 @@
|
||||
from tool.scrawl import Scawler
|
||||
import tool.word_filter as static
|
||||
import tool.keywords as keywords
|
||||
import tool.csv_parse as cp
|
||||
from tool.static import analyze_danmu_statistics
|
||||
from tool.cloud_show import Cloud_shower
|
||||
|
||||
if __name__ == "__main__":
|
||||
# scawler = Scawler()
|
||||
# ls = scawler.work(1, 40)
|
||||
# with open('./raw_danmu.txt', 'w', encoding='utf-8') as f:
|
||||
# for danmu in ls:
|
||||
# f.write(danmu + '\n')
|
||||
ls = []
|
||||
with open('./raw_danmu.txt', 'r', encoding='utf-8') as f:
|
||||
ls = [line.strip() for line in f.readlines() if line.strip()]
|
||||
danmu_counter = analyze_danmu_statistics(ls)
|
||||
shower = Cloud_shower()
|
||||
shower.to_show(danmu_counter)
|
||||
print("词云生成完毕,保存为 ai_danmu_stylecloud.png")
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,53 @@
|
||||
from stylecloud import gen_stylecloud
|
||||
|
||||
|
||||
class Cloud_shower:
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def to_show(self, sample_danmu_data_frq):
|
||||
# 使用stylecloud生成词云
|
||||
text_list = []
|
||||
for word, freq in sample_danmu_data_frq.items():
|
||||
text_list.extend([word] * freq)
|
||||
|
||||
text = " ".join(text_list)
|
||||
gen_stylecloud(
|
||||
text=text, # 处理好的文本
|
||||
size=1024, # 图片尺寸,越大越清晰
|
||||
font_path='msyh.ttc', # 指定中文字体路径(如微软雅黑)
|
||||
output_name='ai_danmu_stylecloud.png', # 输出文件名
|
||||
icon_name='fas fa-question-circle',
|
||||
custom_stopwords=['的', '了', '在', '是', '我', '有', '和', '机'], # 自定义停用词
|
||||
palette='colorbrewer.qualitative.Set1_8', # 使用预设配色方案
|
||||
# background_color='white', # 背景色
|
||||
gradient='horizontal', # 颜色渐变方向
|
||||
max_font_size=200, # 最大字体大小
|
||||
max_words=500, # 最多显示词数
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import tool.csv_parse as cp
|
||||
# 使用示例
|
||||
data_map = cp.read_csv_to_list('./result.csv')
|
||||
print(data_map)
|
||||
|
||||
shower = Cloud_shower()
|
||||
sample_danmu_data = [
|
||||
"AI技术真厉害",
|
||||
"大模型应用广泛",
|
||||
"深度学习",
|
||||
"神经网络",
|
||||
"机器学习",
|
||||
"AI技术真厉害",
|
||||
"自然语言处理",
|
||||
"计算机视觉",
|
||||
"大模型应用广泛",
|
||||
"强化学习",
|
||||
"AI技术真厉害",
|
||||
"生成式AI",
|
||||
]
|
||||
shower.to_show(data_map)
|
||||
print("词云生成完毕,保存为 ai_danmu_stylecloud.png")
|
||||
@ -0,0 +1,61 @@
|
||||
import palettable
|
||||
import csv
|
||||
from typing import List, Tuple
|
||||
|
||||
|
||||
def read_csv_to_list(csv_file: str) -> List[Tuple[str, int]]:
|
||||
"""
|
||||
从CSV文件读取数据并转换为list[tuple[str, int]]
|
||||
|
||||
Args:
|
||||
csv_file: CSV文件路径
|
||||
|
||||
Returns:
|
||||
List[Tuple[str, int]]: 包含(内容, 频率)的列表
|
||||
"""
|
||||
result = {}
|
||||
try:
|
||||
with open(csv_file, 'r', encoding='utf-8') as file:
|
||||
# 创建CSV读取器
|
||||
csv_reader = csv.reader(file)
|
||||
|
||||
# 跳过标题行(如果有)
|
||||
headers = next(csv_reader, None)
|
||||
|
||||
for row in csv_reader:
|
||||
content = row[0].strip() # 内容列
|
||||
frequency = int(row[1]) # 频率列(转换为整数)
|
||||
result[content] = frequency
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f"错误:文件 {csv_file} 未找到")
|
||||
except ValueError as e:
|
||||
print(f"错误:频率值转换失败 - {e}")
|
||||
except Exception as e:
|
||||
print(f"读取文件时发生错误:{e}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def trans_list_to_csv(ls: List[Tuple[str, int]], csv_file: str) -> None:
|
||||
"""
|
||||
将list[tuple[str, int]]转换并保存为CSV文件
|
||||
|
||||
Args:
|
||||
ls: 包含(内容, 频率)的列表
|
||||
"""
|
||||
try:
|
||||
with open(csv_file, 'w', encoding='utf-8', newline='') as file:
|
||||
csv_writer = csv.writer(file)
|
||||
|
||||
# 写入标题行
|
||||
csv_writer.writerow(['内容', '频率'])
|
||||
|
||||
# 写入数据行
|
||||
for content, frequency in ls:
|
||||
csv_writer.writerow([content.replace(",", "_"), frequency])
|
||||
|
||||
print(f"数据已成功保存到 {csv_file}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"写入文件时发生错误:{e}")
|
||||
@ -0,0 +1,309 @@
|
||||
# 创建关键词文件
|
||||
keywords = [
|
||||
# 基础术语
|
||||
'AI',
|
||||
'人工智能',
|
||||
'机器学习',
|
||||
'深度学习',
|
||||
'神经网络',
|
||||
'自然语言处理',
|
||||
'NLP',
|
||||
'计算机视觉',
|
||||
'CV',
|
||||
'大模型',
|
||||
'GPT',
|
||||
'ChatGPT',
|
||||
'文心一言',
|
||||
'通义千问',
|
||||
'智谱',
|
||||
'讯飞星火',
|
||||
'强化学习',
|
||||
'生成式AI',
|
||||
'AIGC',
|
||||
'LLM',
|
||||
'扩散模型',
|
||||
'Transformer',
|
||||
'BERT',
|
||||
'Stable Diffusion',
|
||||
'Midjourney',
|
||||
'DALL-E',
|
||||
'AI绘画',
|
||||
'AI写作',
|
||||
'AI编程',
|
||||
'自动驾驶',
|
||||
'智能助手',
|
||||
'语音识别',
|
||||
'图像识别',
|
||||
'目标检测',
|
||||
'语义分割',
|
||||
'数据挖掘',
|
||||
'知识图谱',
|
||||
'推荐系统',
|
||||
|
||||
# 大模型相关
|
||||
'GPT-3',
|
||||
'GPT-4',
|
||||
'GPT-5',
|
||||
'ChatGLM',
|
||||
'LLaMA',
|
||||
'Vicuna',
|
||||
'Alpaca',
|
||||
'Bloom',
|
||||
'T5',
|
||||
'BART',
|
||||
'ERNIE',
|
||||
'Claude',
|
||||
'Gemini',
|
||||
'PaLM',
|
||||
'LaMDA',
|
||||
'Codex',
|
||||
'Copilot',
|
||||
'InstructGPT',
|
||||
'Sparrow',
|
||||
'Gopher',
|
||||
'Chinchilla',
|
||||
'Jurassic',
|
||||
'WuDao',
|
||||
'PanGu',
|
||||
'PLUG',
|
||||
'M6',
|
||||
'CPM',
|
||||
'EVA',
|
||||
'CogView',
|
||||
|
||||
# 多模态AI
|
||||
'多模态',
|
||||
'视觉语言模型',
|
||||
'VLM',
|
||||
'DALL-E2',
|
||||
'DALL-E3',
|
||||
'Imagen',
|
||||
'Parti',
|
||||
'CogVideo',
|
||||
'Make-A-Video',
|
||||
'Phenaki',
|
||||
'NUWA',
|
||||
'CogView2',
|
||||
'CogView3',
|
||||
'文生图',
|
||||
'图生文',
|
||||
'文生视频',
|
||||
'语音合成',
|
||||
'TTS',
|
||||
'ASR',
|
||||
'语音克隆',
|
||||
|
||||
# 技术架构
|
||||
'注意力机制',
|
||||
'自注意力',
|
||||
'多头注意力',
|
||||
'编码器',
|
||||
'解码器',
|
||||
'预训练',
|
||||
'微调',
|
||||
'提示工程',
|
||||
'Prompt',
|
||||
'思维链',
|
||||
'CoT',
|
||||
'零样本学习',
|
||||
'小样本学习',
|
||||
'指令调优',
|
||||
'RLHF',
|
||||
'人类反馈强化学习',
|
||||
'对齐',
|
||||
'缩放定律',
|
||||
'涌现能力',
|
||||
|
||||
# 应用场景
|
||||
'智能客服',
|
||||
'聊天机器人',
|
||||
'虚拟人',
|
||||
'数字人',
|
||||
'AI主播',
|
||||
'内容生成',
|
||||
'代码生成',
|
||||
'智能编程',
|
||||
'低代码',
|
||||
'无代码',
|
||||
'智能文档',
|
||||
'RAG',
|
||||
'检索增强',
|
||||
'AI搜索',
|
||||
'智能问答',
|
||||
'知识库',
|
||||
'智能诊断',
|
||||
'AI制药',
|
||||
'AI金融',
|
||||
'量化交易',
|
||||
'风险控制',
|
||||
'智能投顾',
|
||||
'AI教育',
|
||||
'个性化学习',
|
||||
'智慧城市',
|
||||
'智能交通',
|
||||
'工业AI',
|
||||
'预测性维护',
|
||||
'质量检测',
|
||||
'AI农业',
|
||||
'精准农业',
|
||||
'智能家居',
|
||||
|
||||
# 工具框架
|
||||
'TensorFlow',
|
||||
'PyTorch',
|
||||
'Keras',
|
||||
'Hugging Face',
|
||||
'Transformers库',
|
||||
'Diffusers',
|
||||
'LangChain',
|
||||
'LlamaIndex',
|
||||
'AutoGPT',
|
||||
'BabyAGI',
|
||||
'OpenAI',
|
||||
'Anthropic',
|
||||
'Google AI',
|
||||
'Microsoft AI',
|
||||
'Meta AI',
|
||||
'百度AI',
|
||||
'阿里云',
|
||||
'腾讯云',
|
||||
'华为云',
|
||||
'讯飞开放平台',
|
||||
|
||||
# 技术概念
|
||||
'监督学习',
|
||||
'无监督学习',
|
||||
'半监督学习',
|
||||
'自监督学习',
|
||||
'迁移学习',
|
||||
'元学习',
|
||||
'联邦学习',
|
||||
'图神经网络',
|
||||
'GNN',
|
||||
'胶囊网络',
|
||||
'神经图灵机',
|
||||
'生成对抗网络',
|
||||
'GAN',
|
||||
'变分自编码器',
|
||||
'VAE',
|
||||
'流模型',
|
||||
'标准化流',
|
||||
'贝叶斯深度学习',
|
||||
'可解释AI',
|
||||
'XAI',
|
||||
'因果推断',
|
||||
'鲁棒性',
|
||||
'公平性',
|
||||
|
||||
# 新兴方向
|
||||
'具身智能',
|
||||
'AGI',
|
||||
'通用人工智能',
|
||||
'超级智能',
|
||||
'AI安全',
|
||||
'对齐问题',
|
||||
'AI伦理',
|
||||
'AI治理',
|
||||
'AI法规',
|
||||
'AI for Science',
|
||||
'科学智能',
|
||||
'AlphaFold',
|
||||
'天气预报',
|
||||
'气候模拟',
|
||||
'蛋白质设计',
|
||||
'材料发现',
|
||||
'AI辅助创作',
|
||||
|
||||
# 中文特色
|
||||
'文心一言',
|
||||
'通义千问',
|
||||
'讯飞星火',
|
||||
'腾讯混元',
|
||||
'字节豆包',
|
||||
'智谱AI',
|
||||
'月之暗面',
|
||||
'深度求索',
|
||||
'零一万物',
|
||||
'百川智能',
|
||||
'昆仑万维',
|
||||
'幻方AI',
|
||||
'上海AI实验室',
|
||||
'北京智源',
|
||||
'之江实验室'
|
||||
]
|
||||
# 小关键词验证
|
||||
# # ai_keywords = [
|
||||
# 'AI', '人工智能', '机器学习', '深度学习', '神经网络', '自然语言处理', 'NLP', '计算机视觉', 'CV', '大模型', 'GPT', 'ChatGPT', '文心一言', '通义千问',
|
||||
# '智谱', '讯飞星火', '强化学习', '生成式AI', 'AIGC', 'LLM', '扩散模型', 'Transformer', 'BERT', 'Stable Diffusion', 'Midjourney',
|
||||
# 'DALL-E', 'AI绘画', 'AI写作', 'AI编程', '自动驾驶', '智能助手', '语音识别', '图像识别', '目标检测', '语义分割', '数据挖掘', '知识图谱', '推荐系统'
|
||||
# ]
|
||||
|
||||
# 去重并排序
|
||||
colloquial_patterns = [
|
||||
# 疑问语气词
|
||||
r'我\?*'
|
||||
r'是不是',
|
||||
r'吗\?*',
|
||||
r'呢\?*',
|
||||
r'吧\?*',
|
||||
r'啊\?*',
|
||||
r'呀\?*',
|
||||
r'啦\?*',
|
||||
# 感叹语气词
|
||||
r'啊!*',
|
||||
r'呀!*',
|
||||
r'啦!*',
|
||||
r'哇!*',
|
||||
r'哦!*',
|
||||
r'哟!*',
|
||||
r'诶!*',
|
||||
# 口语化表达
|
||||
r'^啊\s+',
|
||||
r'^呃\s+',
|
||||
r'^嗯\s+',
|
||||
r'^哼\s+',
|
||||
r'^哈\s+',
|
||||
r'^嘿\s+',
|
||||
r'牢大',
|
||||
r'hhh+',
|
||||
r'哈哈+',
|
||||
r'嘻嘻+',
|
||||
r'嘿嘿+',
|
||||
r'呵呵+',
|
||||
r'太NB了',
|
||||
r'牛啊',
|
||||
r'牛啊',
|
||||
r'太厉害了吧',
|
||||
r'卧槽',
|
||||
r'我靠',
|
||||
r'啥\?*',
|
||||
r'咋\s+',
|
||||
r'啥意思',
|
||||
r'怎么回事',
|
||||
# 其他口语化表达
|
||||
r'我的妈呀',
|
||||
r'天啊',
|
||||
r'上帝',
|
||||
r'佛祖',
|
||||
r'妈呀',
|
||||
r'我晕',
|
||||
r'我吐了',
|
||||
r'我裂开了',
|
||||
r'我傻了',
|
||||
r'我惊了',
|
||||
r'救命',
|
||||
r'要命',
|
||||
r'完蛋',
|
||||
r'糟糕',
|
||||
r'该死'
|
||||
]
|
||||
keywords = sorted(list(set(keywords)))
|
||||
colloquial_patterns = sorted(list(set(colloquial_patterns)))
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 保存到文件
|
||||
with open('keywords.txt', 'w', encoding='utf-8') as f:
|
||||
for keyword in keywords:
|
||||
f.write(keyword + '\n')
|
||||
|
||||
print(f"已创建关键词文件,包含 {len(keywords)} 个AI相关关键词")
|
||||
@ -0,0 +1,61 @@
|
||||
import pandas as pd
|
||||
from collections import Counter
|
||||
import logging
|
||||
|
||||
from tool.word_filter import DanmakuFilter
|
||||
|
||||
|
||||
def analyze_danmu_statistics(danmu_list, top_n=8, output_file='danmu_statistics.xlsx'):
|
||||
"""
|
||||
统计弹幕数据并导出到Excel
|
||||
|
||||
参数:
|
||||
danmu_list: 弹幕字符串列表
|
||||
top_n: 需要统计的前N名弹幕数量
|
||||
output_file: 输出的Excel文件名
|
||||
"""
|
||||
try:
|
||||
danmaku_filter = DanmakuFilter()
|
||||
danmaku_counter = Counter(danmu_list)
|
||||
filtered_counter = danmaku_filter.filter_danmaku(danmaku_counter)
|
||||
|
||||
# 获取前top_n个最常见的弹幕
|
||||
top_danmus = filtered_counter.most_common(top_n)
|
||||
|
||||
# 准备数据框
|
||||
data = []
|
||||
for rank, (danmu, count) in enumerate(top_danmus, 1):
|
||||
data.append({
|
||||
'排名': rank,
|
||||
'弹幕内容': danmu,
|
||||
'出现次数': count,
|
||||
})
|
||||
# 创建DataFrame
|
||||
df = pd.DataFrame(data)
|
||||
print(df.to_string(index=False))
|
||||
# 导出到Excel
|
||||
df.to_excel(output_file, index=False)
|
||||
return filtered_counter
|
||||
|
||||
except Exception as e:
|
||||
print(f"统计过程中发生错误: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 使用示例
|
||||
if __name__ == "__main__":
|
||||
# 示例数据 - 这里替换为您实际的弹幕数据
|
||||
sample_danmu_data = [
|
||||
"AI技术真厉害", "大模型应用广泛", "深度学习", "神经网络", "机器学习", "AI技术真厉害", "自然语言处理", "计算机视觉", "大模型应用广泛", "强化学习", "AI技术真厉害",
|
||||
"生成式AI", "深度学习", "大模型应用广泛", "Transformer", "AI技术真厉害"
|
||||
]
|
||||
|
||||
# 基本统计功能
|
||||
print("=== 基本统计功能 ===")
|
||||
result = analyze_danmu_statistics(sample_danmu_data, top_n=8)
|
||||
|
||||
if result is not None:
|
||||
print("\n统计结果预览:")
|
||||
print(result)
|
||||
# 详细分析报告
|
||||
print("\n=== 生成详细分析报告 ===")
|
||||
@ -0,0 +1,11 @@
|
||||
{
|
||||
"folders": [
|
||||
{
|
||||
"path": ".."
|
||||
},
|
||||
{
|
||||
"path": "../.."
|
||||
}
|
||||
],
|
||||
"settings": {}
|
||||
}
|
||||
Loading…
Reference in new issue