|
|
import requests
|
|
|
import pandas as pd
|
|
|
import time
|
|
|
import re
|
|
|
import random
|
|
|
from typing import List, Dict
|
|
|
import os
|
|
|
|
|
|
class BilibiliDanmuCrawler:
|
|
|
def __init__(self):
|
|
|
self.session = requests.Session()
|
|
|
self.session.headers.update({
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
|
'Referer': 'https://www.bilibili.com'
|
|
|
})
|
|
|
self.noise_patterns = [
|
|
|
r'^666+$', r'^[0-9]+$', r'^点赞$', r'^前排$', r'^沙发$',
|
|
|
r'^哈哈哈+$', r'^233+$', r'^awsl$', r'^爷青回$'
|
|
|
]
|
|
|
|
|
|
def filter_noise(self, danmu: str) -> bool:
|
|
|
"""过滤噪声弹幕"""
|
|
|
danmu = danmu.strip()
|
|
|
if len(danmu) < 2 or len(danmu) > 50:
|
|
|
return False
|
|
|
|
|
|
for pattern in self.noise_patterns:
|
|
|
if re.match(pattern, danmu, re.IGNORECASE):
|
|
|
return False
|
|
|
return True
|
|
|
|
|
|
def generate_mock_data(self) -> pd.DataFrame:
|
|
|
"""生成模拟弹幕数据"""
|
|
|
print("生成模拟弹幕数据...")
|
|
|
|
|
|
# 大语言模型应用相关弹幕
|
|
|
llm_applications = [
|
|
|
"大语言模型在编程辅助方面真的很强,代码生成效率高",
|
|
|
"ChatGPT改变了我的工作方式,写作效率提升明显",
|
|
|
"LLM在医疗领域的应用很有前景,能辅助诊断",
|
|
|
"大模型的训练成本还是太高了,中小企业用不起",
|
|
|
"国产大模型越来越好了,比如文心一言和通义千问",
|
|
|
"提示工程很重要,好的提示词能大幅提升效果",
|
|
|
"AI写作助手节省了很多时间,特别是写报告",
|
|
|
"语言模型在教育应用很棒,能个性化辅导学生",
|
|
|
"担心AI会取代一些初级程序员的工作",
|
|
|
"大模型的伦理问题需要更多关注和监管",
|
|
|
"多模态大模型是未来趋势,能理解图片和文字",
|
|
|
"本地部署大模型很有必要,保护数据隐私",
|
|
|
"AI绘画配合大语言模型很强大,创意工作更高效",
|
|
|
"企业级大模型应用越来越多,降本增效明显",
|
|
|
"大语言模型的数据安全问题需要重视",
|
|
|
"代码自动补全功能太实用了,开发效率翻倍",
|
|
|
"智能客服应用成熟,24小时在线服务",
|
|
|
"机器翻译质量大幅提升,接近人工水平",
|
|
|
"内容创作领域AI应用广泛,自媒体人的利器",
|
|
|
"数据分析结合LLM,洞察发现更快捷"
|
|
|
]
|
|
|
|
|
|
# 应用领域分类
|
|
|
applications = {
|
|
|
'编程开发': [
|
|
|
"代码生成太方便了", "编程助手很好用", "debug效率提升", "自动补全智能",
|
|
|
"程序员必备工具", "开发效率大幅提升", "代码审查助手"
|
|
|
],
|
|
|
'内容创作': [
|
|
|
"写作助手真棒", "内容生成快速", "文案创作神器", "自媒体好帮手",
|
|
|
"创意写作辅助", "营销文案生成"
|
|
|
],
|
|
|
'教育培训': [
|
|
|
"学习辅导不错", "教育应用前景广", "个性化教学", "智能答疑系统",
|
|
|
"在线教育革新"
|
|
|
],
|
|
|
'医疗健康': [
|
|
|
"医疗诊断辅助", "健康咨询AI", "病历分析助手", "药物研发应用"
|
|
|
],
|
|
|
'商业办公': [
|
|
|
"办公自动化", "企业智能助手", "数据分析工具", "商业决策支持",
|
|
|
"客户服务优化"
|
|
|
],
|
|
|
'智能客服': [
|
|
|
"客服效率提升", "24小时在线服务", "智能问答准确", "用户服务体验好"
|
|
|
],
|
|
|
'翻译理解': [
|
|
|
"多语言翻译强", "语义理解准确", "跨语言交流便利", "翻译质量高"
|
|
|
],
|
|
|
'创意设计': [
|
|
|
"AI绘画惊艳", "创意设计辅助", "艺术创作伙伴", "设计灵感来源"
|
|
|
]
|
|
|
}
|
|
|
|
|
|
# 用户观点
|
|
|
opinions_positive = [
|
|
|
"效果超出预期", "用户体验很好", "技术发展迅速", "应用价值高",
|
|
|
"工作效率提升", "学习成本低", "界面友好易用"
|
|
|
]
|
|
|
|
|
|
opinions_concerns = [
|
|
|
"成本还是偏高", "数据隐私担忧", "技术不够稳定", "回答有时不准",
|
|
|
"需要网络连接", "企业应用成本高", "依赖国外技术"
|
|
|
]
|
|
|
|
|
|
# 生成弹幕数据
|
|
|
all_danmus = []
|
|
|
bvids = [f"BV1{''.join(random.choices('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=10))}"
|
|
|
for _ in range(20)]
|
|
|
|
|
|
for bvid in bvids:
|
|
|
# 每个视频生成10-20条弹幕
|
|
|
num_danmus = random.randint(10, 20)
|
|
|
for _ in range(num_danmus):
|
|
|
# 随机选择弹幕类型
|
|
|
danmu_type = random.choice(['application', 'opinion_positive', 'opinion_concern'])
|
|
|
|
|
|
if danmu_type == 'application':
|
|
|
app_category = random.choice(list(applications.keys()))
|
|
|
danmu = random.choice(applications[app_category])
|
|
|
elif danmu_type == 'opinion_positive':
|
|
|
danmu = f"{random.choice(opinions_positive)},{random.choice(['推荐使用', '值得尝试', '会继续使用'])}"
|
|
|
else:
|
|
|
danmu = f"{random.choice(opinions_concerns)},{random.choice(['需要改进', '希望优化', '期待更好'])}"
|
|
|
|
|
|
if self.filter_noise(danmu):
|
|
|
all_danmus.append({
|
|
|
'bvid': bvid,
|
|
|
'danmu': danmu,
|
|
|
'keyword': random.choice(['大语言模型', '大模型', 'LLM'])
|
|
|
})
|
|
|
|
|
|
return pd.DataFrame(all_danmus)
|
|
|
|
|
|
def main():
|
|
|
crawler = BilibiliDanmuCrawler()
|
|
|
|
|
|
print("开始生成弹幕数据...")
|
|
|
df = crawler.generate_mock_data()
|
|
|
|
|
|
# 确保目录存在
|
|
|
os.makedirs('data/raw', exist_ok=True)
|
|
|
|
|
|
# 保存数据
|
|
|
df.to_csv('data/raw/danmu_raw.csv', index=False, encoding='utf-8-sig')
|
|
|
print(f"数据生成完成,共 {len(df)} 条弹幕")
|
|
|
print("数据保存至: data/raw/danmu_raw.csv")
|
|
|
|
|
|
# 显示前几条数据
|
|
|
print("\n前5条数据预览:")
|
|
|
print(df.head())
|
|
|
|
|
|
return df
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main() |