You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
102301535/crawler.py

153 lines
6.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import pandas as pd
import time
import re
import random
from typing import List, Dict
import os
class BilibiliDanmuCrawler:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': 'https://www.bilibili.com'
})
self.noise_patterns = [
r'^666+$', r'^[0-9]+$', r'^点赞$', r'^前排$', r'^沙发$',
r'^哈哈哈+$', r'^233+$', r'^awsl$', r'^爷青回$'
]
def filter_noise(self, danmu: str) -> bool:
"""过滤噪声弹幕"""
danmu = danmu.strip()
if len(danmu) < 2 or len(danmu) > 50:
return False
for pattern in self.noise_patterns:
if re.match(pattern, danmu, re.IGNORECASE):
return False
return True
def generate_mock_data(self) -> pd.DataFrame:
"""生成模拟弹幕数据"""
print("生成模拟弹幕数据...")
# 大语言模型应用相关弹幕
llm_applications = [
"大语言模型在编程辅助方面真的很强,代码生成效率高",
"ChatGPT改变了我的工作方式写作效率提升明显",
"LLM在医疗领域的应用很有前景能辅助诊断",
"大模型的训练成本还是太高了,中小企业用不起",
"国产大模型越来越好了,比如文心一言和通义千问",
"提示工程很重要,好的提示词能大幅提升效果",
"AI写作助手节省了很多时间特别是写报告",
"语言模型在教育应用很棒,能个性化辅导学生",
"担心AI会取代一些初级程序员的工作",
"大模型的伦理问题需要更多关注和监管",
"多模态大模型是未来趋势,能理解图片和文字",
"本地部署大模型很有必要,保护数据隐私",
"AI绘画配合大语言模型很强大创意工作更高效",
"企业级大模型应用越来越多,降本增效明显",
"大语言模型的数据安全问题需要重视",
"代码自动补全功能太实用了,开发效率翻倍",
"智能客服应用成熟24小时在线服务",
"机器翻译质量大幅提升,接近人工水平",
"内容创作领域AI应用广泛自媒体人的利器",
"数据分析结合LLM洞察发现更快捷"
]
# 应用领域分类
applications = {
'编程开发': [
"代码生成太方便了", "编程助手很好用", "debug效率提升", "自动补全智能",
"程序员必备工具", "开发效率大幅提升", "代码审查助手"
],
'内容创作': [
"写作助手真棒", "内容生成快速", "文案创作神器", "自媒体好帮手",
"创意写作辅助", "营销文案生成"
],
'教育培训': [
"学习辅导不错", "教育应用前景广", "个性化教学", "智能答疑系统",
"在线教育革新"
],
'医疗健康': [
"医疗诊断辅助", "健康咨询AI", "病历分析助手", "药物研发应用"
],
'商业办公': [
"办公自动化", "企业智能助手", "数据分析工具", "商业决策支持",
"客户服务优化"
],
'智能客服': [
"客服效率提升", "24小时在线服务", "智能问答准确", "用户服务体验好"
],
'翻译理解': [
"多语言翻译强", "语义理解准确", "跨语言交流便利", "翻译质量高"
],
'创意设计': [
"AI绘画惊艳", "创意设计辅助", "艺术创作伙伴", "设计灵感来源"
]
}
# 用户观点
opinions_positive = [
"效果超出预期", "用户体验很好", "技术发展迅速", "应用价值高",
"工作效率提升", "学习成本低", "界面友好易用"
]
opinions_concerns = [
"成本还是偏高", "数据隐私担忧", "技术不够稳定", "回答有时不准",
"需要网络连接", "企业应用成本高", "依赖国外技术"
]
# 生成弹幕数据
all_danmus = []
bvids = [f"BV1{''.join(random.choices('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=10))}"
for _ in range(20)]
for bvid in bvids:
# 每个视频生成10-20条弹幕
num_danmus = random.randint(10, 20)
for _ in range(num_danmus):
# 随机选择弹幕类型
danmu_type = random.choice(['application', 'opinion_positive', 'opinion_concern'])
if danmu_type == 'application':
app_category = random.choice(list(applications.keys()))
danmu = random.choice(applications[app_category])
elif danmu_type == 'opinion_positive':
danmu = f"{random.choice(opinions_positive)}{random.choice(['推荐使用', '值得尝试', '会继续使用'])}"
else:
danmu = f"{random.choice(opinions_concerns)}{random.choice(['需要改进', '希望优化', '期待更好'])}"
if self.filter_noise(danmu):
all_danmus.append({
'bvid': bvid,
'danmu': danmu,
'keyword': random.choice(['大语言模型', '大模型', 'LLM'])
})
return pd.DataFrame(all_danmus)
def main():
crawler = BilibiliDanmuCrawler()
print("开始生成弹幕数据...")
df = crawler.generate_mock_data()
# 确保目录存在
os.makedirs('data/raw', exist_ok=True)
# 保存数据
df.to_csv('data/raw/danmu_raw.csv', index=False, encoding='utf-8-sig')
print(f"数据生成完成,共 {len(df)} 条弹幕")
print("数据保存至: data/raw/danmu_raw.csv")
# 显示前几条数据
print("\n前5条数据预览:")
print(df.head())
return df
if __name__ == "__main__":
main()