|
|
import requests
|
|
|
import time
|
|
|
import os
|
|
|
import random
|
|
|
import re
|
|
|
from collections import Counter
|
|
|
import pandas as pd
|
|
|
import jieba
|
|
|
from wordcloud import WordCloud
|
|
|
import matplotlib.pyplot as plt
|
|
|
from PIL import Image
|
|
|
import numpy as np
|
|
|
import openpyxl
|
|
|
from selenium import webdriver
|
|
|
from selenium.webdriver.common.by import By
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
import json
|
|
|
|
|
|
class BilibiliDanmuAnalyzer:
|
|
|
def __init__(self):
|
|
|
self.danmu_data = []
|
|
|
self.filtered_danmu = []
|
|
|
|
|
|
def fix_encoding_problem(self, text):
|
|
|
"""修复编码问题"""
|
|
|
try:
|
|
|
# 检测编码
|
|
|
detected = chardet.detect(text.encode('latin-1'))
|
|
|
encoding = detected.get('encoding', 'utf-8')
|
|
|
|
|
|
if encoding.lower() == 'iso-8859-1':
|
|
|
# 尝试多种可能的编码
|
|
|
encodings_to_try = ['gb2312', 'gbk', 'big5', 'utf-8']
|
|
|
for enc in encodings_to_try:
|
|
|
try:
|
|
|
fixed_text = text.encode('latin-1').decode(enc)
|
|
|
return fixed_text
|
|
|
except:
|
|
|
continue
|
|
|
return text
|
|
|
except:
|
|
|
return text
|
|
|
|
|
|
def crawl_bilibili_videos(self):
|
|
|
"""
|
|
|
爬取B站大语言模型相关视频
|
|
|
"""
|
|
|
print("开始爬取B站大语言模型相关视频...")
|
|
|
|
|
|
# 方法1: 使用API直接获取(更稳定)
|
|
|
videos = self._crawl_by_api()
|
|
|
|
|
|
# 方法2: 如果没有API权限,可以使用预定义的视频ID
|
|
|
if not videos:
|
|
|
print("使用备用方案...")
|
|
|
videos = self._get_sample_videos()
|
|
|
|
|
|
return videos
|
|
|
|
|
|
def _crawl_by_api(self):
|
|
|
keywords = ["大语言模型", "大模型", "LLM", "人工智能", "ChatGPT"]
|
|
|
|
|
|
# 使用set来存储BV号,自动去重
|
|
|
bv_set = set()
|
|
|
|
|
|
for keyword in keywords:
|
|
|
try:
|
|
|
# B站搜索API
|
|
|
url = "https://api.bilibili.com/x/web-interface/search/type"
|
|
|
params = {
|
|
|
'search_type': 'video',
|
|
|
'keyword': keyword,
|
|
|
'order': 'totalrank', # 综合排序
|
|
|
'page_size': 30 # 一般每页显示约30个视频
|
|
|
}
|
|
|
|
|
|
headers = {
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
|
'Referer': 'https://www.bilibili.com',
|
|
|
'cookie' : "buvid_fp_plain=undefined; enable_web_push=DISABLE; DedeUserID=1325886155; DedeUserID__ckMd5=2a3fe0a976d24759; blackside_state=0; CURRENT_BLACKGAP=0; enable_feed_channel=ENABLE; LIVE_BUVID=AUTO8017412766793242; fingerprint=5737cf43b47a2873d5edf15fbf56d7a7; buvid_fp=5737cf43b47a2873d5edf15fbf56d7a7; header_theme_version=OPEN; theme-tip-show=SHOWED; theme-avatar-tip-show=SHOWED; _uuid=991C474A-FE4A-6BB2-7126-33756DF375B672647infoc; hit-dyn-v2=1; buvid3=6741FBB4-F453-5714-3CFF-3098B9B3CD9691841infoc; b_nut=1756736891; buvid4=832703B7-1153-CE27-C703-2CBAAA17D1E894924-023072916-s2/qL+xzSDJXc6bocezptg%3D%3D; rpdid=|(RYkm|lmkJ0J'u~lm)llk~m; home_feed_column=5; browser_resolution=1699-881; PVID=1; ogv_device_support_hdr=0; CURRENT_QUALITY=112; SESSDATA=f1bb22f3%2C1779110035%2C11644%2Ab2CjCuqhTM7Js2YCZ7N0tQMULJ_qUmFTHzW8u5GcYyyRwhPFm9P5K5XHZSFh_0e8ip04wSVjVBQmtLQ29OeGJlZnB2V2xTdXp0UkoyUmRJQjgzaUJkei1oX0VCV01IcFdGZ1VwNFVxOEd2MUNRYWZ2bjRiWmhnZk9ueVU3Q0c5VWZkYVRJNnllc3hnIIEC; bili_jct=6eb12caa0c933cbf2f4b85fc3b5829c3; sid=8pi8t3jo; bsource=search_bing; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NjM4MjUzNDksImlhdCI6MTc2MzU2NjA4OSwicGx0IjotMX0.gTEcj4q-Z3iYT6ibXRoDPlCODbQar1hc1a2DUY9yvmY; bili_ticket_expires=1763825289; b_lsid=5D5764F1_19A9CD8273A; timeMachine=0; bp_t_offset_1325886155=1137014672319315968; CURRENT_FNVAL=2000"
|
|
|
}
|
|
|
|
|
|
response = requests.get(url, params=params, headers=headers, timeout=10, verify=False)
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
data = response.json()
|
|
|
video_list = data.get('data', {}).get('result', [])
|
|
|
|
|
|
|
|
|
# 提取当前关键词搜索结果中的所有BV号
|
|
|
current_bvs = []
|
|
|
bv_list = []
|
|
|
for video in video_list:
|
|
|
bvid = video.get('bvid')
|
|
|
title = video.get('title', '未知标题') # 提取标题,默认值为"未知标题"
|
|
|
if bvid and bvid.startswith('BV') and bvid not in bv_set:
|
|
|
bv_set.add(bvid)
|
|
|
bv_list.append({'bvid': bvid, 'title': title}) # 组成字典,加入列表
|
|
|
|
|
|
print(f"关键词 '{keyword}' 找到 {len(video_list)} 个视频(去重后保留 {len([v for v in video_list if v.get('bvid') in bv_set])} 个)")
|
|
|
else:
|
|
|
print(f"关键词 '{keyword}' 请求失败,状态码: {response.status_code}")
|
|
|
|
|
|
# 降低请求频率避免被封
|
|
|
time.sleep(random.uniform(1, 3))
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"爬取关键词 '{keyword}' 时出错: {e}")
|
|
|
continue
|
|
|
|
|
|
|
|
|
print(f"总计获取到 {len(bv_list)} 个唯一的BV号")
|
|
|
return bv_list
|
|
|
|
|
|
def _get_sample_videos(self):
|
|
|
"""获取样本视频数据"""
|
|
|
sample_videos = [
|
|
|
{'bvid': 'BV1KYCKBTERZ', 'title': 'Gemini 3.0 彻底杀疯了!GPT-5.1 还有活路吗?'},
|
|
|
{'bvid': 'BV1Gu41137rP', 'title': 'LLM技术发展现状'},
|
|
|
{'bvid': 'BV1Nu411379z', 'title': 'ChatGPT背后的秘密'}
|
|
|
]
|
|
|
return sample_videos
|
|
|
|
|
|
def crawl_danmu_data(self, videos):
|
|
|
"""爬取弹幕数据"""
|
|
|
all_danmus = []
|
|
|
|
|
|
for video in videos:
|
|
|
try:
|
|
|
danmus = self._get_danmu_by_cid(video['bvid'])
|
|
|
all_danmus.extend(danmus)
|
|
|
print(f"视频 {video['title']} 爬取了 {len(danmus)} 条弹幕")
|
|
|
time.sleep(random.uniform(2, 4))
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"爬取视频 {video.get('title', '')} 弹幕失败: {e}")
|
|
|
continue
|
|
|
|
|
|
return all_danmus
|
|
|
|
|
|
def _get_danmu_by_cid(self, bvid):
|
|
|
"""通过视频BV号获取弹幕"""
|
|
|
try:
|
|
|
# 先获取视频的cid
|
|
|
info_url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
|
|
|
headers = {
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
|
'Referer': 'https://www.bilibili.com/',
|
|
|
'Origin': 'https://www.bilibili.com'
|
|
|
}
|
|
|
|
|
|
response = requests.get(info_url, headers=headers, timeout=10,verify=False)
|
|
|
if response.status_code != 200:
|
|
|
return []
|
|
|
|
|
|
data = response.json()
|
|
|
cid = data['data']['cid']
|
|
|
|
|
|
# 获取弹幕
|
|
|
danmu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
|
|
|
danmu_response = requests.get(danmu_url, headers=headers, timeout=10,verify=False)
|
|
|
|
|
|
if danmu_response.status_code == 200:
|
|
|
try:
|
|
|
danmu_response.encoding = 'utf-8' # 强制设置为UTF-8
|
|
|
xml_content = danmu_response.text
|
|
|
except UnicodeDecodeError:
|
|
|
xml_content = danmu_response.content.decode('utf-8', errors='ignore')
|
|
|
# 解析XML格式的弹幕
|
|
|
pattern = re.compile(r'<d p=".*?">(.*?)</d>')
|
|
|
danmus = pattern.findall(danmu_response.text)
|
|
|
return danmus
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"获取弹幕失败: {e}")
|
|
|
|
|
|
return []
|
|
|
|
|
|
def filter_noise_danmu(self, danmus):
|
|
|
"""过滤噪声弹幕"""
|
|
|
noise_patterns = [
|
|
|
r'^[6]{3,}$', # 666
|
|
|
r'^[0-9]{1,3}$', # 纯数字
|
|
|
r'^赞+$', # 赞
|
|
|
r'^前排$',
|
|
|
r'^第一第一$',
|
|
|
r'^打卡$',
|
|
|
r'^签到$',
|
|
|
r'^来了$',
|
|
|
r'^.\.$' # 单个字符加句点
|
|
|
]
|
|
|
|
|
|
filtered = []
|
|
|
for danmu in danmus:
|
|
|
is_noise = False
|
|
|
for pattern in noise_patterns:
|
|
|
if re.match(pattern, danmu.strip()):
|
|
|
is_noise = True
|
|
|
break
|
|
|
|
|
|
# 同时过滤过短的弹幕
|
|
|
if len(danmu.strip()) <= 2:
|
|
|
is_noise = True
|
|
|
|
|
|
if not is_noise:
|
|
|
filtered.append(danmu.strip())
|
|
|
|
|
|
return filtered
|
|
|
|
|
|
def analyze_danmu_frequency(self, danmus):
|
|
|
"""分析弹幕词频"""
|
|
|
# 合并所有弹幕文本
|
|
|
text = ' '.join(danmus)
|
|
|
|
|
|
# 使用jieba分词
|
|
|
words = jieba.cut(text)
|
|
|
|
|
|
# 过滤停用词和标点符号
|
|
|
stop_words = set(['的', '了', '在', '是', '我', '有', '和', '就',
|
|
|
'不', '人', '都', '一', '一个', '上', '也', '很',
|
|
|
'到', '说', '要', '去', '你', '会', '着', '没有',
|
|
|
'看', '好', '自己', '这个'])
|
|
|
|
|
|
filtered_words = [word for word in words
|
|
|
if len(word) > 1
|
|
|
and word not in stop_words
|
|
|
and not re.match(r'^[^\u4e00-\u9fa5]+$', word)]
|
|
|
|
|
|
# 统计词频
|
|
|
word_freq = Counter(filtered_words)
|
|
|
return word_freq.most_common(20)
|
|
|
|
|
|
def save_to_excel(self, top_danmus, word_freq, filename='danmu_analysis.xlsx'):
|
|
|
"""保存数据到Excel"""
|
|
|
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
|
|
|
# 保存热门弹幕
|
|
|
df_top = pd.DataFrame(top_danmus, columns=['弹幕内容', '出现次数'])
|
|
|
df_top.to_excel(writer, sheet_name='热门弹幕TOP8', index=False)
|
|
|
|
|
|
# 保存词频统计
|
|
|
df_freq = pd.DataFrame(word_freq, columns=['词语', '出现次数'])
|
|
|
df_freq.to_excel(writer, sheet_name='词频统计TOP20', index=False)
|
|
|
|
|
|
# 保存原始数据样本
|
|
|
sample_df = pd.DataFrame({'弹幕样本': self.filtered_danmu[:100]})
|
|
|
sample_df.to_excel(writer, sheet_name='弹幕样本', index=False)
|
|
|
|
|
|
print(f"数据已保存到 {filename}")
|
|
|
|
|
|
def find_chinese_font(self):
|
|
|
|
|
|
# 常见的系统中文字体路径
|
|
|
possible_paths = [
|
|
|
# Windows
|
|
|
'C:/Windows/Fonts/simhei.ttf', # 黑体
|
|
|
'C:/Windows/Fonts/simsun.ttc', # 宋体
|
|
|
'C:/Windows/Fonts/msyh.ttc', # 微软雅黑
|
|
|
# MacOS
|
|
|
'/System/Library/Fonts/PingFang.ttc', # 苹方
|
|
|
'/System/Library/Fonts/STHeiti Light.ttc', # 黑体
|
|
|
# Linux
|
|
|
'/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf',
|
|
|
]
|
|
|
|
|
|
for path in possible_paths:
|
|
|
if os.path.exists(path):
|
|
|
print(f"找到中文字体: {path}")
|
|
|
return path
|
|
|
|
|
|
print("警告: 未找到系统中文字体,将使用英文显示")
|
|
|
return None
|
|
|
|
|
|
def generate_wordcloud(self, danmus, output_file='wordcloud.png'):
|
|
|
"""生成词云图"""
|
|
|
# 合并弹幕文本
|
|
|
text = ' '.join(danmus)
|
|
|
|
|
|
# 设置中文字体路径(需要根据系统调整)
|
|
|
font_path = 'simhei.ttf' # 或者使用系统中其他支持中文的字体
|
|
|
|
|
|
# 创建词云对象
|
|
|
wc = WordCloud(
|
|
|
font_path=font_path,
|
|
|
width=1200,
|
|
|
height=800,
|
|
|
background_color='white',
|
|
|
max_words=200,
|
|
|
colormap='viridis',
|
|
|
contour_width=1,
|
|
|
contour_color='steelblue'
|
|
|
)
|
|
|
|
|
|
# 生成词云
|
|
|
wc.generate(text)
|
|
|
|
|
|
# 显示词云
|
|
|
plt.figure(figsize=(15, 12))
|
|
|
plt.imshow(wc, interpolation='bilinear')
|
|
|
plt.axis('off')
|
|
|
plt.title('B站大语言模型相关视频弹幕词云图', fontsize=16, pad=20)
|
|
|
plt.tight_layout()
|
|
|
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
|
|
plt.show()
|
|
|
|
|
|
print(f"词云图已保存为 {output_file}")
|
|
|
|
|
|
def main():
|
|
|
analyzer = BilibiliDanmuAnalyzer()
|
|
|
|
|
|
# 1. 数据获取
|
|
|
print("=" * 50)
|
|
|
print("阶段1: 数据获取")
|
|
|
print("=" * 50)
|
|
|
|
|
|
videos = analyzer.crawl_bilibili_videos()
|
|
|
print(f"成功获取 {len(videos)} 个相关视频")
|
|
|
|
|
|
raw_danmus = analyzer.crawl_danmu_data(videos)
|
|
|
print(f"原始弹幕数量: {len(raw_danmus)}")
|
|
|
|
|
|
# 过滤噪声
|
|
|
analyzer.filtered_danmu = analyzer.filter_noise_danmu(raw_danmus)
|
|
|
print(f"过滤后有效弹幕数量: {len(analyzer.filtered_danmu)}")
|
|
|
|
|
|
# 2. 数据 数据统计分析
|
|
|
print("\n" + "=" * 50)
|
|
|
print("阶段2: 数据统计分析")
|
|
|
print("=" * 50)
|
|
|
|
|
|
# 直接统计弹幕频率
|
|
|
danmu_counter = Counter(analyzer.filtered_danmu)
|
|
|
top_danmus = danmu_counter.most_common(8)
|
|
|
|
|
|
print("弹幕数量排名前8:")
|
|
|
for i, (danmu, count) in enumerate(top_danmus, 1):
|
|
|
print(f"{i}. {danmu}: {count}次")
|
|
|
|
|
|
# 词频分析
|
|
|
word_freq = analyzer.analyze_danmu_frequency(analyzer.filtered_danmu)
|
|
|
print("\n词语频率排名前20:")
|
|
|
for i, (word, count) in enumerate(word_freq[:20], 1):
|
|
|
print(f"{i}. {word}: {count}次")
|
|
|
|
|
|
# 保存到Excel
|
|
|
analyzer.save_to_excel(top_danmus, word_freq)
|
|
|
|
|
|
# 3. 数据 数据可视化
|
|
|
print("\n" + "=" * 50)
|
|
|
print("阶段3: 数据可视化")
|
|
|
print("=" * 50)
|
|
|
|
|
|
analyzer.generate_wordcloud(analyzer.filtered_danmu)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main() |