You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

355 lines
14 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import time
import os
import random
import re
from collections import Counter
import pandas as pd
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import openpyxl
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json
class BilibiliDanmuAnalyzer:
def __init__(self):
self.danmu_data = []
self.filtered_danmu = []
def fix_encoding_problem(self, text):
"""修复编码问题"""
try:
# 检测编码
detected = chardet.detect(text.encode('latin-1'))
encoding = detected.get('encoding', 'utf-8')
if encoding.lower() == 'iso-8859-1':
# 尝试多种可能的编码
encodings_to_try = ['gb2312', 'gbk', 'big5', 'utf-8']
for enc in encodings_to_try:
try:
fixed_text = text.encode('latin-1').decode(enc)
return fixed_text
except:
continue
return text
except:
return text
def crawl_bilibili_videos(self):
"""
爬取B站大语言模型相关视频
"""
print("开始爬取B站大语言模型相关视频...")
# 方法1: 使用API直接获取(更稳定)
videos = self._crawl_by_api()
# 方法2: 如果没有API权限可以使用预定义的视频ID
if not videos:
print("使用备用方案...")
videos = self._get_sample_videos()
return videos
def _crawl_by_api(self):
keywords = ["大语言模型", "大模型", "LLM", "人工智能", "ChatGPT"]
# 使用set来存储BV号自动去重
bv_set = set()
for keyword in keywords:
try:
# B站搜索API
url = "https://api.bilibili.com/x/web-interface/search/type"
params = {
'search_type': 'video',
'keyword': keyword,
'order': 'totalrank', # 综合排序
'page_size': 30 # 一般每页显示约30个视频
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': 'https://www.bilibili.com',
'cookie' : "buvid_fp_plain=undefined; enable_web_push=DISABLE; DedeUserID=1325886155; DedeUserID__ckMd5=2a3fe0a976d24759; blackside_state=0; CURRENT_BLACKGAP=0; enable_feed_channel=ENABLE; LIVE_BUVID=AUTO8017412766793242; fingerprint=5737cf43b47a2873d5edf15fbf56d7a7; buvid_fp=5737cf43b47a2873d5edf15fbf56d7a7; header_theme_version=OPEN; theme-tip-show=SHOWED; theme-avatar-tip-show=SHOWED; _uuid=991C474A-FE4A-6BB2-7126-33756DF375B672647infoc; hit-dyn-v2=1; buvid3=6741FBB4-F453-5714-3CFF-3098B9B3CD9691841infoc; b_nut=1756736891; buvid4=832703B7-1153-CE27-C703-2CBAAA17D1E894924-023072916-s2/qL+xzSDJXc6bocezptg%3D%3D; rpdid=|(RYkm|lmkJ0J'u~lm)llk~m; home_feed_column=5; browser_resolution=1699-881; PVID=1; ogv_device_support_hdr=0; CURRENT_QUALITY=112; SESSDATA=f1bb22f3%2C1779110035%2C11644%2Ab2CjCuqhTM7Js2YCZ7N0tQMULJ_qUmFTHzW8u5GcYyyRwhPFm9P5K5XHZSFh_0e8ip04wSVjVBQmtLQ29OeGJlZnB2V2xTdXp0UkoyUmRJQjgzaUJkei1oX0VCV01IcFdGZ1VwNFVxOEd2MUNRYWZ2bjRiWmhnZk9ueVU3Q0c5VWZkYVRJNnllc3hnIIEC; bili_jct=6eb12caa0c933cbf2f4b85fc3b5829c3; sid=8pi8t3jo; bsource=search_bing; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NjM4MjUzNDksImlhdCI6MTc2MzU2NjA4OSwicGx0IjotMX0.gTEcj4q-Z3iYT6ibXRoDPlCODbQar1hc1a2DUY9yvmY; bili_ticket_expires=1763825289; b_lsid=5D5764F1_19A9CD8273A; timeMachine=0; bp_t_offset_1325886155=1137014672319315968; CURRENT_FNVAL=2000"
}
response = requests.get(url, params=params, headers=headers, timeout=10, verify=False)
if response.status_code == 200:
data = response.json()
video_list = data.get('data', {}).get('result', [])
# 提取当前关键词搜索结果中的所有BV号
current_bvs = []
bv_list = []
for video in video_list:
bvid = video.get('bvid')
title = video.get('title', '未知标题') # 提取标题,默认值为"未知标题"
if bvid and bvid.startswith('BV') and bvid not in bv_set:
bv_set.add(bvid)
bv_list.append({'bvid': bvid, 'title': title}) # 组成字典,加入列表
print(f"关键词 '{keyword}' 找到 {len(video_list)} 个视频(去重后保留 {len([v for v in video_list if v.get('bvid') in bv_set])} 个)")
else:
print(f"关键词 '{keyword}' 请求失败,状态码: {response.status_code}")
# 降低请求频率避免被封
time.sleep(random.uniform(1, 3))
except Exception as e:
print(f"爬取关键词 '{keyword}' 时出错: {e}")
continue
print(f"总计获取到 {len(bv_list)} 个唯一的BV号")
return bv_list
def _get_sample_videos(self):
"""获取样本视频数据"""
sample_videos = [
{'bvid': 'BV1KYCKBTERZ', 'title': 'Gemini 3.0 彻底杀疯了GPT-5.1 还有活路吗?'},
{'bvid': 'BV1Gu41137rP', 'title': 'LLM技术发展现状'},
{'bvid': 'BV1Nu411379z', 'title': 'ChatGPT背后的秘密'}
]
return sample_videos
def crawl_danmu_data(self, videos):
"""爬取弹幕数据"""
all_danmus = []
for video in videos:
try:
danmus = self._get_danmu_by_cid(video['bvid'])
all_danmus.extend(danmus)
print(f"视频 {video['title']} 爬取了 {len(danmus)} 条弹幕")
time.sleep(random.uniform(2, 4))
except Exception as e:
print(f"爬取视频 {video.get('title', '')} 弹幕失败: {e}")
continue
return all_danmus
def _get_danmu_by_cid(self, bvid):
"""通过视频BV号获取弹幕"""
try:
# 先获取视频的cid
info_url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': 'https://www.bilibili.com/',
'Origin': 'https://www.bilibili.com'
}
response = requests.get(info_url, headers=headers, timeout=10,verify=False)
if response.status_code != 200:
return []
data = response.json()
cid = data['data']['cid']
# 获取弹幕
danmu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
danmu_response = requests.get(danmu_url, headers=headers, timeout=10,verify=False)
if danmu_response.status_code == 200:
try:
danmu_response.encoding = 'utf-8' # 强制设置为UTF-8
xml_content = danmu_response.text
except UnicodeDecodeError:
xml_content = danmu_response.content.decode('utf-8', errors='ignore')
# 解析XML格式的弹幕
pattern = re.compile(r'<d p=".*?">(.*?)</d>')
danmus = pattern.findall(danmu_response.text)
return danmus
except Exception as e:
print(f"获取弹幕失败: {e}")
return []
def filter_noise_danmu(self, danmus):
"""过滤噪声弹幕"""
noise_patterns = [
r'^[6]{3,}$', # 666
r'^[0-9]{1,3}$', # 纯数字
r'^赞+$', # 赞
r'^前排$',
r'^第一第一$',
r'^打卡$',
r'^签到$',
r'^来了$',
r'^.\.$' # 单个字符加句点
]
filtered = []
for danmu in danmus:
is_noise = False
for pattern in noise_patterns:
if re.match(pattern, danmu.strip()):
is_noise = True
break
# 同时过滤过短的弹幕
if len(danmu.strip()) <= 2:
is_noise = True
if not is_noise:
filtered.append(danmu.strip())
return filtered
def analyze_danmu_frequency(self, danmus):
"""分析弹幕词频"""
# 合并所有弹幕文本
text = ' '.join(danmus)
# 使用jieba分词
words = jieba.cut(text)
# 过滤停用词和标点符号
stop_words = set(['', '', '', '', '', '', '', '',
'', '', '', '', '一个', '', '', '',
'', '', '', '', '', '', '', '没有',
'', '', '自己', '这个'])
filtered_words = [word for word in words
if len(word) > 1
and word not in stop_words
and not re.match(r'^[^\u4e00-\u9fa5]+$', word)]
# 统计词频
word_freq = Counter(filtered_words)
return word_freq.most_common(20)
def save_to_excel(self, top_danmus, word_freq, filename='danmu_analysis.xlsx'):
"""保存数据到Excel"""
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
# 保存热门弹幕
df_top = pd.DataFrame(top_danmus, columns=['弹幕内容', '出现次数'])
df_top.to_excel(writer, sheet_name='热门弹幕TOP8', index=False)
# 保存词频统计
df_freq = pd.DataFrame(word_freq, columns=['词语', '出现次数'])
df_freq.to_excel(writer, sheet_name='词频统计TOP20', index=False)
# 保存原始数据样本
sample_df = pd.DataFrame({'弹幕样本': self.filtered_danmu[:100]})
sample_df.to_excel(writer, sheet_name='弹幕样本', index=False)
print(f"数据已保存到 {filename}")
def find_chinese_font(self):
# 常见的系统中文字体路径
possible_paths = [
# Windows
'C:/Windows/Fonts/simhei.ttf', # 黑体
'C:/Windows/Fonts/simsun.ttc', # 宋体
'C:/Windows/Fonts/msyh.ttc', # 微软雅黑
# MacOS
'/System/Library/Fonts/PingFang.ttc', # 苹方
'/System/Library/Fonts/STHeiti Light.ttc', # 黑体
# Linux
'/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf',
]
for path in possible_paths:
if os.path.exists(path):
print(f"找到中文字体: {path}")
return path
print("警告: 未找到系统中文字体,将使用英文显示")
return None
def generate_wordcloud(self, danmus, output_file='wordcloud.png'):
"""生成词云图"""
# 合并弹幕文本
text = ' '.join(danmus)
# 设置中文字体路径(需要根据系统调整)
font_path = 'simhei.ttf' # 或者使用系统中其他支持中文的字体
# 创建词云对象
wc = WordCloud(
font_path=font_path,
width=1200,
height=800,
background_color='white',
max_words=200,
colormap='viridis',
contour_width=1,
contour_color='steelblue'
)
# 生成词云
wc.generate(text)
# 显示词云
plt.figure(figsize=(15, 12))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('B站大语言模型相关视频弹幕词云图', fontsize=16, pad=20)
plt.tight_layout()
plt.savefig(output_file, dpi=300, bbox_inches='tight')
plt.show()
print(f"词云图已保存为 {output_file}")
def main():
analyzer = BilibiliDanmuAnalyzer()
# 1. 数据获取
print("=" * 50)
print("阶段1: 数据获取")
print("=" * 50)
videos = analyzer.crawl_bilibili_videos()
print(f"成功获取 {len(videos)} 个相关视频")
raw_danmus = analyzer.crawl_danmu_data(videos)
print(f"原始弹幕数量: {len(raw_danmus)}")
# 过滤噪声
analyzer.filtered_danmu = analyzer.filter_noise_danmu(raw_danmus)
print(f"过滤后有效弹幕数量: {len(analyzer.filtered_danmu)}")
# 2. 数据 数据统计分析
print("\n" + "=" * 50)
print("阶段2: 数据统计分析")
print("=" * 50)
# 直接统计弹幕频率
danmu_counter = Counter(analyzer.filtered_danmu)
top_danmus = danmu_counter.most_common(8)
print("弹幕数量排名前8:")
for i, (danmu, count) in enumerate(top_danmus, 1):
print(f"{i}. {danmu}: {count}")
# 词频分析
word_freq = analyzer.analyze_danmu_frequency(analyzer.filtered_danmu)
print("\n词语频率排名前20:")
for i, (word, count) in enumerate(word_freq[:20], 1):
print(f"{i}. {word}: {count}")
# 保存到Excel
analyzer.save_to_excel(top_danmus, word_freq)
# 3. 数据 数据可视化
print("\n" + "=" * 50)
print("阶段3: 数据可视化")
print("=" * 50)
analyzer.generate_wordcloud(analyzer.filtered_danmu)
if __name__ == "__main__":
main()