''' 获取相关视频弹幕信息 由于B站反爬虫机制,对每一次数据爬取都手动规定了一定的随机延时,防止被ban searchWord是搜索关键词 savePath将弹幕信息保存至xlsx文件中 ''' import re import time import random import requests import itertools from openpyxl import Workbook # 获取当前页码的视频链接地址 def GetAllSearchVideoUrl(url, headers): response = requests.get(url, headers = headers) response.encoding = 'utf-8' html_data = response.text content_list = re.findall('
', html_data) return content_list # 获取当前视频的弹幕接口cid地址 def GetVideoCid(url, headers): response = requests.get(url, headers = headers) response.encoding = 'utf-8' html_data = response.text content = re.findall('"dynamic":.*?,(.*?),"dimension":.*?', html_data) back = re.search('"cid":', content[0]) num = back.span()[1] cid = content[0][num:] return cid # 获取当前cid地址下的视频弹幕数据 def GetVideoBarrage(url, headers): response = requests.get(url, headers = headers) response.encoding = 'utf-8' html_data = response.text content_list = re.findall('(.*?)', html_data) return content_list def main(): savePath = './docs/barrage.xlsx' searchWord = '2024巴黎奥运会' headers = { 'Cookie': '', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.95 Safari/537.36' } content_list = [] for i in range(15): page = i + 1 o = page * 30 url = f'https://search.bilibili.com/all?keyword={searchWord}&from_source=webtop_search&spm_id_from=333.934&search_source=5&page={page}&o={o}' content = GetAllSearchVideoUrl(url, headers) content_list.append(content) print(f'现在正在获取第{page}页的视频数据') time.sleep(1+random.random()) content_list = list(itertools.chain.from_iterable(content_list)) content_list = list(set(content_list)) cid_list = [] for i in range(300): url = 'https:' + content_list[i] cid = GetVideoCid(url, headers) cid_list.append(cid) print(f'现在正在获取第{i+1}个视频弹幕接口cid信息') time.sleep(1+random.random()) wb = Workbook() ws = wb.active for i in range(300): url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + cid_list[i] barrage_list = GetVideoBarrage(url, headers) print(f'现在正在获取第{i+1}个视频的弹幕信息') time.sleep(1+random.random()) ws.cell(1, i+1, f'视频{i+1}') for j in range(len(barrage_list)): ws.cell(j+2, i+1, barrage_list[j]) wb.save(savePath) print(cid_list) if __name__ == '__main__': main()