|
|
|
|
'''
|
|
|
|
|
获取相关视频弹幕信息
|
|
|
|
|
由于B站反爬虫机制,对每一次数据爬取都手动规定了一定的随机延时,防止被ban
|
|
|
|
|
|
|
|
|
|
searchWord是搜索关键词
|
|
|
|
|
savePath将弹幕信息保存至xlsx文件中
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
import time
|
|
|
|
|
import random
|
|
|
|
|
import requests
|
|
|
|
|
import itertools
|
|
|
|
|
from openpyxl import Workbook
|
|
|
|
|
|
|
|
|
|
# 获取当前页码的视频链接地址
|
|
|
|
|
def GetAllSearchVideoUrl(url, headers):
|
|
|
|
|
response = requests.get(url, headers = headers)
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
html_data = response.text
|
|
|
|
|
content_list = re.findall('<a href="(.*?)" .*? target="_blank" data-v-4caf9c8c><div class=".*?" data-v-4caf9c8c>', html_data)
|
|
|
|
|
return content_list
|
|
|
|
|
|
|
|
|
|
# 获取当前视频的弹幕接口cid地址
|
|
|
|
|
def GetVideoCid(url, headers):
|
|
|
|
|
response = requests.get(url, headers = headers)
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
html_data = response.text
|
|
|
|
|
content = re.findall('"dynamic":.*?,(.*?),"dimension":.*?', html_data)
|
|
|
|
|
back = re.search('"cid":', content[0])
|
|
|
|
|
num = back.span()[1]
|
|
|
|
|
cid = content[0][num:]
|
|
|
|
|
return cid
|
|
|
|
|
|
|
|
|
|
# 获取当前cid地址下的视频弹幕数据
|
|
|
|
|
def GetVideoBarrage(url, headers):
|
|
|
|
|
response = requests.get(url, headers = headers)
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
html_data = response.text
|
|
|
|
|
content_list = re.findall('<d p=".*?">(.*?)</d>', html_data)
|
|
|
|
|
return content_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
savePath = './docs/barrage.xlsx'
|
|
|
|
|
searchWord = '2024巴黎奥运会'
|
|
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
'Cookie': '',
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.95 Safari/537.36'
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
content_list = []
|
|
|
|
|
|
|
|
|
|
for i in range(15):
|
|
|
|
|
page = i + 1
|
|
|
|
|
o = page * 30
|
|
|
|
|
url = f'https://search.bilibili.com/all?keyword={searchWord}&from_source=webtop_search&spm_id_from=333.934&search_source=5&page={page}&o={o}'
|
|
|
|
|
content = GetAllSearchVideoUrl(url, headers)
|
|
|
|
|
content_list.append(content)
|
|
|
|
|
print(f'现在正在获取第{page}页的视频数据')
|
|
|
|
|
time.sleep(1+random.random())
|
|
|
|
|
|
|
|
|
|
content_list = list(itertools.chain.from_iterable(content_list))
|
|
|
|
|
content_list = list(set(content_list))
|
|
|
|
|
|
|
|
|
|
cid_list = []
|
|
|
|
|
|
|
|
|
|
for i in range(300):
|
|
|
|
|
url = 'https:' + content_list[i]
|
|
|
|
|
cid = GetVideoCid(url, headers)
|
|
|
|
|
cid_list.append(cid)
|
|
|
|
|
print(f'现在正在获取第{i+1}个视频弹幕接口cid信息')
|
|
|
|
|
time.sleep(1+random.random())
|
|
|
|
|
|
|
|
|
|
wb = Workbook()
|
|
|
|
|
ws = wb.active
|
|
|
|
|
for i in range(300):
|
|
|
|
|
url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + cid_list[i]
|
|
|
|
|
barrage_list = GetVideoBarrage(url, headers)
|
|
|
|
|
print(f'现在正在获取第{i+1}个视频的弹幕信息')
|
|
|
|
|
time.sleep(1+random.random())
|
|
|
|
|
ws.cell(1, i+1, f'视频{i+1}')
|
|
|
|
|
for j in range(len(barrage_list)):
|
|
|
|
|
ws.cell(j+2, i+1, barrage_list[j])
|
|
|
|
|
|
|
|
|
|
wb.save(savePath)
|
|
|
|
|
|
|
|
|
|
print(cid_list)
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|