You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

93 lines
2.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

'''
获取相关视频弹幕信息
由于B站反爬虫机制对每一次数据爬取都手动规定了一定的随机延时防止被ban
searchWord是搜索关键词
savePath将弹幕信息保存至xlsx文件中
'''
import re
import time
import random
import requests
import itertools
from openpyxl import Workbook
# 获取当前页码的视频链接地址
def GetAllSearchVideoUrl(url, headers):
response = requests.get(url, headers = headers)
response.encoding = 'utf-8'
html_data = response.text
content_list = re.findall('<a href="(.*?)" .*? target="_blank" data-v-4caf9c8c><div class=".*?" data-v-4caf9c8c>', html_data)
return content_list
# 获取当前视频的弹幕接口cid地址
def GetVideoCid(url, headers):
response = requests.get(url, headers = headers)
response.encoding = 'utf-8'
html_data = response.text
content = re.findall('"dynamic":.*?,(.*?),"dimension":.*?', html_data)
back = re.search('"cid":', content[0])
num = back.span()[1]
cid = content[0][num:]
return cid
# 获取当前cid地址下的视频弹幕数据
def GetVideoBarrage(url, headers):
response = requests.get(url, headers = headers)
response.encoding = 'utf-8'
html_data = response.text
content_list = re.findall('<d p=".*?">(.*?)</d>', html_data)
return content_list
def main():
savePath = './docs/barrage.xlsx'
searchWord = '2024巴黎奥运会'
headers = {
'Cookie': '',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.95 Safari/537.36'
}
content_list = []
for i in range(15):
page = i + 1
o = page * 30
url = f'https://search.bilibili.com/all?keyword={searchWord}&from_source=webtop_search&spm_id_from=333.934&search_source=5&page={page}&o={o}'
content = GetAllSearchVideoUrl(url, headers)
content_list.append(content)
print(f'现在正在获取第{page}页的视频数据')
time.sleep(1+random.random())
content_list = list(itertools.chain.from_iterable(content_list))
content_list = list(set(content_list))
cid_list = []
for i in range(300):
url = 'https:' + content_list[i]
cid = GetVideoCid(url, headers)
cid_list.append(cid)
print(f'现在正在获取第{i+1}个视频弹幕接口cid信息')
time.sleep(1+random.random())
wb = Workbook()
ws = wb.active
for i in range(300):
url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + cid_list[i]
barrage_list = GetVideoBarrage(url, headers)
print(f'现在正在获取第{i+1}个视频的弹幕信息')
time.sleep(1+random.random())
ws.cell(1, i+1, f'视频{i+1}')
for j in range(len(barrage_list)):
ws.cell(j+2, i+1, barrage_list[j])
wb.save(savePath)
print(cid_list)
if __name__ == '__main__':
main()