|
|
import requests
|
|
|
import re
|
|
|
|
|
|
count = 0 # 用来记录已经获取了多少条视频的弹幕
|
|
|
|
|
|
def get_page_url(n):
|
|
|
"""
|
|
|
此函数用于获取页面的url
|
|
|
n代表获取的页数
|
|
|
返回一个存储各页面url的列表
|
|
|
"""
|
|
|
page_url_list = [] # 存储页面网址的列表
|
|
|
# 获取B站视频页面
|
|
|
for i in range(n):
|
|
|
if i == 0:
|
|
|
page_url = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5"
|
|
|
page_url_list.append(page_url)
|
|
|
i += 1
|
|
|
else:
|
|
|
page_url = f"https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={i + 1}&o={i * 36}"
|
|
|
page_url_list.append(page_url)
|
|
|
i += 1
|
|
|
return page_url_list
|
|
|
|
|
|
# 设置请求头。为了应对B站的反爬虫,我们需要伪装成浏览器进行请求
|
|
|
header = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"}
|
|
|
|
|
|
def get_cid(page_url_list):
|
|
|
"""
|
|
|
本函数用于获取各个页面视频的cid
|
|
|
返回存储综合排序前300的视频的cid的列表
|
|
|
"""
|
|
|
global count
|
|
|
cid_list = [] # 存储cid的列表
|
|
|
for page_url in page_url_list: # 按照页面的顺序进行爬取
|
|
|
if count >= 300:
|
|
|
break
|
|
|
else:
|
|
|
# 先获取bvid
|
|
|
response = requests.get(url = page_url,headers = header)
|
|
|
response.encoding = 'utf-8'
|
|
|
data = response.text
|
|
|
content = re.findall('"//www.bilibili.com/video/(.*?)/"',data)
|
|
|
# 爬取的bvid会有重复,因此我们要进行去重操作
|
|
|
content = set(content)
|
|
|
content = list(content)
|
|
|
# 通过bvid获取cid
|
|
|
for bvid in content:
|
|
|
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
|
|
|
response = requests.get(url=url, headers=header)
|
|
|
response.encoding = 'utf-8'
|
|
|
data = response.text
|
|
|
content = re.findall('{"cid":(.*?),', data)
|
|
|
cid_list.append(content[0])
|
|
|
count += 1
|
|
|
if count >= 300:
|
|
|
break
|
|
|
|
|
|
return cid_list
|
|
|
|
|
|
def get_danmu(cid_list):
|
|
|
"""
|
|
|
本函数用于获取弹幕
|
|
|
"""
|
|
|
danmu_list = []
|
|
|
for cid in cid_list:
|
|
|
url = f"https://comment.bilibili.com/{cid}.xml"
|
|
|
response = requests.get(url=url, headers=header)
|
|
|
response.encoding = 'utf-8'
|
|
|
data = response.text
|
|
|
content = re.findall('<d p=".*?">(.*?)</d>', data) # 使用正则表达式获取弹幕
|
|
|
danmu_list.extend(content)
|
|
|
return danmu_list
|
|
|
|
|
|
cid_list = get_cid(get_page_url(10))
|
|
|
danmu_list = get_danmu(cid_list) # 获取弹幕
|
|
|
for danmu in danmu_list: # 遍历弹幕
|
|
|
with open('danmu.txt', 'a', encoding='utf-8') as f: # 打开文件准备写入
|
|
|
f.write(danmu + '\n') # 写入弹幕
|