You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
3.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import re
count = 0 # 用来记录已经获取了多少条视频的弹幕
def get_page_url(n):
"""
此函数用于获取页面的url
n代表获取的页数
返回一个存储各页面url的列表
"""
page_url_list = [] # 存储页面网址的列表
# 获取B站视频页面
for i in range(n):
if i == 0:
page_url = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5"
page_url_list.append(page_url)
i += 1
else:
page_url = f"https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={i + 1}&o={i * 36}"
page_url_list.append(page_url)
i += 1
return page_url_list
# 设置请求头。为了应对B站的反爬虫我们需要伪装成浏览器进行请求
header = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"}
def get_cid(page_url_list):
"""
本函数用于获取各个页面视频的cid
返回存储综合排序前300的视频的cid的列表
"""
global count
cid_list = [] # 存储cid的列表
for page_url in page_url_list: # 按照页面的顺序进行爬取
if count >= 300:
break
else:
# 先获取bvid
response = requests.get(url = page_url,headers = header)
response.encoding = 'utf-8'
data = response.text
content = re.findall('"//www.bilibili.com/video/(.*?)/"',data)
# 爬取的bvid会有重复因此我们要进行去重操作
content = set(content)
content = list(content)
# 通过bvid获取cid
for bvid in content:
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
response = requests.get(url=url, headers=header)
response.encoding = 'utf-8'
data = response.text
content = re.findall('{"cid":(.*?),', data)
cid_list.append(content[0])
count += 1
if count >= 300:
break
return cid_list
def get_danmu(cid_list):
"""
本函数用于获取弹幕
"""
danmu_list = []
for cid in cid_list:
url = f"https://comment.bilibili.com/{cid}.xml"
response = requests.get(url=url, headers=header)
response.encoding = 'utf-8'
data = response.text
content = re.findall('<d p=".*?">(.*?)</d>', data) # 使用正则表达式获取弹幕
danmu_list.extend(content)
return danmu_list
cid_list = get_cid(get_page_url(10))
danmu_list = get_danmu(cid_list) # 获取弹幕
for danmu in danmu_list: # 遍历弹幕
with open('danmu.txt', 'a', encoding='utf-8') as f: # 打开文件准备写入
f.write(danmu + '\n') # 写入弹幕