You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

54 lines
4.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests # 引入第三方模块
import re # 引入正则表达式模块
# 1、遍历搜索每一页30个视频bvid号共10页
bvid_list = []
data_list = []
headers={
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203",
"Referer":"https://www.bilibili.com/",
"Cookie":"buvid4=FC27190B-D90C-E609-47DE-FCE8294B68AE44346-023081514-0BHF8D1MZTfSgSMUgteQMw%3D%3D; PVID=1; CURRENT_FNVAL=4048; rpdid=|(J~|uR)l~|)0J'u~k||muum); fingerprint=eb51304bd5d1e23f26077c2233a272f3; buvid_fp_plain=undefined; buvid_fp=eb51304bd5d1e23f26077c2233a272f3; buvid3=F96F85E3-8EB8-C8B1-3E9F-49964E980CA003674infoc; b_nut=1726193603; _uuid=B7610558C-BAC3-63610-C593-6442FF105DB6A21171infoc; enable_web_push=DISABLE; header_theme_version=CLOSE; bsource=search_bing; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3MzA5MDUsImlhdCI6MTcyNjQ3MTY0NSwicGx0IjotMX0.RIWqo495Jb57LSsuhZ4Fe7qi_f7sQk7U7EjtgaQINm0; bili_ticket_expires=1726730845; home_feed_column=5; browser_resolution=1540-770; SESSDATA=c3911aec%2C1742024002%2C82dd8%2A91CjBXEWYuhRf0DtASU3eEabP6HihugMTwYsQBDaoKhLCIu63wetB3GBM3uhNRO_mRMV4SVlRRbXNWQkdlZGF5Rl9IWTRyWF91WVhBaml5VjhWTVhwS24yZ0h6UEJFSjh1V2xIZTl5OFlRWVdseUhhYlQ1TFJCNVNDamItOU5iQUdYbENOaHhQaE9nIIEC; bili_jct=02656c0c3e1791d362c477cfc8046475; DedeUserID=536625628; DedeUserID__ckMd5=1808f3fda2b83419; sid=4x0075lx; bp_t_offset_536625628=977694801799413760; b_lsid=E6A96A9E_191FA0E906F"
} # headers 请求头
for page1 in range(1,11): # 循环第1到10页找到300个视频bvid号
if page1==1: #第一页格式特殊做if判断
url = 'https://search.bilibili.com/all?vt=89796154&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5'
else:
url = f'https://search.bilibili.com/all?vt=89621480&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page1}'
response = requests.get(url=url,headers=headers) # 对url发送请求
response.encoding = 'utf-8' # 自动编码转换
content_list = re.findall('bvid:"(.*?)",title:',response.text) # 匹配视频bvid号
bvid_list.extend(content_list)
bvid_list = list(set(bvid_list)) # 去除重复项
# 2、对每个视频找到对应弹幕网络接口
for bvid in bvid_list:
url = f'https://www.bilibili.com/video/{bvid}/?spm_id_from=333.337.search-card.all.click&vd_source=7b3a711b171cc28773d66f1f7ca6e4bc' # 遍历每个视频地址
response = requests.get(url=url,headers=headers) # 对url发送请求
response.encoding = 'utf-8' # 自动编码转换
oid = re.findall('"embedPlayer":{"p":.*?,"aid":.*?,"bvid":".*?","cid":(.*?),',response.text) # 获取视频oid的值
print(oid) #打印oid值更直观地看到爬取过程
# 3、对每一个弹幕接口oid值爬取对应视频弹幕
for cid in oid:
url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}' # 对存有弹幕的地址进行爬取
response = requests.get(url=url,headers=headers) # 对url发送请求
response.encoding = 'utf-8' # 自动编码转换
content_list = re.findall('<d p=".*?">(.*?)</d>',response.text) # 获取视频弹幕
print(content_list) # 打印每个视频获取的弹幕,更直观
data_list.extend(content_list) # 弹幕存入列表
# 4、将爬取的弹幕写入文档
for index in data_list:
print(index) # 对每个弹幕进行打印,更直观
with open("全弹幕.txt",mode="a",encoding="utf-8") as f: # 爬取到的弹幕写入全弹幕.txt文档
f.write(index)
f.write('\n') # 换行
for index in data_list:
print(index) # 对每个弹幕进行打印,更直观
with open("全弹幕.xls",mode="a",encoding="utf-8") as f: # 爬取到的弹幕写入全弹幕.xls文档
f.write(index)
f.write('\n') # 换行