|
|
import requests # 引入第三方模块
|
|
|
import re # 引入正则表达式模块
|
|
|
|
|
|
|
|
|
# 1、遍历搜索每一页30个视频bvid号,共10页
|
|
|
bvid_list = []
|
|
|
data_list = []
|
|
|
headers={
|
|
|
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203",
|
|
|
"Referer":"https://www.bilibili.com/",
|
|
|
"Cookie":"buvid4=FC27190B-D90C-E609-47DE-FCE8294B68AE44346-023081514-0BHF8D1MZTfSgSMUgteQMw%3D%3D; PVID=1; CURRENT_FNVAL=4048; rpdid=|(J~|uR)l~|)0J'u~k||muum); fingerprint=eb51304bd5d1e23f26077c2233a272f3; buvid_fp_plain=undefined; buvid_fp=eb51304bd5d1e23f26077c2233a272f3; buvid3=F96F85E3-8EB8-C8B1-3E9F-49964E980CA003674infoc; b_nut=1726193603; _uuid=B7610558C-BAC3-63610-C593-6442FF105DB6A21171infoc; enable_web_push=DISABLE; header_theme_version=CLOSE; bsource=search_bing; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3MzA5MDUsImlhdCI6MTcyNjQ3MTY0NSwicGx0IjotMX0.RIWqo495Jb57LSsuhZ4Fe7qi_f7sQk7U7EjtgaQINm0; bili_ticket_expires=1726730845; home_feed_column=5; browser_resolution=1540-770; SESSDATA=c3911aec%2C1742024002%2C82dd8%2A91CjBXEWYuhRf0DtASU3eEabP6HihugMTwYsQBDaoKhLCIu63wetB3GBM3uhNRO_mRMV4SVlRRbXNWQkdlZGF5Rl9IWTRyWF91WVhBaml5VjhWTVhwS24yZ0h6UEJFSjh1V2xIZTl5OFlRWVdseUhhYlQ1TFJCNVNDamItOU5iQUdYbENOaHhQaE9nIIEC; bili_jct=02656c0c3e1791d362c477cfc8046475; DedeUserID=536625628; DedeUserID__ckMd5=1808f3fda2b83419; sid=4x0075lx; bp_t_offset_536625628=977694801799413760; b_lsid=E6A96A9E_191FA0E906F"
|
|
|
} # headers 请求头
|
|
|
|
|
|
for page1 in range(1,11): # 循环第1到10页,找到300个视频bvid号
|
|
|
if page1==1: #第一页格式特殊,做if判断
|
|
|
url = 'https://search.bilibili.com/all?vt=89796154&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5'
|
|
|
else:
|
|
|
url = f'https://search.bilibili.com/all?vt=89621480&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page1}'
|
|
|
response = requests.get(url=url,headers=headers) # 对url发送请求
|
|
|
response.encoding = 'utf-8' # 自动编码转换
|
|
|
content_list = re.findall('bvid:"(.*?)",title:',response.text) # 匹配视频bvid号
|
|
|
bvid_list.extend(content_list)
|
|
|
bvid_list = list(set(bvid_list)) # 去除重复项
|
|
|
|
|
|
# 2、对每个视频,找到对应弹幕网络接口
|
|
|
for bvid in bvid_list:
|
|
|
url = f'https://www.bilibili.com/video/{bvid}/?spm_id_from=333.337.search-card.all.click&vd_source=7b3a711b171cc28773d66f1f7ca6e4bc' # 遍历每个视频地址
|
|
|
response = requests.get(url=url,headers=headers) # 对url发送请求
|
|
|
response.encoding = 'utf-8' # 自动编码转换
|
|
|
oid = re.findall('"embedPlayer":{"p":.*?,"aid":.*?,"bvid":".*?","cid":(.*?),',response.text) # 获取视频oid的值
|
|
|
print(oid) #打印oid值,更直观地看到爬取过程
|
|
|
|
|
|
|
|
|
# 3、对每一个弹幕接口oid值,爬取对应视频弹幕
|
|
|
for cid in oid:
|
|
|
url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}' # 对存有弹幕的地址进行爬取
|
|
|
response = requests.get(url=url,headers=headers) # 对url发送请求
|
|
|
response.encoding = 'utf-8' # 自动编码转换
|
|
|
content_list = re.findall('<d p=".*?">(.*?)</d>',response.text) # 获取视频弹幕
|
|
|
print(content_list) # 打印每个视频获取的弹幕,更直观
|
|
|
data_list.extend(content_list) # 弹幕存入列表
|
|
|
|
|
|
|
|
|
# 4、将爬取的弹幕写入文档
|
|
|
for index in data_list:
|
|
|
print(index) # 对每个弹幕进行打印,更直观
|
|
|
with open("全弹幕.txt",mode="a",encoding="utf-8") as f: # 爬取到的弹幕写入全弹幕.txt文档
|
|
|
f.write(index)
|
|
|
f.write('\n') # 换行
|
|
|
for index in data_list:
|
|
|
print(index) # 对每个弹幕进行打印,更直观
|
|
|
with open("全弹幕.xls",mode="a",encoding="utf-8") as f: # 爬取到的弹幕写入全弹幕.xls文档
|
|
|
f.write(index)
|
|
|
f.write('\n') # 换行 |