import requests # 引入第三方模块 import re # 引入正则表达式模块 # 1、遍历搜索每一页30个视频bvid号,共10页 bvid_list = [] data_list = [] headers={ "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203", "Referer":"https://www.bilibili.com/", "Cookie":"buvid4=FC27190B-D90C-E609-47DE-FCE8294B68AE44346-023081514-0BHF8D1MZTfSgSMUgteQMw%3D%3D; PVID=1; CURRENT_FNVAL=4048; rpdid=|(J~|uR)l~|)0J'u~k||muum); fingerprint=eb51304bd5d1e23f26077c2233a272f3; buvid_fp_plain=undefined; buvid_fp=eb51304bd5d1e23f26077c2233a272f3; buvid3=F96F85E3-8EB8-C8B1-3E9F-49964E980CA003674infoc; b_nut=1726193603; _uuid=B7610558C-BAC3-63610-C593-6442FF105DB6A21171infoc; enable_web_push=DISABLE; header_theme_version=CLOSE; bsource=search_bing; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3MzA5MDUsImlhdCI6MTcyNjQ3MTY0NSwicGx0IjotMX0.RIWqo495Jb57LSsuhZ4Fe7qi_f7sQk7U7EjtgaQINm0; bili_ticket_expires=1726730845; home_feed_column=5; browser_resolution=1540-770; SESSDATA=c3911aec%2C1742024002%2C82dd8%2A91CjBXEWYuhRf0DtASU3eEabP6HihugMTwYsQBDaoKhLCIu63wetB3GBM3uhNRO_mRMV4SVlRRbXNWQkdlZGF5Rl9IWTRyWF91WVhBaml5VjhWTVhwS24yZ0h6UEJFSjh1V2xIZTl5OFlRWVdseUhhYlQ1TFJCNVNDamItOU5iQUdYbENOaHhQaE9nIIEC; bili_jct=02656c0c3e1791d362c477cfc8046475; DedeUserID=536625628; DedeUserID__ckMd5=1808f3fda2b83419; sid=4x0075lx; bp_t_offset_536625628=977694801799413760; b_lsid=E6A96A9E_191FA0E906F" } # headers 请求头 for page1 in range(1,11): # 循环第1到10页,找到300个视频bvid号 if page1==1: #第一页格式特殊,做if判断 url = 'https://search.bilibili.com/all?vt=89796154&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5' else: url = f'https://search.bilibili.com/all?vt=89621480&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page1}' response = requests.get(url=url,headers=headers) # 对url发送请求 response.encoding = 'utf-8' # 自动编码转换 content_list = re.findall('bvid:"(.*?)",title:',response.text) # 匹配视频bvid号 bvid_list.extend(content_list) bvid_list = list(set(bvid_list)) # 去除重复项 # 2、对每个视频,找到对应弹幕网络接口 for bvid in bvid_list: url = f'https://www.bilibili.com/video/{bvid}/?spm_id_from=333.337.search-card.all.click&vd_source=7b3a711b171cc28773d66f1f7ca6e4bc' # 遍历每个视频地址 response = requests.get(url=url,headers=headers) # 对url发送请求 response.encoding = 'utf-8' # 自动编码转换 oid = re.findall('"embedPlayer":{"p":.*?,"aid":.*?,"bvid":".*?","cid":(.*?),',response.text) # 获取视频oid的值 print(oid) #打印oid值,更直观地看到爬取过程 # 3、对每一个弹幕接口oid值,爬取对应视频弹幕 for cid in oid: url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}' # 对存有弹幕的地址进行爬取 response = requests.get(url=url,headers=headers) # 对url发送请求 response.encoding = 'utf-8' # 自动编码转换 content_list = re.findall('(.*?)',response.text) # 获取视频弹幕 print(content_list) # 打印每个视频获取的弹幕,更直观 data_list.extend(content_list) # 弹幕存入列表 # 4、将爬取的弹幕写入文档 for index in data_list: print(index) # 对每个弹幕进行打印,更直观 with open("全弹幕.txt",mode="a",encoding="utf-8") as f: # 爬取到的弹幕写入全弹幕.txt文档 f.write(index) f.write('\n') # 换行 for index in data_list: print(index) # 对每个弹幕进行打印,更直观 with open("全弹幕.xls",mode="a",encoding="utf-8") as f: # 爬取到的弹幕写入全弹幕.xls文档 f.write(index) f.write('\n') # 换行