diff --git a/insect.py b/insect.py new file mode 100644 index 0000000..8bcc852 --- /dev/null +++ b/insect.py @@ -0,0 +1,54 @@ +import requests # 引入第三方模块 +import re # 引入正则表达式模块 + + +# 1、遍历搜索每一页30个视频bvid号,共10页 +bvid_list = [] +data_list = [] +headers={ + "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203", + "Referer":"https://www.bilibili.com/", + "Cookie":"buvid4=FC27190B-D90C-E609-47DE-FCE8294B68AE44346-023081514-0BHF8D1MZTfSgSMUgteQMw%3D%3D; PVID=1; CURRENT_FNVAL=4048; rpdid=|(J~|uR)l~|)0J'u~k||muum); fingerprint=eb51304bd5d1e23f26077c2233a272f3; buvid_fp_plain=undefined; buvid_fp=eb51304bd5d1e23f26077c2233a272f3; buvid3=F96F85E3-8EB8-C8B1-3E9F-49964E980CA003674infoc; b_nut=1726193603; _uuid=B7610558C-BAC3-63610-C593-6442FF105DB6A21171infoc; enable_web_push=DISABLE; header_theme_version=CLOSE; bsource=search_bing; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3MzA5MDUsImlhdCI6MTcyNjQ3MTY0NSwicGx0IjotMX0.RIWqo495Jb57LSsuhZ4Fe7qi_f7sQk7U7EjtgaQINm0; bili_ticket_expires=1726730845; home_feed_column=5; browser_resolution=1540-770; SESSDATA=c3911aec%2C1742024002%2C82dd8%2A91CjBXEWYuhRf0DtASU3eEabP6HihugMTwYsQBDaoKhLCIu63wetB3GBM3uhNRO_mRMV4SVlRRbXNWQkdlZGF5Rl9IWTRyWF91WVhBaml5VjhWTVhwS24yZ0h6UEJFSjh1V2xIZTl5OFlRWVdseUhhYlQ1TFJCNVNDamItOU5iQUdYbENOaHhQaE9nIIEC; bili_jct=02656c0c3e1791d362c477cfc8046475; DedeUserID=536625628; DedeUserID__ckMd5=1808f3fda2b83419; sid=4x0075lx; bp_t_offset_536625628=977694801799413760; b_lsid=E6A96A9E_191FA0E906F" +} # headers 请求头 + +for page1 in range(1,11): # 循环第1到10页,找到300个视频bvid号 + if page1==1: #第一页格式特殊,做if判断 + url = 'https://search.bilibili.com/all?vt=89796154&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5' + else: + url = f'https://search.bilibili.com/all?vt=89621480&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page1}' + response = requests.get(url=url,headers=headers) # 对url发送请求 + response.encoding = 'utf-8' # 自动编码转换 + content_list = re.findall('bvid:"(.*?)",title:',response.text) # 匹配视频bvid号 + bvid_list.extend(content_list) +bvid_list = list(set(bvid_list)) # 去除重复项 + +# 2、对每个视频,找到对应弹幕网络接口 +for bvid in bvid_list: + url = f'https://www.bilibili.com/video/{bvid}/?spm_id_from=333.337.search-card.all.click&vd_source=7b3a711b171cc28773d66f1f7ca6e4bc' # 遍历每个视频地址 + response = requests.get(url=url,headers=headers) # 对url发送请求 + response.encoding = 'utf-8' # 自动编码转换 + oid = re.findall('"embedPlayer":{"p":.*?,"aid":.*?,"bvid":".*?","cid":(.*?),',response.text) # 获取视频oid的值 + print(oid) #打印oid值,更直观地看到爬取过程 + + +# 3、对每一个弹幕接口oid值,爬取对应视频弹幕 + for cid in oid: + url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}' # 对存有弹幕的地址进行爬取 + response = requests.get(url=url,headers=headers) # 对url发送请求 + response.encoding = 'utf-8' # 自动编码转换 + content_list = re.findall('(.*?)',response.text) # 获取视频弹幕 + print(content_list) # 打印每个视频获取的弹幕,更直观 + data_list.extend(content_list) # 弹幕存入列表 + + +# 4、将爬取的弹幕写入文档 +for index in data_list: + print(index) # 对每个弹幕进行打印,更直观 + with open("全弹幕.txt",mode="a",encoding="utf-8") as f: # 爬取到的弹幕写入全弹幕.txt文档 + f.write(index) + f.write('\n') # 换行 +for index in data_list: + print(index) # 对每个弹幕进行打印,更直观 + with open("全弹幕.xls",mode="a",encoding="utf-8") as f: # 爬取到的弹幕写入全弹幕.xls文档 + f.write(index) + f.write('\n') # 换行 \ No newline at end of file