From 61b3bb8e24d0a6de9906f8dd1d0088fdcff27cb6 Mon Sep 17 00:00:00 2001 From: xuan <3142316616@qq.com> Date: Fri, 13 Sep 2024 18:21:54 +0800 Subject: [PATCH] reptile.py --- reptile.py | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 reptile.py diff --git a/reptile.py b/reptile.py new file mode 100644 index 0000000..82ca82a --- /dev/null +++ b/reptile.py @@ -0,0 +1,94 @@ +import requests +import re +import os +class reptile: + def __init__(self,key_word,num): + #b站的搜索关键词 + self.key_word = key_word + #需要爬取的视频链接的个数 + self.num = num + #b站综合排序的翻页的页数 + self.page = 1 + #浏览器访问的标头,防止访问被拒绝 + self.headers = { + 'Accept-Encoding':'gzip, deflate, br', + 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', + 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.60', + 'Cookie':"buvid3=294A4FDA-B28B-E848-02AF-4CC85F21E60591171infoc; b_nut=1719198491; _uuid=81083F39F-D9B8-E9C6-F81E-10DC77867B7C691969infoc; enable_web_push=DISABLE; buvid4=C6405F50-B303-58B0-92EB-FC0454BFDAEF92810-024062403-BZe5%2BJpTiegiNfU4N%2B1JIg%3D%3D; buvid_fp=723bec487c6799b3457440aa4e151029; header_theme_version=CLOSE; CURRENT_FNVAL=4048; SESSDATA=c2d43fe3%2C1734787697%2Ce83be%2A61CjCuaG1-EyZeMtp2_27iesQNI-S-Cm9CylwH3d1egMq6ECCdB1iVx2W84e9xdpWrnHgSVk5mSVNJZU1ibHNqYXFiNHlCckNEWS1wSEFwZ2ZkX0lRa1NVWFBhdDQ4WExRLUJjdGdURy1YVHRudm9OTFlrb0JkSFVZMXNVSW1ZYTRndEF1ZHNkcTZBIIEC; bili_jct=f9658d85c4fff75f379bafb9d8640c84; DedeUserID=3493115639892125; DedeUserID__ckMd5=c9eb7af8c9ebf568; rpdid=|(u))kkY|mm)0J'u~umkJl~R); is-2022-channel=1; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYyMzcxNjgsImlhdCI6MTcyNTk3NzkwOCwicGx0IjotMX0.QLV_A_4RlS1JqNtyCErEjqyFnEVNNA4LpW9Ea78HJyg; bili_ticket_expires=1726237108; home_feed_column=5; browser_resolution=1653-834; bsource=search_baidu; bmg_af_switch=1; bmg_src_def_domain=i0.hdslb.com; sid=57thd76i; b_lsid=D3749D16_191E5E181D7; bp_t_offset_3493115639892125=976252148054491136", + } + self.base_url = 'https:' + #对收集的弹幕的临时文件的路径 + self.path = f"./关于b站视频{self.key_word}的弹幕.txt" + + + def get_video_url(self): + ''' + 获取网页综合排序当中的视频的详情页面连接 + ''' + url_list = [] + cnt = 0 + #b站综合排序的网页地址 + while cnt < self.num: + url = f'https://search.bilibili.com/all?keyword={self.key_word}&from_source=webtop_search&spm_id_from=333.1007&search_source=2&page={self.page}' + res = requests.get(url=url,headers=self.headers) + #网页无法访问情况要错误跳出 + if res.status_code != 200: + url_list = res.status_code + break + #获取html代码 + html = res.text + #正则化表达解析html,获取视频超链接的详情页的地址列表 + reg_exp = 'video-list.*?a.*?href="(.*?)"' + result = re.findall(reg_exp,html,re.S) + #给list去重,去除重复的数据 + url_list = url_list+result + url_list = list(set(url_list)) + #下一次循环访问下一页 + self.page += 1 + + cnt = len(url_list) + + url_list = url_list[0:self.num] + return url_list + + def bullet_chat(self,url): + ''' + 获取详情页面的弹幕列表 + ''' + if self.base_url not in url: + url = self.base_url +url + res = requests.get(url,headers=self.headers) + # reg = 'bui-long-list-item' + #解析xml内部包含的弹幕池的cid + reg_find_bullet_cid = 'window.__INITIAL_STATE__.*?"cid":([0-9]*)' + bullet_cid = re.findall(reg_find_bullet_cid,res.text,re.S)[0] + #弹幕池的网页 + bullet_url = f'https://comment.bilibili.com/{bullet_cid}.xml' + bullet_xml = requests.get(bullet_url,headers=self.headers) + #弹幕中文需要的编码格式 + bullet_xml.encoding = 'utf-8' + #解析弹幕池当中的弹幕内容 + reg_find_bullet_content = '(.*?)' + bullet_list = re.findall(reg_find_bullet_content,bullet_xml.text) + return bullet_list + + def save_bullet(self,bullet_list): + with open(self.path,'a',encoding='utf-8') as f: + bullet = '\n'.join(bullet_list) + f.write(bullet) + + def get_dataset(self): + if os.path.exists(self.path): + print("文件已存在即将进行覆盖") + os.remove(self.path) + print('正在爬取弹幕......') + url_list = self.get_video_url() + for url in url_list: + bullet_list = self.bullet_chat(url) + self.save_bullet(bullet_list) + print("爬取弹幕成功") + return self.path + +if __name__ == '__main__': + reptile = reptile('2024年巴黎运动会',300) + reptile.get_dataset() \ No newline at end of file