master
xuan 5 months ago
commit 61b3bb8e24

@ -0,0 +1,94 @@
import requests
import re
import os
class reptile:
def __init__(self,key_word,num):
#b站的搜索关键词
self.key_word = key_word
#需要爬取的视频链接的个数
self.num = num
#b站综合排序的翻页的页数
self.page = 1
#浏览器访问的标头,防止访问被拒绝
self.headers = {
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.60',
'Cookie':"buvid3=294A4FDA-B28B-E848-02AF-4CC85F21E60591171infoc; b_nut=1719198491; _uuid=81083F39F-D9B8-E9C6-F81E-10DC77867B7C691969infoc; enable_web_push=DISABLE; buvid4=C6405F50-B303-58B0-92EB-FC0454BFDAEF92810-024062403-BZe5%2BJpTiegiNfU4N%2B1JIg%3D%3D; buvid_fp=723bec487c6799b3457440aa4e151029; header_theme_version=CLOSE; CURRENT_FNVAL=4048; SESSDATA=c2d43fe3%2C1734787697%2Ce83be%2A61CjCuaG1-EyZeMtp2_27iesQNI-S-Cm9CylwH3d1egMq6ECCdB1iVx2W84e9xdpWrnHgSVk5mSVNJZU1ibHNqYXFiNHlCckNEWS1wSEFwZ2ZkX0lRa1NVWFBhdDQ4WExRLUJjdGdURy1YVHRudm9OTFlrb0JkSFVZMXNVSW1ZYTRndEF1ZHNkcTZBIIEC; bili_jct=f9658d85c4fff75f379bafb9d8640c84; DedeUserID=3493115639892125; DedeUserID__ckMd5=c9eb7af8c9ebf568; rpdid=|(u))kkY|mm)0J'u~umkJl~R); is-2022-channel=1; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYyMzcxNjgsImlhdCI6MTcyNTk3NzkwOCwicGx0IjotMX0.QLV_A_4RlS1JqNtyCErEjqyFnEVNNA4LpW9Ea78HJyg; bili_ticket_expires=1726237108; home_feed_column=5; browser_resolution=1653-834; bsource=search_baidu; bmg_af_switch=1; bmg_src_def_domain=i0.hdslb.com; sid=57thd76i; b_lsid=D3749D16_191E5E181D7; bp_t_offset_3493115639892125=976252148054491136",
}
self.base_url = 'https:'
#对收集的弹幕的临时文件的路径
self.path = f"./关于b站视频{self.key_word}的弹幕.txt"
def get_video_url(self):
'''
获取网页综合排序当中的视频的详情页面连接
'''
url_list = []
cnt = 0
#b站综合排序的网页地址
while cnt < self.num:
url = f'https://search.bilibili.com/all?keyword={self.key_word}&from_source=webtop_search&spm_id_from=333.1007&search_source=2&page={self.page}'
res = requests.get(url=url,headers=self.headers)
#网页无法访问情况要错误跳出
if res.status_code != 200:
url_list = res.status_code
break
#获取html代码
html = res.text
#正则化表达解析html获取视频超链接的详情页的地址列表
reg_exp = 'video-list.*?a.*?href="(.*?)"'
result = re.findall(reg_exp,html,re.S)
#给list去重去除重复的数据
url_list = url_list+result
url_list = list(set(url_list))
#下一次循环访问下一页
self.page += 1
cnt = len(url_list)
url_list = url_list[0:self.num]
return url_list
def bullet_chat(self,url):
'''
获取详情页面的弹幕列表
'''
if self.base_url not in url:
url = self.base_url +url
res = requests.get(url,headers=self.headers)
# reg = 'bui-long-list-item'
#解析xml内部包含的弹幕池的cid
reg_find_bullet_cid = 'window.__INITIAL_STATE__.*?"cid":([0-9]*)'
bullet_cid = re.findall(reg_find_bullet_cid,res.text,re.S)[0]
#弹幕池的网页
bullet_url = f'https://comment.bilibili.com/{bullet_cid}.xml'
bullet_xml = requests.get(bullet_url,headers=self.headers)
#弹幕中文需要的编码格式
bullet_xml.encoding = 'utf-8'
#解析弹幕池当中的弹幕内容
reg_find_bullet_content = '<d p=.*?>(.*?)</d>'
bullet_list = re.findall(reg_find_bullet_content,bullet_xml.text)
return bullet_list
def save_bullet(self,bullet_list):
with open(self.path,'a',encoding='utf-8') as f:
bullet = '\n'.join(bullet_list)
f.write(bullet)
def get_dataset(self):
if os.path.exists(self.path):
print("文件已存在即将进行覆盖")
os.remove(self.path)
print('正在爬取弹幕......')
url_list = self.get_video_url()
for url in url_list:
bullet_list = self.bullet_chat(url)
self.save_bullet(bullet_list)
print("爬取弹幕成功")
return self.path
if __name__ == '__main__':
reptile = reptile('2024年巴黎运动会',300)
reptile.get_dataset()
Loading…
Cancel
Save