You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

94 lines
4.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import re
import os
class reptile:
def __init__(self,key_word,num):
#b站的搜索关键词
self.key_word = key_word
#需要爬取的视频链接的个数
self.num = num
#b站综合排序的翻页的页数
self.page = 1
#浏览器访问的标头,防止访问被拒绝
self.headers = {
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.60',
'Cookie':"buvid3=294A4FDA-B28B-E848-02AF-4CC85F21E60591171infoc; b_nut=1719198491; _uuid=81083F39F-D9B8-E9C6-F81E-10DC77867B7C691969infoc; enable_web_push=DISABLE; buvid4=C6405F50-B303-58B0-92EB-FC0454BFDAEF92810-024062403-BZe5%2BJpTiegiNfU4N%2B1JIg%3D%3D; buvid_fp=723bec487c6799b3457440aa4e151029; header_theme_version=CLOSE; CURRENT_FNVAL=4048; SESSDATA=c2d43fe3%2C1734787697%2Ce83be%2A61CjCuaG1-EyZeMtp2_27iesQNI-S-Cm9CylwH3d1egMq6ECCdB1iVx2W84e9xdpWrnHgSVk5mSVNJZU1ibHNqYXFiNHlCckNEWS1wSEFwZ2ZkX0lRa1NVWFBhdDQ4WExRLUJjdGdURy1YVHRudm9OTFlrb0JkSFVZMXNVSW1ZYTRndEF1ZHNkcTZBIIEC; bili_jct=f9658d85c4fff75f379bafb9d8640c84; DedeUserID=3493115639892125; DedeUserID__ckMd5=c9eb7af8c9ebf568; rpdid=|(u))kkY|mm)0J'u~umkJl~R); is-2022-channel=1; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYyMzcxNjgsImlhdCI6MTcyNTk3NzkwOCwicGx0IjotMX0.QLV_A_4RlS1JqNtyCErEjqyFnEVNNA4LpW9Ea78HJyg; bili_ticket_expires=1726237108; home_feed_column=5; browser_resolution=1653-834; bsource=search_baidu; bmg_af_switch=1; bmg_src_def_domain=i0.hdslb.com; sid=57thd76i; b_lsid=D3749D16_191E5E181D7; bp_t_offset_3493115639892125=976252148054491136",
}
self.base_url = 'https:'
#对收集的弹幕的临时文件的路径
self.path = f"./关于b站视频{self.key_word}的弹幕.txt"
def get_video_url(self):
'''
获取网页综合排序当中的视频的详情页面连接
'''
url_list = []
cnt = 0
#b站综合排序的网页地址
while cnt < self.num:
url = f'https://search.bilibili.com/all?keyword={self.key_word}&from_source=webtop_search&spm_id_from=333.1007&search_source=2&page={self.page}'
res = requests.get(url=url,headers=self.headers)
#网页无法访问情况要错误跳出
if res.status_code != 200:
url_list = res.status_code
break
#获取html代码
html = res.text
#正则化表达解析html获取视频超链接的详情页的地址列表
reg_exp = 'video-list.*?a.*?href="(.*?)"'
result = re.findall(reg_exp,html,re.S)
#给list去重去除重复的数据
url_list = url_list+result
url_list = list(set(url_list))
#下一次循环访问下一页
self.page += 1
cnt = len(url_list)
url_list = url_list[0:self.num]
return url_list
def bullet_chat(self,url):
'''
获取详情页面的弹幕列表
'''
if self.base_url not in url:
url = self.base_url +url
res = requests.get(url,headers=self.headers)
# reg = 'bui-long-list-item'
#解析xml内部包含的弹幕池的cid
reg_find_bullet_cid = 'window.__INITIAL_STATE__.*?"cid":([0-9]*)'
bullet_cid = re.findall(reg_find_bullet_cid,res.text,re.S)[0]
#弹幕池的网页
bullet_url = f'https://comment.bilibili.com/{bullet_cid}.xml'
bullet_xml = requests.get(bullet_url,headers=self.headers)
#弹幕中文需要的编码格式
bullet_xml.encoding = 'utf-8'
#解析弹幕池当中的弹幕内容
reg_find_bullet_content = '<d p=.*?>(.*?)</d>'
bullet_list = re.findall(reg_find_bullet_content,bullet_xml.text)
return bullet_list
def save_bullet(self,bullet_list):
with open(self.path,'a',encoding='utf-8') as f:
bullet = '\n'.join(bullet_list)
f.write(bullet)
def get_dataset(self):
if os.path.exists(self.path):
print("文件已存在即将进行覆盖")
os.remove(self.path)
print('正在爬取弹幕......')
url_list = self.get_video_url()
for url in url_list:
bullet_list = self.bullet_chat(url)
self.save_bullet(bullet_list)
print("爬取弹幕成功")
return self.path
if __name__ == '__main__':
reptile = reptile('2024年巴黎运动会AI应用',300)
reptile.get_dataset()