import re, requests, bs4, time
class Scawler:
def __init__(self):
self.url_ref = 'https://search.bilibili.com/all?vt=83547368&keyword=LLM'
self.headers = {
# Referer 防盗链 告诉服务器你请求链接是从哪里跳转过来的
# "Referer": "https://www.bilibili.com/video/BV1454y187Er/",
"Referer":
self.url_ref,
# User-Agent 用户代理, 表示浏览器/设备基本身份信息
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
}
self.url_interface_cid = "https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
self.url_page_base = "https://search.bilibili.com/all?vt=85151086&keyword=LLM&page={page}&o={offset}"
# 从视频主页获取cid
def get_cid(self, url):
response = requests.get(url, headers=self.headers, timeout=10)
response.encoding = 'utf-8'
text = response.text
pattern = r'
(.*?)'
match = re.search(pattern, text)
if match:
title_content = match.group(1)
# print(f"title_content: {title_content}")
match = re.search(r'"cid":(\d+)', text)
if match:
return match.group(1)
return None
# 根据b站接口从cid获取弹幕内容
def get_from_cid(self, cid):
self.url_cid = self.url_interface_cid.format(cid=cid)
response = requests.get(self.url_cid, headers=self.headers, timeout=10)
response.encoding = 'utf-8'
text = response.text
def get_parse_list(text):
danmaku_list = []
pattern = r'([^<]*)'
matches = re.findall(pattern, text)
for match in matches:
params = match[0].split(',')
danmaku = {
'time': float(params[0]), # 出现时间
'type': int(params[1]), # 弹幕类型
'size': int(params[2]), # 字体大小
'color': int(params[3]), # 颜色
'timestamp': int(params[4]), # 发送时间
'pool': int(params[5]), # 弹幕池
'uid': params[6], # 用户ID
'id': params[7], # 弹幕ID
'text': match[1] # 弹幕内容
}
danmaku_list.append(danmaku)
return [i['text'] for i in danmaku_list]
# print(get_parse_list(text))
danmaku_list = get_parse_list(text)
return danmaku_list
# 获取搜索结果页面的HTML内容
def get_html(self, page):
page -= 1
url_base = self.url_page_base.format(page=page, offset=page * 30)
response = requests.get(url_base, headers=self.headers, timeout=10)
response.encoding = 'utf-8'
return response.text
# 解析HTML内容,提取视频链接并获取弹幕
def parse_html(self, html, num):
soup = bs4.BeautifulSoup(html, 'html.parser')
danmaku_list = []
for i in range(num):
selector = f"#i_cecream > div > div:nth-child(2) > div.search-content.search-content--gray > div > div > div > div.video.i_wrapper.search-all-list > div > div:nth-child({i+1}) > div > div.bili-video-card__wrap > a"
element = soup.select_one(selector)
if element:
# 提取链接地址(href属性)
link_url = element.get('href')
# 提取元素内文本内容
link_text = element.get_text(strip=True) # strip=True去除首尾空白
link_url = link_url.replace("//", "https://")
print("提取结果:")
print(f"链接地址: {link_url}")
print(f"链接文本: {link_text}")
cid = self.get_cid(link_url)
print(f"得到CID: {cid}")
danmaku_list.extend(self.get_from_cid(cid))
else:
print("未找到匹配的元素。")
pass
time.sleep(0.5)
print("弹幕获取完成")
return danmaku_list
def work(self, page, num):
html = self.get_html(page)
ls = self.parse_html(html, num)
return ls
if __name__ == "__main__":
# import cProfile as pf
# from pstats import SortKey as sk
# import pstats
# p = pstats.Stats("profiler_stats")
# p.strip_dirs().sort_stats(sk.TIME).print_stats(30)
scrawl = Scawler()
scrawl.work(1, 2)
# pf.run("scrawl.work(1, 2)", "profiler_stats")
pass