You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

120 lines
4.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import re, requests, bs4, time
class Scawler:
def __init__(self):
self.url_ref = 'https://search.bilibili.com/all?vt=83547368&keyword=LLM'
self.headers = {
# Referer 防盗链 告诉服务器你请求链接是从哪里跳转过来的
# "Referer": "https://www.bilibili.com/video/BV1454y187Er/",
"Referer":
self.url_ref,
# User-Agent 用户代理, 表示浏览器/设备基本身份信息
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
}
self.url_interface_cid = "https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
self.url_page_base = "https://search.bilibili.com/all?vt=85151086&keyword=LLM&page={page}&o={offset}"
# 从视频主页获取cid
def get_cid(self, url):
response = requests.get(url, headers=self.headers, timeout=10)
response.encoding = 'utf-8'
text = response.text
pattern = r'<title>(.*?)</title>'
match = re.search(pattern, text)
if match:
title_content = match.group(1)
# print(f"title_content: {title_content}")
match = re.search(r'"cid":(\d+)', text)
if match:
return match.group(1)
return None
# 根据b站接口从cid获取弹幕内容
def get_from_cid(self, cid):
self.url_cid = self.url_interface_cid.format(cid=cid)
response = requests.get(self.url_cid, headers=self.headers, timeout=10)
response.encoding = 'utf-8'
text = response.text
def get_parse_list(text):
danmaku_list = []
pattern = r'<d p="([^"]*)">([^<]*)</d>'
matches = re.findall(pattern, text)
for match in matches:
params = match[0].split(',')
danmaku = {
'time': float(params[0]), # 出现时间
'type': int(params[1]), # 弹幕类型
'size': int(params[2]), # 字体大小
'color': int(params[3]), # 颜色
'timestamp': int(params[4]), # 发送时间
'pool': int(params[5]), # 弹幕池
'uid': params[6], # 用户ID
'id': params[7], # 弹幕ID
'text': match[1] # 弹幕内容
}
danmaku_list.append(danmaku)
return [i['text'] for i in danmaku_list]
# print(get_parse_list(text))
danmaku_list = get_parse_list(text)
return danmaku_list
# 获取搜索结果页面的HTML内容
def get_html(self, page):
page -= 1
url_base = self.url_page_base.format(page=page, offset=page * 30)
response = requests.get(url_base, headers=self.headers, timeout=10)
response.encoding = 'utf-8'
return response.text
# 解析HTML内容提取视频链接并获取弹幕
def parse_html(self, html, num):
soup = bs4.BeautifulSoup(html, 'html.parser')
danmaku_list = []
for i in range(num):
selector = f"#i_cecream > div > div:nth-child(2) > div.search-content.search-content--gray > div > div > div > div.video.i_wrapper.search-all-list > div > div:nth-child({i+1}) > div > div.bili-video-card__wrap > a"
element = soup.select_one(selector)
if element:
# 提取链接地址href属性
link_url = element.get('href')
# 提取元素内文本内容
link_text = element.get_text(strip=True) # strip=True去除首尾空白
link_url = link_url.replace("//", "https://")
print("提取结果:")
print(f"链接地址: {link_url}")
print(f"链接文本: {link_text}")
cid = self.get_cid(link_url)
print(f"得到CID: {cid}")
danmaku_list.extend(self.get_from_cid(cid))
else:
print("未找到匹配的元素。")
pass
time.sleep(0.5)
print("弹幕获取完成")
return danmaku_list
def work(self, page, num):
html = self.get_html(page)
ls = self.parse_html(html, num)
return ls
if __name__ == "__main__":
# import cProfile as pf
# from pstats import SortKey as sk
# import pstats
# p = pstats.Stats("profiler_stats")
# p.strip_dirs().sort_stats(sk.TIME).print_stats(30)
scrawl = Scawler()
scrawl.work(1, 2)
# pf.run("scrawl.work(1, 2)", "profiler_stats")
pass