|
|
import re, requests, bs4, time
|
|
|
|
|
|
|
|
|
class Scawler:
|
|
|
|
|
|
def __init__(self):
|
|
|
self.url_ref = 'https://search.bilibili.com/all?vt=83547368&keyword=LLM'
|
|
|
self.headers = {
|
|
|
# Referer 防盗链 告诉服务器你请求链接是从哪里跳转过来的
|
|
|
# "Referer": "https://www.bilibili.com/video/BV1454y187Er/",
|
|
|
"Referer":
|
|
|
self.url_ref,
|
|
|
# User-Agent 用户代理, 表示浏览器/设备基本身份信息
|
|
|
"User-Agent":
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
|
|
}
|
|
|
self.url_interface_cid = "https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
|
|
|
self.url_page_base = "https://search.bilibili.com/all?vt=85151086&keyword=LLM&page={page}&o={offset}"
|
|
|
|
|
|
# 从视频主页获取cid
|
|
|
def get_cid(self, url):
|
|
|
response = requests.get(url, headers=self.headers, timeout=10)
|
|
|
response.encoding = 'utf-8'
|
|
|
text = response.text
|
|
|
pattern = r'<title>(.*?)</title>'
|
|
|
match = re.search(pattern, text)
|
|
|
|
|
|
if match:
|
|
|
title_content = match.group(1)
|
|
|
# print(f"title_content: {title_content}")
|
|
|
|
|
|
match = re.search(r'"cid":(\d+)', text)
|
|
|
if match:
|
|
|
return match.group(1)
|
|
|
return None
|
|
|
|
|
|
# 根据b站接口从cid获取弹幕内容
|
|
|
def get_from_cid(self, cid):
|
|
|
self.url_cid = self.url_interface_cid.format(cid=cid)
|
|
|
response = requests.get(self.url_cid, headers=self.headers, timeout=10)
|
|
|
response.encoding = 'utf-8'
|
|
|
text = response.text
|
|
|
|
|
|
def get_parse_list(text):
|
|
|
danmaku_list = []
|
|
|
pattern = r'<d p="([^"]*)">([^<]*)</d>'
|
|
|
matches = re.findall(pattern, text)
|
|
|
for match in matches:
|
|
|
params = match[0].split(',')
|
|
|
danmaku = {
|
|
|
'time': float(params[0]), # 出现时间
|
|
|
'type': int(params[1]), # 弹幕类型
|
|
|
'size': int(params[2]), # 字体大小
|
|
|
'color': int(params[3]), # 颜色
|
|
|
'timestamp': int(params[4]), # 发送时间
|
|
|
'pool': int(params[5]), # 弹幕池
|
|
|
'uid': params[6], # 用户ID
|
|
|
'id': params[7], # 弹幕ID
|
|
|
'text': match[1] # 弹幕内容
|
|
|
}
|
|
|
danmaku_list.append(danmaku)
|
|
|
return [i['text'] for i in danmaku_list]
|
|
|
|
|
|
# print(get_parse_list(text))
|
|
|
danmaku_list = get_parse_list(text)
|
|
|
return danmaku_list
|
|
|
|
|
|
# 获取搜索结果页面的HTML内容
|
|
|
def get_html(self, page):
|
|
|
page -= 1
|
|
|
url_base = self.url_page_base.format(page=page, offset=page * 30)
|
|
|
response = requests.get(url_base, headers=self.headers, timeout=10)
|
|
|
response.encoding = 'utf-8'
|
|
|
return response.text
|
|
|
|
|
|
# 解析HTML内容,提取视频链接并获取弹幕
|
|
|
def parse_html(self, html, num):
|
|
|
soup = bs4.BeautifulSoup(html, 'html.parser')
|
|
|
danmaku_list = []
|
|
|
for i in range(num):
|
|
|
selector = f"#i_cecream > div > div:nth-child(2) > div.search-content.search-content--gray > div > div > div > div.video.i_wrapper.search-all-list > div > div:nth-child({i+1}) > div > div.bili-video-card__wrap > a"
|
|
|
element = soup.select_one(selector)
|
|
|
if element:
|
|
|
# 提取链接地址(href属性)
|
|
|
link_url = element.get('href')
|
|
|
# 提取元素内文本内容
|
|
|
link_text = element.get_text(strip=True) # strip=True去除首尾空白
|
|
|
link_url = link_url.replace("//", "https://")
|
|
|
print("提取结果:")
|
|
|
print(f"链接地址: {link_url}")
|
|
|
print(f"链接文本: {link_text}")
|
|
|
cid = self.get_cid(link_url)
|
|
|
print(f"得到CID: {cid}")
|
|
|
danmaku_list.extend(self.get_from_cid(cid))
|
|
|
else:
|
|
|
print("未找到匹配的元素。")
|
|
|
pass
|
|
|
time.sleep(0.5)
|
|
|
print("弹幕获取完成")
|
|
|
return danmaku_list
|
|
|
|
|
|
def work(self, page, num):
|
|
|
html = self.get_html(page)
|
|
|
ls = self.parse_html(html, num)
|
|
|
return ls
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
# import cProfile as pf
|
|
|
# from pstats import SortKey as sk
|
|
|
# import pstats
|
|
|
# p = pstats.Stats("profiler_stats")
|
|
|
# p.strip_dirs().sort_stats(sk.TIME).print_stats(30)
|
|
|
|
|
|
scrawl = Scawler()
|
|
|
scrawl.work(1, 2)
|
|
|
# pf.run("scrawl.work(1, 2)", "profiler_stats")
|
|
|
|
|
|
pass
|