import re, requests, bs4, time class Scawler: def __init__(self): self.url_ref = 'https://search.bilibili.com/all?vt=83547368&keyword=LLM' self.headers = { # Referer 防盗链 告诉服务器你请求链接是从哪里跳转过来的 # "Referer": "https://www.bilibili.com/video/BV1454y187Er/", "Referer": self.url_ref, # User-Agent 用户代理, 表示浏览器/设备基本身份信息 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", } self.url_interface_cid = "https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" self.url_page_base = "https://search.bilibili.com/all?vt=85151086&keyword=LLM&page={page}&o={offset}" # 从视频主页获取cid def get_cid(self, url): response = requests.get(url, headers=self.headers, timeout=10) response.encoding = 'utf-8' text = response.text pattern = r'(.*?)' match = re.search(pattern, text) if match: title_content = match.group(1) # print(f"title_content: {title_content}") match = re.search(r'"cid":(\d+)', text) if match: return match.group(1) return None # 根据b站接口从cid获取弹幕内容 def get_from_cid(self, cid): self.url_cid = self.url_interface_cid.format(cid=cid) response = requests.get(self.url_cid, headers=self.headers, timeout=10) response.encoding = 'utf-8' text = response.text def get_parse_list(text): danmaku_list = [] pattern = r'([^<]*)' matches = re.findall(pattern, text) for match in matches: params = match[0].split(',') danmaku = { 'time': float(params[0]), # 出现时间 'type': int(params[1]), # 弹幕类型 'size': int(params[2]), # 字体大小 'color': int(params[3]), # 颜色 'timestamp': int(params[4]), # 发送时间 'pool': int(params[5]), # 弹幕池 'uid': params[6], # 用户ID 'id': params[7], # 弹幕ID 'text': match[1] # 弹幕内容 } danmaku_list.append(danmaku) return [i['text'] for i in danmaku_list] # print(get_parse_list(text)) danmaku_list = get_parse_list(text) return danmaku_list # 获取搜索结果页面的HTML内容 def get_html(self, page): page -= 1 url_base = self.url_page_base.format(page=page, offset=page * 30) response = requests.get(url_base, headers=self.headers, timeout=10) response.encoding = 'utf-8' return response.text # 解析HTML内容,提取视频链接并获取弹幕 def parse_html(self, html, num): soup = bs4.BeautifulSoup(html, 'html.parser') danmaku_list = [] for i in range(num): selector = f"#i_cecream > div > div:nth-child(2) > div.search-content.search-content--gray > div > div > div > div.video.i_wrapper.search-all-list > div > div:nth-child({i+1}) > div > div.bili-video-card__wrap > a" element = soup.select_one(selector) if element: # 提取链接地址(href属性) link_url = element.get('href') # 提取元素内文本内容 link_text = element.get_text(strip=True) # strip=True去除首尾空白 link_url = link_url.replace("//", "https://") print("提取结果:") print(f"链接地址: {link_url}") print(f"链接文本: {link_text}") cid = self.get_cid(link_url) print(f"得到CID: {cid}") danmaku_list.extend(self.get_from_cid(cid)) else: print("未找到匹配的元素。") pass time.sleep(0.5) print("弹幕获取完成") return danmaku_list def work(self, page, num): html = self.get_html(page) ls = self.parse_html(html, num) return ls if __name__ == "__main__": # import cProfile as pf # from pstats import SortKey as sk # import pstats # p = pstats.Stats("profiler_stats") # p.strip_dirs().sort_stats(sk.TIME).print_stats(30) scrawl = Scawler() scrawl.work(1, 2) # pf.run("scrawl.work(1, 2)", "profiler_stats") pass