homework/tool/scrawl.py

import re, requests, bs4, time


class Scawler:

    def __init__(self):
        self.url_ref = 'https://search.bilibili.com/all?vt=83547368&keyword=LLM'
        self.headers = {
            # Referer 防盗链 告诉服务器你请求链接是从哪里跳转过来的
            # "Referer": "https://www.bilibili.com/video/BV1454y187Er/",
            "Referer":
            self.url_ref,
            # User-Agent 用户代理, 表示浏览器/设备基本身份信息
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        }
        self.url_interface_cid = "https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
        self.url_page_base = "https://search.bilibili.com/all?vt=85151086&keyword=LLM&page={page}&o={offset}"

    #   从视频主页获取cid
    def get_cid(self, url):
        response = requests.get(url, headers=self.headers, timeout=10)
        response.encoding = 'utf-8'
        text = response.text
        pattern = r'<title>(.*?)</title>'
        match = re.search(pattern, text)

        if match:
            title_content = match.group(1)
            # print(f"title_content: {title_content}")

        match = re.search(r'"cid":(\d+)', text)
        if match:
            return match.group(1)
        return None

    #  根据b站接口从cid获取弹幕内容
    def get_from_cid(self, cid):
        self.url_cid = self.url_interface_cid.format(cid=cid)
        response = requests.get(self.url_cid, headers=self.headers, timeout=10)
        response.encoding = 'utf-8'
        text = response.text

        def get_parse_list(text):
            danmaku_list = []
            pattern = r'<d p="([^"]*)">([^<]*)</d>'
            matches = re.findall(pattern, text)
            for match in matches:
                params = match[0].split(',')
                danmaku = {
                    'time': float(params[0]),  # 出现时间
                    'type': int(params[1]),  # 弹幕类型
                    'size': int(params[2]),  # 字体大小
                    'color': int(params[3]),  # 颜色
                    'timestamp': int(params[4]),  # 发送时间
                    'pool': int(params[5]),  # 弹幕池
                    'uid': params[6],  # 用户ID
                    'id': params[7],  # 弹幕ID
                    'text': match[1]  # 弹幕内容
                }
                danmaku_list.append(danmaku)
            return [i['text'] for i in danmaku_list]

        # print(get_parse_list(text))
        danmaku_list = get_parse_list(text)
        return danmaku_list

    #  获取搜索结果页面的HTML内容
    def get_html(self, page):
        page -= 1
        url_base = self.url_page_base.format(page=page, offset=page * 30)
        response = requests.get(url_base, headers=self.headers, timeout=10)
        response.encoding = 'utf-8'
        return response.text

    #  解析HTML内容，提取视频链接并获取弹幕
    def parse_html(self, html, num):
        soup = bs4.BeautifulSoup(html, 'html.parser')
        danmaku_list = []
        for i in range(num):
            selector = f"#i_cecream > div > div:nth-child(2) > div.search-content.search-content--gray > div > div > div > div.video.i_wrapper.search-all-list > div > div:nth-child({i+1}) > div > div.bili-video-card__wrap > a"
            element = soup.select_one(selector)
            if element:
                # 提取链接地址（href属性）
                link_url = element.get('href')
                # 提取元素内文本内容
                link_text = element.get_text(strip=True)  # strip=True去除首尾空白
                link_url = link_url.replace("//", "https://")
                print("提取结果：")
                print(f"链接地址: {link_url}")
                print(f"链接文本: {link_text}")
                cid = self.get_cid(link_url)
                print(f"得到CID: {cid}")
                danmaku_list.extend(self.get_from_cid(cid))
            else:
                print("未找到匹配的元素。")
                pass
            time.sleep(0.5)
        print("弹幕获取完成")
        return danmaku_list

    def work(self, page, num):
        html = self.get_html(page)
        ls = self.parse_html(html, num)
        return ls


if __name__ == "__main__":
    # import cProfile as pf
    # from pstats import SortKey as sk
    # import pstats
    # p = pstats.Stats("profiler_stats")
    # p.strip_dirs().sort_stats(sk.TIME).print_stats(30)

    scrawl = Scawler()
    scrawl.work(1, 2)
    # pf.run("scrawl.work(1, 2)", "profiler_stats")

    pass