提交完整的代码

11 months ago · e1cb4683bd
commit e1cb4683bd
2 changed files with 87 additions and 0 deletions
--- a/crawlerCore.py
+++ b/crawlerCore.py
@ -0,0 +1,84 @@
 import re
 from os import remove
 import requests
 import json
 from bs4 import BeautifulSoup
 # 获取搜索结果
 def get_search_result_bv(key_word, limit):
    bv_list = []
    count = 0
    page_no = 1
    while count < limit:
        search_url = (
            "https://search.bilibili.com/all?keyword="
            + key_word
            + "&pages="
            + page_no.__str__()
        )
        head = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
            "referer": "https://www.bilibili.com/",
        }
        response_data = requests.get(url=search_url, headers=head)
        soup = BeautifulSoup(response_data.text, "lxml")
        results = soup("div", "bili-video-card__info--right")
        try:
            for result in results:
                curr_href = result.a["href"].strip("/")
                # 过滤搜索结果中的直播(live.bilibili)
                info = curr_href.split("/")
                if info[1] == "video":
                    bv_list.append(info[2])
                    count += 1
                if count >= limit:
                    break
        except IndexError:
            print("Out of Index at [" + __name__ + "]! 搜索结果获取失败?")
            print("http code: {code}".format(code=response_data.status_code))
            break
        page_no += 1
    return bv_list
 def get_cid(bv_list):
    cid_list = []
    head = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
        "referer": "https://www.bilibili.com/",
    }
    for bv in bv_list:
        url = "https://api.bilibili.com/x/player/pagelist?bvid=" + bv
        response_data = requests.get(url=url, headers=head)
        data = json.loads(response_data.text)
        cid_list.append(data["data"][0]["cid"])
    return cid_list
 def get_comments(cid_list):
    comments_list = []
    for cid in cid_list:
        curr_url="https://comment.bilibili.com/" + cid.__str__() + ".xml"
        head = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
            "referer": "https://www.bilibili.com/",
        }
        response_data = requests.get(url=curr_url, headers=head)
        response_data.encoding="utf-8"
        soup = BeautifulSoup(response_data.text, "xml")
        comments = soup.find_all("d")
        for comment in comments:
            comments_list.append(comment.text)
    return comments_list
 def comments_filter(rules, comments_list):
    patterns=[]
    res=[]
    for rule in rules:
        patterns.append(re.compile(rule, re.IGNORECASE))
    for comment in comments_list:
        for pattern in patterns:
            if re.match(pattern, comment):
                if not res.__contains__(comment):
                    res.append(comment)
    return res
--- a/main.py
+++ b/main.py
@ -0,0 +1,3 @@
 from crawlerCore import *
 rules=[r'[^a-z]*AI[^a-z]*']
 print(comments_filter(rules, get_comments(get_cid(get_search_result_bv("2024巴黎奥运会",300)))))