提交完整的代码

11 months ago · e1cb4683bd
commit e1cb4683bd
2 changed files with 87 additions and 0 deletions
--- a/crawlerCore.py
+++ b/crawlerCore.py
@ -0,0 +1,84 @@
+import re
+from os import remove
+
+import requests
+import json
+from bs4 import BeautifulSoup
+
+
+# 获取搜索结果
+def get_search_result_bv(key_word, limit):
+    bv_list = []
+    count = 0
+    page_no = 1
+    while count < limit:
+        search_url = (
+            "https://search.bilibili.com/all?keyword="
+            + key_word
+            + "&pages="
+            + page_no.__str__()
+        )
+        head = {
+            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+            "referer": "https://www.bilibili.com/",
+        }
+        response_data = requests.get(url=search_url, headers=head)
+        soup = BeautifulSoup(response_data.text, "lxml")
+        results = soup("div", "bili-video-card__info--right")
+        try:
+            for result in results:
+                curr_href = result.a["href"].strip("/")
+                # 过滤搜索结果中的直播(live.bilibili)
+                info = curr_href.split("/")
+                if info[1] == "video":
+                    bv_list.append(info[2])
+                    count += 1
+                if count >= limit:
+                    break
+        except IndexError:
+            print("Out of Index at [" + __name__ + "]! 搜索结果获取失败?")
+            print("http code: {code}".format(code=response_data.status_code))
+            break
+        page_no += 1
+    return bv_list
+
+
+def get_cid(bv_list):
+    cid_list = []
+    head = {
+        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+        "referer": "https://www.bilibili.com/",
+    }
+    for bv in bv_list:
+        url = "https://api.bilibili.com/x/player/pagelist?bvid=" + bv
+        response_data = requests.get(url=url, headers=head)
+        data = json.loads(response_data.text)
+        cid_list.append(data["data"][0]["cid"])
+    return cid_list
+def get_comments(cid_list):
+    comments_list = []
+    for cid in cid_list:
+        curr_url="https://comment.bilibili.com/" + cid.__str__() + ".xml"
+        head = {
+            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+            "referer": "https://www.bilibili.com/",
+        }
+        response_data = requests.get(url=curr_url, headers=head)
+        response_data.encoding="utf-8"
+        soup = BeautifulSoup(response_data.text, "xml")
+        comments = soup.find_all("d")
+        for comment in comments:
+            comments_list.append(comment.text)
+    return comments_list
+def comments_filter(rules, comments_list):
+    patterns=[]
+    res=[]
+    for rule in rules:
+        patterns.append(re.compile(rule, re.IGNORECASE))
+    for comment in comments_list:
+        for pattern in patterns:
+            if re.match(pattern, comment):
+                if not res.__contains__(comment):
+                    res.append(comment)
+
+    return res
--- a/main.py
+++ b/main.py
@ -0,0 +1,3 @@
+from crawlerCore import *
+rules=[r'[^a-z]*AI[^a-z]*']
+print(comments_filter(rules, get_comments(get_cid(get_search_result_bv("2024巴黎奥运会",300)))))