From e1cb4683bd727d51839454498d4275e8b1d769b5 Mon Sep 17 00:00:00 2001
From: cflsxjw <cfl.sxjw@gmail.com>
Date: Wed, 18 Sep 2024 11:12:32 +0800
Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E5=AE=8C=E6=95=B4=E7=9A=84?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 crawlerCore.py | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++
 main.py        |  3 ++
 2 files changed, 87 insertions(+)
 create mode 100644 crawlerCore.py
 create mode 100644 main.py

diff --git a/crawlerCore.py b/crawlerCore.py
new file mode 100644
index 0000000..a611b49
--- /dev/null
+++ b/crawlerCore.py
@@ -0,0 +1,84 @@
+import re
+from os import remove
+
+import requests
+import json
+from bs4 import BeautifulSoup
+
+
+# 获取搜索结果
+def get_search_result_bv(key_word, limit):
+    bv_list = []
+    count = 0
+    page_no = 1
+    while count < limit:
+        search_url = (
+            "https://search.bilibili.com/all?keyword="
+            + key_word
+            + "&pages="
+            + page_no.__str__()
+        )
+        head = {
+            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+            "referer": "https://www.bilibili.com/",
+        }
+        response_data = requests.get(url=search_url, headers=head)
+        soup = BeautifulSoup(response_data.text, "lxml")
+        results = soup("div", "bili-video-card__info--right")
+        try:
+            for result in results:
+                curr_href = result.a["href"].strip("/")
+                # 过滤搜索结果中的直播(live.bilibili)
+                info = curr_href.split("/")
+                if info[1] == "video":
+                    bv_list.append(info[2])
+                    count += 1
+                if count >= limit:
+                    break
+        except IndexError:
+            print("Out of Index at [" + __name__ + "]! 搜索结果获取失败?")
+            print("http code: {code}".format(code=response_data.status_code))
+            break
+        page_no += 1
+    return bv_list
+
+
+def get_cid(bv_list):
+    cid_list = []
+    head = {
+        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+        "referer": "https://www.bilibili.com/",
+    }
+    for bv in bv_list:
+        url = "https://api.bilibili.com/x/player/pagelist?bvid=" + bv
+        response_data = requests.get(url=url, headers=head)
+        data = json.loads(response_data.text)
+        cid_list.append(data["data"][0]["cid"])
+    return cid_list
+def get_comments(cid_list):
+    comments_list = []
+    for cid in cid_list:
+        curr_url="https://comment.bilibili.com/" + cid.__str__() + ".xml"
+        head = {
+            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+            "referer": "https://www.bilibili.com/",
+        }
+        response_data = requests.get(url=curr_url, headers=head)
+        response_data.encoding="utf-8"
+        soup = BeautifulSoup(response_data.text, "xml")
+        comments = soup.find_all("d")
+        for comment in comments:
+            comments_list.append(comment.text)
+    return comments_list
+def comments_filter(rules, comments_list):
+    patterns=[]
+    res=[]
+    for rule in rules:
+        patterns.append(re.compile(rule, re.IGNORECASE))
+    for comment in comments_list:
+        for pattern in patterns:
+            if re.match(pattern, comment):
+                if not res.__contains__(comment):
+                    res.append(comment)
+
+    return res
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..bfe6fc4
--- /dev/null
+++ b/main.py
@@ -0,0 +1,3 @@
+from crawlerCore import *
+rules=[r'[^a-z]*AI[^a-z]*']
+print(comments_filter(rules, get_comments(get_cid(get_search_result_bv("2024巴黎奥运会",300)))))