From e1cb4683bd727d51839454498d4275e8b1d769b5 Mon Sep 17 00:00:00 2001 From: cflsxjw Date: Wed, 18 Sep 2024 11:12:32 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E5=AE=8C=E6=95=B4=E7=9A=84?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawlerCore.py | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++ main.py | 3 ++ 2 files changed, 87 insertions(+) create mode 100644 crawlerCore.py create mode 100644 main.py diff --git a/crawlerCore.py b/crawlerCore.py new file mode 100644 index 0000000..a611b49 --- /dev/null +++ b/crawlerCore.py @@ -0,0 +1,84 @@ +import re +from os import remove + +import requests +import json +from bs4 import BeautifulSoup + + +# 获取搜索结果 +def get_search_result_bv(key_word, limit): + bv_list = [] + count = 0 + page_no = 1 + while count < limit: + search_url = ( + "https://search.bilibili.com/all?keyword=" + + key_word + + "&pages=" + + page_no.__str__() + ) + head = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", + "referer": "https://www.bilibili.com/", + } + response_data = requests.get(url=search_url, headers=head) + soup = BeautifulSoup(response_data.text, "lxml") + results = soup("div", "bili-video-card__info--right") + try: + for result in results: + curr_href = result.a["href"].strip("/") + # 过滤搜索结果中的直播(live.bilibili) + info = curr_href.split("/") + if info[1] == "video": + bv_list.append(info[2]) + count += 1 + if count >= limit: + break + except IndexError: + print("Out of Index at [" + __name__ + "]! 搜索结果获取失败?") + print("http code: {code}".format(code=response_data.status_code)) + break + page_no += 1 + return bv_list + + +def get_cid(bv_list): + cid_list = [] + head = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", + "referer": "https://www.bilibili.com/", + } + for bv in bv_list: + url = "https://api.bilibili.com/x/player/pagelist?bvid=" + bv + response_data = requests.get(url=url, headers=head) + data = json.loads(response_data.text) + cid_list.append(data["data"][0]["cid"]) + return cid_list +def get_comments(cid_list): + comments_list = [] + for cid in cid_list: + curr_url="https://comment.bilibili.com/" + cid.__str__() + ".xml" + head = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", + "referer": "https://www.bilibili.com/", + } + response_data = requests.get(url=curr_url, headers=head) + response_data.encoding="utf-8" + soup = BeautifulSoup(response_data.text, "xml") + comments = soup.find_all("d") + for comment in comments: + comments_list.append(comment.text) + return comments_list +def comments_filter(rules, comments_list): + patterns=[] + res=[] + for rule in rules: + patterns.append(re.compile(rule, re.IGNORECASE)) + for comment in comments_list: + for pattern in patterns: + if re.match(pattern, comment): + if not res.__contains__(comment): + res.append(comment) + + return res \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..bfe6fc4 --- /dev/null +++ b/main.py @@ -0,0 +1,3 @@ +from crawlerCore import * +rules=[r'[^a-z]*AI[^a-z]*'] +print(comments_filter(rules, get_comments(get_cid(get_search_result_bv("2024巴黎奥运会",300)))))