提交完整的代码

master
cflsxjw 2 months ago
commit e1cb4683bd

@ -0,0 +1,84 @@
import re
from os import remove
import requests
import json
from bs4 import BeautifulSoup
# 获取搜索结果
def get_search_result_bv(key_word, limit):
bv_list = []
count = 0
page_no = 1
while count < limit:
search_url = (
"https://search.bilibili.com/all?keyword="
+ key_word
+ "&pages="
+ page_no.__str__()
)
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
"referer": "https://www.bilibili.com/",
}
response_data = requests.get(url=search_url, headers=head)
soup = BeautifulSoup(response_data.text, "lxml")
results = soup("div", "bili-video-card__info--right")
try:
for result in results:
curr_href = result.a["href"].strip("/")
# 过滤搜索结果中的直播(live.bilibili)
info = curr_href.split("/")
if info[1] == "video":
bv_list.append(info[2])
count += 1
if count >= limit:
break
except IndexError:
print("Out of Index at [" + __name__ + "]! 搜索结果获取失败?")
print("http code: {code}".format(code=response_data.status_code))
break
page_no += 1
return bv_list
def get_cid(bv_list):
cid_list = []
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
"referer": "https://www.bilibili.com/",
}
for bv in bv_list:
url = "https://api.bilibili.com/x/player/pagelist?bvid=" + bv
response_data = requests.get(url=url, headers=head)
data = json.loads(response_data.text)
cid_list.append(data["data"][0]["cid"])
return cid_list
def get_comments(cid_list):
comments_list = []
for cid in cid_list:
curr_url="https://comment.bilibili.com/" + cid.__str__() + ".xml"
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
"referer": "https://www.bilibili.com/",
}
response_data = requests.get(url=curr_url, headers=head)
response_data.encoding="utf-8"
soup = BeautifulSoup(response_data.text, "xml")
comments = soup.find_all("d")
for comment in comments:
comments_list.append(comment.text)
return comments_list
def comments_filter(rules, comments_list):
patterns=[]
res=[]
for rule in rules:
patterns.append(re.compile(rule, re.IGNORECASE))
for comment in comments_list:
for pattern in patterns:
if re.match(pattern, comment):
if not res.__contains__(comment):
res.append(comment)
return res

@ -0,0 +1,3 @@
from crawlerCore import *
rules=[r'[^a-z]*AI[^a-z]*']
print(comments_filter(rules, get_comments(get_cid(get_search_result_bv("2024巴黎奥运会",300)))))
Loading…
Cancel
Save