You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

95 lines
3.2 KiB

import re
import requests
import json
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
# 获取搜索结果
def get_search_result_bv(key_word, limit):
bv_list = []
count = 0
page_no = 1
while count < limit:
search_url = (
"https://search.bilibili.com/all?keyword="
+ key_word
+ "&pages="
+ page_no.__str__()
)
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
"referer": "https://www.bilibili.com/",
}
response_data = requests.get(url=search_url, headers=head)
soup = BeautifulSoup(response_data.text, "lxml")
results = soup("div", "bili-video-card__info--right")
try:
for result in results:
curr_href = result.a["href"].strip("/")
# 过滤搜索结果中的直播(live.bilibili)
info = curr_href.split("/")
if info[1] == "video":
bv_list.append(info[2])
count += 1
if count >= limit:
break
except IndexError:
print("Out of Index at [" + __name__ + "]! 搜索结果获取失败?")
print("http code: {code}".format(code=response_data.status_code))
break
page_no += 1
return bv_list
def get_cid(bv_list):
cid_list = []
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
"referer": "https://www.bilibili.com/",
}
for bv in bv_list:
url = "https://api.bilibili.com/x/player/pagelist?bvid=" + bv
response_data = requests.get(url=url, headers=head)
data = json.loads(response_data.text)
cid_list.append(data["data"][0]["cid"])
return cid_list
def get_comments(cid_list):
comments_list = []
for cid in cid_list:
curr_url="https://comment.bilibili.com/" + cid.__str__() + ".xml"
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
"referer": "https://www.bilibili.com/",
}
response_data = requests.get(url=curr_url, headers=head)
response_data.encoding="utf-8"
soup = BeautifulSoup(response_data.text, "xml")
comments = soup.find_all("d")
for comment in comments:
comments_list.append(comment.text)
return comments_list
def comments_filter(rules, comments_list):
patterns=[]
res=[]
for rule in rules:
patterns.append(re.compile(rule, re.IGNORECASE))
for comment in comments_list:
for pattern in patterns:
if re.match(pattern, comment):
if not res.__contains__(comment):
res.append(comment)
return res
def gen_wordcloud(filtered_text):
source = []
for word in filtered_text:
source += jieba.lcut(word)
source = ' '.join(source)
wordcloud = WordCloud(font_path="./font.ttf", background_color='white', scale=1.5).generate(source)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()