|
|
|
import re
|
|
|
|
import requests
|
|
|
|
import json
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
import jieba
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 获取搜索结果
|
|
|
|
def get_search_result_bv(key_word, limit):
|
|
|
|
bv_list = []
|
|
|
|
count = 0
|
|
|
|
page_no = 1
|
|
|
|
while count < limit:
|
|
|
|
search_url = (
|
|
|
|
"https://search.bilibili.com/all?keyword="
|
|
|
|
+ key_word
|
|
|
|
+ "&pages="
|
|
|
|
+ page_no.__str__()
|
|
|
|
)
|
|
|
|
head = {
|
|
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
|
|
|
|
"referer": "https://www.bilibili.com/",
|
|
|
|
}
|
|
|
|
response_data = requests.get(url=search_url, headers=head)
|
|
|
|
soup = BeautifulSoup(response_data.text, "lxml")
|
|
|
|
results = soup("div", "bili-video-card__info--right")
|
|
|
|
try:
|
|
|
|
for result in results:
|
|
|
|
curr_href = result.a["href"].strip("/")
|
|
|
|
# 过滤搜索结果中的直播(live.bilibili)
|
|
|
|
info = curr_href.split("/")
|
|
|
|
if info[1] == "video":
|
|
|
|
bv_list.append(info[2])
|
|
|
|
count += 1
|
|
|
|
if count >= limit:
|
|
|
|
break
|
|
|
|
except IndexError:
|
|
|
|
print("Out of Index at [" + __name__ + "]! 搜索结果获取失败?")
|
|
|
|
print("http code: {code}".format(code=response_data.status_code))
|
|
|
|
break
|
|
|
|
page_no += 1
|
|
|
|
return bv_list
|
|
|
|
|
|
|
|
|
|
|
|
def get_cid(bv_list):
|
|
|
|
cid_list = []
|
|
|
|
head = {
|
|
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
|
|
|
|
"referer": "https://www.bilibili.com/",
|
|
|
|
}
|
|
|
|
for bv in bv_list:
|
|
|
|
url = "https://api.bilibili.com/x/player/pagelist?bvid=" + bv
|
|
|
|
response_data = requests.get(url=url, headers=head)
|
|
|
|
data = json.loads(response_data.text)
|
|
|
|
cid_list.append(data["data"][0]["cid"])
|
|
|
|
return cid_list
|
|
|
|
def get_comments(cid_list):
|
|
|
|
comments_list = []
|
|
|
|
for cid in cid_list:
|
|
|
|
curr_url="https://comment.bilibili.com/" + cid.__str__() + ".xml"
|
|
|
|
head = {
|
|
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
|
|
|
|
"referer": "https://www.bilibili.com/",
|
|
|
|
}
|
|
|
|
response_data = requests.get(url=curr_url, headers=head)
|
|
|
|
response_data.encoding="utf-8"
|
|
|
|
soup = BeautifulSoup(response_data.text, "xml")
|
|
|
|
comments = soup.find_all("d")
|
|
|
|
for comment in comments:
|
|
|
|
comments_list.append(comment.text)
|
|
|
|
return comments_list
|
|
|
|
def comments_filter(rules, comments_list):
|
|
|
|
patterns=[]
|
|
|
|
res=[]
|
|
|
|
for rule in rules:
|
|
|
|
patterns.append(re.compile(rule, re.IGNORECASE))
|
|
|
|
for comment in comments_list:
|
|
|
|
for pattern in patterns:
|
|
|
|
if re.match(pattern, comment):
|
|
|
|
if not res.__contains__(comment):
|
|
|
|
res.append(comment)
|
|
|
|
return res
|
|
|
|
|
|
|
|
def gen_wordcloud(filtered_text):
|
|
|
|
source = []
|
|
|
|
for word in filtered_text:
|
|
|
|
source += jieba.lcut(word)
|
|
|
|
source = ' '.join(source)
|
|
|
|
wordcloud = WordCloud(font_path="./font.ttf", background_color='white', scale=1.5).generate(source)
|
|
|
|
plt.imshow(wordcloud)
|
|
|
|
plt.axis('off')
|
|
|
|
plt.show()
|