import re import requests import json from bs4 import BeautifulSoup import matplotlib.pyplot as plt from wordcloud import WordCloud import jieba # 获取搜索结果 def get_search_result_bv(key_word, limit): bv_list = [] count = 0 page_no = 1 while count < limit: search_url = ( "https://search.bilibili.com/all?keyword=" + key_word + "&pages=" + page_no.__str__() ) head = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", "referer": "https://www.bilibili.com/", } response_data = requests.get(url=search_url, headers=head) soup = BeautifulSoup(response_data.text, "lxml") results = soup("div", "bili-video-card__info--right") try: for result in results: curr_href = result.a["href"].strip("/") # 过滤搜索结果中的直播(live.bilibili) info = curr_href.split("/") if info[1] == "video": bv_list.append(info[2]) count += 1 if count >= limit: break except IndexError: print("Out of Index at [" + __name__ + "]! 搜索结果获取失败?") print("http code: {code}".format(code=response_data.status_code)) break page_no += 1 return bv_list def get_cid(bv_list): cid_list = [] head = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", "referer": "https://www.bilibili.com/", } for bv in bv_list: url = "https://api.bilibili.com/x/player/pagelist?bvid=" + bv response_data = requests.get(url=url, headers=head) data = json.loads(response_data.text) cid_list.append(data["data"][0]["cid"]) return cid_list def get_comments(cid_list): comments_list = [] for cid in cid_list: curr_url="https://comment.bilibili.com/" + cid.__str__() + ".xml" head = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", "referer": "https://www.bilibili.com/", } response_data = requests.get(url=curr_url, headers=head) response_data.encoding="utf-8" soup = BeautifulSoup(response_data.text, "xml") comments = soup.find_all("d") for comment in comments: comments_list.append(comment.text) return comments_list def comments_filter(rules, comments_list): patterns=[] res=[] for rule in rules: patterns.append(re.compile(rule, re.IGNORECASE)) for comment in comments_list: for pattern in patterns: if re.match(pattern, comment): if not res.__contains__(comment): res.append(comment) return res def gen_wordcloud(filtered_text): source = [] for word in filtered_text: source += jieba.lcut(word) source = ' '.join(source) wordcloud = WordCloud(font_path="./font.ttf", background_color='white', scale=1.5).generate(source) plt.imshow(wordcloud) plt.axis('off') plt.show()