102201638-240918/crawlerCore.py

import re
import requests
import json
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba


# 获取搜索结果
def get_search_result_bv(key_word, limit):
    bv_list = []
    count = 0
    page_no = 1
    while count < limit:
        search_url = (
            "https://search.bilibili.com/all?keyword="
            + key_word
            + "&pages="
            + page_no.__str__()
        )
        head = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
            "referer": "https://www.bilibili.com/",
        }
        response_data = requests.get(url=search_url, headers=head)
        soup = BeautifulSoup(response_data.text, "lxml")
        results = soup("div", "bili-video-card__info--right")
        try:
            for result in results:
                curr_href = result.a["href"].strip("/")
                # 过滤搜索结果中的直播(live.bilibili)
                info = curr_href.split("/")
                if info[1] == "video":
                    bv_list.append(info[2])
                    count += 1
                if count >= limit:
                    break
        except IndexError:
            print("Out of Index at [" + __name__ + "]! 搜索结果获取失败?")
            print("http code: {code}".format(code=response_data.status_code))
            break
        page_no += 1
    return bv_list


def get_cid(bv_list):
    cid_list = []
    head = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
        "referer": "https://www.bilibili.com/",
    }
    for bv in bv_list:
        url = "https://api.bilibili.com/x/player/pagelist?bvid=" + bv
        response_data = requests.get(url=url, headers=head)
        data = json.loads(response_data.text)
        cid_list.append(data["data"][0]["cid"])
    return cid_list
def get_comments(cid_list):
    comments_list = []
    for cid in cid_list:
        curr_url="https://comment.bilibili.com/" + cid.__str__() + ".xml"
        head = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
            "referer": "https://www.bilibili.com/",
        }
        response_data = requests.get(url=curr_url, headers=head)
        response_data.encoding="utf-8"
        soup = BeautifulSoup(response_data.text, "xml")
        comments = soup.find_all("d")
        for comment in comments:
            comments_list.append(comment.text)
    return comments_list
def comments_filter(rules, comments_list):
    patterns=[]
    res=[]
    for rule in rules:
        patterns.append(re.compile(rule, re.IGNORECASE))
    for comment in comments_list:
        for pattern in patterns:
            if re.match(pattern, comment):
                if not res.__contains__(comment):
                    res.append(comment)
    return res

def gen_wordcloud(filtered_text):
    source = []
    for word in filtered_text:
        source += jieba.lcut(word)
    source = ' '.join(source)
    wordcloud = WordCloud(font_path="./font.ttf", background_color='white', scale=1.5).generate(source)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
提交完整的代码 2 months ago			`import re`
			`import requests`
			`import json`
			`from bs4 import BeautifulSoup`
添加云图 2 months ago			`import matplotlib.pyplot as plt`
			`from wordcloud import WordCloud`
			`import jieba`

提交完整的代码 2 months ago

			`# 获取搜索结果`
			`def get_search_result_bv(key_word, limit):`
			`bv_list = []`
			`count = 0`
			`page_no = 1`
			`while count < limit:`
			`search_url = (`
			`"https://search.bilibili.com/all?keyword="`
			`+ key_word`
			`+ "&pages="`
			`+ page_no.__str__()`
			`)`
			`head = {`
			`"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",`
			`"referer": "https://www.bilibili.com/",`
			`}`
			`response_data = requests.get(url=search_url, headers=head)`
			`soup = BeautifulSoup(response_data.text, "lxml")`
			`results = soup("div", "bili-video-card__info--right")`
			`try:`
			`for result in results:`
			`curr_href = result.a["href"].strip("/")`
			`# 过滤搜索结果中的直播(live.bilibili)`
			`info = curr_href.split("/")`
			`if info[1] == "video":`
			`bv_list.append(info[2])`
			`count += 1`
			`if count >= limit:`
			`break`
			`except IndexError:`
			`print("Out of Index at [" + __name__ + "]! 搜索结果获取失败?")`
			`print("http code: {code}".format(code=response_data.status_code))`
			`break`
			`page_no += 1`
			`return bv_list`


			`def get_cid(bv_list):`
			`cid_list = []`
			`head = {`
			`"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",`
			`"referer": "https://www.bilibili.com/",`
			`}`
			`for bv in bv_list:`
			`url = "https://api.bilibili.com/x/player/pagelist?bvid=" + bv`
			`response_data = requests.get(url=url, headers=head)`
			`data = json.loads(response_data.text)`
			`cid_list.append(data["data"][0]["cid"])`
			`return cid_list`
			`def get_comments(cid_list):`
			`comments_list = []`
			`for cid in cid_list:`
			`curr_url="https://comment.bilibili.com/" + cid.__str__() + ".xml"`
			`head = {`
			`"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",`
			`"referer": "https://www.bilibili.com/",`
			`}`
			`response_data = requests.get(url=curr_url, headers=head)`
			`response_data.encoding="utf-8"`
			`soup = BeautifulSoup(response_data.text, "xml")`
			`comments = soup.find_all("d")`
			`for comment in comments:`
			`comments_list.append(comment.text)`
			`return comments_list`
			`def comments_filter(rules, comments_list):`
			`patterns=[]`
			`res=[]`
			`for rule in rules:`
			`patterns.append(re.compile(rule, re.IGNORECASE))`
			`for comment in comments_list:`
			`for pattern in patterns:`
			`if re.match(pattern, comment):`
			`if not res.__contains__(comment):`
			`res.append(comment)`
添加云图 2 months ago			`return res`

			`def gen_wordcloud(filtered_text):`
			`source = []`
			`for word in filtered_text:`
			`source += jieba.lcut(word)`
			`source = ' '.join(source)`
			`wordcloud = WordCloud(font_path="./font.ttf", background_color='white', scale=1.5).generate(source)`
			`plt.imshow(wordcloud)`
			`plt.axis('off')`
			`plt.show()`