import re import random import requests import datetime import sys import io import json import openpyxl from bs4 import BeautifulSoup import time import pandas import wordcloud import numpy import jieba import pylint information_list = []#用来保存爬取的弹幕 #解决一下怎么爬出BV号,每个网站的BV号不同,根据不同的BV号我们可以对不同的视频进行操作 bvnum_list = []#存BV号,有了BV号再循环导出弹幕 cid_list = []#存cid号 def findbv(): temp_list =[] for i in range(1,10): url = f'https://search.bilibili.com/video?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={i}' headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", "Cookie" : "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696" } result = requests.get(url=url,headers= headers) result.encoding = "utf-8" tempresult = result.text tempnum = re.findall(r'bvid:"(.*?)"', tempresult) temp_list.extend(tempnum) return temp_list #接下来抓取cid def getcid(list): temp_cid_list = [] try: for i in range(300): temp_url = f'https://api.bilibili.com/x/player/pagelist?bvid={list[i]}&jsonp=jsonp' temp_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", 'Cookie': "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696" } respone = requests.get(url = temp_url,headers = temp_headers) respone.encoding = 'utf-8' if respone.status_code == 200: temp_cid_list.extend(re.findall(r'"cid":(.*?),',respone.text)) except requests.RequestException as e: print(f"An error occurred: {e}") return temp_cid_list def get_information(list): sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') temp_information_list = [] for i in range(300): temp_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={list[i]}' temp_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", 'Cookie': "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696" } try: response = requests.get(url=temp_url, headers=temp_headers) response.encoding = 'utf-8' temp_list = re.findall('(.*?)',response.text) temp_information_list.extend(temp_list) except requests.RequestException as e: print(f"An error occurred: {e}") time.sleep(0.5) return temp_information_list bvnum_list = findbv() cid_list = getcid(bvnum_list) information_list = get_information(cid_list) for i in range(len(information_list)): with open("E:\python1\\requirement1.txt","a",encoding= "utf-8") as f: #/r要做出区分 f.write(information_list[i]) f.write("\n") f = open("E:\python1\\requirement1.txt",encoding= "utf-8") txt = f.read() string = ' '.join(jieba.lcut(txt)) wc = wordcloud.WordCloud( width = 500, height = 500, background_color= 'white', scale = 10, font_path = 'msyh.ttc', ) wc.generate(string) wc.to_file("wordcloud.png") df = pandas.DataFrame(information_list) df.to_excel('spider1.xlsx',index = False)