You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

102 lines
7.3 KiB

2 months ago
import re
import random
import requests
import datetime
import sys
import io
import json
import openpyxl
from bs4 import BeautifulSoup
import time
import pandas
import wordcloud
import numpy
import jieba
import pylint
information_list = []#用来保存爬取的弹幕
#解决一下怎么爬出BV号每个网站的BV号不同根据不同的BV号我们可以对不同的视频进行操作
bvnum_list = []#存BV号有了BV号再循环导出弹幕
cid_list = []#存cid号
def findbv():
temp_list =[]
for i in range(1,10):
url = f'https://search.bilibili.com/video?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={i}'
headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
"Cookie" : "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696"
}
result = requests.get(url=url,headers= headers)
result.encoding = "utf-8"
tempresult = result.text
tempnum = re.findall(r'bvid:"(.*?)"', tempresult)
temp_list.extend(tempnum)
return temp_list
#接下来抓取cid
def getcid(list):
temp_cid_list = []
try:
for i in range(300):
temp_url = f'https://api.bilibili.com/x/player/pagelist?bvid={list[i]}&jsonp=jsonp'
temp_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
'Cookie': "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696"
}
respone = requests.get(url = temp_url,headers = temp_headers)
respone.encoding = 'utf-8'
if respone.status_code == 200:
temp_cid_list.extend(re.findall(r'"cid":(.*?),',respone.text))
except requests.RequestException as e:
print(f"An error occurred: {e}")
return temp_cid_list
def get_information(list):
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
temp_information_list = []
for i in range(300):
temp_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={list[i]}'
temp_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
'Cookie': "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696"
}
try:
response = requests.get(url=temp_url, headers=temp_headers)
response.encoding = 'utf-8'
temp_list = re.findall('<d p=".*?">(.*?)</d>',response.text)
temp_information_list.extend(temp_list)
except requests.RequestException as e:
print(f"An error occurred: {e}")
time.sleep(0.5)
return temp_information_list
bvnum_list = findbv()
cid_list = getcid(bvnum_list)
information_list = get_information(cid_list)
for i in range(len(information_list)):
with open("E:\python1\\requirement1.txt","a",encoding= "utf-8") as f: #/r要做出区分
f.write(information_list[i])
f.write("\n")
f = open("E:\python1\\requirement1.txt",encoding= "utf-8")
txt = f.read()
string = ' '.join(jieba.lcut(txt))
wc = wordcloud.WordCloud(
width = 500,
height = 500,
background_color= 'white',
scale = 10,
font_path = 'msyh.ttc',
)
wc.generate(string)
wc.to_file("wordcloud.png")
df = pandas.DataFrame(information_list)
df.to_excel('spider1.xlsx',index = False)