|
|
@ -1,2 +1,101 @@
|
|
|
|
# python
|
|
|
|
import re
|
|
|
|
|
|
|
|
import random
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
import datetime
|
|
|
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
import io
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
import openpyxl
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
import pandas
|
|
|
|
|
|
|
|
import wordcloud
|
|
|
|
|
|
|
|
import numpy
|
|
|
|
|
|
|
|
import jieba
|
|
|
|
|
|
|
|
import pylint
|
|
|
|
|
|
|
|
information_list = []#用来保存爬取的弹幕
|
|
|
|
|
|
|
|
#解决一下怎么爬出BV号,每个网站的BV号不同,根据不同的BV号我们可以对不同的视频进行操作
|
|
|
|
|
|
|
|
bvnum_list = []#存BV号,有了BV号再循环导出弹幕
|
|
|
|
|
|
|
|
cid_list = []#存cid号
|
|
|
|
|
|
|
|
def findbv():
|
|
|
|
|
|
|
|
temp_list =[]
|
|
|
|
|
|
|
|
for i in range(1,10):
|
|
|
|
|
|
|
|
url = f'https://search.bilibili.com/video?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={i}'
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
|
|
|
|
|
|
|
|
"Cookie" : "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
result = requests.get(url=url,headers= headers)
|
|
|
|
|
|
|
|
result.encoding = "utf-8"
|
|
|
|
|
|
|
|
tempresult = result.text
|
|
|
|
|
|
|
|
tempnum = re.findall(r'bvid:"(.*?)"', tempresult)
|
|
|
|
|
|
|
|
temp_list.extend(tempnum)
|
|
|
|
|
|
|
|
return temp_list
|
|
|
|
|
|
|
|
#接下来抓取cid
|
|
|
|
|
|
|
|
def getcid(list):
|
|
|
|
|
|
|
|
temp_cid_list = []
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
for i in range(300):
|
|
|
|
|
|
|
|
temp_url = f'https://api.bilibili.com/x/player/pagelist?bvid={list[i]}&jsonp=jsonp'
|
|
|
|
|
|
|
|
temp_headers = {
|
|
|
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
|
|
|
|
|
|
|
|
'Cookie': "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
respone = requests.get(url = temp_url,headers = temp_headers)
|
|
|
|
|
|
|
|
respone.encoding = 'utf-8'
|
|
|
|
|
|
|
|
if respone.status_code == 200:
|
|
|
|
|
|
|
|
temp_cid_list.extend(re.findall(r'"cid":(.*?),',respone.text))
|
|
|
|
|
|
|
|
except requests.RequestException as e:
|
|
|
|
|
|
|
|
print(f"An error occurred: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return temp_cid_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_information(list):
|
|
|
|
|
|
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
|
|
|
|
|
|
|
|
temp_information_list = []
|
|
|
|
|
|
|
|
for i in range(300):
|
|
|
|
|
|
|
|
temp_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={list[i]}'
|
|
|
|
|
|
|
|
temp_headers = {
|
|
|
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
|
|
|
|
|
|
|
|
'Cookie': "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
response = requests.get(url=temp_url, headers=temp_headers)
|
|
|
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
|
|
|
temp_list = re.findall('<d p=".*?">(.*?)</d>',response.text)
|
|
|
|
|
|
|
|
temp_information_list.extend(temp_list)
|
|
|
|
|
|
|
|
except requests.RequestException as e:
|
|
|
|
|
|
|
|
print(f"An error occurred: {e}")
|
|
|
|
|
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
|
|
return temp_information_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bvnum_list = findbv()
|
|
|
|
|
|
|
|
cid_list = getcid(bvnum_list)
|
|
|
|
|
|
|
|
information_list = get_information(cid_list)
|
|
|
|
|
|
|
|
for i in range(len(information_list)):
|
|
|
|
|
|
|
|
with open("E:\python1\\requirement.txt","a",encoding= "utf-8") as f: #/r要做出区分
|
|
|
|
|
|
|
|
f.write(information_list[i])
|
|
|
|
|
|
|
|
f.write("\n")
|
|
|
|
|
|
|
|
f = open("E:\python1\\requirement.txt",encoding= "utf-8")
|
|
|
|
|
|
|
|
txt = f.read()
|
|
|
|
|
|
|
|
string = ' '.join(jieba.lcut(txt))
|
|
|
|
|
|
|
|
wc = wordcloud.WordCloud(
|
|
|
|
|
|
|
|
width = 500,
|
|
|
|
|
|
|
|
height = 500,
|
|
|
|
|
|
|
|
background_color= 'white',
|
|
|
|
|
|
|
|
scale = 10,
|
|
|
|
|
|
|
|
font_path = 'msyh.ttc',
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
wc.generate(string)
|
|
|
|
|
|
|
|
wc.to_file("wordcloud.png")
|
|
|
|
|
|
|
|
df = pandas.DataFrame(information_list)
|
|
|
|
|
|
|
|
df.to_excel('spider1.xlsx',index = False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|