可行的爬取视频弹幕

master
posql3f6g 2 years ago
parent bf5fbea0bf
commit a204a1aeda

@ -0,0 +1,167 @@
import requests
from lxml import etree
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import re
import os
import subprocess
import json
import pprint
import openpyxl
import urllib.request
def crawl_vedio(vedio_url):
url = vedio_url
headers = {
'referer': 'https://www.bilibili.com/video/BV1jm4y167fE/?vd_source=59fecc30e7f4791084968599ca1f8b82',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
title = re.findall('<h1 title="(.*?)" class="video-title tit">', response.text)[0]
playinfo = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
json_data = json.loads(playinfo)
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
video_url = json_data['data']['dash']['video'][0]['baseUrl']
audio_content = requests.get(url=audio_url, headers=headers).content
vedio_content = requests.get(url=video_url, headers=headers).content
with open('视频\\' + title + '.mp3', mode='wb') as f:
f.write(audio_content)
with open('视频\\' + title + '.mp4', mode='wb') as f:
f.write(vedio_content)
COMMAND = f'ffmpeg -i 视频\\{title}.mp4 -i 视频\\{title}.mp3 -c:v copy -c:a aac -strict experimental 视频\\{title}output.mp4'
subprocess.run(COMMAND, shell=True)
crawl_barrage(vedio_url,title)
# crawl_comment(vedio_url,title)
def crawl_barrage(vedio_url,title):
base_url = vedio_url.split('https://www.')[1]
url = 'https://www.i' + base_url
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
tree = etree.HTML(content)
danmuurl = tree.xpath('//div[@class="btn-group"]/a[3]/@href')
response2 = requests.get(url=danmuurl[0], headers=headers)
response2.encoding = 'utf-8'
content_list = re.findall('<d p=".*?">(.*?)</d>', response2.text)
for content in content_list:
with open('弹幕\\'+title+'弹幕.txt', mode='a', encoding='utf-8') as f:
f.write(content)
f.write('\n')
# crawl_comment(vedio_url, title)
# def crawl_comment(vedio_url,title):
#
# url = vedio_url
#
# headers = {
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
# }
#
# response = requests.get(url=url, headers=headers)
#
# mid_list = [i['member']['mid'] for i in response.json()['data']['replies']]
# uname_list = [i['member']['uname'] for i in response.json()['data']['replies']]
# sign_list = [i['member']['sign'] for i in response.json()['data']['replies']]
# content_list = [i['content']['message'] for i in response.json()['data']['replies']]
#
# for content in content_list:
# with open('评论\\'+title+'评论.txt', mode='a', encoding='utf-8') as f:
# f.write(content)
# f.write('\n')
#
# for mid in mid_list:
# with open('评论\\'+title+'Uid.txt', mode='a', encoding='utf-8') as f:
# f.write(mid)
# f.write('\n')
#
# for uname in uname_list:
# with open('评论\\'+title+'昵称.txt', mode='a', encoding='utf-8') as f:
# f.write(uname)
# f.write('\n')
#
# for sign in sign_list:
# with open('评论\\'+title+'个性签名.txt', mode='a', encoding='utf-8') as f:
# f.write(sign)
# f.write('\n')
#
# dict = {}
# dict["Uid"] = mid_list
# dict["uname"] = uname_list
# dict["sign"] = sign_list
# dict["content"] = content_list
#
# write_excel(dict)
#
#
# def write_excel(dict):
# work_book = openpyxl.Workbook()
# sheet = work_book.create_sheet('评论')
# for index, (key, value) in enumerate(dict.items()):
# sheet.cell(1, index + 1, key)
# for i in range(len(value)):
# sheet.cell(i + 2, index + 1, value[i])
#
# work_book.save('评论.xlsx')
# 480959917
# 1856528671
def crawl_upinfo(url):
path='chromedriver.exe'
browser=webdriver.Chrome(path)
browser.get(url)
time.sleep(2)
detial_url_list = []
html = BeautifulSoup(browser.page_source)
for a_label in html.find('div', id='submit-video-list').find_all('a',attrs={'target': '_blank', 'class': 'title'}):
if (a_label['href'] != None):
detial_url_list.append('https:' + a_label['href'])
# print(detial_url_list)
return detial_url_list
# https://space.bilibili.com/480959917/video?tid=0&page=2&keyword=&order=pubdate
if __name__=='__main__':
uid=input('请输入你想要看的博主的uid:')
base_url1='https://space.bilibili.com/'
base_url2='/video?tid=0&page='
base_url3='&keyword=&order=pubdate'
url=base_url1+uid+base_url2+'1'+base_url3
path='chromedriver.exe'
browser=webdriver.Chrome(path)
browser.get(url)
time.sleep(2)
html = BeautifulSoup(browser.page_source)
last_page=html.find('div',id='submit-video-list').find('ul',class_='be-pager').find_all('li')[-2].find('a').text
upvedio_url_list=[]
# print(last_page)
for i in range(1,int(last_page)+1):
upvedio_url=base_url1+uid+base_url2+str(i)+base_url3
# print(upvedio_url)
upvedio_url_list+=crawl_upinfo(upvedio_url)
for url in upvedio_url_list:
crawl_vedio(url)
# print(upvedio_url_list)
Loading…
Cancel
Save