parent
bf5fbea0bf
commit
a204a1aeda
@ -0,0 +1,167 @@
|
||||
import requests
|
||||
from lxml import etree
|
||||
from selenium import webdriver
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import os
|
||||
import subprocess
|
||||
import json
|
||||
import pprint
|
||||
import openpyxl
|
||||
import urllib.request
|
||||
|
||||
def crawl_vedio(vedio_url):
|
||||
url = vedio_url
|
||||
|
||||
headers = {
|
||||
'referer': 'https://www.bilibili.com/video/BV1jm4y167fE/?vd_source=59fecc30e7f4791084968599ca1f8b82',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
|
||||
}
|
||||
response = requests.get(url=url, headers=headers)
|
||||
|
||||
title = re.findall('<h1 title="(.*?)" class="video-title tit">', response.text)[0]
|
||||
playinfo = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
|
||||
|
||||
json_data = json.loads(playinfo)
|
||||
|
||||
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
|
||||
video_url = json_data['data']['dash']['video'][0]['baseUrl']
|
||||
|
||||
audio_content = requests.get(url=audio_url, headers=headers).content
|
||||
vedio_content = requests.get(url=video_url, headers=headers).content
|
||||
|
||||
with open('视频\\' + title + '.mp3', mode='wb') as f:
|
||||
f.write(audio_content)
|
||||
with open('视频\\' + title + '.mp4', mode='wb') as f:
|
||||
f.write(vedio_content)
|
||||
|
||||
COMMAND = f'ffmpeg -i 视频\\{title}.mp4 -i 视频\\{title}.mp3 -c:v copy -c:a aac -strict experimental 视频\\{title}output.mp4'
|
||||
subprocess.run(COMMAND, shell=True)
|
||||
|
||||
crawl_barrage(vedio_url,title)
|
||||
# crawl_comment(vedio_url,title)
|
||||
|
||||
def crawl_barrage(vedio_url,title):
|
||||
base_url = vedio_url.split('https://www.')[1]
|
||||
url = 'https://www.i' + base_url
|
||||
headers = {
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
|
||||
}
|
||||
|
||||
request = urllib.request.Request(url=url, headers=headers)
|
||||
response = urllib.request.urlopen(request)
|
||||
content = response.read().decode('utf-8')
|
||||
tree = etree.HTML(content)
|
||||
danmuurl = tree.xpath('//div[@class="btn-group"]/a[3]/@href')
|
||||
|
||||
response2 = requests.get(url=danmuurl[0], headers=headers)
|
||||
response2.encoding = 'utf-8'
|
||||
content_list = re.findall('<d p=".*?">(.*?)</d>', response2.text)
|
||||
|
||||
for content in content_list:
|
||||
with open('弹幕\\'+title+'弹幕.txt', mode='a', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
f.write('\n')
|
||||
|
||||
# crawl_comment(vedio_url, title)
|
||||
|
||||
# def crawl_comment(vedio_url,title):
|
||||
#
|
||||
# url = vedio_url
|
||||
#
|
||||
# headers = {
|
||||
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
|
||||
# }
|
||||
#
|
||||
# response = requests.get(url=url, headers=headers)
|
||||
#
|
||||
# mid_list = [i['member']['mid'] for i in response.json()['data']['replies']]
|
||||
# uname_list = [i['member']['uname'] for i in response.json()['data']['replies']]
|
||||
# sign_list = [i['member']['sign'] for i in response.json()['data']['replies']]
|
||||
# content_list = [i['content']['message'] for i in response.json()['data']['replies']]
|
||||
#
|
||||
# for content in content_list:
|
||||
# with open('评论\\'+title+'评论.txt', mode='a', encoding='utf-8') as f:
|
||||
# f.write(content)
|
||||
# f.write('\n')
|
||||
#
|
||||
# for mid in mid_list:
|
||||
# with open('评论\\'+title+'Uid.txt', mode='a', encoding='utf-8') as f:
|
||||
# f.write(mid)
|
||||
# f.write('\n')
|
||||
#
|
||||
# for uname in uname_list:
|
||||
# with open('评论\\'+title+'昵称.txt', mode='a', encoding='utf-8') as f:
|
||||
# f.write(uname)
|
||||
# f.write('\n')
|
||||
#
|
||||
# for sign in sign_list:
|
||||
# with open('评论\\'+title+'个性签名.txt', mode='a', encoding='utf-8') as f:
|
||||
# f.write(sign)
|
||||
# f.write('\n')
|
||||
#
|
||||
# dict = {}
|
||||
# dict["Uid"] = mid_list
|
||||
# dict["uname"] = uname_list
|
||||
# dict["sign"] = sign_list
|
||||
# dict["content"] = content_list
|
||||
#
|
||||
# write_excel(dict)
|
||||
#
|
||||
#
|
||||
# def write_excel(dict):
|
||||
# work_book = openpyxl.Workbook()
|
||||
# sheet = work_book.create_sheet('评论')
|
||||
# for index, (key, value) in enumerate(dict.items()):
|
||||
# sheet.cell(1, index + 1, key)
|
||||
# for i in range(len(value)):
|
||||
# sheet.cell(i + 2, index + 1, value[i])
|
||||
#
|
||||
# work_book.save('评论.xlsx')
|
||||
|
||||
|
||||
|
||||
# 480959917
|
||||
# 1856528671
|
||||
def crawl_upinfo(url):
|
||||
path='chromedriver.exe'
|
||||
browser=webdriver.Chrome(path)
|
||||
browser.get(url)
|
||||
time.sleep(2)
|
||||
detial_url_list = []
|
||||
html = BeautifulSoup(browser.page_source)
|
||||
|
||||
for a_label in html.find('div', id='submit-video-list').find_all('a',attrs={'target': '_blank', 'class': 'title'}):
|
||||
if (a_label['href'] != None):
|
||||
detial_url_list.append('https:' + a_label['href'])
|
||||
# print(detial_url_list)
|
||||
return detial_url_list
|
||||
|
||||
# https://space.bilibili.com/480959917/video?tid=0&page=2&keyword=&order=pubdate
|
||||
|
||||
if __name__=='__main__':
|
||||
uid=input('请输入你想要看的博主的uid:')
|
||||
base_url1='https://space.bilibili.com/'
|
||||
base_url2='/video?tid=0&page='
|
||||
base_url3='&keyword=&order=pubdate'
|
||||
url=base_url1+uid+base_url2+'1'+base_url3
|
||||
|
||||
path='chromedriver.exe'
|
||||
browser=webdriver.Chrome(path)
|
||||
browser.get(url)
|
||||
time.sleep(2)
|
||||
html = BeautifulSoup(browser.page_source)
|
||||
last_page=html.find('div',id='submit-video-list').find('ul',class_='be-pager').find_all('li')[-2].find('a').text
|
||||
upvedio_url_list=[]
|
||||
|
||||
# print(last_page)
|
||||
|
||||
for i in range(1,int(last_page)+1):
|
||||
upvedio_url=base_url1+uid+base_url2+str(i)+base_url3
|
||||
# print(upvedio_url)
|
||||
upvedio_url_list+=crawl_upinfo(upvedio_url)
|
||||
|
||||
for url in upvedio_url_list:
|
||||
crawl_vedio(url)
|
||||
# print(upvedio_url_list)
|
Loading…
Reference in new issue