You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

144 lines
5.0 KiB

import requests
from lxml import etree
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import re
import os
import subprocess
import json
import pprint
import openpyxl
def crawl_vedio(vedio_url,last_page):
url = vedio_url
headers = {
'referer': 'https://www.bilibili.com/video/BV1jm4y167fE/?vd_source=59fecc30e7f4791084968599ca1f8b82',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
title = re.findall('<h1 title="(.*?)" class="video-title tit">', response.text)[0]
playinfo = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
json_data = json.loads(playinfo)
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
video_url = json_data['data']['dash']['video'][0]['baseUrl']
audio_content = requests.get(url=audio_url, headers=headers).content
vedio_content = requests.get(url=video_url, headers=headers).content
with open('视频\\' + title + '.mp3', mode='wb') as f:
f.write(audio_content)
with open('视频\\' + title + '.mp4', mode='wb') as f:
f.write(vedio_content)
COMMAND = f'ffmpeg -i 视频\\{title}.mp4 -i 视频\\{title}.mp3 -c:v copy -c:a aac -strict experimental 视频\\{title}output.mp4'
subprocess.run(COMMAND, shell=True)
crawl_barrage(vedio_url,title)
crawl_comment(vedio_url,title,last_page)
def crawl_barrage(vedio_url,title):
url = vedio_url
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
content_list = re.findall('<d p=".*?">(.*?)</d>', response.text)
for content in content_list:
with open('弹幕\\'+title+'弹幕.txt', mode='a', encoding='utf-8') as f:
f.write(content)
f.write('\n')
def crawl_comment(vedio_url,title,last_page):
for page in range(0, last_page):
time.sleep(1)
url = 'https://api.bilibili.com/x/v2/reply/main?csrf=9e1a61c984801379382903865edb8344&mode=3&next=' + str(
page) + '&oid=697023311&plat=1&seek_rpid=&type=1'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
mid_list = [i['member']['mid'] for i in response.json()['data']['replies']]
uname_list = [i['member']['uname'] for i in response.json()['data']['replies']]
sign_list = [i['member']['sign'] for i in response.json()['data']['replies']]
content_list = [i['content']['message'] for i in response.json()['data']['replies']]
for content in content_list:
with open('评论\\'+title+'评论.txt', mode='a', encoding='utf-8') as f:
f.write(content)
f.write('\n')
for mid in mid_list:
with open('评论\\'+title+'Uid.txt', mode='a', encoding='utf-8') as f:
f.write(mid)
f.write('\n')
for uname in uname_list:
with open('评论\\'+title+'昵称.txt', mode='a', encoding='utf-8') as f:
f.write(uname)
f.write('\n')
for sign in sign_list:
with open('评论\\'+title+'个性签名.txt', mode='a', encoding='utf-8') as f:
f.write(sign)
f.write('\n')
dict = {}
dict["Uid"] = mid_list
dict["uname"] = uname_list
dict["sign"] = sign_list
dict["content"] = content_list
write_excel(dict)
def write_excel(dict):
work_book = openpyxl.Workbook()
sheet = work_book.create_sheet('评论')
for index, (key, value) in enumerate(dict.items()):
sheet.cell(1, index + 1, key)
for i in range(len(value)):
sheet.cell(i + 2, index + 1, value[i])
work_book.save('评论.xlsx')
if __name__=='__main__':
# 480959917
# 1856528671
uid=input('请输入你想要看的博主的uid:')
base_url='https://space.bilibili.com'
url=base_url+'/'+uid+'/video'
path='chromedriver.exe'
browser=webdriver.Chrome(path)
browser.get(url)
time.sleep(2)
detial_url_list = []
html = BeautifulSoup(browser.page_source)
last_page=html.find('div',id='submit-video-list').find('ul',class_='be-pager').find_all('li')[-2].find('a').text
for a_label in html.find('div', id='submit-video-list').find_all('a',attrs={'target': '_blank', 'class': 'title'}):
if (a_label['href'] != None):
detial_url_list.append('https:' + a_label['href'])
for vedio_url in detial_url_list:
crawl_vedio(vedio_url,last_page)