import requests from lxml import etree from selenium import webdriver import time from bs4 import BeautifulSoup import re import os import subprocess import json import pprint import openpyxl import urllib.request def crawl_vedio(vedio_url): url = vedio_url headers = { 'referer': 'https://www.bilibili.com/video/BV1jm4y167fE/?vd_source=59fecc30e7f4791084968599ca1f8b82', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' } response = requests.get(url=url, headers=headers) title = re.findall('

', response.text)[0] playinfo = re.findall('', response.text)[0] json_data = json.loads(playinfo) audio_url = json_data['data']['dash']['audio'][0]['baseUrl'] video_url = json_data['data']['dash']['video'][0]['baseUrl'] audio_content = requests.get(url=audio_url, headers=headers).content vedio_content = requests.get(url=video_url, headers=headers).content with open('视频\\' + title + '.mp3', mode='wb') as f: f.write(audio_content) with open('视频\\' + title + '.mp4', mode='wb') as f: f.write(vedio_content) COMMAND = f'ffmpeg -i 视频\\{title}.mp4 -i 视频\\{title}.mp3 -c:v copy -c:a aac -strict experimental 视频\\{title}output.mp4' subprocess.run(COMMAND, shell=True) crawl_barrage(vedio_url,title) # crawl_comment(vedio_url,title) def crawl_barrage(vedio_url,title): base_url = vedio_url.split('https://www.')[1] url = 'https://www.i' + base_url headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' } request = urllib.request.Request(url=url, headers=headers) response = urllib.request.urlopen(request) content = response.read().decode('utf-8') tree = etree.HTML(content) danmuurl = tree.xpath('//div[@class="btn-group"]/a[3]/@href') response2 = requests.get(url=danmuurl[0], headers=headers) response2.encoding = 'utf-8' content_list = re.findall('(.*?)', response2.text) for content in content_list: with open('弹幕\\'+title+'弹幕.txt', mode='a', encoding='utf-8') as f: f.write(content) f.write('\n') # crawl_comment(vedio_url, title) # def crawl_comment(vedio_url,title): # # url = vedio_url # # headers = { # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' # } # # response = requests.get(url=url, headers=headers) # # mid_list = [i['member']['mid'] for i in response.json()['data']['replies']] # uname_list = [i['member']['uname'] for i in response.json()['data']['replies']] # sign_list = [i['member']['sign'] for i in response.json()['data']['replies']] # content_list = [i['content']['message'] for i in response.json()['data']['replies']] # # for content in content_list: # with open('评论\\'+title+'评论.txt', mode='a', encoding='utf-8') as f: # f.write(content) # f.write('\n') # # for mid in mid_list: # with open('评论\\'+title+'Uid.txt', mode='a', encoding='utf-8') as f: # f.write(mid) # f.write('\n') # # for uname in uname_list: # with open('评论\\'+title+'昵称.txt', mode='a', encoding='utf-8') as f: # f.write(uname) # f.write('\n') # # for sign in sign_list: # with open('评论\\'+title+'个性签名.txt', mode='a', encoding='utf-8') as f: # f.write(sign) # f.write('\n') # # dict = {} # dict["Uid"] = mid_list # dict["uname"] = uname_list # dict["sign"] = sign_list # dict["content"] = content_list # # write_excel(dict) # # # def write_excel(dict): # work_book = openpyxl.Workbook() # sheet = work_book.create_sheet('评论') # for index, (key, value) in enumerate(dict.items()): # sheet.cell(1, index + 1, key) # for i in range(len(value)): # sheet.cell(i + 2, index + 1, value[i]) # # work_book.save('评论.xlsx') # 480959917 # 1856528671 def crawl_upinfo(url): path='chromedriver.exe' browser=webdriver.Chrome(path) browser.get(url) time.sleep(2) detial_url_list = [] html = BeautifulSoup(browser.page_source) for a_label in html.find('div', id='submit-video-list').find_all('a',attrs={'target': '_blank', 'class': 'title'}): if (a_label['href'] != None): detial_url_list.append('https:' + a_label['href']) # print(detial_url_list) return detial_url_list # https://space.bilibili.com/480959917/video?tid=0&page=2&keyword=&order=pubdate if __name__=='__main__': uid=input('请输入你想要看的博主的uid:') base_url1='https://space.bilibili.com/' base_url2='/video?tid=0&page=' base_url3='&keyword=&order=pubdate' url=base_url1+uid+base_url2+'1'+base_url3 path='chromedriver.exe' browser=webdriver.Chrome(path) browser.get(url) time.sleep(2) html = BeautifulSoup(browser.page_source) last_page=html.find('div',id='submit-video-list').find('ul',class_='be-pager').find_all('li')[-2].find('a').text upvedio_url_list=[] # print(last_page) for i in range(1,int(last_page)+1): upvedio_url=base_url1+uid+base_url2+str(i)+base_url3 # print(upvedio_url) upvedio_url_list+=crawl_upinfo(upvedio_url) for url in upvedio_url_list: crawl_vedio(url) # print(upvedio_url_list)