import requests from lxml import etree from selenium import webdriver import time from bs4 import BeautifulSoup import re import os import subprocess import json import pprint import openpyxl def crawl_vedio(vedio_url,last_page): url = vedio_url headers = { 'referer': 'https://www.bilibili.com/video/BV1jm4y167fE/?vd_source=59fecc30e7f4791084968599ca1f8b82', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' } response = requests.get(url=url, headers=headers) title = re.findall('

', response.text)[0] playinfo = re.findall('', response.text)[0] json_data = json.loads(playinfo) audio_url = json_data['data']['dash']['audio'][0]['baseUrl'] video_url = json_data['data']['dash']['video'][0]['baseUrl'] audio_content = requests.get(url=audio_url, headers=headers).content vedio_content = requests.get(url=video_url, headers=headers).content with open('视频\\' + title + '.mp3', mode='wb') as f: f.write(audio_content) with open('视频\\' + title + '.mp4', mode='wb') as f: f.write(vedio_content) COMMAND = f'ffmpeg -i 视频\\{title}.mp4 -i 视频\\{title}.mp3 -c:v copy -c:a aac -strict experimental 视频\\{title}output.mp4' subprocess.run(COMMAND, shell=True) crawl_barrage(vedio_url,title) crawl_comment(vedio_url,title,last_page) def crawl_barrage(vedio_url,title): url = vedio_url headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' } response = requests.get(url=url, headers=headers) response.encoding = 'utf-8' content_list = re.findall('(.*?)', response.text) for content in content_list: with open('弹幕\\'+title+'弹幕.txt', mode='a', encoding='utf-8') as f: f.write(content) f.write('\n') def crawl_comment(vedio_url,title,last_page): for page in range(0, last_page): time.sleep(1) url = 'https://api.bilibili.com/x/v2/reply/main?csrf=9e1a61c984801379382903865edb8344&mode=3&next=' + str( page) + '&oid=697023311&plat=1&seek_rpid=&type=1' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' } response = requests.get(url=url, headers=headers) mid_list = [i['member']['mid'] for i in response.json()['data']['replies']] uname_list = [i['member']['uname'] for i in response.json()['data']['replies']] sign_list = [i['member']['sign'] for i in response.json()['data']['replies']] content_list = [i['content']['message'] for i in response.json()['data']['replies']] for content in content_list: with open('评论\\'+title+'评论.txt', mode='a', encoding='utf-8') as f: f.write(content) f.write('\n') for mid in mid_list: with open('评论\\'+title+'Uid.txt', mode='a', encoding='utf-8') as f: f.write(mid) f.write('\n') for uname in uname_list: with open('评论\\'+title+'昵称.txt', mode='a', encoding='utf-8') as f: f.write(uname) f.write('\n') for sign in sign_list: with open('评论\\'+title+'个性签名.txt', mode='a', encoding='utf-8') as f: f.write(sign) f.write('\n') dict = {} dict["Uid"] = mid_list dict["uname"] = uname_list dict["sign"] = sign_list dict["content"] = content_list write_excel(dict) def write_excel(dict): work_book = openpyxl.Workbook() sheet = work_book.create_sheet('评论') for index, (key, value) in enumerate(dict.items()): sheet.cell(1, index + 1, key) for i in range(len(value)): sheet.cell(i + 2, index + 1, value[i]) work_book.save('评论.xlsx') if __name__=='__main__': # 480959917 # 1856528671 uid=input('请输入你想要看的博主的uid:') base_url='https://space.bilibili.com' url=base_url+'/'+uid+'/video' path='chromedriver.exe' browser=webdriver.Chrome(path) browser.get(url) time.sleep(2) detial_url_list = [] html = BeautifulSoup(browser.page_source) last_page=html.find('div',id='submit-video-list').find('ul',class_='be-pager').find_all('li')[-2].find('a').text for a_label in html.find('div', id='submit-video-list').find_all('a',attrs={'target': '_blank', 'class': 'title'}): if (a_label['href'] != None): detial_url_list.append('https:' + a_label['href']) for vedio_url in detial_url_list: crawl_vedio(vedio_url,last_page)