From a204a1aeda49889143a9fb248848b5dffc0a5f82 Mon Sep 17 00:00:00 2001 From: posql3f6g <2974352416@qq.com> Date: Wed, 19 Apr 2023 20:18:05 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8F=AF=E8=A1=8C=E7=9A=84=E7=88=AC=E5=8F=96?= =?UTF-8?q?=E8=A7=86=E9=A2=91=E5=BC=B9=E5=B9=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 完整爬取b站视频弹幕.py | 167 ++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 完整爬取b站视频弹幕.py diff --git a/完整爬取b站视频弹幕.py b/完整爬取b站视频弹幕.py new file mode 100644 index 0000000..5aa3b71 --- /dev/null +++ b/完整爬取b站视频弹幕.py @@ -0,0 +1,167 @@ +import requests +from lxml import etree +from selenium import webdriver +import time +from bs4 import BeautifulSoup +import re +import os +import subprocess +import json +import pprint +import openpyxl +import urllib.request + +def crawl_vedio(vedio_url): + url = vedio_url + + headers = { + 'referer': 'https://www.bilibili.com/video/BV1jm4y167fE/?vd_source=59fecc30e7f4791084968599ca1f8b82', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' + } + response = requests.get(url=url, headers=headers) + + title = re.findall('

', response.text)[0] + playinfo = re.findall('', response.text)[0] + + json_data = json.loads(playinfo) + + audio_url = json_data['data']['dash']['audio'][0]['baseUrl'] + video_url = json_data['data']['dash']['video'][0]['baseUrl'] + + audio_content = requests.get(url=audio_url, headers=headers).content + vedio_content = requests.get(url=video_url, headers=headers).content + + with open('视频\\' + title + '.mp3', mode='wb') as f: + f.write(audio_content) + with open('视频\\' + title + '.mp4', mode='wb') as f: + f.write(vedio_content) + + COMMAND = f'ffmpeg -i 视频\\{title}.mp4 -i 视频\\{title}.mp3 -c:v copy -c:a aac -strict experimental 视频\\{title}output.mp4' + subprocess.run(COMMAND, shell=True) + + crawl_barrage(vedio_url,title) + # crawl_comment(vedio_url,title) + +def crawl_barrage(vedio_url,title): + base_url = vedio_url.split('https://www.')[1] + url = 'https://www.i' + base_url + headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' + } + + request = urllib.request.Request(url=url, headers=headers) + response = urllib.request.urlopen(request) + content = response.read().decode('utf-8') + tree = etree.HTML(content) + danmuurl = tree.xpath('//div[@class="btn-group"]/a[3]/@href') + + response2 = requests.get(url=danmuurl[0], headers=headers) + response2.encoding = 'utf-8' + content_list = re.findall('(.*?)', response2.text) + + for content in content_list: + with open('弹幕\\'+title+'弹幕.txt', mode='a', encoding='utf-8') as f: + f.write(content) + f.write('\n') + + # crawl_comment(vedio_url, title) + +# def crawl_comment(vedio_url,title): +# +# url = vedio_url +# +# headers = { +# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' +# } +# +# response = requests.get(url=url, headers=headers) +# +# mid_list = [i['member']['mid'] for i in response.json()['data']['replies']] +# uname_list = [i['member']['uname'] for i in response.json()['data']['replies']] +# sign_list = [i['member']['sign'] for i in response.json()['data']['replies']] +# content_list = [i['content']['message'] for i in response.json()['data']['replies']] +# +# for content in content_list: +# with open('评论\\'+title+'评论.txt', mode='a', encoding='utf-8') as f: +# f.write(content) +# f.write('\n') +# +# for mid in mid_list: +# with open('评论\\'+title+'Uid.txt', mode='a', encoding='utf-8') as f: +# f.write(mid) +# f.write('\n') +# +# for uname in uname_list: +# with open('评论\\'+title+'昵称.txt', mode='a', encoding='utf-8') as f: +# f.write(uname) +# f.write('\n') +# +# for sign in sign_list: +# with open('评论\\'+title+'个性签名.txt', mode='a', encoding='utf-8') as f: +# f.write(sign) +# f.write('\n') +# +# dict = {} +# dict["Uid"] = mid_list +# dict["uname"] = uname_list +# dict["sign"] = sign_list +# dict["content"] = content_list +# +# write_excel(dict) +# +# +# def write_excel(dict): +# work_book = openpyxl.Workbook() +# sheet = work_book.create_sheet('评论') +# for index, (key, value) in enumerate(dict.items()): +# sheet.cell(1, index + 1, key) +# for i in range(len(value)): +# sheet.cell(i + 2, index + 1, value[i]) +# +# work_book.save('评论.xlsx') + + + +# 480959917 +# 1856528671 +def crawl_upinfo(url): + path='chromedriver.exe' + browser=webdriver.Chrome(path) + browser.get(url) + time.sleep(2) + detial_url_list = [] + html = BeautifulSoup(browser.page_source) + + for a_label in html.find('div', id='submit-video-list').find_all('a',attrs={'target': '_blank', 'class': 'title'}): + if (a_label['href'] != None): + detial_url_list.append('https:' + a_label['href']) + # print(detial_url_list) + return detial_url_list + +# https://space.bilibili.com/480959917/video?tid=0&page=2&keyword=&order=pubdate + +if __name__=='__main__': + uid=input('请输入你想要看的博主的uid:') + base_url1='https://space.bilibili.com/' + base_url2='/video?tid=0&page=' + base_url3='&keyword=&order=pubdate' + url=base_url1+uid+base_url2+'1'+base_url3 + + path='chromedriver.exe' + browser=webdriver.Chrome(path) + browser.get(url) + time.sleep(2) + html = BeautifulSoup(browser.page_source) + last_page=html.find('div',id='submit-video-list').find('ul',class_='be-pager').find_all('li')[-2].find('a').text + upvedio_url_list=[] + + # print(last_page) + + for i in range(1,int(last_page)+1): + upvedio_url=base_url1+uid+base_url2+str(i)+base_url3 + # print(upvedio_url) + upvedio_url_list+=crawl_upinfo(upvedio_url) + + for url in upvedio_url_list: + crawl_vedio(url) +# print(upvedio_url_list) \ No newline at end of file