From a204a1aeda49889143a9fb248848b5dffc0a5f82 Mon Sep 17 00:00:00 2001
From: posql3f6g <2974352416@qq.com>
Date: Wed, 19 Apr 2023 20:18:05 +0800
Subject: [PATCH] =?UTF-8?q?=E5=8F=AF=E8=A1=8C=E7=9A=84=E7=88=AC=E5=8F=96?=
=?UTF-8?q?=E8=A7=86=E9=A2=91=E5=BC=B9=E5=B9=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
完整爬取b站视频弹幕.py | 167 ++++++++++++++++++++++++++++++++
1 file changed, 167 insertions(+)
create mode 100644 完整爬取b站视频弹幕.py
diff --git a/完整爬取b站视频弹幕.py b/完整爬取b站视频弹幕.py
new file mode 100644
index 0000000..5aa3b71
--- /dev/null
+++ b/完整爬取b站视频弹幕.py
@@ -0,0 +1,167 @@
+import requests
+from lxml import etree
+from selenium import webdriver
+import time
+from bs4 import BeautifulSoup
+import re
+import os
+import subprocess
+import json
+import pprint
+import openpyxl
+import urllib.request
+
+def crawl_vedio(vedio_url):
+ url = vedio_url
+
+ headers = {
+ 'referer': 'https://www.bilibili.com/video/BV1jm4y167fE/?vd_source=59fecc30e7f4791084968599ca1f8b82',
+ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
+ }
+ response = requests.get(url=url, headers=headers)
+
+ title = re.findall('
', response.text)[0]
+ playinfo = re.findall('', response.text)[0]
+
+ json_data = json.loads(playinfo)
+
+ audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
+ video_url = json_data['data']['dash']['video'][0]['baseUrl']
+
+ audio_content = requests.get(url=audio_url, headers=headers).content
+ vedio_content = requests.get(url=video_url, headers=headers).content
+
+ with open('视频\\' + title + '.mp3', mode='wb') as f:
+ f.write(audio_content)
+ with open('视频\\' + title + '.mp4', mode='wb') as f:
+ f.write(vedio_content)
+
+ COMMAND = f'ffmpeg -i 视频\\{title}.mp4 -i 视频\\{title}.mp3 -c:v copy -c:a aac -strict experimental 视频\\{title}output.mp4'
+ subprocess.run(COMMAND, shell=True)
+
+ crawl_barrage(vedio_url,title)
+ # crawl_comment(vedio_url,title)
+
+def crawl_barrage(vedio_url,title):
+ base_url = vedio_url.split('https://www.')[1]
+ url = 'https://www.i' + base_url
+ headers = {
+ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
+ }
+
+ request = urllib.request.Request(url=url, headers=headers)
+ response = urllib.request.urlopen(request)
+ content = response.read().decode('utf-8')
+ tree = etree.HTML(content)
+ danmuurl = tree.xpath('//div[@class="btn-group"]/a[3]/@href')
+
+ response2 = requests.get(url=danmuurl[0], headers=headers)
+ response2.encoding = 'utf-8'
+ content_list = re.findall('(.*?)', response2.text)
+
+ for content in content_list:
+ with open('弹幕\\'+title+'弹幕.txt', mode='a', encoding='utf-8') as f:
+ f.write(content)
+ f.write('\n')
+
+ # crawl_comment(vedio_url, title)
+
+# def crawl_comment(vedio_url,title):
+#
+# url = vedio_url
+#
+# headers = {
+# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
+# }
+#
+# response = requests.get(url=url, headers=headers)
+#
+# mid_list = [i['member']['mid'] for i in response.json()['data']['replies']]
+# uname_list = [i['member']['uname'] for i in response.json()['data']['replies']]
+# sign_list = [i['member']['sign'] for i in response.json()['data']['replies']]
+# content_list = [i['content']['message'] for i in response.json()['data']['replies']]
+#
+# for content in content_list:
+# with open('评论\\'+title+'评论.txt', mode='a', encoding='utf-8') as f:
+# f.write(content)
+# f.write('\n')
+#
+# for mid in mid_list:
+# with open('评论\\'+title+'Uid.txt', mode='a', encoding='utf-8') as f:
+# f.write(mid)
+# f.write('\n')
+#
+# for uname in uname_list:
+# with open('评论\\'+title+'昵称.txt', mode='a', encoding='utf-8') as f:
+# f.write(uname)
+# f.write('\n')
+#
+# for sign in sign_list:
+# with open('评论\\'+title+'个性签名.txt', mode='a', encoding='utf-8') as f:
+# f.write(sign)
+# f.write('\n')
+#
+# dict = {}
+# dict["Uid"] = mid_list
+# dict["uname"] = uname_list
+# dict["sign"] = sign_list
+# dict["content"] = content_list
+#
+# write_excel(dict)
+#
+#
+# def write_excel(dict):
+# work_book = openpyxl.Workbook()
+# sheet = work_book.create_sheet('评论')
+# for index, (key, value) in enumerate(dict.items()):
+# sheet.cell(1, index + 1, key)
+# for i in range(len(value)):
+# sheet.cell(i + 2, index + 1, value[i])
+#
+# work_book.save('评论.xlsx')
+
+
+
+# 480959917
+# 1856528671
+def crawl_upinfo(url):
+ path='chromedriver.exe'
+ browser=webdriver.Chrome(path)
+ browser.get(url)
+ time.sleep(2)
+ detial_url_list = []
+ html = BeautifulSoup(browser.page_source)
+
+ for a_label in html.find('div', id='submit-video-list').find_all('a',attrs={'target': '_blank', 'class': 'title'}):
+ if (a_label['href'] != None):
+ detial_url_list.append('https:' + a_label['href'])
+ # print(detial_url_list)
+ return detial_url_list
+
+# https://space.bilibili.com/480959917/video?tid=0&page=2&keyword=&order=pubdate
+
+if __name__=='__main__':
+ uid=input('请输入你想要看的博主的uid:')
+ base_url1='https://space.bilibili.com/'
+ base_url2='/video?tid=0&page='
+ base_url3='&keyword=&order=pubdate'
+ url=base_url1+uid+base_url2+'1'+base_url3
+
+ path='chromedriver.exe'
+ browser=webdriver.Chrome(path)
+ browser.get(url)
+ time.sleep(2)
+ html = BeautifulSoup(browser.page_source)
+ last_page=html.find('div',id='submit-video-list').find('ul',class_='be-pager').find_all('li')[-2].find('a').text
+ upvedio_url_list=[]
+
+ # print(last_page)
+
+ for i in range(1,int(last_page)+1):
+ upvedio_url=base_url1+uid+base_url2+str(i)+base_url3
+ # print(upvedio_url)
+ upvedio_url_list+=crawl_upinfo(upvedio_url)
+
+ for url in upvedio_url_list:
+ crawl_vedio(url)
+# print(upvedio_url_list)
\ No newline at end of file