parent
180436c39c
commit
ed2991593a
@ -0,0 +1,85 @@
|
||||
import scrapy
|
||||
from scrapy_redis.spiders import RedisSpider
|
||||
import re
|
||||
import json
|
||||
import requests
|
||||
import urllib.request
|
||||
from lxml import etree
|
||||
# from bilibili_spider.items import BilibiliSpiderItem
|
||||
|
||||
class BilibiliSpider(RedisSpider):
|
||||
name = "bilibili"
|
||||
redis_key = 'bili23'
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
domain = kwargs.pop('domain', '')
|
||||
self.allowed_domains = list(filter(None, domain.split(',')))
|
||||
super(BilibiliSpider, self).__init__(*args, **kwargs)
|
||||
|
||||
def parse(self, response):
|
||||
url = response._url
|
||||
headers = {
|
||||
'referer': 'https://www.bilibili.com/video/BV1jm4y167fE/?vd_source=59fecc30e7f4791084968599ca1f8b82',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
|
||||
}
|
||||
|
||||
title = re.findall('<h1 title="(.*?)" class="video-title tit">', response.text)[0]
|
||||
playinfo = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
|
||||
|
||||
json_data = json.loads(playinfo)
|
||||
|
||||
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
|
||||
video_url = json_data['data']['dash']['video'][0]['baseUrl']
|
||||
|
||||
audio_content = requests.get(url=audio_url, headers=headers).content
|
||||
vedio_content = requests.get(url=video_url, headers=headers).content
|
||||
|
||||
# temp = {}
|
||||
# temp['video_mp3'] = audio_content
|
||||
# temp['video_mp4'] = vedio_content
|
||||
# temp['title'] = title
|
||||
|
||||
with open('视频\\' + title + '.mp3', mode='wb') as f:
|
||||
f.write(audio_content)
|
||||
with open('视频\\' + title + '.mp4', mode='wb') as f:
|
||||
f.write(vedio_content)
|
||||
|
||||
# COMMAND = f'ffmpeg -i 视频\\{title}.mp4 -i 视频\\{title}.mp3 -c:v copy -c:a aac -strict experimental 视频\\{title}output.mp4'
|
||||
# subprocess.run(COMMAND, shell=True)
|
||||
headers_1 = {
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
|
||||
}
|
||||
yield scrapy.Request(url, callback=self.parse_barrage, meta={"meta_1": title}, headers=headers_1)
|
||||
|
||||
def parse_barrage(self, response):
|
||||
title = response.meta['meta_1']
|
||||
# temp = response.meta['meta_1']
|
||||
vedio_url = response._url
|
||||
base_url = vedio_url.split('https://www.')[1]
|
||||
url = 'https://www.i' + base_url
|
||||
headers = {
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
|
||||
}
|
||||
|
||||
request = urllib.request.Request(url=url, headers=headers)
|
||||
response = urllib.request.urlopen(request)
|
||||
content = response.read().decode('utf-8')
|
||||
tree = etree.HTML(content)
|
||||
danmuurl = tree.xpath('//div[@class="btn-group"]/a[3]/@href')
|
||||
|
||||
response2 = requests.get(url=danmuurl[0], headers=headers)
|
||||
response2.encoding = 'utf-8'
|
||||
content_list = re.findall('<d p=".*?">(.*?)</d>', response2.text)
|
||||
|
||||
# item = BilibiliSpiderItem()
|
||||
#
|
||||
# item['video_mp3'] = temp['video_mp3']
|
||||
# item['video_mp4'] = temp['video_mp4']
|
||||
# item['title'] = temp['title']
|
||||
# item['barrage'] = content_list
|
||||
|
||||
# yield item
|
||||
for content in content_list:
|
||||
with open('弹幕\\' + title + '弹幕.txt', mode='a', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
f.write('\n')
|
Loading…
Reference in new issue