You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

86 lines
3.5 KiB

import scrapy
from scrapy_redis.spiders import RedisSpider
import re
import json
import requests
import urllib.request
from lxml import etree
# from bilibili_spider.items import BilibiliSpiderItem
class BilibiliSpider(RedisSpider):
name = "bilibili"
redis_key = 'bili23'
def __init__(self, *args, **kwargs):
domain = kwargs.pop('domain', '')
self.allowed_domains = list(filter(None, domain.split(',')))
super(BilibiliSpider, self).__init__(*args, **kwargs)
def parse(self, response):
url = response._url
headers = {
'referer': 'https://www.bilibili.com/video/BV1jm4y167fE/?vd_source=59fecc30e7f4791084968599ca1f8b82',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
}
title = re.findall('<h1 title="(.*?)" class="video-title tit">', response.text)[0]
playinfo = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
json_data = json.loads(playinfo)
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
video_url = json_data['data']['dash']['video'][0]['baseUrl']
audio_content = requests.get(url=audio_url, headers=headers).content
vedio_content = requests.get(url=video_url, headers=headers).content
# temp = {}
# temp['video_mp3'] = audio_content
# temp['video_mp4'] = vedio_content
# temp['title'] = title
with open('视频\\' + title + '.mp3', mode='wb') as f:
f.write(audio_content)
with open('视频\\' + title + '.mp4', mode='wb') as f:
f.write(vedio_content)
# COMMAND = f'ffmpeg -i 视频\\{title}.mp4 -i 视频\\{title}.mp3 -c:v copy -c:a aac -strict experimental 视频\\{title}output.mp4'
# subprocess.run(COMMAND, shell=True)
headers_1 = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
}
yield scrapy.Request(url, callback=self.parse_barrage, meta={"meta_1": title}, headers=headers_1)
def parse_barrage(self, response):
title = response.meta['meta_1']
# temp = response.meta['meta_1']
vedio_url = response._url
base_url = vedio_url.split('https://www.')[1]
url = 'https://www.i' + base_url
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
tree = etree.HTML(content)
danmuurl = tree.xpath('//div[@class="btn-group"]/a[3]/@href')
response2 = requests.get(url=danmuurl[0], headers=headers)
response2.encoding = 'utf-8'
content_list = re.findall('<d p=".*?">(.*?)</d>', response2.text)
# item = BilibiliSpiderItem()
#
# item['video_mp3'] = temp['video_mp3']
# item['video_mp4'] = temp['video_mp4']
# item['title'] = temp['title']
# item['barrage'] = content_list
# yield item
for content in content_list:
with open('弹幕\\' + title + '弹幕.txt', mode='a', encoding='utf-8') as f:
f.write(content)
f.write('\n')