You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
86 lines
3.5 KiB
86 lines
3.5 KiB
import scrapy
|
|
from scrapy_redis.spiders import RedisSpider
|
|
import re
|
|
import json
|
|
import requests
|
|
import urllib.request
|
|
from lxml import etree
|
|
# from bilibili_spider.items import BilibiliSpiderItem
|
|
|
|
class BilibiliSpider(RedisSpider):
|
|
name = "bilibili"
|
|
redis_key = 'bili23'
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
domain = kwargs.pop('domain', '')
|
|
self.allowed_domains = list(filter(None, domain.split(',')))
|
|
super(BilibiliSpider, self).__init__(*args, **kwargs)
|
|
|
|
def parse(self, response):
|
|
url = response._url
|
|
headers = {
|
|
'referer': 'https://www.bilibili.com/video/BV1jm4y167fE/?vd_source=59fecc30e7f4791084968599ca1f8b82',
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
|
|
}
|
|
|
|
title = re.findall('<h1 title="(.*?)" class="video-title tit">', response.text)[0]
|
|
playinfo = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
|
|
|
|
json_data = json.loads(playinfo)
|
|
|
|
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
|
|
video_url = json_data['data']['dash']['video'][0]['baseUrl']
|
|
|
|
audio_content = requests.get(url=audio_url, headers=headers).content
|
|
vedio_content = requests.get(url=video_url, headers=headers).content
|
|
|
|
# temp = {}
|
|
# temp['video_mp3'] = audio_content
|
|
# temp['video_mp4'] = vedio_content
|
|
# temp['title'] = title
|
|
|
|
with open('视频\\' + title + '.mp3', mode='wb') as f:
|
|
f.write(audio_content)
|
|
with open('视频\\' + title + '.mp4', mode='wb') as f:
|
|
f.write(vedio_content)
|
|
|
|
# COMMAND = f'ffmpeg -i 视频\\{title}.mp4 -i 视频\\{title}.mp3 -c:v copy -c:a aac -strict experimental 视频\\{title}output.mp4'
|
|
# subprocess.run(COMMAND, shell=True)
|
|
headers_1 = {
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
|
|
}
|
|
yield scrapy.Request(url, callback=self.parse_barrage, meta={"meta_1": title}, headers=headers_1)
|
|
|
|
def parse_barrage(self, response):
|
|
title = response.meta['meta_1']
|
|
# temp = response.meta['meta_1']
|
|
vedio_url = response._url
|
|
base_url = vedio_url.split('https://www.')[1]
|
|
url = 'https://www.i' + base_url
|
|
headers = {
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
|
|
}
|
|
|
|
request = urllib.request.Request(url=url, headers=headers)
|
|
response = urllib.request.urlopen(request)
|
|
content = response.read().decode('utf-8')
|
|
tree = etree.HTML(content)
|
|
danmuurl = tree.xpath('//div[@class="btn-group"]/a[3]/@href')
|
|
|
|
response2 = requests.get(url=danmuurl[0], headers=headers)
|
|
response2.encoding = 'utf-8'
|
|
content_list = re.findall('<d p=".*?">(.*?)</d>', response2.text)
|
|
|
|
# item = BilibiliSpiderItem()
|
|
#
|
|
# item['video_mp3'] = temp['video_mp3']
|
|
# item['video_mp4'] = temp['video_mp4']
|
|
# item['title'] = temp['title']
|
|
# item['barrage'] = content_list
|
|
|
|
# yield item
|
|
for content in content_list:
|
|
with open('弹幕\\' + title + '弹幕.txt', mode='a', encoding='utf-8') as f:
|
|
f.write(content)
|
|
f.write('\n')
|