From ed2991593a030a2f3705388006ec6cd711f46400 Mon Sep 17 00:00:00 2001 From: psqoycetx <1767552118@qq.com> Date: Sat, 22 Apr 2023 17:46:14 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=86=E5=B8=83=E5=BC=8F=E7=88=AC=E8=99=AB?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bilibili.py | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 bilibili.py diff --git a/bilibili.py b/bilibili.py new file mode 100644 index 0000000..7705711 --- /dev/null +++ b/bilibili.py @@ -0,0 +1,85 @@ +import scrapy +from scrapy_redis.spiders import RedisSpider +import re +import json +import requests +import urllib.request +from lxml import etree +# from bilibili_spider.items import BilibiliSpiderItem + +class BilibiliSpider(RedisSpider): + name = "bilibili" + redis_key = 'bili23' + + def __init__(self, *args, **kwargs): + domain = kwargs.pop('domain', '') + self.allowed_domains = list(filter(None, domain.split(','))) + super(BilibiliSpider, self).__init__(*args, **kwargs) + + def parse(self, response): + url = response._url + headers = { + 'referer': 'https://www.bilibili.com/video/BV1jm4y167fE/?vd_source=59fecc30e7f4791084968599ca1f8b82', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36' + } + + title = re.findall('

', response.text)[0] + playinfo = re.findall('', response.text)[0] + + json_data = json.loads(playinfo) + + audio_url = json_data['data']['dash']['audio'][0]['baseUrl'] + video_url = json_data['data']['dash']['video'][0]['baseUrl'] + + audio_content = requests.get(url=audio_url, headers=headers).content + vedio_content = requests.get(url=video_url, headers=headers).content + + # temp = {} + # temp['video_mp3'] = audio_content + # temp['video_mp4'] = vedio_content + # temp['title'] = title + + with open('视频\\' + title + '.mp3', mode='wb') as f: + f.write(audio_content) + with open('视频\\' + title + '.mp4', mode='wb') as f: + f.write(vedio_content) + + # COMMAND = f'ffmpeg -i 视频\\{title}.mp4 -i 视频\\{title}.mp3 -c:v copy -c:a aac -strict experimental 视频\\{title}output.mp4' + # subprocess.run(COMMAND, shell=True) + headers_1 = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36' + } + yield scrapy.Request(url, callback=self.parse_barrage, meta={"meta_1": title}, headers=headers_1) + + def parse_barrage(self, response): + title = response.meta['meta_1'] + # temp = response.meta['meta_1'] + vedio_url = response._url + base_url = vedio_url.split('https://www.')[1] + url = 'https://www.i' + base_url + headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36' + } + + request = urllib.request.Request(url=url, headers=headers) + response = urllib.request.urlopen(request) + content = response.read().decode('utf-8') + tree = etree.HTML(content) + danmuurl = tree.xpath('//div[@class="btn-group"]/a[3]/@href') + + response2 = requests.get(url=danmuurl[0], headers=headers) + response2.encoding = 'utf-8' + content_list = re.findall('(.*?)', response2.text) + + # item = BilibiliSpiderItem() + # + # item['video_mp3'] = temp['video_mp3'] + # item['video_mp4'] = temp['video_mp4'] + # item['title'] = temp['title'] + # item['barrage'] = content_list + + # yield item + for content in content_list: + with open('弹幕\\' + title + '弹幕.txt', mode='a', encoding='utf-8') as f: + f.write(content) + f.write('\n')