from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from scrapy_redis.spiders import RedisCrawlSpider from scrapy.linkextractors import LinkExtractor from scrapy.spiders import Rule from scrapy import Request from spider.items import VideoItem, BiliItem class BiliSpider(RedisCrawlSpider): name = 'Bili' redis_key = 'Bili' rules = [ Rule(LinkExtractor(allow=r"https://www.bilibili.com/bangumi/.*?"), callback='parse_Item', follow=True), Rule(LinkExtractor(allow=r"https://www.bilibili.com/video/BV.*?"), callback='parse_Videoitem', follow=True), Rule(LinkExtractor(allow=r"https://www.bilibili.com/bangumi/media/md.*?"), callback='parse_BiliItem', follow=True), ] def parse_Videoitem(self, response, **kwargs): Video_item = VideoItem() Video_item['title'] = response.xpath('//*[@id="viewbox_report"]/h1/@title').extract()[0] Video_item['view_counts'] = str( response.xpath('//*[@id="viewbox_report"]/div/div/span[1]/@title').extract()[0]).replace("总播放数", "") Video_item['barrage'] = str( response.xpath('//*[@id="viewbox_report"]/div/div/span[2]/@title').extract()[0]).replace( "历史累计弹幕数", "") Video_item['up'] = str(response.xpath('//*[@id="v_upinfo"]/div[2]/div[1]/a[1]/text()').extract()[0]).replace( "\\n", "").strip() yield Video_item def parse_BiliItem(self, response, **kwargs): bangumi_item = BiliItem() bangumi_item['title'] = response.xpath( '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()').extract()[0] bangumi_item['view_counts'] = response.xpath( '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()').extract()[0] bangumi_item['attention'] = response.xpath( '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()').extract()[0] bangumi_item['barrage'] = response.xpath( '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()').extract()[0] bangumi_item['evaluate'] = response.xpath( '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[2]/div/div[1]/text()').extract()[0] yield bangumi_item def parse_Item(self, response, **kwargs): url = 'https:' + response.xpath('//*[@id="media_module"]/div/a/@href').extract()[0] yield Request(url=url, callback=self.parse_BiliItem)