You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

55 lines
2.5 KiB

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy_redis.spiders import RedisCrawlSpider
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
from scrapy import Request
from spider.items import VideoItem, BiliItem
class BiliSpider(RedisCrawlSpider):
name = 'Bili'
redis_key = 'Bili'
rules = [
Rule(LinkExtractor(allow=r"https://www.bilibili.com/bangumi/.*?"), callback='parse_Item', follow=True),
Rule(LinkExtractor(allow=r"https://www.bilibili.com/video/BV.*?"), callback='parse_Videoitem', follow=True),
Rule(LinkExtractor(allow=r"https://www.bilibili.com/bangumi/media/md.*?"), callback='parse_BiliItem',
follow=True),
]
def parse_Videoitem(self, response, **kwargs):
Video_item = VideoItem()
Video_item['title'] = response.xpath('//*[@id="viewbox_report"]/h1/@title').extract()[0]
Video_item['view_counts'] = str(
response.xpath('//*[@id="viewbox_report"]/div/div/span[1]/@title').extract()[0]).replace("总播放数", "")
Video_item['barrage'] = str(
response.xpath('//*[@id="viewbox_report"]/div/div/span[2]/@title').extract()[0]).replace(
"历史累计弹幕数", "")
Video_item['up'] = str(response.xpath('//*[@id="v_upinfo"]/div[2]/div[1]/a[1]/text()').extract()[0]).replace(
"\\n",
"").strip()
yield Video_item
def parse_BiliItem(self, response, **kwargs):
bangumi_item = BiliItem()
bangumi_item['title'] = response.xpath(
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()').extract()[0]
bangumi_item['view_counts'] = response.xpath(
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()').extract()[0]
bangumi_item['attention'] = response.xpath(
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()').extract()[0]
bangumi_item['barrage'] = response.xpath(
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()').extract()[0]
bangumi_item['evaluate'] = response.xpath(
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[2]/div/div[1]/text()').extract()[0]
yield bangumi_item
def parse_Item(self, response, **kwargs):
url = 'https:' + response.xpath('//*[@id="media_module"]/div/a/@href').extract()[0]
yield Request(url=url, callback=self.parse_BiliItem)