You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
55 lines
2.5 KiB
55 lines
2.5 KiB
from scrapy.crawler import CrawlerProcess
|
|
from scrapy.utils.project import get_project_settings
|
|
from scrapy_redis.spiders import RedisCrawlSpider
|
|
from scrapy.linkextractors import LinkExtractor
|
|
from scrapy.spiders import Rule
|
|
from scrapy import Request
|
|
|
|
from spider.items import VideoItem, BiliItem
|
|
|
|
|
|
class BiliSpider(RedisCrawlSpider):
|
|
name = 'Bili'
|
|
redis_key = 'Bili'
|
|
|
|
rules = [
|
|
Rule(LinkExtractor(allow=r"https://www.bilibili.com/bangumi/.*?"), callback='parse_Item', follow=True),
|
|
Rule(LinkExtractor(allow=r"https://www.bilibili.com/video/BV.*?"), callback='parse_Videoitem', follow=True),
|
|
Rule(LinkExtractor(allow=r"https://www.bilibili.com/bangumi/media/md.*?"), callback='parse_BiliItem',
|
|
follow=True),
|
|
]
|
|
|
|
def parse_Videoitem(self, response, **kwargs):
|
|
Video_item = VideoItem()
|
|
Video_item['title'] = response.xpath('//*[@id="viewbox_report"]/h1/@title').extract()[0]
|
|
Video_item['view_counts'] = str(
|
|
response.xpath('//*[@id="viewbox_report"]/div/div/span[1]/@title').extract()[0]).replace("总播放数", "")
|
|
Video_item['barrage'] = str(
|
|
response.xpath('//*[@id="viewbox_report"]/div/div/span[2]/@title').extract()[0]).replace(
|
|
"历史累计弹幕数", "")
|
|
Video_item['up'] = str(response.xpath('//*[@id="v_upinfo"]/div[2]/div[1]/a[1]/text()').extract()[0]).replace(
|
|
"\\n",
|
|
"").strip()
|
|
yield Video_item
|
|
|
|
def parse_BiliItem(self, response, **kwargs):
|
|
bangumi_item = BiliItem()
|
|
bangumi_item['title'] = response.xpath(
|
|
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()').extract()[0]
|
|
bangumi_item['view_counts'] = response.xpath(
|
|
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()').extract()[0]
|
|
bangumi_item['attention'] = response.xpath(
|
|
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()').extract()[0]
|
|
bangumi_item['barrage'] = response.xpath(
|
|
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()').extract()[0]
|
|
bangumi_item['evaluate'] = response.xpath(
|
|
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[2]/div/div[1]/text()').extract()[0]
|
|
yield bangumi_item
|
|
|
|
def parse_Item(self, response, **kwargs):
|
|
url = 'https:' + response.xpath('//*[@id="media_module"]/div/a/@href').extract()[0]
|
|
yield Request(url=url, callback=self.parse_BiliItem)
|
|
|
|
|
|
|