|
|
|
@ -50,6 +50,7 @@ class JobboleSpider(scrapy.Spider):
|
|
|
|
|
cookie_dict[cookie['name']] = cookie['value']
|
|
|
|
|
for url in self.start_urls:
|
|
|
|
|
yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict)
|
|
|
|
|
|
|
|
|
|
def parse(self, response):
|
|
|
|
|
# 1.获取新闻列表页中的新闻url并交给scrapy进行下载后调用相应的解析方
|
|
|
|
|
# 提取文章链接,extract_first()提取第一个值
|
|
|
|
@ -57,7 +58,8 @@ class JobboleSpider(scrapy.Spider):
|
|
|
|
|
for post_node in post_nodes:
|
|
|
|
|
image_url = "https:" + post_node.css('.entry_summary a img::attr(src)').extract_first("")
|
|
|
|
|
post_url = post_node.css('h2 a::attr(href)').extract_first("")
|
|
|
|
|
yield Request(url=parse.urljoin(response.url, post_url), meta={'front_image_url': image_url}, callback=self.parse_detail, dont_filter=True)
|
|
|
|
|
yield Request(url=parse.urljoin(response.url, post_url), meta={'front_image_url': image_url},
|
|
|
|
|
callback=self.parse_detail, dont_filter=True)
|
|
|
|
|
# 2.获取下一页的url并交给scrapy进行下载,下载完成后交给parse继续跟进
|
|
|
|
|
next_url = response.css('div.pager a:last-child::attr(href)').extract_first("")
|
|
|
|
|
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
|
|
|
|
@ -74,9 +76,10 @@ class JobboleSpider(scrapy.Spider):
|
|
|
|
|
item_loader.add_value('url', response.url)
|
|
|
|
|
item_loader.add_value('front_image_url', response.meta.get('front_image_url', ''))
|
|
|
|
|
yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
|
|
|
|
|
meta={'article_item': item_loader, 'url':response.url}, callback=self.parse_nums)
|
|
|
|
|
meta={'article_item': item_loader, 'url': response.url}, callback=self.parse_nums)
|
|
|
|
|
|
|
|
|
|
def parse_nums(self, response):
|
|
|
|
|
# 提取点赞数,收藏数,评论数
|
|
|
|
|
j_data = json.loads(response.text)
|
|
|
|
|
item_loader = response.meta.get('article_item', "")
|
|
|
|
|
item_loader.add_value('praise_nums', j_data['DiggCount'])
|
|
|
|
|