# -*- coding: utf-8 -*- import scrapy import json from scrapy_redis.spiders import RedisSpider # from tencent.items import TencentItem from tencent.items import TencentItem # import requests #class HrSpider(scrapy.Spider): class HrSpider(RedisSpider): name = 'hr' allowed_domains = ['careers.tencent.com'] one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1592484674932&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn' two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1592484862642&postId={}&language=zh-cn' start_urls = [one_url.format(1)] redis_key = "hr" def parse(self, response): for page in range(1,15): # 向这10页来发起请求 url = self.one_url.format(page) yield scrapy.Request( url=url, callback=self.parse_one ) def parse_one(self,response): data = json.loads(response.text) for job in data['Data']['Posts']: item = TencentItem() item['zh_name'] = job['RecruitPostName'] # 职位 item['zh_type'] = job['CategoryName'] # 职位类型 item['zh_city'] = job['LocationName'] post_id = job['PostId'] # id # 拼接详情页的url detail_url = self.two_url.format(post_id) yield scrapy.Request( url=detail_url, meta={'item':item}, callback=self.parse_two ) def parse_two(self,response): # item = response.meta['item'] item = response.meta.get('item') # print(response.text) # print(type(response.text)) data = json.loads(response.text) item['zh_yaoqiu'] = data['Data']['Requirement'] item['zh_duty'] = data['Data']['Responsibility'] item['zh_city'] = data['Data']['LocationName'] # print(item) yield item