diff --git a/hr.py b/hr.py new file mode 100644 index 0000000..3dc501e --- /dev/null +++ b/hr.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +import scrapy +import json +from scrapy_redis.spiders import RedisSpider +# from tencent.items import TencentItem +from tencent.items import TencentItem +# import requests +#class HrSpider(scrapy.Spider): +class HrSpider(RedisSpider): + name = 'hr' + allowed_domains = ['careers.tencent.com'] + one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1592484674932&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn' + + two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1592484862642&postId={}&language=zh-cn' + + start_urls = [one_url.format(1)] + redis_key = "hr" + def parse(self, response): + + for page in range(1,15): + + # 向这10页来发起请求 + url = self.one_url.format(page) + + yield scrapy.Request( + url=url, + callback=self.parse_one + + ) + + def parse_one(self,response): + + data = json.loads(response.text) + + for job in data['Data']['Posts']: + + + item = TencentItem() + item['zh_name'] = job['RecruitPostName'] # 职位 + item['zh_type'] = job['CategoryName'] # 职位类型 + item['zh_city'] = job['LocationName'] + post_id = job['PostId'] # id + + # 拼接详情页的url + detail_url = self.two_url.format(post_id) + + yield scrapy.Request( + url=detail_url, + meta={'item':item}, + callback=self.parse_two + ) + + + def parse_two(self,response): + + # item = response.meta['item'] + item = response.meta.get('item') + + + # print(response.text) + # print(type(response.text)) + + data = json.loads(response.text) + + item['zh_yaoqiu'] = data['Data']['Requirement'] + item['zh_duty'] = data['Data']['Responsibility'] + item['zh_city'] = data['Data']['LocationName'] + + # print(item) + yield item +