forked from p8e7mthal/spider
Scrapy-redis
commit
ea79183c3c
@ -0,0 +1,3 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
@ -0,0 +1,11 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="TestRunnerService">
|
||||
<option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
|
||||
</component>
|
||||
</module>
|
||||
@ -0,0 +1,19 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="dataSourceStorageLocal" created-in="PY-211.7142.13">
|
||||
<data-source name="@localhost" uuid="cdb7b751-ca1b-41ce-b04e-769ad450cfd5">
|
||||
<database-info product="MySQL" version="8.0.28" jdbc-version="4.2" driver-name="MySQL Connector/J" driver-version="mysql-connector-java-8.0.25 (Revision: 08be9e9b4cba6aa115f9b27b215887af40b159e0)" dbms="MYSQL" exact-version="8.0.28" exact-driver-version="8.0">
|
||||
<extra-name-characters>#@</extra-name-characters>
|
||||
<identifier-quote-string>`</identifier-quote-string>
|
||||
</database-info>
|
||||
<case-sensitivity plain-identifiers="lower" quoted-identifiers="lower" />
|
||||
<secret-storage>master_key</secret-storage>
|
||||
<user-name>root</user-name>
|
||||
<schema-mapping>
|
||||
<introspection-scope>
|
||||
<node kind="schema" qname="@" />
|
||||
</introspection-scope>
|
||||
</schema-mapping>
|
||||
</data-source>
|
||||
</component>
|
||||
</project>
|
||||
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
|
||||
<data-source source="LOCAL" name="@localhost" uuid="cdb7b751-ca1b-41ce-b04e-769ad450cfd5">
|
||||
<driver-ref>mysql.8</driver-ref>
|
||||
<synchronize>true</synchronize>
|
||||
<jdbc-driver>com.mysql.cj.jdbc.Driver</jdbc-driver>
|
||||
<jdbc-url>jdbc:mysql://localhost:3306</jdbc-url>
|
||||
<working-dir>$ProjectFileDir$</working-dir>
|
||||
</data-source>
|
||||
</component>
|
||||
</project>
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
||||
#n:information_schema
|
||||
!<md> [null, 0, null, null, -2147483648, -2147483648]
|
||||
@ -0,0 +1,2 @@
|
||||
#n:mysql
|
||||
!<md> [null, 0, null, null, -2147483648, -2147483648]
|
||||
@ -0,0 +1,2 @@
|
||||
#n:performance_schema
|
||||
!<md> [null, 0, null, null, -2147483648, -2147483648]
|
||||
@ -0,0 +1,12 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredIdentifiers">
|
||||
<list>
|
||||
<option value="MySQLdb" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
||||
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
||||
@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/Test.iml" filepath="$PROJECT_DIR$/.idea/Test.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,26 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
# import scrapy
|
||||
#
|
||||
#
|
||||
# class TestItem(scrapy.Item):
|
||||
# # define the fields for your item here like:
|
||||
# # name = scrapy.Field()
|
||||
# pass
|
||||
|
||||
from scrapy import Item, Field
|
||||
|
||||
|
||||
class BossjobItem(Item):
|
||||
# define the fields for your item here like:
|
||||
# name = scrapy.Field()
|
||||
collection = 'products'
|
||||
job_com = Field()
|
||||
job_name = Field()
|
||||
salary = Field()
|
||||
job_limit = Field()
|
||||
job_benefit = Field()
|
||||
job_ab = Field()
|
||||
@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,85 @@
|
||||
import scrapy
|
||||
from scrapy import Request, Spider
|
||||
from Test.items import BossjobItem
|
||||
from scrapy_redis.spiders import RedisSpider
|
||||
# class BosstestSpider(scrapy.Spider):
|
||||
class BosstestSpider(RedisSpider):
|
||||
name = 'bosstest'
|
||||
allowed_domains = ['www.zhipin.com']
|
||||
#start_urls = ['http://www.zhipin.com/c101020100/?query=python&page=\d&ka=page-\d']
|
||||
#start_urls = ['https://www.zhipin.com/job_detail/?query=python&city=101020100&industry=&position=']
|
||||
# start_urls = ['http://www.zhipin.com/']
|
||||
# start_urls = ['http://www.zhipin.com/c101020100/?query=python&page=1&ka=page-&page=1']
|
||||
redis_key = "bole_urls"
|
||||
# def start_requests(self):
|
||||
#
|
||||
# for citycode in self.settings.get('CITYCODE'):
|
||||
# for keyword in self.settings.get('KEYWORDS'):
|
||||
# for page in range(1, self.settings.get('MAX_PAGE') + 1):
|
||||
# # 可能这里要设计翻页
|
||||
# # 注意动态页面
|
||||
# href = "&page="+ str(page)
|
||||
# ka = "&ka=page-" + href
|
||||
# page_url = 'http://www.zhipin.com/' + citycode + "/?query="+keyword + href + ka
|
||||
# # url = self.start_urls + quote(keyword)
|
||||
# yield Request(url=page_url, callback=self.parse, meta={'page': page}, dont_filter=True)
|
||||
|
||||
def parse(self, response):
|
||||
|
||||
|
||||
# url = response.url
|
||||
# print(url)
|
||||
# print('___________________' + url + '+++++++++++++++++++++++++')
|
||||
# print('+++++++++++++++++++' + str(type(response.url)) + '——————————————————————————')
|
||||
# url = response.xpath()
|
||||
# 起始url
|
||||
job_names_tmp = response.xpath('//span[@class="job-name"]/a/text()').extract()
|
||||
# 工作名字
|
||||
salary_tmp = response.xpath('//div[@class="job-limit clearfix"]/span[@class="red"]/text()').extract()
|
||||
# 工资
|
||||
job_limit_tmp = response.xpath('//div[@class="job-limit clearfix"]/p/text()').extract()
|
||||
# 工作经验限制
|
||||
job_com_tmp = response.xpath('//div[@class="company-text"]/h3/a/text()').extract()
|
||||
# 公司名字
|
||||
# job_benefit = response.xpath('//div[@class="info-desc"]/text()').extract()
|
||||
# 福利
|
||||
# job_ab_tmp = response.xpath('//div[@class="info-append clearfix"]/div[@class="tags"]')
|
||||
# /html/body/div[1]/div[3]/div/div[3]/ul/li[1]/div/div[2]/div[1]
|
||||
# //div[@class="info-append clearfix"]/div[@class="tags"]
|
||||
#能力
|
||||
urls = response.xpath("//div[@class='page']/a[@class='next']/@href").extract()
|
||||
for url in urls:
|
||||
print('___________________' + url + '+++++++++++++++++++++++++')
|
||||
url = 'http://www.zhipin.com/' + str(url)
|
||||
print('+++++++++++++++++++++++++' + url + '+++++++++++++++++++++++++')
|
||||
yield Request(url=url, callback=self.parse, dont_filter=True)
|
||||
|
||||
for i in range(len(job_names_tmp)):
|
||||
# job_ab_tmp2 = job_ab_tmp[i].xpath('/span[@class="tag-item"]/text()').extract()
|
||||
# print(job_ab_tmp2)
|
||||
# job_ab = ' '.join(job_ab_tmp2)
|
||||
|
||||
item = BossjobItem()
|
||||
|
||||
# item['job_city'] = '上海'
|
||||
item['job_com'] = job_com_tmp[i]
|
||||
item['job_name'] = job_names_tmp[i]
|
||||
item['salary'] = salary_tmp[i]
|
||||
item['job_limit'] = job_limit_tmp[i]
|
||||
# item['job_benefit'] = job_benefit[i]
|
||||
# item['job_ab'] = job_ab
|
||||
|
||||
# item['job_com'] = ''.join(job_com_tmp[i])
|
||||
# item['job_name'] = ''.join(job_names_tmp[i])
|
||||
# item['salary'] = ''.join(salary_tmp[i])
|
||||
# item['job_limit'] = ''.join(job_limit_tmp[i])
|
||||
# item['job_benefit'] = ''.join(job_benefit[i])
|
||||
yield item
|
||||
|
||||
# urls = response.xpath("//div[@class='page']/a[@class='next']/@href").extract()
|
||||
# for url in urls:
|
||||
# print('___________________' + url + '+++++++++++++++++++++++++')
|
||||
# url = 'http://www.zhipin.com/' + str(url)
|
||||
# print('+++++++++++++++++++++++++' + url + '+++++++++++++++++++++++++')
|
||||
# yield Request(url=url,callback=self.parse, dont_filter=True)
|
||||
#print(response.body)
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = Test.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = Test
|
||||
Loading…
Reference in new issue