更新

4 years ago · ea79183c3c
commit ea79183c3c
36 changed files with 2876 additions and 0 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
--- a/.idea/Test.iml
+++ b/.idea/Test.iml
@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
+  </component>
+</module>
--- a/.idea/dataSources.local.xml
+++ b/.idea/dataSources.local.xml
@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="dataSourceStorageLocal" created-in="PY-211.7142.13">
+    <data-source name="@localhost" uuid="cdb7b751-ca1b-41ce-b04e-769ad450cfd5">
+      <database-info product="MySQL" version="8.0.28" jdbc-version="4.2" driver-name="MySQL Connector/J" driver-version="mysql-connector-java-8.0.25 (Revision: 08be9e9b4cba6aa115f9b27b215887af40b159e0)" dbms="MYSQL" exact-version="8.0.28" exact-driver-version="8.0">
+        <extra-name-characters>#@</extra-name-characters>
+        <identifier-quote-string>`</identifier-quote-string>
+      </database-info>
+      <case-sensitivity plain-identifiers="lower" quoted-identifiers="lower" />
+      <secret-storage>master_key</secret-storage>
+      <user-name>root</user-name>
+      <schema-mapping>
+        <introspection-scope>
+          <node kind="schema" qname="@" />
+        </introspection-scope>
+      </schema-mapping>
+    </data-source>
+  </component>
+</project>
--- a/.idea/dataSources.xml
+++ b/.idea/dataSources.xml
@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="DataSourceManagerImpl" format="xml" multifile-model="true">
+    <data-source source="LOCAL" name="@localhost" uuid="cdb7b751-ca1b-41ce-b04e-769ad450cfd5">
+      <driver-ref>mysql.8</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>com.mysql.cj.jdbc.Driver</jdbc-driver>
+      <jdbc-url>jdbc:mysql://localhost:3306</jdbc-url>
+      <working-dir>$ProjectFileDir$</working-dir>
+    </data-source>
+  </component>
+</project>
--- a/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5.xml
+++ b/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5.xml
--- a/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5/storage_v2/_src_/schema/information_schema.FNRwLQ.meta
+++ b/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5/storage_v2/_src_/schema/information_schema.FNRwLQ.meta
@ -0,0 +1,2 @@
+#n:information_schema
+!<md> [null, 0, null, null, -2147483648, -2147483648]
--- a/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5/storage_v2/_src_/schema/mysql.osA4Bg.meta
+++ b/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5/storage_v2/_src_/schema/mysql.osA4Bg.meta
@ -0,0 +1,2 @@
+#n:mysql
+!<md> [null, 0, null, null, -2147483648, -2147483648]
--- a/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5/storage_v2/_src_/schema/performance_schema.kIw0nw.meta
+++ b/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5/storage_v2/_src_/schema/performance_schema.kIw0nw.meta
@ -0,0 +1,2 @@
+#n:performance_schema
+!<md> [null, 0, null, null, -2147483648, -2147483648]
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@ -0,0 +1,12 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredIdentifiers">
+        <list>
+          <option value="MySQLdb" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Test.iml" filepath="$PROJECT_DIR$/.idea/Test.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
--- a/Test/init.py
+++ b/Test/init.py
--- a/Test/pycache/init.cpython-37.pyc
+++ b/Test/pycache/init.cpython-37.pyc
--- a/Test/pycache/init.cpython-39.pyc
+++ b/Test/pycache/init.cpython-39.pyc
--- a/Test/pycache/items.cpython-37.pyc
+++ b/Test/pycache/items.cpython-37.pyc
--- a/Test/pycache/items.cpython-39.pyc
+++ b/Test/pycache/items.cpython-39.pyc
--- a/Test/pycache/middlewares.cpython-37.pyc
+++ b/Test/pycache/middlewares.cpython-37.pyc
--- a/Test/pycache/middlewares.cpython-39.pyc
+++ b/Test/pycache/middlewares.cpython-39.pyc
--- a/Test/pycache/pipelines.cpython-37.pyc
+++ b/Test/pycache/pipelines.cpython-37.pyc
--- a/Test/pycache/settings.cpython-37.pyc
+++ b/Test/pycache/settings.cpython-37.pyc
--- a/Test/pycache/settings.cpython-39.pyc
+++ b/Test/pycache/settings.cpython-39.pyc
--- a/Test/items.py
+++ b/Test/items.py
@ -0,0 +1,26 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+# import scrapy
+#
+#
+# class TestItem(scrapy.Item):
+#     # define the fields for your item here like:
+#     # name = scrapy.Field()
+#     pass
+
+from scrapy import Item, Field
+
+
+class BossjobItem(Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    collection = 'products'
+    job_com = Field()
+    job_name = Field()
+    salary = Field()
+    job_limit = Field()
+    job_benefit = Field()
+    job_ab = Field()
--- a/Test/middlewares.py
+++ b/Test/middlewares.py
@ -0,0 +1,162 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+from scrapy.http.response.html import HtmlResponse
+# 改造返回类
+
+#selenium
+from selenium import webdriver
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.common.by import By
+from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
+#自定义
+import logging
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+import time
+
+
+
+class TestSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class TestDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    def __init__(self,timeout = 25):
+        # 初始化参数
+        options = webdriver.FirefoxOptions()
+        # 使用该参数就不会看到启动，无头
+        # options.add_argument('-headless')
+        self.browser = webdriver.Firefox(options=options,executable_path=r"C:\Users\cookie\Desktop\Test\Test\spiders\geckodriver.exe")
+        profile = FirefoxProfile()
+        #设置火狐具体配置
+        self.timeout = timeout
+        #self.browser = webdriver.Firefox(profile)
+        self.browser.set_page_load_timeout(self.timeout)
+        # self.browser.implicitly_wait(self.timeout)
+        # 隐式等待
+
+    #每次爬取都初始化
+    def __del__(self):
+        self.browser.close()
+        #关闭
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        """
+        selenium 下载中间件
+        """
+
+        logging.info('******WebDriver is Starting******')
+        print('使用selenium请求页面:{}'.format(request.url))
+            #page = request.meta.get('page', 1)
+        try:
+                self.browser.get(request.url)
+                #if page > 1:
+                self.browser.implicitly_wait(self.timeout)
+                return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',
+                                    status=200)
+        except TimeoutError:
+                return HtmlResponse(url=request.url, status=500, request=request)
+        # self.browser.get(request.url)
+        # time.sleep(1)
+        # try:
+        #     while True:
+        #         show_more = self.browser.find_element_by_class_name('show-more')
+        #         show_more.click()
+        #         time.sleep(3)
+        #         if not show_more:
+        #             break
+        # except:
+        #     pass
+        #     # 获得网页源代码
+        # source = self.browser.page_source
+        #     # 构造response对象 | 进行返回
+        # response = HtmlResponse(url=self.browser.current_url, body=source, request=request, encoding='utf-8')
+        # return response
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/Test/pipelines.py
+++ b/Test/pipelines.py
@ -0,0 +1,87 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+from scrapy import Spider
+import pymysql
+from logging import log
+
+# 异步防止堵塞
+
+class TestPipeline:
+    def process_item(self, item, spider):
+        # print('这是测试'+item['job_com'])
+        print('可以用+++++++++++++++++++++++++++++=')
+        # return item
+
+class MysqlPipelineTwo(object,Spider):
+    def __init__(self,dbpool):
+        self.dbpool = dbpool
+        self.connect = pymysql.connect(
+            host=self.settings.get('MYSQL_HOST'),
+            db=self.settings.get('MYSQL_DBNAME'),
+            user=self.settings.get('MYSQL_USER'),
+            password=self.settings.get('MYSQL_PASSWORD'),
+            charset='utf8',
+            use_unicode=True
+        )
+        self.cursor = self.connect.cursor()
+#?
+    # @classmethod
+    # def from_settings(cls, settings):
+    #     """
+    #     数据库建立连接
+    #     :param settings: 配置参数
+    #     :return: 实例化参数
+    #     """
+    #     adbparams = dict(
+    #         host=settings['MYSQL_HOST'],
+    #         db=settings['MYSQL_DBNAME'],
+    #         user=settings['MYSQL_USER'],
+    #         password=settings['MYSQL_PASSWORD'],
+    #         cursorclass=pymysql.cursors.DictCursor  # 指定cursor类型
+    #     )
+    #     # 连接数据池ConnectionPool，使用pymysql或者Mysqldb连接
+    #     dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
+    #     # 返回实例化参数
+    #     return cls(dbpool)
+
+    def process_item(self, item, spider):
+        try:
+            #插入
+            insert_sql = """
+                    insert into %s(job_com,job_names,salary,job_limit,job_benefit)
+                                     values("%s", "%s", "%s", "%s", "%s" );
+                                """ % (
+            '上海', item['job_com'], item['job_name'], item['salary'], item['job_limit'], item['job_benefit'])
+            self.cursor.execute(insert_sql)
+            self.connect.commit()
+            #执行
+        except Exception as error:
+            log(error)
+        return item
+        # """
+        # 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作，返回一个对象
+        # """
+        # query = self.dbpool.runInteraction(self.do_insert, item)  # 指定操作方法和操作数据
+        # # 添加异常处理
+        # query.addCallback(self.handle_error)  # 处理异常
+
+    # def do_insert(self, cursor, item):
+    #     # 对数据库进行插入操作，并不需要commit，twisted会自动commit
+    #     insert_sql = """
+    #     insert into %s(job_com,job_names,salary,job_limit,job_benefit)
+    #                      values("%s", "%s", "%s", "%s", "%s" );
+    #                 """ % ('上海', item['job_com'], item['job_name'], item['salary'], item['job_limit'], item['job_benefit'])
+    #     cursor.execute(insert_sql)
+    #
+    # def handle_error(self, failure):
+    #     if failure:
+    #         # 打印错误信息
+    #         print(failure)
+
+# item['job_city']
--- a/Test/settings.py
+++ b/Test/settings.py
@ -0,0 +1,125 @@
+# Scrapy settings for Test project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'Test'
+
+SPIDER_MODULES = ['Test.spiders']
+NEWSPIDER_MODULE = 'Test.spiders'
+
+
+KEYWORDS = ['python']
+CITYCODE = ['c101020100']
+MAX_PAGE = 10
+#使用了scrapy_redis的去重组件，在redis数据库里做去重，确保所有爬虫共享相同的去重指纹
+DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
+#使用了scrapy_redis的调度器，在redis里分配请求，确保request存储到redis中
+SCHEDULER = "scrapy_redis.scheduler.Scheduler"
+# 在redis中保持scrapy-redis用到的队列，不会清理redis中的队列，从而可以实现暂停和恢复的功能
+SCHEDULER_PERSIST = True
+#指定redis数据库的连接参数
+REDIS_HOST = "192.168.163.128"
+REDIS_PORT = 6379
+
+HTTPERROR_ALLOWED_CODES = [301]
+ROBOTSTXT_OBEY = False
+MYSQL_HOST = "127.0.0.1"
+MYSQL_PORT = 3306
+MYSQL_DBNAME = "spidetssdb"
+MYSQL_USER = "root"
+MYSQL_PASSWORD = "Fengqici010515"
+CHARSET = "utf8"
+
+'''
+ 通过配置RedisPipeline将item写入key为 
+ spider.name : items 的redis的list中，供后面的分布式处理item，这个已经由scrapy-redis 实现，
+ 不需要我们写代码，直接使用即可
+'''
+ITEM_PIPELINES = {
+   # 'Test.spiders.pipelines.Test.spidersPipeline': 300,
+   'scrapy_redis.pipelines.RedisPipeline': 400,
+   # 'Test.pipelines.MysqlPipelineTwo': 302,
+   # 'Test.pipelines.TestPipeline': 300,
+
+}
+#
+#
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'Test (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 1
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'Test.middlewares.TestSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+   'Test.middlewares.TestDownloaderMiddleware': 543,
+}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'Test.pipelines.TestPipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/Test/spiders/init.py
+++ b/Test/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/Test/spiders/pycache/init.cpython-37.pyc
+++ b/Test/spiders/pycache/init.cpython-37.pyc
--- a/Test/spiders/pycache/init.cpython-39.pyc
+++ b/Test/spiders/pycache/init.cpython-39.pyc
--- a/Test/spiders/pycache/bosstest.cpython-37.pyc
+++ b/Test/spiders/pycache/bosstest.cpython-37.pyc
--- a/Test/spiders/pycache/bosstest.cpython-39.pyc
+++ b/Test/spiders/pycache/bosstest.cpython-39.pyc
--- a/Test/spiders/bosstest.py
+++ b/Test/spiders/bosstest.py
@ -0,0 +1,85 @@
+import scrapy
+from scrapy import Request, Spider
+from Test.items import BossjobItem
+from scrapy_redis.spiders import RedisSpider
+# class BosstestSpider(scrapy.Spider):
+class BosstestSpider(RedisSpider):
+    name = 'bosstest'
+    allowed_domains = ['www.zhipin.com']
+    #start_urls = ['http://www.zhipin.com/c101020100/?query=python&page=\d&ka=page-\d']
+    #start_urls = ['https://www.zhipin.com/job_detail/?query=python&city=101020100&industry=&position=']
+    # start_urls = ['http://www.zhipin.com/']
+    # start_urls = ['http://www.zhipin.com/c101020100/?query=python&page=1&ka=page-&page=1']
+    redis_key = "bole_urls"
+    # def start_requests(self):
+    #
+    #         for citycode in self.settings.get('CITYCODE'):
+    #             for keyword in self.settings.get('KEYWORDS'):
+    #                 for page in range(1, self.settings.get('MAX_PAGE') + 1):
+    #             # 可能这里要设计翻页
+    #             # 注意动态页面
+    #                     href = "&page="+ str(page)
+    #                     ka = "&ka=page-" + href
+    #                     page_url = 'http://www.zhipin.com/'  + citycode + "/?query="+keyword + href + ka
+    #                     # url = self.start_urls + quote(keyword)
+    #                     yield Request(url=page_url, callback=self.parse, meta={'page': page}, dont_filter=True)
+
+    def parse(self, response):
+
+
+        # url = response.url
+        # print(url)
+        # print('___________________' + url + '+++++++++++++++++++++++++')
+        # print('+++++++++++++++++++' + str(type(response.url)) + '——————————————————————————')
+        # url = response.xpath()
+        # 起始url
+        job_names_tmp = response.xpath('//span[@class="job-name"]/a/text()').extract()
+        # 工作名字
+        salary_tmp = response.xpath('//div[@class="job-limit clearfix"]/span[@class="red"]/text()').extract()
+        # 工资
+        job_limit_tmp = response.xpath('//div[@class="job-limit clearfix"]/p/text()').extract()
+        # 工作经验限制
+        job_com_tmp = response.xpath('//div[@class="company-text"]/h3/a/text()').extract()
+        # 公司名字
+        # job_benefit = response.xpath('//div[@class="info-desc"]/text()').extract()
+        # 福利
+        # job_ab_tmp = response.xpath('//div[@class="info-append clearfix"]/div[@class="tags"]')
+        # /html/body/div[1]/div[3]/div/div[3]/ul/li[1]/div/div[2]/div[1]
+        # //div[@class="info-append clearfix"]/div[@class="tags"]
+        #能力
+        urls = response.xpath("//div[@class='page']/a[@class='next']/@href").extract()
+        for url in urls:
+            print('___________________' + url + '+++++++++++++++++++++++++')
+            url = 'http://www.zhipin.com/' + str(url)
+            print('+++++++++++++++++++++++++' + url + '+++++++++++++++++++++++++')
+            yield Request(url=url, callback=self.parse, dont_filter=True)
+
+        for i in range(len(job_names_tmp)):
+            # job_ab_tmp2 = job_ab_tmp[i].xpath('/span[@class="tag-item"]/text()').extract()
+            # print(job_ab_tmp2)
+            # job_ab = ' '.join(job_ab_tmp2)
+
+            item = BossjobItem()
+
+            # item['job_city'] = '上海'
+            item['job_com'] = job_com_tmp[i]
+            item['job_name'] = job_names_tmp[i]
+            item['salary'] = salary_tmp[i]
+            item['job_limit'] = job_limit_tmp[i]
+            # item['job_benefit'] = job_benefit[i]
+            # item['job_ab'] = job_ab
+
+            # item['job_com'] = ''.join(job_com_tmp[i])
+            # item['job_name'] = ''.join(job_names_tmp[i])
+            # item['salary'] = ''.join(salary_tmp[i])
+            # item['job_limit'] = ''.join(job_limit_tmp[i])
+            # item['job_benefit'] = ''.join(job_benefit[i])
+            yield item
+
+        # urls = response.xpath("//div[@class='page']/a[@class='next']/@href").extract()
+        # for url in urls:
+        #     print('___________________' + url + '+++++++++++++++++++++++++')
+        #     url = 'http://www.zhipin.com/' + str(url)
+        #     print('+++++++++++++++++++++++++' + url + '+++++++++++++++++++++++++')
+        #     yield Request(url=url,callback=self.parse, dont_filter=True)
+        #print(response.body)
--- a/Test/spiders/geckodriver.exe
+++ b/Test/spiders/geckodriver.exe
--- a/geckodriver.log
+++ b/geckodriver.log
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = Test.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = Test