58 changed files with 44 additions and 3196 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -1,3 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
--- a/.idea/Test.iml
+++ b/.idea/Test.iml
@ -1,11 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-  <component name="TestRunnerService">
-    <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
-  </component>
-</module>
--- a/.idea/dataSources.local.xml
+++ b/.idea/dataSources.local.xml
@ -1,19 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="dataSourceStorageLocal" created-in="PY-211.7142.13">
-    <data-source name="@localhost" uuid="cdb7b751-ca1b-41ce-b04e-769ad450cfd5">
-      <database-info product="MySQL" version="8.0.28" jdbc-version="4.2" driver-name="MySQL Connector/J" driver-version="mysql-connector-java-8.0.25 (Revision: 08be9e9b4cba6aa115f9b27b215887af40b159e0)" dbms="MYSQL" exact-version="8.0.28" exact-driver-version="8.0">
-        <extra-name-characters>#@</extra-name-characters>
-        <identifier-quote-string>`</identifier-quote-string>
-      </database-info>
-      <case-sensitivity plain-identifiers="lower" quoted-identifiers="lower" />
-      <secret-storage>master_key</secret-storage>
-      <user-name>root</user-name>
-      <schema-mapping>
-        <introspection-scope>
-          <node kind="schema" qname="@" />
-        </introspection-scope>
-      </schema-mapping>
-    </data-source>
-  </component>
-</project>
--- a/.idea/dataSources.xml
+++ b/.idea/dataSources.xml
@ -1,12 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="DataSourceManagerImpl" format="xml" multifile-model="true">
-    <data-source source="LOCAL" name="@localhost" uuid="cdb7b751-ca1b-41ce-b04e-769ad450cfd5">
-      <driver-ref>mysql.8</driver-ref>
-      <synchronize>true</synchronize>
-      <jdbc-driver>com.mysql.cj.jdbc.Driver</jdbc-driver>
-      <jdbc-url>jdbc:mysql://localhost:3306</jdbc-url>
-      <working-dir>$ProjectFileDir$</working-dir>
-    </data-source>
-  </component>
-</project>
--- a/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5.xml
+++ b/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5.xml
--- a/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5/storage_v2/_src_/schema/information_schema.FNRwLQ.meta
+++ b/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5/storage_v2/_src_/schema/information_schema.FNRwLQ.meta
@ -1,2 +0,0 @@
-#n:information_schema
-!<md> [null, 0, null, null, -2147483648, -2147483648]
--- a/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5/storage_v2/_src_/schema/mysql.osA4Bg.meta
+++ b/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5/storage_v2/_src_/schema/mysql.osA4Bg.meta
@ -1,2 +0,0 @@
-#n:mysql
-!<md> [null, 0, null, null, -2147483648, -2147483648]
--- a/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5/storage_v2/_src_/schema/performance_schema.kIw0nw.meta
+++ b/.idea/dataSources/cdb7b751-ca1b-41ce-b04e-769ad450cfd5/storage_v2/_src_/schema/performance_schema.kIw0nw.meta
@ -1,2 +0,0 @@
-#n:performance_schema
-!<md> [null, 0, null, null, -2147483648, -2147483648]
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@ -1,12 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <profile version="1.0">
-    <option name="myName" value="Project Default" />
-    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
-      <option name="ignoredIdentifiers">
-        <list>
-          <option value="MySQLdb" />
-        </list>
-      </option>
-    </inspection_tool>
-  </profile>
-</component>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -1,6 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <settings>
-    <option name="USE_PROJECT_PROFILE" value="false" />
-    <version value="1.0" />
-  </settings>
-</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
-</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/Test.iml" filepath="$PROJECT_DIR$/.idea/Test.iml" />
-    </modules>
-  </component>
-</project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="VcsDirectoryMappings">
-    <mapping directory="$PROJECT_DIR$" vcs="Git" />
-  </component>
-</project>
--- a/README.md
+++ b/README.md
@ -0,0 +1,2 @@
+# spider
+
--- a/RedisTest/.idea/.gitignore
+++ b/RedisTest/.idea/.gitignore
@ -1,8 +0,0 @@
-# 默认忽略的文件
-/shelf/
-/workspace.xml
-# 数据源本地存储已忽略文件
-/dataSources/
-/dataSources.local.xml
-# 基于编辑器的 HTTP 客户端请求
-/httpRequests/
--- a/RedisTest/.idea/RedisTest.iml
+++ b/RedisTest/.idea/RedisTest.iml
@ -1,11 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-  <component name="TestRunnerService">
-    <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
-  </component>
-</module>
--- a/RedisTest/.idea/inspectionProfiles/Project_Default.xml
+++ b/RedisTest/.idea/inspectionProfiles/Project_Default.xml
@ -1,29 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <profile version="1.0">
-    <option name="myName" value="Project Default" />
-    <inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
-    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
-      <option name="ignoredPackages">
-        <value>
-          <list size="1">
-            <item index="0" class="java.lang.String" itemvalue="django" />
-          </list>
-        </value>
-      </option>
-    </inspection_tool>
-    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
-      <option name="ignoredErrors">
-        <list>
-          <option value="E265" />
-        </list>
-      </option>
-    </inspection_tool>
-    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
-      <option name="ignoredIdentifiers">
-        <list>
-          <option value="str.close" />
-        </list>
-      </option>
-    </inspection_tool>
-  </profile>
-</component>
--- a/RedisTest/.idea/inspectionProfiles/profiles_settings.xml
+++ b/RedisTest/.idea/inspectionProfiles/profiles_settings.xml
@ -1,6 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <settings>
-    <option name="USE_PROJECT_PROFILE" value="false" />
-    <version value="1.0" />
-  </settings>
-</component>
--- a/RedisTest/.idea/misc.xml
+++ b/RedisTest/.idea/misc.xml
@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
-</project>
--- a/RedisTest/.idea/modules.xml
+++ b/RedisTest/.idea/modules.xml
@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/RedisTest.iml" filepath="$PROJECT_DIR$/.idea/RedisTest.iml" />
-    </modules>
-  </component>
-</project>
--- a/RedisTest/RedisTest/init.py
+++ b/RedisTest/RedisTest/init.py
--- a/RedisTest/RedisTest/pycache/init.cpython-37.pyc
+++ b/RedisTest/RedisTest/pycache/init.cpython-37.pyc
--- a/RedisTest/RedisTest/pycache/items.cpython-37.pyc
+++ b/RedisTest/RedisTest/pycache/items.cpython-37.pyc
--- a/RedisTest/RedisTest/pycache/pipelines.cpython-37.pyc
+++ b/RedisTest/RedisTest/pycache/pipelines.cpython-37.pyc
--- a/RedisTest/RedisTest/pycache/settings.cpython-37.pyc
+++ b/RedisTest/RedisTest/pycache/settings.cpython-37.pyc
--- a/RedisTest/RedisTest/items.py
+++ b/RedisTest/RedisTest/items.py
@ -1,12 +0,0 @@
-# Define here the models for your scraped items
-#
-# See documentation in:
-# https://docs.scrapy.org/en/latest/topics/items.html
-
-import scrapy
-from scrapy import Field
-
-class RedistestItem(scrapy.Item):
-    # define the fields for your item here like:
-    # name = scrapy.Field()
-    title = Field()
--- a/RedisTest/RedisTest/middlewares.py
+++ b/RedisTest/RedisTest/middlewares.py
@ -1,103 +0,0 @@
-# Define here the models for your spider middleware
-#
-# See documentation in:
-# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-
-from scrapy import signals
-
-# useful for handling different item types with a single interface
-from itemadapter import is_item, ItemAdapter
-
-
-class RedistestSpiderMiddleware:
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the spider middleware does not modify the
-    # passed objects.
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_spider_input(self, response, spider):
-        # Called for each response that goes through the spider
-        # middleware and into the spider.
-
-        # Should return None or raise an exception.
-        return None
-
-    def process_spider_output(self, response, result, spider):
-        # Called with the results returned from the Spider, after
-        # it has processed the response.
-
-        # Must return an iterable of Request, or item objects.
-        for i in result:
-            yield i
-
-    def process_spider_exception(self, response, exception, spider):
-        # Called when a spider or process_spider_input() method
-        # (from other spider middleware) raises an exception.
-
-        # Should return either None or an iterable of Request or item objects.
-        pass
-
-    def process_start_requests(self, start_requests, spider):
-        # Called with the start requests of the spider, and works
-        # similarly to the process_spider_output() method, except
-        # that it doesn’t have a response associated.
-
-        # Must return only requests (not items).
-        for r in start_requests:
-            yield r
-
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
-
-
-class RedistestDownloaderMiddleware:
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the downloader middleware does not modify the
-    # passed objects.
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_request(self, request, spider):
-        # Called for each request that goes through the downloader
-        # middleware.
-
-        # Must either:
-        # - return None: continue processing this request
-        # - or return a Response object
-        # - or return a Request object
-        # - or raise IgnoreRequest: process_exception() methods of
-        #   installed downloader middleware will be called
-        return None
-
-    def process_response(self, request, response, spider):
-        # Called with the response returned from the downloader.
-
-        # Must either;
-        # - return a Response object
-        # - return a Request object
-        # - or raise IgnoreRequest
-        return response
-
-    def process_exception(self, request, exception, spider):
-        # Called when a download handler or a process_request()
-        # (from other downloader middleware) raises an exception.
-
-        # Must either:
-        # - return None: continue processing this exception
-        # - return a Response object: stops process_exception() chain
-        # - return a Request object: stops process_exception() chain
-        pass
-
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
--- a/RedisTest/RedisTest/pipelines.py
+++ b/RedisTest/RedisTest/pipelines.py
@ -1,14 +0,0 @@
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-
-
-# useful for handling different item types with a single interface
-from itemadapter import ItemAdapter
-
-class RedistestPipeline(object):
-    def process_item(self, item, spider):
-        with open("article.txt",'a') as file:
-            file.write("title%s\n"%item['title'])
-        return item
--- a/RedisTest/RedisTest/settings.py
+++ b/RedisTest/RedisTest/settings.py
@ -1,97 +0,0 @@
-# Scrapy settings for RedisTest project
-#
-# For simplicity, this file contains only settings considered important or
-# commonly used. You can find more settings consulting the documentation:
-#
-#     https://docs.scrapy.org/en/latest/topics/settings.html
-#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-
-BOT_NAME = 'RedisTest'
-
-SPIDER_MODULES = ['RedisTest.spiders']
-NEWSPIDER_MODULE = 'RedisTest.spiders'
-DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
-SCHEDULER = "scrapy_redis.scheduler.Scheduler"
-SCHEDULER_PERSIST = True
-REDIS_HOST = "127.0.0.1"
-REDIS_PORT = 6379
-
-ROBOTSTXT_OBEY = False
-ITEM_PIPELINES = {
-   'RedisTest.pipelines.RedistestPipeline': 300,
-}
-
-# Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'RedisTest (+http://www.yourdomain.com)'
-
-# Obey robots.txt rules
-ROBOTSTXT_OBEY = True
-HTTPERROR_ALLOWED_CODES = [404]
-# Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
-
-# Configure a delay for requests for the same website (default: 0)
-# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
-# See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
-# The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
-
-# Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
-
-# Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
-
-# Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
-#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-#   'Accept-Language': 'en',
-#}
-
-# Enable or disable spider middlewares
-# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
-#    'RedisTest.middlewares.RedistestSpiderMiddleware': 543,
-#}
-
-# Enable or disable downloader middlewares
-# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
-#    'RedisTest.middlewares.RedistestDownloaderMiddleware': 543,
-#}
-
-# Enable or disable extensions
-# See https://docs.scrapy.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
-#    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
-
-# Configure item pipelines
-# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'RedisTest.pipelines.RedistestPipeline': 300,
-#}
-
-# Enable and configure the AutoThrottle extension (disabled by default)
-# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
-# The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
-# The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
-# The average number of requests Scrapy should be sending in parallel to
-# each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
-# Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
-
-# Enable and configure HTTP caching (disabled by default)
-# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/RedisTest/RedisTest/spiders/init.py
+++ b/RedisTest/RedisTest/spiders/init.py
@ -1,4 +0,0 @@
-# This package will contain the spiders of your Scrapy project
-#
-# Please refer to the documentation for information on how to create and manage
-# your spiders.
--- a/RedisTest/RedisTest/spiders/pycache/init.cpython-37.pyc
+++ b/RedisTest/RedisTest/spiders/pycache/init.cpython-37.pyc
--- a/RedisTest/RedisTest/spiders/pycache/article.cpython-37.pyc
+++ b/RedisTest/RedisTest/spiders/pycache/article.cpython-37.pyc
--- a/RedisTest/RedisTest/spiders/article.py
+++ b/RedisTest/RedisTest/spiders/article.py
@ -1,13 +0,0 @@
-from scrapy_redis.spiders import RedisSpider
-from ..items import RedistestItem
-
-class ArticleSpider(RedisSpider):
-    name = 'article'
-    redis_key = "bole_urls"
-
-    def parse(self, response):
-        article_list = response.xpath("//div[@class='post floated-thumb']")
-        for article in article_list:
-            item = RedistestItem()
-            item['title'] = article.xpath("div[@class='post-meta']/p[1]/a/@title").extract_first()
-            yield item
--- a/RedisTest/scrapy.cfg
+++ b/RedisTest/scrapy.cfg
@ -1,11 +0,0 @@
-# Automatically created by: scrapy startproject
-#
-# For more information about the [deploy] section see:
-# https://scrapyd.readthedocs.io/en/latest/deploy.html
-
-[settings]
-default = RedisTest.settings
-
-[deploy]
-#url = http://localhost:6800/
-project = RedisTest
--- a/Test/init.py
+++ b/Test/init.py
--- a/Test/pycache/init.cpython-37.pyc
+++ b/Test/pycache/init.cpython-37.pyc
--- a/Test/pycache/init.cpython-39.pyc
+++ b/Test/pycache/init.cpython-39.pyc
--- a/Test/pycache/items.cpython-37.pyc
+++ b/Test/pycache/items.cpython-37.pyc
--- a/Test/pycache/items.cpython-39.pyc
+++ b/Test/pycache/items.cpython-39.pyc
--- a/Test/pycache/middlewares.cpython-37.pyc
+++ b/Test/pycache/middlewares.cpython-37.pyc
--- a/Test/pycache/middlewares.cpython-39.pyc
+++ b/Test/pycache/middlewares.cpython-39.pyc
--- a/Test/pycache/pipelines.cpython-37.pyc
+++ b/Test/pycache/pipelines.cpython-37.pyc
--- a/Test/pycache/settings.cpython-37.pyc
+++ b/Test/pycache/settings.cpython-37.pyc
--- a/Test/pycache/settings.cpython-39.pyc
+++ b/Test/pycache/settings.cpython-39.pyc
--- a/Test/items.py
+++ b/Test/items.py
@ -1,26 +0,0 @@
-# Define here the models for your scraped items
-#
-# See documentation in:
-# https://docs.scrapy.org/en/latest/topics/items.html
-
-# import scrapy
-#
-#
-# class TestItem(scrapy.Item):
-#     # define the fields for your item here like:
-#     # name = scrapy.Field()
-#     pass
-
-from scrapy import Item, Field
-
-
-class BossjobItem(Item):
-    # define the fields for your item here like:
-    # name = scrapy.Field()
-    collection = 'products'
-    job_com = Field()
-    job_name = Field()
-    salary = Field()
-    job_limit = Field()
-    job_benefit = Field()
-    job_ab = Field()
--- a/Test/middlewares.py
+++ b/Test/middlewares.py
@ -1,162 +0,0 @@
-# Define here the models for your spider middleware
-#
-# See documentation in:
-# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-
-from scrapy import signals
-from scrapy.http.response.html import HtmlResponse
-# 改造返回类
-
-#selenium
-from selenium import webdriver
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.common.by import By
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-#自定义
-import logging
-# useful for handling different item types with a single interface
-from itemadapter import is_item, ItemAdapter
-
-import time
-
-
-
-class TestSpiderMiddleware:
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the spider middleware does not modify the
-    # passed objects.
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_spider_input(self, response, spider):
-        # Called for each response that goes through the spider
-        # middleware and into the spider.
-
-        # Should return None or raise an exception.
-        return None
-
-    def process_spider_output(self, response, result, spider):
-        # Called with the results returned from the Spider, after
-        # it has processed the response.
-
-        # Must return an iterable of Request, or item objects.
-        for i in result:
-            yield i
-
-    def process_spider_exception(self, response, exception, spider):
-        # Called when a spider or process_spider_input() method
-        # (from other spider middleware) raises an exception.
-
-        # Should return either None or an iterable of Request or item objects.
-        pass
-
-    def process_start_requests(self, start_requests, spider):
-        # Called with the start requests of the spider, and works
-        # similarly to the process_spider_output() method, except
-        # that it doesn’t have a response associated.
-
-        # Must return only requests (not items).
-        for r in start_requests:
-            yield r
-
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
-
-
-class TestDownloaderMiddleware(object):
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the downloader middleware does not modify the
-    # passed objects.
-    def __init__(self,timeout = 25):
-        # 初始化参数
-        options = webdriver.FirefoxOptions()
-        # 使用该参数就不会看到启动，无头
-        # options.add_argument('-headless')
-        self.browser = webdriver.Firefox(options=options,executable_path=r"C:\Users\cookie\Desktop\Test\Test\spiders\geckodriver.exe")
-        profile = FirefoxProfile()
-        #设置火狐具体配置
-        self.timeout = timeout
-        #self.browser = webdriver.Firefox(profile)
-        self.browser.set_page_load_timeout(self.timeout)
-        # self.browser.implicitly_wait(self.timeout)
-        # 隐式等待
-
-    #每次爬取都初始化
-    def __del__(self):
-        self.browser.close()
-        #关闭
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_request(self, request, spider):
-        # Called for each request that goes through the downloader
-        # middleware.
-
-        # Must either:
-        # - return None: continue processing this request
-        # - or return a Response object
-        # - or return a Request object
-        # - or raise IgnoreRequest: process_exception() methods of
-        #   installed downloader middleware will be called
-        """
-        selenium 下载中间件
-        """
-
-        logging.info('******WebDriver is Starting******')
-        print('使用selenium请求页面:{}'.format(request.url))
-            #page = request.meta.get('page', 1)
-        try:
-                self.browser.get(request.url)
-                #if page > 1:
-                self.browser.implicitly_wait(self.timeout)
-                return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',
-                                    status=200)
-        except TimeoutError:
-                return HtmlResponse(url=request.url, status=500, request=request)
-        # self.browser.get(request.url)
-        # time.sleep(1)
-        # try:
-        #     while True:
-        #         show_more = self.browser.find_element_by_class_name('show-more')
-        #         show_more.click()
-        #         time.sleep(3)
-        #         if not show_more:
-        #             break
-        # except:
-        #     pass
-        #     # 获得网页源代码
-        # source = self.browser.page_source
-        #     # 构造response对象 | 进行返回
-        # response = HtmlResponse(url=self.browser.current_url, body=source, request=request, encoding='utf-8')
-        # return response
-    def process_response(self, request, response, spider):
-        # Called with the response returned from the downloader.
-
-        # Must either;
-        # - return a Response object
-        # - return a Request object
-        # - or raise IgnoreRequest
-        return response
-
-    def process_exception(self, request, exception, spider):
-        # Called when a download handler or a process_request()
-        # (from other downloader middleware) raises an exception.
-
-        # Must either:
-        # - return None: continue processing this exception
-        # - return a Response object: stops process_exception() chain
-        # - return a Request object: stops process_exception() chain
-        pass
-
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
--- a/Test/pipelines.py
+++ b/Test/pipelines.py
@ -1,87 +0,0 @@
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-
-
-# useful for handling different item types with a single interface
-from itemadapter import ItemAdapter
-from scrapy import Spider
-import pymysql
-from logging import log
-
-# 异步防止堵塞
-
-class TestPipeline:
-    def process_item(self, item, spider):
-        # print('这是测试'+item['job_com'])
-        print('可以用+++++++++++++++++++++++++++++=')
-        # return item
-
-class MysqlPipelineTwo(object,Spider):
-    def __init__(self,dbpool):
-        self.dbpool = dbpool
-        self.connect = pymysql.connect(
-            host=self.settings.get('MYSQL_HOST'),
-            db=self.settings.get('MYSQL_DBNAME'),
-            user=self.settings.get('MYSQL_USER'),
-            password=self.settings.get('MYSQL_PASSWORD'),
-            charset='utf8',
-            use_unicode=True
-        )
-        self.cursor = self.connect.cursor()
-#?
-    # @classmethod
-    # def from_settings(cls, settings):
-    #     """
-    #     数据库建立连接
-    #     :param settings: 配置参数
-    #     :return: 实例化参数
-    #     """
-    #     adbparams = dict(
-    #         host=settings['MYSQL_HOST'],
-    #         db=settings['MYSQL_DBNAME'],
-    #         user=settings['MYSQL_USER'],
-    #         password=settings['MYSQL_PASSWORD'],
-    #         cursorclass=pymysql.cursors.DictCursor  # 指定cursor类型
-    #     )
-    #     # 连接数据池ConnectionPool，使用pymysql或者Mysqldb连接
-    #     dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
-    #     # 返回实例化参数
-    #     return cls(dbpool)
-
-    def process_item(self, item, spider):
-        try:
-            #插入
-            insert_sql = """
-                    insert into %s(job_com,job_names,salary,job_limit,job_benefit)
-                                     values("%s", "%s", "%s", "%s", "%s" );
-                                """ % (
-            '上海', item['job_com'], item['job_name'], item['salary'], item['job_limit'], item['job_benefit'])
-            self.cursor.execute(insert_sql)
-            self.connect.commit()
-            #执行
-        except Exception as error:
-            log(error)
-        return item
-        # """
-        # 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作，返回一个对象
-        # """
-        # query = self.dbpool.runInteraction(self.do_insert, item)  # 指定操作方法和操作数据
-        # # 添加异常处理
-        # query.addCallback(self.handle_error)  # 处理异常
-
-    # def do_insert(self, cursor, item):
-    #     # 对数据库进行插入操作，并不需要commit，twisted会自动commit
-    #     insert_sql = """
-    #     insert into %s(job_com,job_names,salary,job_limit,job_benefit)
-    #                      values("%s", "%s", "%s", "%s", "%s" );
-    #                 """ % ('上海', item['job_com'], item['job_name'], item['salary'], item['job_limit'], item['job_benefit'])
-    #     cursor.execute(insert_sql)
-    #
-    # def handle_error(self, failure):
-    #     if failure:
-    #         # 打印错误信息
-    #         print(failure)
-
-# item['job_city']
--- a/Test/settings.py
+++ b/Test/settings.py
@ -1,125 +0,0 @@
-# Scrapy settings for Test project
-#
-# For simplicity, this file contains only settings considered important or
-# commonly used. You can find more settings consulting the documentation:
-#
-#     https://docs.scrapy.org/en/latest/topics/settings.html
-#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-
-BOT_NAME = 'Test'
-
-SPIDER_MODULES = ['Test.spiders']
-NEWSPIDER_MODULE = 'Test.spiders'
-
-
-KEYWORDS = ['python']
-CITYCODE = ['c101020100']
-MAX_PAGE = 10
-#使用了scrapy_redis的去重组件，在redis数据库里做去重，确保所有爬虫共享相同的去重指纹
-DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
-#使用了scrapy_redis的调度器，在redis里分配请求，确保request存储到redis中
-SCHEDULER = "scrapy_redis.scheduler.Scheduler"
-# 在redis中保持scrapy-redis用到的队列，不会清理redis中的队列，从而可以实现暂停和恢复的功能
-SCHEDULER_PERSIST = True
-#指定redis数据库的连接参数
-REDIS_HOST = "192.168.163.128"
-REDIS_PORT = 6379
-
-HTTPERROR_ALLOWED_CODES = [301]
-ROBOTSTXT_OBEY = False
-MYSQL_HOST = "127.0.0.1"
-MYSQL_PORT = 3306
-MYSQL_DBNAME = "spidetssdb"
-MYSQL_USER = "root"
-MYSQL_PASSWORD = "Fengqici010515"
-CHARSET = "utf8"
-
-'''
- 通过配置RedisPipeline将item写入key为 
- spider.name : items 的redis的list中，供后面的分布式处理item，这个已经由scrapy-redis 实现，
- 不需要我们写代码，直接使用即可
-'''
-ITEM_PIPELINES = {
-   # 'Test.spiders.pipelines.Test.spidersPipeline': 300,
-   'scrapy_redis.pipelines.RedisPipeline': 400,
-   # 'Test.pipelines.MysqlPipelineTwo': 302,
-   # 'Test.pipelines.TestPipeline': 300,
-
-}
-#
-#
-
-# Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'Test (+http://www.yourdomain.com)'
-
-# Obey robots.txt rules
-ROBOTSTXT_OBEY = False
-
-# Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
-
-# Configure a delay for requests for the same website (default: 0)
-# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
-# See also autothrottle settings and docs
-DOWNLOAD_DELAY = 1
-# The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
-
-# Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
-
-# Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
-
-# Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
-#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-#   'Accept-Language': 'en',
-#}
-
-# Enable or disable spider middlewares
-# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
-#    'Test.middlewares.TestSpiderMiddleware': 543,
-#}
-
-# Enable or disable downloader middlewares
-# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-DOWNLOADER_MIDDLEWARES = {
-   'Test.middlewares.TestDownloaderMiddleware': 543,
-}
-
-# Enable or disable extensions
-# See https://docs.scrapy.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
-#    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
-
-# Configure item pipelines
-# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'Test.pipelines.TestPipeline': 300,
-#}
-
-# Enable and configure the AutoThrottle extension (disabled by default)
-# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
-# The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
-# The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
-# The average number of requests Scrapy should be sending in parallel to
-# each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
-# Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
-
-# Enable and configure HTTP caching (disabled by default)
-# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/Test/spiders/init.py
+++ b/Test/spiders/init.py
@ -1,4 +0,0 @@
-# This package will contain the spiders of your Scrapy project
-#
-# Please refer to the documentation for information on how to create and manage
-# your spiders.
--- a/Test/spiders/pycache/init.cpython-37.pyc
+++ b/Test/spiders/pycache/init.cpython-37.pyc
--- a/Test/spiders/pycache/init.cpython-39.pyc
+++ b/Test/spiders/pycache/init.cpython-39.pyc
--- a/Test/spiders/pycache/bosstest.cpython-37.pyc
+++ b/Test/spiders/pycache/bosstest.cpython-37.pyc
--- a/Test/spiders/pycache/bosstest.cpython-39.pyc
+++ b/Test/spiders/pycache/bosstest.cpython-39.pyc
--- a/Test/spiders/bosstest.py
+++ b/Test/spiders/bosstest.py
@ -1,85 +0,0 @@
-import scrapy
-from scrapy import Request, Spider
-from Test.items import BossjobItem
-from scrapy_redis.spiders import RedisSpider
-# class BosstestSpider(scrapy.Spider):
-class BosstestSpider(RedisSpider):
-    name = 'bosstest'
-    allowed_domains = ['www.zhipin.com']
-    #start_urls = ['http://www.zhipin.com/c101020100/?query=python&page=\d&ka=page-\d']
-    #start_urls = ['https://www.zhipin.com/job_detail/?query=python&city=101020100&industry=&position=']
-    # start_urls = ['http://www.zhipin.com/']
-    # start_urls = ['http://www.zhipin.com/c101020100/?query=python&page=1&ka=page-&page=1']
-    redis_key = "bole_urls"
-    # def start_requests(self):
-    #
-    #         for citycode in self.settings.get('CITYCODE'):
-    #             for keyword in self.settings.get('KEYWORDS'):
-    #                 for page in range(1, self.settings.get('MAX_PAGE') + 1):
-    #             # 可能这里要设计翻页
-    #             # 注意动态页面
-    #                     href = "&page="+ str(page)
-    #                     ka = "&ka=page-" + href
-    #                     page_url = 'http://www.zhipin.com/'  + citycode + "/?query="+keyword + href + ka
-    #                     # url = self.start_urls + quote(keyword)
-    #                     yield Request(url=page_url, callback=self.parse, meta={'page': page}, dont_filter=True)
-
-    def parse(self, response):
-
-
-        # url = response.url
-        # print(url)
-        # print('___________________' + url + '+++++++++++++++++++++++++')
-        # print('+++++++++++++++++++' + str(type(response.url)) + '——————————————————————————')
-        # url = response.xpath()
-        # 起始url
-        job_names_tmp = response.xpath('//span[@class="job-name"]/a/text()').extract()
-        # 工作名字
-        salary_tmp = response.xpath('//div[@class="job-limit clearfix"]/span[@class="red"]/text()').extract()
-        # 工资
-        job_limit_tmp = response.xpath('//div[@class="job-limit clearfix"]/p/text()').extract()
-        # 工作经验限制
-        job_com_tmp = response.xpath('//div[@class="company-text"]/h3/a/text()').extract()
-        # 公司名字
-        # job_benefit = response.xpath('//div[@class="info-desc"]/text()').extract()
-        # 福利
-        # job_ab_tmp = response.xpath('//div[@class="info-append clearfix"]/div[@class="tags"]')
-        # /html/body/div[1]/div[3]/div/div[3]/ul/li[1]/div/div[2]/div[1]
-        # //div[@class="info-append clearfix"]/div[@class="tags"]
-        #能力
-        urls = response.xpath("//div[@class='page']/a[@class='next']/@href").extract()
-        for url in urls:
-            print('___________________' + url + '+++++++++++++++++++++++++')
-            url = 'http://www.zhipin.com/' + str(url)
-            print('+++++++++++++++++++++++++' + url + '+++++++++++++++++++++++++')
-            yield Request(url=url, callback=self.parse, dont_filter=True)
-
-        for i in range(len(job_names_tmp)):
-            # job_ab_tmp2 = job_ab_tmp[i].xpath('/span[@class="tag-item"]/text()').extract()
-            # print(job_ab_tmp2)
-            # job_ab = ' '.join(job_ab_tmp2)
-
-            item = BossjobItem()
-
-            # item['job_city'] = '上海'
-            item['job_com'] = job_com_tmp[i]
-            item['job_name'] = job_names_tmp[i]
-            item['salary'] = salary_tmp[i]
-            item['job_limit'] = job_limit_tmp[i]
-            # item['job_benefit'] = job_benefit[i]
-            # item['job_ab'] = job_ab
-
-            # item['job_com'] = ''.join(job_com_tmp[i])
-            # item['job_name'] = ''.join(job_names_tmp[i])
-            # item['salary'] = ''.join(salary_tmp[i])
-            # item['job_limit'] = ''.join(job_limit_tmp[i])
-            # item['job_benefit'] = ''.join(job_benefit[i])
-            yield item
-
-        # urls = response.xpath("//div[@class='page']/a[@class='next']/@href").extract()
-        # for url in urls:
-        #     print('___________________' + url + '+++++++++++++++++++++++++')
-        #     url = 'http://www.zhipin.com/' + str(url)
-        #     print('+++++++++++++++++++++++++' + url + '+++++++++++++++++++++++++')
-        #     yield Request(url=url,callback=self.parse, dont_filter=True)
-        #print(response.body)
--- a/Test/spiders/geckodriver.exe
+++ b/Test/spiders/geckodriver.exe
--- a/geckodriver.log
+++ b/geckodriver.log
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,42 @@
+##  项目介绍
+
+
+
+selenium+mysql+scrapy-redis
+
+
+
+本爬虫是基于selenium自动化web测试工具
+
+希望可以达到web可视化的展现。
+
+- [x] 	selenium无头爬取下载器
+- [x] 	mysql存入
+- [ ] 	scrapy-redis的框架构建和selenium与其对接
+- [x] 	一般静态网站下载器
+- [ ] 	前端可视化界面
+- [ ] 	后端sql数据库+端口对接
+- [ ] 	数据分析处理
+
+
+
+##  结构介绍
+
+**Selenium爬虫**
+
+
+
+geckodriver.exe   火狐的浏览器驱动
+
+test.py  爬取boss网的测试脚本
+
+setting  火狐的配置信息（未完成）
+
+ip_ok.txt  可用ip的记录txt
+
+
+
+**spidetMsql**
+
+test.py   数据存入数据库
+
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -1,11 +0,0 @@
-# Automatically created by: scrapy startproject
-#
-# For more information about the [deploy] section see:
-# https://scrapyd.readthedocs.io/en/latest/deploy.html
-
-[settings]
-default = Test.settings
-
-[deploy]
-#url = http://localhost:6800/
-project = Test