5.27.14.27

1 year ago · d77bb4b53f
parent 525f8a437a
commit d77bb4b53f
30 changed files with 731 additions and 0 deletions
--- a/ArticleSpider/.idea/.gitignore
+++ b/ArticleSpider/.idea/.gitignore
@ -0,0 +1,8 @@
+# 默认忽略的文件
+/shelf/
+/workspace.xml
+# 基于编辑器的 HTTP 客户端请求
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
--- a/ArticleSpider/.idea/ArticleSpider.iml
+++ b/ArticleSpider/.idea/ArticleSpider.iml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.9 (article_spider)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/ArticleSpider/.idea/inspectionProfiles/profiles_settings.xml
+++ b/ArticleSpider/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/ArticleSpider/.idea/misc.xml
+++ b/ArticleSpider/.idea/misc.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.9 (article_spider)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (article_spider)" project-jdk-type="Python SDK" />
+</project>
--- a/ArticleSpider/.idea/modules.xml
+++ b/ArticleSpider/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/ArticleSpider.iml" filepath="$PROJECT_DIR$/.idea/ArticleSpider.iml" />
+    </modules>
+  </component>
+</project>
--- a/ArticleSpider/ArticleSpider/init.py
+++ b/ArticleSpider/ArticleSpider/init.py
--- a/ArticleSpider/ArticleSpider/pycache/init.cpython-39.pyc
+++ b/ArticleSpider/ArticleSpider/pycache/init.cpython-39.pyc
--- a/ArticleSpider/ArticleSpider/pycache/items.cpython-39.pyc
+++ b/ArticleSpider/ArticleSpider/pycache/items.cpython-39.pyc
--- a/ArticleSpider/ArticleSpider/pycache/pipelines.cpython-39.pyc
+++ b/ArticleSpider/ArticleSpider/pycache/pipelines.cpython-39.pyc
--- a/ArticleSpider/ArticleSpider/pycache/settings.cpython-39.pyc
+++ b/ArticleSpider/ArticleSpider/pycache/settings.cpython-39.pyc
--- a/ArticleSpider/ArticleSpider/gen_test.py
+++ b/ArticleSpider/ArticleSpider/gen_test.py
--- a/ArticleSpider/ArticleSpider/items.py
+++ b/ArticleSpider/ArticleSpider/items.py
@ -0,0 +1,64 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+import re
+import scrapy
+from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join
+from scrapy.loader import ItemLoader
+from ArticleSpider.models.es_types import ArticleType
+from w3lib.html import remove_tags
+
+class ArticlespiderItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
+
+
+def date_convert(value):
+    match_re = re.match('.*?(\d+.*)', value)
+    if match_re:
+        return match_re.group(1)
+    else:
+        return '1970-07-01'
+
+
+class ArticleItemLoader(ItemLoader):
+    default_output_processor = TakeFirst()
+
+
+class JobBoleArticleItem(scrapy.Item):
+    title = scrapy.Field()  # 标题
+    create_date = scrapy.Field(
+        input_processor=MapCompose(date_convert)
+    )  # 发布时间
+    url = scrapy.Field()  # 链接
+    url_object_id = scrapy.Field()  # 链接id
+    front_image_url = scrapy.Field(
+        output_processor=Identity()
+    )  # 封面图
+    front_image_path = scrapy.Field()  # 封面图路径
+    praise_nums = scrapy.Field()  # 点赞数
+    comment_nums = scrapy.Field()  # 评论数
+    fav_nums = scrapy.Field()  # 收藏数
+    tags = scrapy.Field(
+        output_processor=Join(separator=',')
+    )  # 标签
+    content = scrapy.Field()  # 内容
+
+    def save_to_es(self):
+        article = ArticleType()
+        article.title = self['title']
+        article.create_date = self['create_date']
+        article.content = remove_tags(self['content'])
+        article.front_image_url = self['front_image_url']
+        if 'front_image_path' in self:
+            article.front_image_path = self['front_image_path']
+        article.praise_nums = self['praise_nums']
+        article.fav_nums = self['fav_nums']
+        article.comment_nums = self['comment_nums']
+        article.url = self['url']
+        article.tags = self['tags']
+        article.meta.id = self['url_object_id']
+        article.save()
+        return
--- a/ArticleSpider/ArticleSpider/middlewares.py
+++ b/ArticleSpider/ArticleSpider/middlewares.py
@ -0,0 +1,103 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class ArticlespiderSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class ArticlespiderDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
--- a/ArticleSpider/ArticleSpider/models/init.py
+++ b/ArticleSpider/ArticleSpider/models/init.py
--- a/ArticleSpider/ArticleSpider/models/es_types.py
+++ b/ArticleSpider/ArticleSpider/models/es_types.py
@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+__author__ = 'bobby'
+
+from datetime import datetime
+from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
+    analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
+
+from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
+
+from elasticsearch_dsl.connections import connections
+
+connections.create_connection(hosts=["localhost"])
+
+
+class CustomAnalyzer(_CustomAnalyzer):
+    def get_analysis_definition(self):
+        return {}
+
+
+ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
+
+
+class ArticleType(DocType):
+    # 伯乐在线文章类型
+    suggest = Completion(analyzer=ik_analyzer)
+    title = Text(analyzer="ik_max_word")
+    create_date = Date()
+    url = Keyword()
+    url_object_id = Keyword()
+    front_image_url = Keyword()
+    front_image_path = Keyword()
+    praise_nums = Integer()
+    comment_nums = Integer()
+    fav_nums = Integer()
+    tags = Text(analyzer="ik_max_word")
+    content = Text(analyzer="ik_max_word")
+
+    class Meta:
+        index = "jobbole"
+        doc_type = "article"
+
+
+if __name__ == "__main__":
+    ArticleType.init()
--- a/ArticleSpider/ArticleSpider/pipelines.py
+++ b/ArticleSpider/ArticleSpider/pipelines.py
@ -0,0 +1,169 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+from scrapy.pipelines.images import ImagesPipeline
+from scrapy.http.request import Request
+from ArticleSpider.models.es_types import ArticleType
+import codecs
+import json
+from w3lib.html import remove_tags
+from scrapy.exporters import JsonItemExporter
+import MySQLdb
+from twisted.enterprise import adbapi
+from MySQLdb.cursors import DictCursor
+
+
+
+class ArticlespiderPipeline(object):
+    def process_item(self, item, spider):
+        return item
+
+
+class MysqlPipeline(object):
+    def __init__(self):
+        self.conn = MySQLdb.connect('127.0.0.1', 'root', 'qweasdzxc227', 'article_spider', charset="utf8",
+                                    use_unicode=True)
+        self.cursor = self.conn.cursor()
+
+    def process_item(self, item, spider):
+        insert_sql = """
+            insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date)
+            values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums)
+        """
+        params = list()
+        params.append(item.get("title", ""))
+        params.append(item.get("url", ""))
+        params.append(item.get("url_object_id", ""))
+        front_image = ','.join(item.get("front_image_url", []))
+        params.append(front_image)
+        params.append(item.get("front_image_path", ""))
+        params.append(item.get("parise_nums", 0))
+        params.append(item.get("comment_nums", 0))
+        params.append(item.get("fav_nums", 0))
+        params.append(item.get("tags", ""))
+        params.append(item.get("content", ""))
+        params.append(item.get("create_date", "1970-07-01"))
+        self.cursor.execute(insert_sql, tuple(params))
+        self.conn.commit()
+        return item
+
+
+class MysqlTwistedPipline(object):
+    def __init__(self, dbpool):
+        self.dbpool = dbpool
+
+    @classmethod
+    def from_settings(cls, settings):
+        dbparms = dict(
+            host=settings["MYSQL_HOST"],
+            db=settings["MYSQL_DBNAME"],
+            user=settings["MYSQL_USER"],
+            passwd=settings["MYSQL_PASSWORD"],
+            charset='utf8',
+            cursorclass=DictCursor,
+            use_unicode=True,
+        )
+        dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
+
+        return cls(dbpool)
+
+    def process_item(self, item, spider):
+        # 使用twisted将mysql插入变成异步执行
+        query = self.dbpool.runInteraction(self.do_insert, item)
+        query.addErrback(self.handle_error, item, spider)  # 处理异常
+
+    def handle_error(self, failure, item, spider):
+        # 处理异步插入的异常
+        print(failure)
+
+    def do_insert(self, cursor, item):
+        # 执行具体的插入
+        insert_sql = """
+                   insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date)
+                   values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums)
+               """
+        params = list()
+        params.append(item.get("title", ""))
+        params.append(item.get("url", ""))
+        params.append(item.get("url_object_id", ""))
+        front_image = ','.join(item.get("front_image_url", []))
+        params.append(front_image)
+        params.append(item.get("front_image_path", ""))
+        params.append(item.get("parise_nums", 0))
+        params.append(item.get("comment_nums", 0))
+        params.append(item.get("fav_nums", 0))
+        params.append(item.get("tags", ""))
+        params.append(item.get("content", ""))
+        params.append(item.get("create_date", "1970-07-01"))
+        # 根据不同的item 构建不同的sql语句并插入到mysql中
+        cursor.execute(insert_sql, tuple(params))
+
+
+class JsonWithEncodingPipeline(object):
+    # 自定义json文件的导出
+    def __init__(self):
+        self.file = codecs.open('article.json', 'a', encoding="utf-8")
+
+    def process_item(self, item, spider):
+        lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
+        self.file.write(lines)
+        return item
+
+    def spider_closed(self, spider):
+        self.file.close()
+
+
+class JsonExporterPipeline(object):
+    def __init__(self):
+        self.file = open('articleexport.json', 'wb')
+        self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
+        self.exporter.start_exporting()
+
+    def close_spider(self, spider):
+        self.exporter.finish_exporting()
+        self.file.close()
+
+    def process_item(self, item, spider):
+        self.exporter.export_item(item)
+
+
+class ArticleImagePipeline(ImagesPipeline):
+    def item_completed(self, results, item, info):
+        try:
+            if "front_image_url" in item:
+                image_file_path = ''
+                for ok, value in results:
+                    image_file_path = value["path"]
+                item["front_image_path"] = image_file_path
+                return item
+        except Exception as e:
+            print(e)
+            item['front_image_path'] = '图片不可用'
+            return item
+
+
+class ElasticsearchPipeline(object):
+    # 将数据写入到es中
+    def process_item(self, item, spider):
+        # article = ArticleType()
+        # article.title = item['title']
+        # article.create_date = item['create_date']
+        # article.content = remove_tags(item['content'])
+        # article.front_image_url = item['front_image_url']
+        # if 'front_image_path' in item:
+        #     article.front_image_path = item['front_image_path']
+        # article.praise_nums = item['praise_nums']
+        # article.fav_nums = item['fav_nums']
+        # article.comment_nums = item['comment_nums']
+        # article.url = item['url']
+        # article.tags = item['tags']
+        # article.meta.id = item['url_object_id']
+        # article.save()
+        # 将item转换为es的数据
+        item.save_to_es()
+        return item
--- a/ArticleSpider/ArticleSpider/settings.py
+++ b/ArticleSpider/ArticleSpider/settings.py
@ -0,0 +1,115 @@
+# Scrapy settings for ArticleSpider project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+import os
+import sys
+
+import scrapy.downloadermiddlewares.useragent
+
+import ArticleSpider.pipelines
+
+BOT_NAME = "ArticleSpider"
+
+SPIDER_MODULES = ["ArticleSpider.spiders"]
+NEWSPIDER_MODULE = "ArticleSpider.spiders"
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+# CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+# DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = True
+COOKIES_DEBUG = True
+
+# Disable Telnet Console (enabled by default)
+# TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+# DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+# }
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+#    "ArticleSpider.middlewares.ArticlespiderSpiderMiddleware": 543,
+# }
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+    # "ArticleSpider.middlewares.ArticlespiderDownloaderMiddleware": 543,
+    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
+}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+# EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+# }
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    # 'scrapy.pipelines.images.ImagesPipeline': 1,
+    # 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2,
+    # 'ArticleSpider.pipelines.JsonExporterPipeline': 3,
+    # 'ArticleSpider.pipelines.MysqlPipeline': 4,
+    # 'ArticleSpider.pipelines.MysqlTwistedPipline': 5,
+    'ArticleSpider.pipelines.ElasticsearchPipeline': 6,
+    'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+# AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+# AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+# AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = "httpcache"
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
+
+# IMAGES_URLS_FIELD = 'front_image_url'
+project_dir = os.path.abspath(os.path.dirname(__file__))
+IMAGES_STORE = os.path.join(project_dir, 'images')
+
+MYSQL_HOST = '127.0.0.1'
+MYSQL_DBNAME = 'article_spider'
+MYSQL_USER = 'root'
+MYSQL_PASSWORD = 'qweasdzxc227'
--- a/ArticleSpider/ArticleSpider/spiders/init.py
+++ b/ArticleSpider/ArticleSpider/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/ArticleSpider/ArticleSpider/spiders/pycache/init.cpython-39.pyc
+++ b/ArticleSpider/ArticleSpider/spiders/pycache/init.cpython-39.pyc
--- a/ArticleSpider/ArticleSpider/spiders/pycache/jobbole.cpython-39.pyc
+++ b/ArticleSpider/ArticleSpider/spiders/pycache/jobbole.cpython-39.pyc
--- a/ArticleSpider/ArticleSpider/spiders/jobbole.py
+++ b/ArticleSpider/ArticleSpider/spiders/jobbole.py
@ -0,0 +1,128 @@
+import json
+import re
+import os
+import requests
+import scrapy
+import pickle
+import datetime
+from scrapy.http import Request
+from urllib import parse
+from scrapy.loader import ItemLoader
+from ArticleSpider.items import ArticleItemLoader
+from ArticleSpider.items import JobBoleArticleItem
+from ArticleSpider.utils import common
+from ArticleSpider.utils.common import get_md5
+from scrapy import signals
+import time
+from selenium import webdriver
+from scrapy.loader import ItemLoader
+
+
+class JobboleSpider(scrapy.Spider):
+    name = "jobbole"
+    allowed_domains = ["news.cnblogs.com"]
+    start_urls = ["http://news.cnblogs.com/"]
+
+    def start_requests(self):
+        cookies = []
+        if os.path.exists(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie'):
+            cookies = pickle.load(open(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie', 'rb'))
+        if not cookies:
+            driver = webdriver.Chrome()
+            driver.implicitly_wait(10)
+            # 进入登录网站
+            driver.get('https://account.cnblogs.com/signin')
+            # 使点击验证码失效
+            driver.execute_script("Object.defineProperties(navigator,{webdriver:{get:()=>undefined}})")
+            # 输入账号
+            driver.find_element_by_id('mat-input-0').send_keys('包包1')
+            # 输入密码
+            driver.find_element_by_id('mat-input-1').send_keys('qweasdzxc227')
+            # 点击登录
+            driver.find_element_by_css_selector('.mat-button-wrapper').click()
+            # 点击验证码
+            driver.find_element_by_xpath('//*[@id="Shape3"]').click()
+            time.sleep(5)
+            cookies = driver.get_cookies()
+            pickle.dump(cookies, open(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie', 'wb'))
+        cookie_dict = {}
+        for cookie in cookies:
+            cookie_dict[cookie['name']] = cookie['value']
+        for url in self.start_urls:
+            yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict)
+        # cookie_dict = {cookie['name']: cookie['value'] for cookie in cookies}
+        # print(cookies)
+        # print(cookie_dict)
+        # yield scrapy.Request(url='https://account.cnblogs.com/signin', callback=self.parse, cookies=cookie_dict)
+
+    def parse(self, response):
+        # 1.获取新闻列表页中的新闻url并交给scrapy进行下载后调用相应的解析方
+        # 提取文章链接,extract_first()提取第一个值
+        post_nodes = response.css('#news_list .news_block')[:1]
+        for post_node in post_nodes:
+            image_url = "https:" + post_node.css('.entry_summary a img::attr(src)').extract_first("")
+            post_url = post_node.css('h2 a::attr(href)').extract_first("")
+            yield Request(url=parse.urljoin(response.url, post_url), meta={'front_image_url': image_url}, callback=self.parse_detail, dont_filter=True)
+        # 2.获取下一页的url并交给scrapy进行下载，下载完成后交给parse继续跟进
+        # next_url = response.css('div.pager a:last-child::attr(href)').extract_first("")
+        # yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
+
+    def parse_detail(self, response):
+        match_re = re.match(".*?(\d+)", response.url)
+        if match_re:
+            post_id = match_re.group(1)
+            # article_item = JobBoleArticleItem()
+            # title = response.css('#news_title a::text').extract_first("")
+            # create_date = response.css('#news_info .time::text').extract_first("")
+            # match_re = re.match('.*?(\d+.*)', create_date)
+            # if match_re:
+            #     create_date = match_re.group(1)
+            #     # create_date = response.xpath('//*[@id="news_info"]//*[@class="time"]/text()').extract_first("")
+            #
+            # content = response.css('#news_content').extract()[0]
+            # tag_list = response.css('.news_tags a::text').extract()
+            # tags = ','.join(tag_list)
+            # article_item['title'] = title
+            # article_item['create_date'] = create_date
+            # article_item['content'] = content
+            # article_item['tags'] = tags
+            # article_item['url'] = response.url
+            # if response.meta.get('front_image_url', ""):
+            #     article_item['front_image_url'] = [response.meta.get('front_image_url', "")]
+            # else:
+            #     article_item['front_image_url'] = []
+            item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
+            item_loader.add_css('title', '#news_title a::text')
+            item_loader.add_css('content', '#news_content')
+            item_loader.add_css('tags', '.news_tags a::text')
+            item_loader.add_css('create_date', '#news_info .time::text')
+            item_loader.add_value('url', response.url)
+            item_loader.add_value('front_image_url', response.meta.get('front_image_url', ''))
+            # article_item = item_loader.load_item()
+            # if response.meta.get('front_image_url', ""):
+            #     article_item['front_image_url'] = [response.meta.get('front_image_url', "")]
+            # else:
+            #     article_item['front_image_url'] = []
+            yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
+                          meta={'article_item': item_loader, 'url':response.url}, callback=self.parse_nums)
+            # praise_nums = j_data['DiggCount']
+            # fav_nums = j_data['TotalView']
+            # comment_nums = j_data['CommentCount']
+            # pass
+
+    def parse_nums(self, response):
+        j_data = json.loads(response.text)
+        item_loader = response.meta.get('article_item', "")
+        # praise_nums = j_data['DiggCount']
+        # fav_nums = j_data['TotalView']
+        # comment_nums = j_data['CommentCount']
+        item_loader.add_value('praise_nums', j_data['DiggCount'])
+        item_loader.add_value('fav_nums', j_data['TotalView'])
+        item_loader.add_value('comment_nums', j_data['CommentCount'])
+        item_loader.add_value('url_object_id', common.get_md5(response.meta.get('url', '')))
+        # article_item['praise_nums'] = praise_nums
+        # article_item['fav_nums'] = fav_nums
+        # article_item['comment_nums'] = comment_nums
+        # article_item['url_object_id'] = common.get_md5(article_item['url'])
+        article_item = item_loader.load_item()
+        yield article_item
--- a/ArticleSpider/ArticleSpider/utils/init.py
+++ b/ArticleSpider/ArticleSpider/utils/init.py
--- a/ArticleSpider/ArticleSpider/utils/pycache/init.cpython-39.pyc
+++ b/ArticleSpider/ArticleSpider/utils/pycache/init.cpython-39.pyc
--- a/ArticleSpider/ArticleSpider/utils/pycache/common.cpython-39.pyc
+++ b/ArticleSpider/ArticleSpider/utils/pycache/common.cpython-39.pyc
--- a/ArticleSpider/ArticleSpider/utils/common.py
+++ b/ArticleSpider/ArticleSpider/utils/common.py
@ -0,0 +1,14 @@
+import hashlib
+
+
+# 生成md5
+def get_md5(url):
+    if isinstance(url, str):
+        url = url.encode("utf-8")
+    m = hashlib.md5()
+    m.update(url)
+    return m.hexdigest()
+
+
+if __name__ == '__main__':
+    print(get_md5("https://cnblogs.com"))
--- a/ArticleSpider/article.json
+++ b/ArticleSpider/article.json
--- a/ArticleSpider/articleexport.json
+++ b/ArticleSpider/articleexport.json
--- a/ArticleSpider/cookies/jobbole.cookie
+++ b/ArticleSpider/cookies/jobbole.cookie
--- a/ArticleSpider/main.py
+++ b/ArticleSpider/main.py
@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+__author__ = 'bobby'
+
+from scrapy.cmdline import execute
+
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+execute(["scrapy", "crawl", "jobbole"])
+# execute(["scrapy", "crawl", "zhihu"])
+# execute(["scrapy", "crawl", "lagou"])
--- a/ArticleSpider/scrapy.cfg
+++ b/ArticleSpider/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = ArticleSpider.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = ArticleSpider