parent
525f8a437a
commit
d77bb4b53f
@ -0,0 +1,8 @@
|
|||||||
|
# 默认忽略的文件
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# 基于编辑器的 HTTP 客户端请求
|
||||||
|
/httpRequests/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="jdk" jdkName="Python 3.9 (article_spider)" jdkType="Python SDK" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
@ -0,0 +1,6 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
@ -0,0 +1,7 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="Black">
|
||||||
|
<option name="sdkName" value="Python 3.9 (article_spider)" />
|
||||||
|
</component>
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (article_spider)" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/ArticleSpider.iml" filepath="$PROJECT_DIR$/.idea/ArticleSpider.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,64 @@
|
|||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
|
import re
|
||||||
|
import scrapy
|
||||||
|
from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join
|
||||||
|
from scrapy.loader import ItemLoader
|
||||||
|
from ArticleSpider.models.es_types import ArticleType
|
||||||
|
from w3lib.html import remove_tags
|
||||||
|
|
||||||
|
class ArticlespiderItem(scrapy.Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
|
# name = scrapy.Field()
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def date_convert(value):
|
||||||
|
match_re = re.match('.*?(\d+.*)', value)
|
||||||
|
if match_re:
|
||||||
|
return match_re.group(1)
|
||||||
|
else:
|
||||||
|
return '1970-07-01'
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleItemLoader(ItemLoader):
|
||||||
|
default_output_processor = TakeFirst()
|
||||||
|
|
||||||
|
|
||||||
|
class JobBoleArticleItem(scrapy.Item):
|
||||||
|
title = scrapy.Field() # 标题
|
||||||
|
create_date = scrapy.Field(
|
||||||
|
input_processor=MapCompose(date_convert)
|
||||||
|
) # 发布时间
|
||||||
|
url = scrapy.Field() # 链接
|
||||||
|
url_object_id = scrapy.Field() # 链接id
|
||||||
|
front_image_url = scrapy.Field(
|
||||||
|
output_processor=Identity()
|
||||||
|
) # 封面图
|
||||||
|
front_image_path = scrapy.Field() # 封面图路径
|
||||||
|
praise_nums = scrapy.Field() # 点赞数
|
||||||
|
comment_nums = scrapy.Field() # 评论数
|
||||||
|
fav_nums = scrapy.Field() # 收藏数
|
||||||
|
tags = scrapy.Field(
|
||||||
|
output_processor=Join(separator=',')
|
||||||
|
) # 标签
|
||||||
|
content = scrapy.Field() # 内容
|
||||||
|
|
||||||
|
def save_to_es(self):
|
||||||
|
article = ArticleType()
|
||||||
|
article.title = self['title']
|
||||||
|
article.create_date = self['create_date']
|
||||||
|
article.content = remove_tags(self['content'])
|
||||||
|
article.front_image_url = self['front_image_url']
|
||||||
|
if 'front_image_path' in self:
|
||||||
|
article.front_image_path = self['front_image_path']
|
||||||
|
article.praise_nums = self['praise_nums']
|
||||||
|
article.fav_nums = self['fav_nums']
|
||||||
|
article.comment_nums = self['comment_nums']
|
||||||
|
article.url = self['url']
|
||||||
|
article.tags = self['tags']
|
||||||
|
article.meta.id = self['url_object_id']
|
||||||
|
article.save()
|
||||||
|
return
|
@ -0,0 +1,44 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
__author__ = 'bobby'
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
|
||||||
|
analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
|
||||||
|
|
||||||
|
from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
|
||||||
|
|
||||||
|
from elasticsearch_dsl.connections import connections
|
||||||
|
|
||||||
|
connections.create_connection(hosts=["localhost"])
|
||||||
|
|
||||||
|
|
||||||
|
class CustomAnalyzer(_CustomAnalyzer):
|
||||||
|
def get_analysis_definition(self):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleType(DocType):
|
||||||
|
# 伯乐在线文章类型
|
||||||
|
suggest = Completion(analyzer=ik_analyzer)
|
||||||
|
title = Text(analyzer="ik_max_word")
|
||||||
|
create_date = Date()
|
||||||
|
url = Keyword()
|
||||||
|
url_object_id = Keyword()
|
||||||
|
front_image_url = Keyword()
|
||||||
|
front_image_path = Keyword()
|
||||||
|
praise_nums = Integer()
|
||||||
|
comment_nums = Integer()
|
||||||
|
fav_nums = Integer()
|
||||||
|
tags = Text(analyzer="ik_max_word")
|
||||||
|
content = Text(analyzer="ik_max_word")
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
index = "jobbole"
|
||||||
|
doc_type = "article"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ArticleType.init()
|
@ -0,0 +1,169 @@
|
|||||||
|
# Define your item pipelines here
|
||||||
|
#
|
||||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
from scrapy.pipelines.images import ImagesPipeline
|
||||||
|
from scrapy.http.request import Request
|
||||||
|
from ArticleSpider.models.es_types import ArticleType
|
||||||
|
import codecs
|
||||||
|
import json
|
||||||
|
from w3lib.html import remove_tags
|
||||||
|
from scrapy.exporters import JsonItemExporter
|
||||||
|
import MySQLdb
|
||||||
|
from twisted.enterprise import adbapi
|
||||||
|
from MySQLdb.cursors import DictCursor
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class ArticlespiderPipeline(object):
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
class MysqlPipeline(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.conn = MySQLdb.connect('127.0.0.1', 'root', 'qweasdzxc227', 'article_spider', charset="utf8",
|
||||||
|
use_unicode=True)
|
||||||
|
self.cursor = self.conn.cursor()
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
insert_sql = """
|
||||||
|
insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date)
|
||||||
|
values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums)
|
||||||
|
"""
|
||||||
|
params = list()
|
||||||
|
params.append(item.get("title", ""))
|
||||||
|
params.append(item.get("url", ""))
|
||||||
|
params.append(item.get("url_object_id", ""))
|
||||||
|
front_image = ','.join(item.get("front_image_url", []))
|
||||||
|
params.append(front_image)
|
||||||
|
params.append(item.get("front_image_path", ""))
|
||||||
|
params.append(item.get("parise_nums", 0))
|
||||||
|
params.append(item.get("comment_nums", 0))
|
||||||
|
params.append(item.get("fav_nums", 0))
|
||||||
|
params.append(item.get("tags", ""))
|
||||||
|
params.append(item.get("content", ""))
|
||||||
|
params.append(item.get("create_date", "1970-07-01"))
|
||||||
|
self.cursor.execute(insert_sql, tuple(params))
|
||||||
|
self.conn.commit()
|
||||||
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
class MysqlTwistedPipline(object):
|
||||||
|
def __init__(self, dbpool):
|
||||||
|
self.dbpool = dbpool
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_settings(cls, settings):
|
||||||
|
dbparms = dict(
|
||||||
|
host=settings["MYSQL_HOST"],
|
||||||
|
db=settings["MYSQL_DBNAME"],
|
||||||
|
user=settings["MYSQL_USER"],
|
||||||
|
passwd=settings["MYSQL_PASSWORD"],
|
||||||
|
charset='utf8',
|
||||||
|
cursorclass=DictCursor,
|
||||||
|
use_unicode=True,
|
||||||
|
)
|
||||||
|
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
|
||||||
|
|
||||||
|
return cls(dbpool)
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
# 使用twisted将mysql插入变成异步执行
|
||||||
|
query = self.dbpool.runInteraction(self.do_insert, item)
|
||||||
|
query.addErrback(self.handle_error, item, spider) # 处理异常
|
||||||
|
|
||||||
|
def handle_error(self, failure, item, spider):
|
||||||
|
# 处理异步插入的异常
|
||||||
|
print(failure)
|
||||||
|
|
||||||
|
def do_insert(self, cursor, item):
|
||||||
|
# 执行具体的插入
|
||||||
|
insert_sql = """
|
||||||
|
insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date)
|
||||||
|
values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums)
|
||||||
|
"""
|
||||||
|
params = list()
|
||||||
|
params.append(item.get("title", ""))
|
||||||
|
params.append(item.get("url", ""))
|
||||||
|
params.append(item.get("url_object_id", ""))
|
||||||
|
front_image = ','.join(item.get("front_image_url", []))
|
||||||
|
params.append(front_image)
|
||||||
|
params.append(item.get("front_image_path", ""))
|
||||||
|
params.append(item.get("parise_nums", 0))
|
||||||
|
params.append(item.get("comment_nums", 0))
|
||||||
|
params.append(item.get("fav_nums", 0))
|
||||||
|
params.append(item.get("tags", ""))
|
||||||
|
params.append(item.get("content", ""))
|
||||||
|
params.append(item.get("create_date", "1970-07-01"))
|
||||||
|
# 根据不同的item 构建不同的sql语句并插入到mysql中
|
||||||
|
cursor.execute(insert_sql, tuple(params))
|
||||||
|
|
||||||
|
|
||||||
|
class JsonWithEncodingPipeline(object):
|
||||||
|
# 自定义json文件的导出
|
||||||
|
def __init__(self):
|
||||||
|
self.file = codecs.open('article.json', 'a', encoding="utf-8")
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
|
||||||
|
self.file.write(lines)
|
||||||
|
return item
|
||||||
|
|
||||||
|
def spider_closed(self, spider):
|
||||||
|
self.file.close()
|
||||||
|
|
||||||
|
|
||||||
|
class JsonExporterPipeline(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.file = open('articleexport.json', 'wb')
|
||||||
|
self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
|
||||||
|
self.exporter.start_exporting()
|
||||||
|
|
||||||
|
def close_spider(self, spider):
|
||||||
|
self.exporter.finish_exporting()
|
||||||
|
self.file.close()
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
self.exporter.export_item(item)
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleImagePipeline(ImagesPipeline):
|
||||||
|
def item_completed(self, results, item, info):
|
||||||
|
try:
|
||||||
|
if "front_image_url" in item:
|
||||||
|
image_file_path = ''
|
||||||
|
for ok, value in results:
|
||||||
|
image_file_path = value["path"]
|
||||||
|
item["front_image_path"] = image_file_path
|
||||||
|
return item
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
item['front_image_path'] = '图片不可用'
|
||||||
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
class ElasticsearchPipeline(object):
|
||||||
|
# 将数据写入到es中
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
# article = ArticleType()
|
||||||
|
# article.title = item['title']
|
||||||
|
# article.create_date = item['create_date']
|
||||||
|
# article.content = remove_tags(item['content'])
|
||||||
|
# article.front_image_url = item['front_image_url']
|
||||||
|
# if 'front_image_path' in item:
|
||||||
|
# article.front_image_path = item['front_image_path']
|
||||||
|
# article.praise_nums = item['praise_nums']
|
||||||
|
# article.fav_nums = item['fav_nums']
|
||||||
|
# article.comment_nums = item['comment_nums']
|
||||||
|
# article.url = item['url']
|
||||||
|
# article.tags = item['tags']
|
||||||
|
# article.meta.id = item['url_object_id']
|
||||||
|
# article.save()
|
||||||
|
# 将item转换为es的数据
|
||||||
|
item.save_to_es()
|
||||||
|
return item
|
@ -0,0 +1,115 @@
|
|||||||
|
# Scrapy settings for ArticleSpider project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import scrapy.downloadermiddlewares.useragent
|
||||||
|
|
||||||
|
import ArticleSpider.pipelines
|
||||||
|
|
||||||
|
BOT_NAME = "ArticleSpider"
|
||||||
|
|
||||||
|
SPIDER_MODULES = ["ArticleSpider.spiders"]
|
||||||
|
NEWSPIDER_MODULE = "ArticleSpider.spiders"
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
# CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
# DOWNLOAD_DELAY = 3
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
|
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
COOKIES_ENABLED = True
|
||||||
|
COOKIES_DEBUG = True
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
# TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
# DEFAULT_REQUEST_HEADERS = {
|
||||||
|
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
# "Accept-Language": "en",
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
# SPIDER_MIDDLEWARES = {
|
||||||
|
# "ArticleSpider.middlewares.ArticlespiderSpiderMiddleware": 543,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
DOWNLOADER_MIDDLEWARES = {
|
||||||
|
# "ArticleSpider.middlewares.ArticlespiderDownloaderMiddleware": 543,
|
||||||
|
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
# EXTENSIONS = {
|
||||||
|
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
ITEM_PIPELINES = {
|
||||||
|
# 'scrapy.pipelines.images.ImagesPipeline': 1,
|
||||||
|
# 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2,
|
||||||
|
# 'ArticleSpider.pipelines.JsonExporterPipeline': 3,
|
||||||
|
# 'ArticleSpider.pipelines.MysqlPipeline': 4,
|
||||||
|
# 'ArticleSpider.pipelines.MysqlTwistedPipline': 5,
|
||||||
|
'ArticleSpider.pipelines.ElasticsearchPipeline': 6,
|
||||||
|
'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
# AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
# AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
# AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
# HTTPCACHE_ENABLED = True
|
||||||
|
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
# HTTPCACHE_DIR = "httpcache"
|
||||||
|
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
|
FEED_EXPORT_ENCODING = "utf-8"
|
||||||
|
|
||||||
|
# IMAGES_URLS_FIELD = 'front_image_url'
|
||||||
|
project_dir = os.path.abspath(os.path.dirname(__file__))
|
||||||
|
IMAGES_STORE = os.path.join(project_dir, 'images')
|
||||||
|
|
||||||
|
MYSQL_HOST = '127.0.0.1'
|
||||||
|
MYSQL_DBNAME = 'article_spider'
|
||||||
|
MYSQL_USER = 'root'
|
||||||
|
MYSQL_PASSWORD = 'qweasdzxc227'
|
@ -0,0 +1,4 @@
|
|||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,14 @@
|
|||||||
|
import hashlib
|
||||||
|
|
||||||
|
|
||||||
|
# 生成md5
|
||||||
|
def get_md5(url):
|
||||||
|
if isinstance(url, str):
|
||||||
|
url = url.encode("utf-8")
|
||||||
|
m = hashlib.md5()
|
||||||
|
m.update(url)
|
||||||
|
return m.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print(get_md5("https://cnblogs.com"))
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
@ -0,0 +1,12 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
__author__ = 'bobby'
|
||||||
|
|
||||||
|
from scrapy.cmdline import execute
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
execute(["scrapy", "crawl", "jobbole"])
|
||||||
|
# execute(["scrapy", "crawl", "zhihu"])
|
||||||
|
# execute(["scrapy", "crawl", "lagou"])
|
@ -0,0 +1,11 @@
|
|||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = ArticleSpider.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = ArticleSpider
|
Loading…
Reference in new issue