You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
133 lines
6.7 KiB
133 lines
6.7 KiB
# Scrapy settings for spider project
|
|
#
|
|
# For simplicity, this file contains only settings considered important or
|
|
# commonly used. You can find more settings consulting the documentation:
|
|
#
|
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
|
|
|
BOT_NAME = 'spider'
|
|
|
|
SPIDER_MODULES = ['spider.spiders']
|
|
NEWSPIDER_MODULE = 'spider.spiders'
|
|
FEED_EXPORT_ENCODING = 'utf-8'
|
|
|
|
|
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
|
#USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
|
|
# Obey robots.txt rules
|
|
ROBOTSTXT_OBEY = False
|
|
|
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
|
#CONCURRENT_REQUESTS = 32
|
|
|
|
# Configure a delay for requests for the same website (default: 0)
|
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
|
# See also autothrottle settings and docs
|
|
DOWNLOAD_DELAY = 5
|
|
# The download delay setting will honor only one of:
|
|
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
|
#CONCURRENT_REQUESTS_PER_IP = 16
|
|
|
|
# Disable cookies (enabled by default)
|
|
#Cookies_enabled=True
|
|
|
|
#TELNETCONSOLE_ENABLED = False
|
|
# Disable Telnet Console (enabled by default)
|
|
|
|
# Override the default request headers:
|
|
DEFAULT_REQUEST_HEADERS = {
|
|
'Connection': 'keep-alive',
|
|
}
|
|
|
|
'''加入自定义UA'''
|
|
|
|
USER_AGENTS_LIST = [
|
|
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
|
|
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
|
|
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
|
|
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
|
|
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
|
|
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
|
|
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
|
|
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
|
|
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
|
|
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
|
|
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
|
|
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
|
|
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
|
|
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
|
|
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
|
|
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
|
|
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
|
|
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
|
|
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
|
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
|
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
|
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
|
|
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
|
|
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
|
|
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
|
|
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
|
|
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
|
|
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
|
|
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
|
|
"UCWEB7.0.2.37/28/999",
|
|
"NOKIA5700/ UCWEB7.0.2.37/28/999",
|
|
"Openwave/ UCWEB7.0.2.37/28/999",
|
|
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
|
|
]
|
|
#
|
|
|
|
# Enable or disable spider middlewares
|
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
|
#SPIDER_MIDDLEWARES = {
|
|
# 'spider.middlewares.SpiderSpiderMiddleware': 543,
|
|
#}
|
|
|
|
#REDIRECT_ENABLED = False
|
|
# Enable or disable downloader middlewares
|
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
|
DOWNLOADER_MIDDLEWARES = {
|
|
'spider.middlewares.SpiderDownloaderMiddleware': 543,
|
|
#'spider.middlewares.ipDownloaderMiddleware': 243
|
|
}
|
|
|
|
# Enable or disable extensions
|
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
|
#EXTENSIONS = {
|
|
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
|
#}
|
|
|
|
# Configure item pipelines
|
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
|
ITEM_PIPELINES = {
|
|
'spider.pipelines.RedisSavePipeline': 500,
|
|
'spider.pipelines.ExcelPipeline': 400,
|
|
}
|
|
|
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
|
#AUTOTHROTTLE_ENABLED = True
|
|
# The initial download delay
|
|
#AUTOTHROTTLE_START_DELAY = 5
|
|
# The maximum download delay to be set in case of high latencies
|
|
#AUTOTHROTTLE_MAX_DELAY = 60
|
|
# The average number of requests Scrapy should be sending in parallel to
|
|
# each remote server
|
|
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
|
# Enable showing throttling stats for every response received:
|
|
#AUTOTHROTTLE_DEBUG = False
|
|
|
|
# Enable and configure HTTP caching (disabled by default)
|
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
|
#HTTPCACHE_ENABLED = True
|
|
#HTTPCACHE_EXPIRATION_SECS = 0
|
|
#HTTPCACHE_DIR = 'httpcache'
|
|
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
|
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|