parent
104a58dff1
commit
53b479b643
@ -0,0 +1,8 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
@ -0,0 +1,6 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
@ -0,0 +1,10 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="jdk" jdkName="Python 3.10" jdkType="Python SDK" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
@ -0,0 +1,4 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/jd-distributed-crawler.iml" filepath="$PROJECT_DIR$/.idea/jd-distributed-crawler.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
@ -0,0 +1,37 @@
|
|||||||
|
# 京东分布式爬虫
|
||||||
|
|
||||||
|
#### Description
|
||||||
|
selenium爬取搜索页
|
||||||
|
scrapy爬取详情页
|
||||||
|
|
||||||
|
#### Software Architecture
|
||||||
|
Software architecture description
|
||||||
|
|
||||||
|
#### Installation
|
||||||
|
|
||||||
|
1. xxxx
|
||||||
|
2. xxxx
|
||||||
|
3. xxxx
|
||||||
|
|
||||||
|
#### Instructions
|
||||||
|
|
||||||
|
1. xxxx
|
||||||
|
2. xxxx
|
||||||
|
3. xxxx
|
||||||
|
|
||||||
|
#### Contribution
|
||||||
|
|
||||||
|
1. Fork the repository
|
||||||
|
2. Create Feat_xxx branch
|
||||||
|
3. Commit your code
|
||||||
|
4. Create Pull Request
|
||||||
|
|
||||||
|
|
||||||
|
#### Gitee Feature
|
||||||
|
|
||||||
|
1. You can use Readme\_XXX.md to support different languages, such as Readme\_en.md, Readme\_zh.md
|
||||||
|
2. Gitee blog [blog.gitee.com](https://blog.gitee.com)
|
||||||
|
3. Explore open source project [https://gitee.com/explore](https://gitee.com/explore)
|
||||||
|
4. The most valuable open source project [GVP](https://gitee.com/gvp)
|
||||||
|
5. The manual of Gitee [https://gitee.com/help](https://gitee.com/help)
|
||||||
|
6. The most popular members [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/)
|
After Width: | Height: | Size: 4.9 KiB |
After Width: | Height: | Size: 116 KiB |
After Width: | Height: | Size: 116 KiB |
After Width: | Height: | Size: 92 KiB |
After Width: | Height: | Size: 149 KiB |
After Width: | Height: | Size: 149 KiB |
@ -0,0 +1,115 @@
|
|||||||
|
from time import sleep
|
||||||
|
import yaml
|
||||||
|
import redis
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from pymongo import MongoClient
|
||||||
|
|
||||||
|
|
||||||
|
def scroll_by_xpath(webDriver, xpath):
|
||||||
|
"""
|
||||||
|
将webDriver里的网页下拉到xpath的位置
|
||||||
|
:param webDriver: web driver, 用于操作驱动后的浏览器
|
||||||
|
:param xpath: 要定位的元素的xpath, str
|
||||||
|
"""
|
||||||
|
target = webDriver.find_element(By.XPATH, xpath)
|
||||||
|
webDriver.execute_script("arguments[0].scrollIntoView();", target)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# 获取配置文件
|
||||||
|
yaml_file = "./settings.yaml"
|
||||||
|
with open(yaml_file, 'r') as f:
|
||||||
|
settings = yaml.safe_load(f)
|
||||||
|
|
||||||
|
# 获取redis连接
|
||||||
|
redis_connection = redis.Redis(
|
||||||
|
host=settings['redis']['host'],
|
||||||
|
port=settings['redis']['port'],
|
||||||
|
password=settings['redis']['password'],
|
||||||
|
db=settings['redis']['db'],
|
||||||
|
charset='UTF-8',
|
||||||
|
encoding='UTF-8')
|
||||||
|
|
||||||
|
mongo_connection = MongoClient('mongodb://{}:{}'.format(settings['mongodb']['host'], settings['mongodb']['port']),
|
||||||
|
username = settings['mongodb']['username'],
|
||||||
|
password = settings['mongodb']['password'])
|
||||||
|
|
||||||
|
# 打开京东商品搜索页
|
||||||
|
driver = webdriver.Chrome()
|
||||||
|
driver.get('https://search.jd.com/Search?keyword={}&enc=utf-8&wq={}'.format(settings['search']['keyword'], settings['search']['keyword']))
|
||||||
|
# 下拉到翻页导航的位置
|
||||||
|
scroll_by_xpath(driver, '//*[@id="J_bottomPage"]/span[1]/a[9]')
|
||||||
|
# 强制休眠5s再读取元素, 京东的搜索页需要下拉才加载全, 而下拉操作是异步的, 不匹配下拉时间读取不全元素
|
||||||
|
sleep(settings['search']['sleep'])
|
||||||
|
# 获取当前搜索的关键词的最大商品页
|
||||||
|
max_page = int(driver.find_element(By.XPATH, '//*[@id="J_bottomPage"]/span[2]/em[1]/b').text)
|
||||||
|
|
||||||
|
skuid_prev = []
|
||||||
|
skuid_url = []
|
||||||
|
page = 0
|
||||||
|
while page < max_page:
|
||||||
|
sku_price = []
|
||||||
|
|
||||||
|
# 下拉
|
||||||
|
scroll_by_xpath(driver, '//*[@id="J_bottomPage"]/span[1]/a[9]')
|
||||||
|
sleep(settings['search']['sleep'])
|
||||||
|
|
||||||
|
# 获取搜索页的商品的skuid和对应的价格
|
||||||
|
li_list = driver.find_elements(By.XPATH, '//*[@id="J_goodsList"]/ul/li')
|
||||||
|
for li in li_list:
|
||||||
|
li_text = li.text.split("\n")
|
||||||
|
result = 0
|
||||||
|
for text in li_text:
|
||||||
|
if text[1] in [str(i) for i in range(10)] and text[2] in [str(i) for i in range(10)]:
|
||||||
|
result = text[1:-1]
|
||||||
|
break
|
||||||
|
price = result
|
||||||
|
sku_price.append({li.get_attribute("data-sku"): price})
|
||||||
|
# 重复读取时, 重来
|
||||||
|
if page != 0 and sku_price[0] == skuid_prev[0]:
|
||||||
|
page -= 1
|
||||||
|
continue
|
||||||
|
# 防止未加载完就读取元素, 重新读取
|
||||||
|
while len(skuid_prev) < len(skuid_prev):
|
||||||
|
li_list = driver.find_elements(By.XPATH, '//*[@id="J_goodsList"]/ul/li')
|
||||||
|
for li in li_list:
|
||||||
|
price = li.find_element(By.XPATH, "//*[@class='gl-i-wrap']/*[@class='p-price']//strong/i")
|
||||||
|
sku_price.append({li.get_attribute("data-sku"): price})
|
||||||
|
|
||||||
|
#拼接成url, 入redis的start_url
|
||||||
|
for dic in sku_price:
|
||||||
|
for key in dic.keys():
|
||||||
|
skuid_url.append('https://item.jd.com/{}.html'.format(key))
|
||||||
|
for url in skuid_url:
|
||||||
|
redis_connection.rpush(settings['result']['rpush_key'], url)
|
||||||
|
|
||||||
|
#将sku和price放入mongodb
|
||||||
|
mongoDB = mongo_connection['admin']
|
||||||
|
mongoCollection = mongoDB['a']
|
||||||
|
|
||||||
|
for dic in sku_price:
|
||||||
|
document = {}
|
||||||
|
for key in dic.keys():
|
||||||
|
document['sku'] = key
|
||||||
|
document['price'] = dic[key]
|
||||||
|
# 更新MongoDB:
|
||||||
|
# 当数据库里没有, 插入
|
||||||
|
mongoCollection.update_one(
|
||||||
|
{'sku': document['sku']},
|
||||||
|
{
|
||||||
|
'$setOnInsert': document #生效需要upsert=True
|
||||||
|
},
|
||||||
|
upsert=True
|
||||||
|
)
|
||||||
|
|
||||||
|
print(str(page) + " " + str(len(skuid_url)) + " ", end="")
|
||||||
|
print(sku_price)
|
||||||
|
print(skuid_url)
|
||||||
|
sleep(settings['search']['sleep'])
|
||||||
|
|
||||||
|
skuid_prev = sku_price.copy()
|
||||||
|
skuid_url = []
|
||||||
|
page += 1
|
||||||
|
# 翻页
|
||||||
|
driver.find_element(By.XPATH, '//*[@id="J_bottomPage"]/span[1]/a[9]').click()
|
@ -0,0 +1,18 @@
|
|||||||
|
search:
|
||||||
|
keyword: '3080'
|
||||||
|
sleep: 5
|
||||||
|
|
||||||
|
redis:
|
||||||
|
host: '120.24.87.40'
|
||||||
|
port: '6379'
|
||||||
|
password: 'Guet@207'
|
||||||
|
db: 1
|
||||||
|
|
||||||
|
mongodb:
|
||||||
|
host: '120.24.87.40'
|
||||||
|
port: '27017'
|
||||||
|
username: "admin"
|
||||||
|
password: "123456"
|
||||||
|
|
||||||
|
result:
|
||||||
|
rpush_key: "jd:start_urls"
|
@ -0,0 +1,28 @@
|
|||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class JdskuspiderItem(scrapy.Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
|
# name = scrapy.Field()
|
||||||
|
# sku-id
|
||||||
|
sku = scrapy.Field()
|
||||||
|
|
||||||
|
# title
|
||||||
|
title = scrapy.Field()
|
||||||
|
|
||||||
|
# 显卡型号
|
||||||
|
model = scrapy.Field()
|
||||||
|
|
||||||
|
# 显存类型
|
||||||
|
memoryType = scrapy.Field()
|
||||||
|
|
||||||
|
# 显存位宽
|
||||||
|
bitWidth = scrapy.Field()
|
||||||
|
|
||||||
|
# 显存容量
|
||||||
|
memoryCapacity = scrapy.Field()
|
@ -0,0 +1,14 @@
|
|||||||
|
# Define your item pipelines here
|
||||||
|
#
|
||||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class JdskuspiderPipeline:
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
print(item)
|
||||||
|
return item
|
@ -0,0 +1,160 @@
|
|||||||
|
# Scrapy settings for jdSkuSpider project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
BOT_NAME = 'jdSkuSpider'
|
||||||
|
|
||||||
|
SPIDER_MODULES = ['jdSkuSpider.spiders']
|
||||||
|
NEWSPIDER_MODULE = 'jdSkuSpider.spiders'
|
||||||
|
|
||||||
|
#代理商提供的代理网址
|
||||||
|
PROXY_SERVER_URL = 'https://h.shanchendaili.com/api.html?action=get_ip&key=HU015c86520413222339Rp2a&time=10&count=1&protocol=http&type=text&textSep=1&only=1'
|
||||||
|
#用来测试当前的代理ip是否被封禁
|
||||||
|
TEST_PAGE = 'https://item.jd.com/100017846659.html'
|
||||||
|
#不去重(True代表不去重, False代表去重)
|
||||||
|
DONT_FILTER = True
|
||||||
|
|
||||||
|
REDIS_HOST = '120.24.87.40'
|
||||||
|
REDIS_PORT = '6379'
|
||||||
|
REDIS_DB = 1
|
||||||
|
REDIS_PARAMS = {
|
||||||
|
'password': 'Guet@207',
|
||||||
|
}
|
||||||
|
# 使用scrapy_redis中的调度器, 即保证每台主机爬取的URL地址都不同的Scheduler
|
||||||
|
SCHEDULER = 'scrapy_redis.scheduler.Scheduler'
|
||||||
|
# 配置scrapy使用的去重类, 即RFPDupeFilter
|
||||||
|
DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
|
||||||
|
# 序列化
|
||||||
|
SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"
|
||||||
|
#自动限速(控制爬取速度,降低对方服务器压力)
|
||||||
|
AUTOTHROTTLE_ENABLED = True
|
||||||
|
|
||||||
|
mongodb = {
|
||||||
|
'host': '120.24.87.40',
|
||||||
|
'port': 27017,
|
||||||
|
'username': "admin",
|
||||||
|
'password': "123456"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
cookies = {
|
||||||
|
'shshshfpa': '88442a32-4008-3016-2ed4-5f00ba6e02a6-1583306994',
|
||||||
|
'__jdu': '1645417385963897370071',
|
||||||
|
'shshshfpb': 'eKzbp55ekVbG%2BmXNii7FmnA%3D%3D',
|
||||||
|
'unpl': 'JF8EAKpnNSttC0hdDR9XGhMQTw5XW15YGURUPDQCVFwIHFEHGQoSExZ7XlVdXhRKFR9vZxRXXlNKUQ4fBCsSE3tdVV9cD0gVBWduNWRtW0tkBCsCHBcUTl1SXFQMQxABZm8DVltZSlIFKzIcEhl7bWRbXQlKFQVpZAVXbVl7VgQaAB8XFEJcU24cZk4XAmZlSFRaXU9RBR0AEhYYTF9dVlsKTRYCaWc1VlReQ1I1GA',
|
||||||
|
'__jdv': '76161171|baidu|-|organic|notset|1648984819662',
|
||||||
|
'areaId': '20',
|
||||||
|
'PCSYCityID': 'CN_450000_450300_0',
|
||||||
|
'ipLoc-djd': '20-1726-22884-51455',
|
||||||
|
'pinId': 'bxSvmv8CNc-KKJVv2AYoCLV9-x-f3wj7',
|
||||||
|
'pin': 'jd_701eaef2a29ff',
|
||||||
|
'unick': '%E6%9D%9C%E6%92%B0%E4%B8%AD%E7%9A%84%E6%9C%AA%E5%90%8D',
|
||||||
|
'_tp': 'Wy5hpT1Zse8ScnmqROf6D38%2BJ8IK3ESqvwXymUySsLE%3D',
|
||||||
|
'_pst': 'jd_701eaef2a29ff',
|
||||||
|
'shshshfp': '9dc45f7c100e4d56fc0ddb7d96bc59ab',
|
||||||
|
'__jdc': '122270672',
|
||||||
|
'__jda': '122270672.1645417385963897370071.1645417385.1649430078.1649491583.20',
|
||||||
|
'thor': 'FC76D1F441FDD23810222D65B21A1E62936594BACBBD52CF88EB830809C78B7F75DEEDA34458FE020C54AF1DD6E55F389B2063BCCBBD829B977631DD6FB67A17BA374FEFAA00AF1C8264A11F080FA449884B327D73A31031D1F35232730745A6BD1570FDB90E15A4002FCBF4C8CDED1F588BFA29B2272823A7263C9A88F289B84E2075F1A710BED202A651BD7ABBC09D354497FAFBB4A0A7CBDC9590803F6162',
|
||||||
|
'ceshi3.com': '000',
|
||||||
|
'token': '5a5e7384ead314efe3237efb8d9825fb,3,916384',
|
||||||
|
'__tk': 'OINnrcq4NDN5NiuEqcrdriKiOcJ5sINEsca4rfKiqIhgNLbhOIN4sG,3,916384',
|
||||||
|
'ip_cityCode': '1726',
|
||||||
|
'wlfstk_smdl': 'xrr3qwmzs7zoj7y6kbt5nfkai2clqo2n',
|
||||||
|
'shshshsID': '3c84311b6180b15c5bf06ba762b90ce2_7_1649491714829',
|
||||||
|
'__jdb': '122270672.15.1645417385963897370071|20.1649491583',
|
||||||
|
'3AB9D23F7A4B3C9B': '3WE3SSBAN2EJKLE3VP6ZK2I4UOCXTYVZTD7O46KFL2S7J2UVXVZ7IJGBJKC4S3RAWL2DRAMRLPN63TK3LHWTA5JVQQ',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
#USER_AGENT = 'jdSkuSpider (+http://www.yourdomain.com)'
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
#CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
DOWNLOAD_DELAY = 1
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
|
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
#COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
#TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
DEFAULT_REQUEST_HEADERS = {
|
||||||
|
'authority': 'item.jd.com',
|
||||||
|
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||||||
|
'accept-language': 'zh,zh-CN;q=0.9',
|
||||||
|
'cache-control': 'max-age=0',
|
||||||
|
# Requests sorts cookies= alphabetically
|
||||||
|
# 'cookie': 'shshshfpa=88442a32-4008-3016-2ed4-5f00ba6e02a6-1583306994; __jdu=1645417385963897370071; shshshfpb=eKzbp55ekVbG%2BmXNii7FmnA%3D%3D; unpl=JF8EAKpnNSttC0hdDR9XGhMQTw5XW15YGURUPDQCVFwIHFEHGQoSExZ7XlVdXhRKFR9vZxRXXlNKUQ4fBCsSE3tdVV9cD0gVBWduNWRtW0tkBCsCHBcUTl1SXFQMQxABZm8DVltZSlIFKzIcEhl7bWRbXQlKFQVpZAVXbVl7VgQaAB8XFEJcU24cZk4XAmZlSFRaXU9RBR0AEhYYTF9dVlsKTRYCaWc1VlReQ1I1GA; __jdv=76161171|baidu|-|organic|notset|1648984819662; areaId=20; PCSYCityID=CN_450000_450300_0; ipLoc-djd=20-1726-22884-51455; pinId=bxSvmv8CNc-KKJVv2AYoCLV9-x-f3wj7; pin=jd_701eaef2a29ff; unick=%E6%9D%9C%E6%92%B0%E4%B8%AD%E7%9A%84%E6%9C%AA%E5%90%8D; _tp=Wy5hpT1Zse8ScnmqROf6D38%2BJ8IK3ESqvwXymUySsLE%3D; _pst=jd_701eaef2a29ff; shshshfp=9dc45f7c100e4d56fc0ddb7d96bc59ab; __jdc=122270672; __jda=122270672.1645417385963897370071.1645417385.1649430078.1649491583.20; thor=FC76D1F441FDD23810222D65B21A1E62936594BACBBD52CF88EB830809C78B7F75DEEDA34458FE020C54AF1DD6E55F389B2063BCCBBD829B977631DD6FB67A17BA374FEFAA00AF1C8264A11F080FA449884B327D73A31031D1F35232730745A6BD1570FDB90E15A4002FCBF4C8CDED1F588BFA29B2272823A7263C9A88F289B84E2075F1A710BED202A651BD7ABBC09D354497FAFBB4A0A7CBDC9590803F6162; ceshi3.com=000; token=5a5e7384ead314efe3237efb8d9825fb,3,916384; __tk=OINnrcq4NDN5NiuEqcrdriKiOcJ5sINEsca4rfKiqIhgNLbhOIN4sG,3,916384; ip_cityCode=1726; wlfstk_smdl=xrr3qwmzs7zoj7y6kbt5nfkai2clqo2n; shshshsID=3c84311b6180b15c5bf06ba762b90ce2_7_1649491714829; __jdb=122270672.15.1645417385963897370071|20.1649491583; 3AB9D23F7A4B3C9B=3WE3SSBAN2EJKLE3VP6ZK2I4UOCXTYVZTD7O46KFL2S7J2UVXVZ7IJGBJKC4S3RAWL2DRAMRLPN63TK3LHWTA5JVQQ',
|
||||||
|
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"Windows"',
|
||||||
|
'sec-fetch-dest': 'document',
|
||||||
|
'sec-fetch-mode': 'navigate',
|
||||||
|
'sec-fetch-site': 'none',
|
||||||
|
'sec-fetch-user': '?1',
|
||||||
|
'upgrade-insecure-requests': '1',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
#SPIDER_MIDDLEWARES = {
|
||||||
|
# 'jdSkuSpider.middlewares.JdskuspiderSpiderMiddleware': 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
DOWNLOADER_MIDDLEWARES = {
|
||||||
|
'jdSkuSpider.middlewares.ProxyMiddleWare':542,
|
||||||
|
'jdSkuSpider.middlewares.JdskuspiderDownloaderMiddleware': 543,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
#EXTENSIONS = {
|
||||||
|
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
ITEM_PIPELINES = {
|
||||||
|
'scrapy_redis.pipelines.RedisPipeline':400,
|
||||||
|
'jdSkuSpider.pipelines.JdskuspiderPipeline': 300,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
#AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
#AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
#AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
#HTTPCACHE_ENABLED = True
|
||||||
|
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
#HTTPCACHE_DIR = 'httpcache'
|
||||||
|
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
@ -0,0 +1,4 @@
|
|||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
@ -0,0 +1,11 @@
|
|||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = jdSkuSpider.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = jdSkuSpider
|
Loading…
Reference in new issue