master
Zhao 2 years ago
parent a8b8584cf4
commit 08809d2216

@ -0,0 +1 @@
[{"domain": ".taobao.com", "expiry": 1685468327, "httpOnly": false, "name": "l", "path": "/", "secure": false, "value": "fB_ILMgVTXNQ7v3bBOfwourza77OSIRxMuPzaNbMi9fPOmfp5yQAW65TY0T9C3hVFsQvR3ykA-nXBeYBqS0dz4mNB_LvYHkmnmOk-Wf.."}, {"domain": ".taobao.com", "expiry": 1685468327, "httpOnly": false, "name": "tfstk", "path": "/", "secure": false, "value": "cq51BK9gncmsS4VmshaU0aYlW_RfZYEB-RtO5lOE_r7LV3b1iZGyVDklnDA2ke1.."}, {"domain": ".taobao.com", "httpOnly": false, "name": "_l_g_", "path": "/", "sameSite": "None", "secure": true, "value": "Ug%3D%3D"}, {"domain": ".taobao.com", "httpOnly": false, "name": "_nk_", "path": "/", "sameSite": "None", "secure": true, "value": "tb321304412"}, {"domain": ".taobao.com", "httpOnly": false, "name": "existShop", "path": "/", "sameSite": "None", "secure": true, "value": "MTY2OTkxNjMyOA%3D%3D"}, {"domain": ".taobao.com", "httpOnly": true, "name": "cookie1", "path": "/", "sameSite": "None", "secure": true, "value": "AimWB5HotT62bF1%2BmBCcRd8gjVJXFKBHJr%2Fjjn6Vekk%3D"}, {"domain": ".taobao.com", "httpOnly": false, "name": "dnk", "path": "/", "sameSite": "None", "secure": true, "value": "tb321304412"}, {"domain": ".taobao.com", "httpOnly": false, "name": "cancelledSubSites", "path": "/", "sameSite": "None", "secure": true, "value": "empty"}, {"domain": ".taobao.com", "httpOnly": false, "name": "sg", "path": "/", "sameSite": "None", "secure": true, "value": "28e"}, {"domain": ".taobao.com", "expiry": 1672537127, "httpOnly": false, "name": "lgc", "path": "/", "sameSite": "None", "secure": true, "value": "tb321304412"}, {"domain": ".taobao.com", "httpOnly": false, "name": "csg", "path": "/", "sameSite": "None", "secure": true, "value": "d27084b6"}, {"domain": ".taobao.com", "expiry": 1672537127, "httpOnly": true, "name": "uc3", "path": "/", "sameSite": "None", "secure": true, "value": "lg2=V32FPkk%2Fw0dUvg%3D%3D&vt3=F8dCvjcIfs1PFPkuQrw%3D&nk2=F5RGNGdwoOrDWFI%3D&id2=UUphzW%2B%2BovW1q2RoMg%3D%3D"}, {"domain": ".taobao.com", "httpOnly": true, "name": "unb", "path": "/", "sameSite": "None", "secure": true, "value": "2207876772278"}, {"domain": ".taobao.com", "expiry": 1685468327, "httpOnly": false, "name": "isg", "path": "/", "sameSite": "None", "secure": true, "value": "BBoas2OnLZ2ml6ExsppfBWpwa8A8S54lOnUtxySTxq14l7rRDNvuNeBlYmMLFxa9"}, {"domain": ".taobao.com", "httpOnly": true, "name": "skt", "path": "/", "sameSite": "None", "secure": true, "value": "873c7ff29e138663"}, {"domain": ".taobao.com", "expiry": 1672537127, "httpOnly": true, "name": "uc4", "path": "/", "sameSite": "None", "secure": true, "value": "id4=0%40U2grFnDvlmnfW%2BNp%2B6ZGX9l5MMfTPSq%2F&nk4=0%40FY4NAeZV1KZ7RC8Zkc8%2FSEPqFqwKaQ%3D%3D"}, {"domain": ".taobao.com", "httpOnly": true, "name": "cookie2", "path": "/", "sameSite": "None", "secure": true, "value": "10339a3aaef951d408d9fe6c690714d0"}, {"domain": ".taobao.com", "expiry": 1701481127, "httpOnly": true, "name": "sgcookie", "path": "/", "sameSite": "None", "secure": true, "value": "E10071Id7xaU2Yoct9CgGOcxaZWAP3q4bb7bTdhFAibJTucWghbnO1%2Fryls4Hn399PNM16PK3LJh3Wjxt7cLslg5TIVrhUfAij890ae4N%2Fq0Po0%3D"}, {"domain": ".taobao.com", "httpOnly": false, "name": "uc1", "path": "/", "sameSite": "None", "secure": true, "value": "cookie15=URm48syIIVrSKA%3D%3D&cookie21=VFC%2FuZ9ainBZ&cookie14=UoeyBziqc0Vuvw%3D%3D&cookie16=UIHiLt3xCS3yM2h4eKHS9lpEOw%3D%3D&pas=0&existShop=false"}, {"domain": ".taobao.com", "expiry": 1701481127, "httpOnly": false, "name": "_cc_", "path": "/", "sameSite": "None", "secure": true, "value": "URm48syIZQ%3D%3D"}, {"domain": ".taobao.com", "httpOnly": true, "name": "cookie17", "path": "/", "sameSite": "None", "secure": true, "value": "UUphzW%2B%2BovW1q2RoMg%3D%3D"}, {"domain": ".taobao.com", "expiry": 1670002714, "httpOnly": false, "name": "xlly_s", "path": "/", "sameSite": "None", "secure": true, "value": "1"}, {"domain": ".taobao.com", "expiry": 1677721127, "httpOnly": false, "name": "t", "path": "/", "sameSite": "None", "secure": true, "value": "cf7516845097b2b6ef1fa91777a658c7"}, {"domain": ".taobao.com", "expiry": 1701481127, "httpOnly": false, "name": "tracknick", "path": "/", "sameSite": "None", "secure": true, "value": "tb321304412"}, {"domain": ".taobao.com", "expiry": 1704476313, "httpOnly": false, "name": "cna", "path": "/", "sameSite": "None", "secure": true, "value": "mtQPHA0NHXICAbZb2NqBoZ0S"}, {"domain": ".taobao.com", "httpOnly": false, "name": "_tb_token_", "path": "/", "sameSite": "None", "secure": true, "value": "e8b5b3e0b6583"}, {"domain": ".taobao.com", "httpOnly": true, "name": "_samesite_flag_", "path": "/", "sameSite": "None", "secure": true, "value": "true"}]

@ -0,0 +1,5 @@
import scrapy
class TaobaoItem(scrapy.Item):
url = scrapy.Field()
headers = scrapy.Field()
cookie = scrapy.Field()

@ -0,0 +1,12 @@
import re
from scrapy.exceptions import IgnoreRequest
class TaobaoDownloaderMiddleware:
# 拦截所有的响应
def process_response(self, request, response, spider):
return response
# 拦截发生异常的请求(request)对象
def process_exception(self, request, exception, spider):
print('发生异常')
return None

@ -0,0 +1,33 @@
import re
import redis
# useful for handling different item types with a single interface
# class TaobaoPipeline:
# def process_item(self,item,spider):
# return item
#该继承类结束了就会想Redis发送封装后的item因此需要使其最后执行
class MasterPipeline(object):
def __init__(self, host, port):
# 链接redis数据库
self.r = redis.StrictRedis(
host=host, port=port, db=0, decode_responses=True)
# 链接数据库的加载
@classmethod
def from_crawler(cls, crawler):
# """注入实例化对象(传入参数)"""
return cls(
host=crawler.settings.get("REDIS_HOST"),
port=crawler.settings.get("REDIS_PORT",6379),
)
# 存入数据
def process_item(self, item, spider):
# print(item['url'])
if re.findall('login',item['url']):
self.r.lpush('TaobaoMaster:error',item['url'])
else:
self.r.lpush('TaobaoMaster:start_urls', item['url'])
return item

@ -0,0 +1,164 @@
# Scrapy settings for Taobao project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'Taobao'
SPIDER_MODULES = ['Taobao.spiders']
NEWSPIDER_MODULE = 'Taobao.spiders'
# Warnings information
LOG_LEVEL = "ERROR" # WARNING
LOG_FILE = "./log.log"
# 查找页数
MAX_PAGE = 60
# 查找内容
GOODS = '运动鞋'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = None
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'Taobao.middlewares.TaobaoSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'Taobao.middlewares.TaobaoDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
# # Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'Taobao.pipelines.TaobaoPipeline': 300,
'Taobao.pipelines.MasterPipeline': 300,
# 放入redis数据库
'scrapy_redis.pipelines.RedisPipeline': 900,
}
# IMAGES_STORE ='/downloadImg'
# 增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
SCHEDULER_PERSIST = False
# 指定排序爬取地址时使用的队列,
# 默认的 按优先级排序(Scrapy默认)由sorted set实现的一种非FIFO、LIFO方式。
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
REDIS_HOST = "10.70.107.213"
REDIS_PORT = 6379
FEED_EXPORT_ENCODING = 'utf-8'
# 用于随机的伪装头
USER_AGENTS = [
# Opera
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
# Firefox
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
# Safari
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
# chrome
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
# 360
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
# 淘宝浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
# 猎豹浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
# QQ浏览器
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
# sogou浏览器
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
# hon浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
# UC浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
]
# 账户用户名与密码
# USERNAME = '13211501016'
# PASSWORD = 'qq998765mM'
USERNAME = '13387732452'
PASSWORD = 'qq@2654922683eE'

@ -0,0 +1,105 @@
import os
import json
import scrapy
import random
from scrapy_redis.spiders import RedisCrawlSpider
from Taobao.items import TaobaoItem
# selenium模块引入
from selenium import webdriver
from selenium.webdriver.common.by import By
from time import sleep
class TaobaoSpider(RedisCrawlSpider):
name = 'TaobaoMaster'
# allowed_domain = ['.taobao.com']
# start_urls = ['https://s.taobao.com/search?s=0&q=运动鞋']
head = {
'user-agent':'',
'cookie':'',
}
def start_requests(self):
print('执行初始化url操作')
# 初始化伪装头
self.head['user-agent']=random.choice(self.settings['USER_AGENTS'])
# 初始获取的base_url
base_url = []
for page in range(1, self.settings['MAX_PAGE']+1):
url = 'https://s.taobao.com/search?q=' + self.settings['GOODS'] + '&s=' + str(page*44)
base_url.append(url)
#一次cookies只能使用30分钟,需要在过程中进行拦截换headers处理
try:
if os.path.getsize(r'./Taobao/info/cookies.json'):
print('已存在cookies')
# 使用原有的cookies登陆注册
with open(r'./Taobao/info/cookies.json', 'r', encoding='utf-8') as f:
cookies_list = json.loads(f.read())
cookies = [item["name"] + "=" + item["value"]for item in cookies_list]
cookiesStr = ';'.join(item for item in cookies)
else:
print('不存在cookies')
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument('--disable-gpu')
options.add_experimental_option('excludeSwitches', ['enable-logging'])
count = 0
while count !=3:
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(15)
driver.get("https://s.taobao.com/")
# 登录
driver.find_element(by=By.CLASS_NAME, value='h').click()
sleep(4)
driver.find_element(by=By.NAME, value='fm-login-id').send_keys(self.settings['USERNAME'])
sleep(4)
driver.find_element(by=By.NAME, value="fm-login-password").send_keys(self.settings['PASSWORD'])
sleep(4)
driver.find_element(by=By.CLASS_NAME, value='fm-submit').click()
sleep(10) #手机确认登陆
cookies = driver.get_cookies()
jsonCookies = json.dumps(cookies)
# 写入cookies
with open(r'./Taobao/info/cookies.json', 'w') as f:
f.write(jsonCookies)
# 使用新写入的cookies
with open(r'./Taobao/info/cookies.json', 'r', encoding='utf-8') as f:
cookies_list = json.loads(f.read()) # 以dict形式将str读取出来
cookies = [item["name"] + "=" + item["value"]for item in cookies_list]
cookiesStr = ';'.join(item for item in cookies)
#登陆失败达到3次
if count == 3:
print('登陆失败')
#登陆失败或获取cookie失败
print('str:',cookiesStr[0])
if cookiesStr[0] != 'l' and cookiesStr[0] !='t':
print('%d次登陆失败' %count)
with open(r'./Taobao/info/cookies.json','w') as f:
f.write('')
count += 1
sleep(4)
driver.quit()
continue
else:
break
self.head['cookie'] = cookiesStr
print(cookiesStr)
for url in base_url:
# print(url)
yield scrapy.Request(url, headers=self.head, callback=self.parse,meta={'cookiejar':cookiesStr,'proxy':''})
except Exception as e:
# 输出异常情况
print(e)
return None
def parse(self, response):
item = TaobaoItem()
item['url'] = response.url
item['cookie']=response.meta['cookiejar']
item['headers'] = self.head['user-agent']
return item

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = Taobao.settings
[deploy]
#url = http://localhost:6800/
project = Taobao
Loading…
Cancel
Save