parent
a8b8584cf4
commit
08809d2216
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1 @@
|
||||
[{"domain": ".taobao.com", "expiry": 1685468327, "httpOnly": false, "name": "l", "path": "/", "secure": false, "value": "fB_ILMgVTXNQ7v3bBOfwourza77OSIRxMuPzaNbMi9fPOmfp5yQAW65TY0T9C3hVFsQvR3ykA-nXBeYBqS0dz4mNB_LvYHkmnmOk-Wf.."}, {"domain": ".taobao.com", "expiry": 1685468327, "httpOnly": false, "name": "tfstk", "path": "/", "secure": false, "value": "cq51BK9gncmsS4VmshaU0aYlW_RfZYEB-RtO5lOE_r7LV3b1iZGyVDklnDA2ke1.."}, {"domain": ".taobao.com", "httpOnly": false, "name": "_l_g_", "path": "/", "sameSite": "None", "secure": true, "value": "Ug%3D%3D"}, {"domain": ".taobao.com", "httpOnly": false, "name": "_nk_", "path": "/", "sameSite": "None", "secure": true, "value": "tb321304412"}, {"domain": ".taobao.com", "httpOnly": false, "name": "existShop", "path": "/", "sameSite": "None", "secure": true, "value": "MTY2OTkxNjMyOA%3D%3D"}, {"domain": ".taobao.com", "httpOnly": true, "name": "cookie1", "path": "/", "sameSite": "None", "secure": true, "value": "AimWB5HotT62bF1%2BmBCcRd8gjVJXFKBHJr%2Fjjn6Vekk%3D"}, {"domain": ".taobao.com", "httpOnly": false, "name": "dnk", "path": "/", "sameSite": "None", "secure": true, "value": "tb321304412"}, {"domain": ".taobao.com", "httpOnly": false, "name": "cancelledSubSites", "path": "/", "sameSite": "None", "secure": true, "value": "empty"}, {"domain": ".taobao.com", "httpOnly": false, "name": "sg", "path": "/", "sameSite": "None", "secure": true, "value": "28e"}, {"domain": ".taobao.com", "expiry": 1672537127, "httpOnly": false, "name": "lgc", "path": "/", "sameSite": "None", "secure": true, "value": "tb321304412"}, {"domain": ".taobao.com", "httpOnly": false, "name": "csg", "path": "/", "sameSite": "None", "secure": true, "value": "d27084b6"}, {"domain": ".taobao.com", "expiry": 1672537127, "httpOnly": true, "name": "uc3", "path": "/", "sameSite": "None", "secure": true, "value": "lg2=V32FPkk%2Fw0dUvg%3D%3D&vt3=F8dCvjcIfs1PFPkuQrw%3D&nk2=F5RGNGdwoOrDWFI%3D&id2=UUphzW%2B%2BovW1q2RoMg%3D%3D"}, {"domain": ".taobao.com", "httpOnly": true, "name": "unb", "path": "/", "sameSite": "None", "secure": true, "value": "2207876772278"}, {"domain": ".taobao.com", "expiry": 1685468327, "httpOnly": false, "name": "isg", "path": "/", "sameSite": "None", "secure": true, "value": "BBoas2OnLZ2ml6ExsppfBWpwa8A8S54lOnUtxySTxq14l7rRDNvuNeBlYmMLFxa9"}, {"domain": ".taobao.com", "httpOnly": true, "name": "skt", "path": "/", "sameSite": "None", "secure": true, "value": "873c7ff29e138663"}, {"domain": ".taobao.com", "expiry": 1672537127, "httpOnly": true, "name": "uc4", "path": "/", "sameSite": "None", "secure": true, "value": "id4=0%40U2grFnDvlmnfW%2BNp%2B6ZGX9l5MMfTPSq%2F&nk4=0%40FY4NAeZV1KZ7RC8Zkc8%2FSEPqFqwKaQ%3D%3D"}, {"domain": ".taobao.com", "httpOnly": true, "name": "cookie2", "path": "/", "sameSite": "None", "secure": true, "value": "10339a3aaef951d408d9fe6c690714d0"}, {"domain": ".taobao.com", "expiry": 1701481127, "httpOnly": true, "name": "sgcookie", "path": "/", "sameSite": "None", "secure": true, "value": "E10071Id7xaU2Yoct9CgGOcxaZWAP3q4bb7bTdhFAibJTucWghbnO1%2Fryls4Hn399PNM16PK3LJh3Wjxt7cLslg5TIVrhUfAij890ae4N%2Fq0Po0%3D"}, {"domain": ".taobao.com", "httpOnly": false, "name": "uc1", "path": "/", "sameSite": "None", "secure": true, "value": "cookie15=URm48syIIVrSKA%3D%3D&cookie21=VFC%2FuZ9ainBZ&cookie14=UoeyBziqc0Vuvw%3D%3D&cookie16=UIHiLt3xCS3yM2h4eKHS9lpEOw%3D%3D&pas=0&existShop=false"}, {"domain": ".taobao.com", "expiry": 1701481127, "httpOnly": false, "name": "_cc_", "path": "/", "sameSite": "None", "secure": true, "value": "URm48syIZQ%3D%3D"}, {"domain": ".taobao.com", "httpOnly": true, "name": "cookie17", "path": "/", "sameSite": "None", "secure": true, "value": "UUphzW%2B%2BovW1q2RoMg%3D%3D"}, {"domain": ".taobao.com", "expiry": 1670002714, "httpOnly": false, "name": "xlly_s", "path": "/", "sameSite": "None", "secure": true, "value": "1"}, {"domain": ".taobao.com", "expiry": 1677721127, "httpOnly": false, "name": "t", "path": "/", "sameSite": "None", "secure": true, "value": "cf7516845097b2b6ef1fa91777a658c7"}, {"domain": ".taobao.com", "expiry": 1701481127, "httpOnly": false, "name": "tracknick", "path": "/", "sameSite": "None", "secure": true, "value": "tb321304412"}, {"domain": ".taobao.com", "expiry": 1704476313, "httpOnly": false, "name": "cna", "path": "/", "sameSite": "None", "secure": true, "value": "mtQPHA0NHXICAbZb2NqBoZ0S"}, {"domain": ".taobao.com", "httpOnly": false, "name": "_tb_token_", "path": "/", "sameSite": "None", "secure": true, "value": "e8b5b3e0b6583"}, {"domain": ".taobao.com", "httpOnly": true, "name": "_samesite_flag_", "path": "/", "sameSite": "None", "secure": true, "value": "true"}]
|
@ -0,0 +1,5 @@
|
||||
import scrapy
|
||||
class TaobaoItem(scrapy.Item):
|
||||
url = scrapy.Field()
|
||||
headers = scrapy.Field()
|
||||
cookie = scrapy.Field()
|
@ -0,0 +1,12 @@
|
||||
import re
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
|
||||
class TaobaoDownloaderMiddleware:
|
||||
# 拦截所有的响应
|
||||
def process_response(self, request, response, spider):
|
||||
return response
|
||||
|
||||
# 拦截发生异常的请求(request)对象
|
||||
def process_exception(self, request, exception, spider):
|
||||
print('发生异常')
|
||||
return None
|
@ -0,0 +1,105 @@
|
||||
import os
|
||||
import json
|
||||
import scrapy
|
||||
import random
|
||||
from scrapy_redis.spiders import RedisCrawlSpider
|
||||
from Taobao.items import TaobaoItem
|
||||
# selenium模块引入
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from time import sleep
|
||||
|
||||
|
||||
class TaobaoSpider(RedisCrawlSpider):
|
||||
name = 'TaobaoMaster'
|
||||
# allowed_domain = ['.taobao.com']
|
||||
# start_urls = ['https://s.taobao.com/search?s=0&q=运动鞋']
|
||||
head = {
|
||||
'user-agent':'',
|
||||
'cookie':'',
|
||||
}
|
||||
|
||||
def start_requests(self):
|
||||
print('执行初始化url操作')
|
||||
# 初始化伪装头
|
||||
self.head['user-agent']=random.choice(self.settings['USER_AGENTS'])
|
||||
# 初始获取的base_url
|
||||
base_url = []
|
||||
for page in range(1, self.settings['MAX_PAGE']+1):
|
||||
url = 'https://s.taobao.com/search?q=' + self.settings['GOODS'] + '&s=' + str(page*44)
|
||||
base_url.append(url)
|
||||
#一次cookies只能使用30分钟,需要在过程中进行拦截换headers处理
|
||||
try:
|
||||
if os.path.getsize(r'./Taobao/info/cookies.json'):
|
||||
print('已存在cookies')
|
||||
# 使用原有的cookies登陆注册
|
||||
with open(r'./Taobao/info/cookies.json', 'r', encoding='utf-8') as f:
|
||||
cookies_list = json.loads(f.read())
|
||||
cookies = [item["name"] + "=" + item["value"]for item in cookies_list]
|
||||
cookiesStr = ';'.join(item for item in cookies)
|
||||
else:
|
||||
print('不存在cookies')
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
options.add_argument('--disable-gpu')
|
||||
options.add_experimental_option('excludeSwitches', ['enable-logging'])
|
||||
|
||||
|
||||
count = 0
|
||||
while count !=3:
|
||||
driver = webdriver.Chrome(options=options)
|
||||
driver.implicitly_wait(15)
|
||||
driver.get("https://s.taobao.com/")
|
||||
# 登录
|
||||
driver.find_element(by=By.CLASS_NAME, value='h').click()
|
||||
sleep(4)
|
||||
driver.find_element(by=By.NAME, value='fm-login-id').send_keys(self.settings['USERNAME'])
|
||||
sleep(4)
|
||||
driver.find_element(by=By.NAME, value="fm-login-password").send_keys(self.settings['PASSWORD'])
|
||||
sleep(4)
|
||||
driver.find_element(by=By.CLASS_NAME, value='fm-submit').click()
|
||||
sleep(10) #手机确认登陆
|
||||
cookies = driver.get_cookies()
|
||||
jsonCookies = json.dumps(cookies)
|
||||
|
||||
# 写入cookies
|
||||
with open(r'./Taobao/info/cookies.json', 'w') as f:
|
||||
f.write(jsonCookies)
|
||||
# 使用新写入的cookies
|
||||
with open(r'./Taobao/info/cookies.json', 'r', encoding='utf-8') as f:
|
||||
cookies_list = json.loads(f.read()) # 以dict形式将str读取出来
|
||||
|
||||
cookies = [item["name"] + "=" + item["value"]for item in cookies_list]
|
||||
cookiesStr = ';'.join(item for item in cookies)
|
||||
|
||||
#登陆失败达到3次
|
||||
if count == 3:
|
||||
print('登陆失败')
|
||||
#登陆失败或获取cookie失败
|
||||
print('str:',cookiesStr[0])
|
||||
if cookiesStr[0] != 'l' and cookiesStr[0] !='t':
|
||||
print('第%d次登陆失败' %count)
|
||||
with open(r'./Taobao/info/cookies.json','w') as f:
|
||||
f.write('')
|
||||
count += 1
|
||||
sleep(4)
|
||||
driver.quit()
|
||||
continue
|
||||
else:
|
||||
break
|
||||
self.head['cookie'] = cookiesStr
|
||||
print(cookiesStr)
|
||||
for url in base_url:
|
||||
# print(url)
|
||||
yield scrapy.Request(url, headers=self.head, callback=self.parse,meta={'cookiejar':cookiesStr,'proxy':''})
|
||||
except Exception as e:
|
||||
# 输出异常情况
|
||||
print(e)
|
||||
return None
|
||||
|
||||
def parse(self, response):
|
||||
item = TaobaoItem()
|
||||
item['url'] = response.url
|
||||
item['cookie']=response.meta['cookiejar']
|
||||
item['headers'] = self.head['user-agent']
|
||||
return item
|
Binary file not shown.
Binary file not shown.
@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = Taobao.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = Taobao
|
Loading…
Reference in new issue