You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
34 lines
983 B
34 lines
983 B
3 years ago
|
import re
|
||
|
|
||
|
import scrapy
|
||
|
|
||
|
|
||
|
class IpproxySpider(scrapy.Spider):
|
||
|
name = 'ipproxy'
|
||
|
allowed_domains = ['kuaidaili.com']
|
||
|
start_urls = ['http://kuaidaili.com/free']
|
||
|
custom_settings={
|
||
|
'DOWNLOADER_MIDDLEWARES' :{
|
||
|
'spider.middlewares.IPDownloaderMiddleware': 543
|
||
|
},
|
||
|
'ITEM_PIPELINES' : {
|
||
|
'spider.pipelines.IPProxyPipeline': 500
|
||
|
},
|
||
|
'DOWNLOAD_DELAY' : 5
|
||
|
}
|
||
|
|
||
|
def parse(self, response):
|
||
|
|
||
|
for i in range(1,16):
|
||
|
IpPool = {}
|
||
|
ip = response.xpath('//*[@id="list"]/table/tbody/tr[{}]/td[1]'.format(i)).extract_first()
|
||
|
port = response.xpath('//*[@id="list"]/table/tbody/tr[{}]/td[2]'.format(i)).extract_first()
|
||
|
print(ip,port)
|
||
|
ip = re.findall('>.*?<',ip)[0][1:-1]
|
||
|
port = re.findall('>.*?<',port)[0][1:-1]
|
||
|
print(ip, port)
|
||
|
IpPool['ip']='http://'+ip+':'+port
|
||
|
print(IpPool['ip'])
|
||
|
yield IpPool
|
||
|
|