parent
b8642fcca1
commit
f881ba4164
@ -0,0 +1,33 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class IpproxySpider(scrapy.Spider):
|
||||||
|
name = 'ipproxy'
|
||||||
|
allowed_domains = ['kuaidaili.com']
|
||||||
|
start_urls = ['http://kuaidaili.com/free']
|
||||||
|
custom_settings={
|
||||||
|
'DOWNLOADER_MIDDLEWARES' :{
|
||||||
|
'spider.middlewares.IPDownloaderMiddleware': 543
|
||||||
|
},
|
||||||
|
'ITEM_PIPELINES' : {
|
||||||
|
'spider.pipelines.IPProxyPipeline': 500
|
||||||
|
},
|
||||||
|
'DOWNLOAD_DELAY' : 5
|
||||||
|
}
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
|
||||||
|
for i in range(1,16):
|
||||||
|
IpPool = {}
|
||||||
|
ip = response.xpath('//*[@id="list"]/table/tbody/tr[{}]/td[1]'.format(i)).extract_first()
|
||||||
|
port = response.xpath('//*[@id="list"]/table/tbody/tr[{}]/td[2]'.format(i)).extract_first()
|
||||||
|
print(ip,port)
|
||||||
|
ip = re.findall('>.*?<',ip)[0][1:-1]
|
||||||
|
port = re.findall('>.*?<',port)[0][1:-1]
|
||||||
|
print(ip, port)
|
||||||
|
IpPool['ip']='http://'+ip+':'+port
|
||||||
|
print(IpPool['ip'])
|
||||||
|
yield IpPool
|
||||||
|
|
Loading…
Reference in new issue