import re import scrapy class IpproxySpider(scrapy.Spider): name = 'ipproxy' allowed_domains = ['kuaidaili.com'] start_urls = ['http://kuaidaili.com/free'] custom_settings={ 'DOWNLOADER_MIDDLEWARES' :{ 'spider.middlewares.IPDownloaderMiddleware': 543 }, 'ITEM_PIPELINES' : { 'spider.pipelines.IPProxyPipeline': 500 }, 'DOWNLOAD_DELAY' : 5 } def parse(self, response): for i in range(1,16): IpPool = {} ip = response.xpath('//*[@id="list"]/table/tbody/tr[{}]/td[1]'.format(i)).extract_first() port = response.xpath('//*[@id="list"]/table/tbody/tr[{}]/td[2]'.format(i)).extract_first() print(ip,port) ip = re.findall('>.*?<',ip)[0][1:-1] port = re.findall('>.*?<',port)[0][1:-1] print(ip, port) IpPool['ip']='http://'+ip+':'+port print(IpPool['ip']) yield IpPool