import scrapy from VulCrawl.items import VulcrawlItem from bs4 import BeautifulSoup from scrapy_redis.spiders import RedisCrawlSpider from scrapy_redis.spiders import RedisSpider class Vulcrawl2Spider(RedisSpider): name = 'vulcrawl2' #allowed_domains = ['www.xxx.com'] start_urls = ['https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=1'] redis_key = "Vul" page = 2 def parse(self, response): if self.page < 10: html = response.body soup = BeautifulSoup(html, "lxml") tr_list = soup.findAll('tr') item = VulcrawlItem() del (tr_list[0]) for i in tr_list: info_one = {} td = i.findAll('td') # info_one['time'] = td[0].string # info_one['title'] = td[1].string.strip() # info_one['Vulnerability_Type'] = td[2].string.strip() item['Numbering'] = td[2].string.strip() item['time'] = td[0].string item['title'] = td[1].string.strip() item['url'] = "https://wooyun.m4d3bug.com/" + td[1].find('a')['href'] yield item # print(info_one) self.page += 1 url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page) yield scrapy.Request(url=url2, callback=self.parse)