diff --git a/vulcrawl2.py b/vulcrawl2.py index 4d2c038..f85d551 100644 --- a/vulcrawl2.py +++ b/vulcrawl2.py @@ -1,5 +1,5 @@ import scrapy -from VulCrawl.items import Vulcrawl2Item +from VulCrawl.items import VulcrawlItem from bs4 import BeautifulSoup from scrapy_redis.spiders import RedisCrawlSpider from scrapy_redis.spiders import RedisSpider @@ -12,24 +12,25 @@ class Vulcrawl2Spider(RedisSpider): page = 2 def parse(self, response): - html = response.body - soup = BeautifulSoup(html, "lxml") - tr_list = soup.findAll('tr') - item = Vulcrawl2Item() - del (tr_list[0]) - for i in tr_list: - info_one = {} - td = i.findAll('td') - # info_one['time'] = td[0].string - # info_one['title'] = td[1].string.strip() - # info_one['Vulnerability_Type'] = td[2].string.strip() - item['Vulnerability_Type'] = td[2].string.strip() - item['title'] = td[0].string - item['time'] = td[1].string.strip() - item['url'] = "https://wooyun.m4d3bug.com/" + td[1].find('a')['href'] + if self.page < 10: + html = response.body + soup = BeautifulSoup(html, "lxml") + tr_list = soup.findAll('tr') + item = VulcrawlItem() + del (tr_list[0]) + for i in tr_list: + info_one = {} + td = i.findAll('td') + # info_one['time'] = td[0].string + # info_one['title'] = td[1].string.strip() + # info_one['Vulnerability_Type'] = td[2].string.strip() + item['Numbering'] = td[2].string.strip() + item['time'] = td[0].string + item['title'] = td[1].string.strip() + item['url'] = "https://wooyun.m4d3bug.com/" + td[1].find('a')['href'] - yield item - # print(info_one) - url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page) - self.page += 1 - yield scrapy.Request(url=url2, callback=self.parse) + yield item + # print(info_one) + self.page += 1 + url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page) + yield scrapy.Request(url=url2, callback=self.parse)