diff --git a/vulcrawl2.py b/vulcrawl2.py new file mode 100644 index 0000000..4d2c038 --- /dev/null +++ b/vulcrawl2.py @@ -0,0 +1,35 @@ +import scrapy +from VulCrawl.items import Vulcrawl2Item +from bs4 import BeautifulSoup +from scrapy_redis.spiders import RedisCrawlSpider +from scrapy_redis.spiders import RedisSpider + +class Vulcrawl2Spider(RedisSpider): + name = 'vulcrawl2' + #allowed_domains = ['www.xxx.com'] + start_urls = ['https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=1'] + redis_key = "Vul" + page = 2 + + def parse(self, response): + html = response.body + soup = BeautifulSoup(html, "lxml") + tr_list = soup.findAll('tr') + item = Vulcrawl2Item() + del (tr_list[0]) + for i in tr_list: + info_one = {} + td = i.findAll('td') + # info_one['time'] = td[0].string + # info_one['title'] = td[1].string.strip() + # info_one['Vulnerability_Type'] = td[2].string.strip() + item['Vulnerability_Type'] = td[2].string.strip() + item['title'] = td[0].string + item['time'] = td[1].string.strip() + item['url'] = "https://wooyun.m4d3bug.com/" + td[1].find('a')['href'] + + yield item + # print(info_one) + url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page) + self.page += 1 + yield scrapy.Request(url=url2, callback=self.parse)