From ea9bc6c200c4867659113ad147d85afcf55dbe19 Mon Sep 17 00:00:00 2001 From: pmh9c3ri2 <1306209041@qq.com> Date: Fri, 22 Apr 2022 10:47:53 +0800 Subject: [PATCH] ADD file via upload --- vulcrawl2.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 vulcrawl2.py diff --git a/vulcrawl2.py b/vulcrawl2.py new file mode 100644 index 0000000..4d2c038 --- /dev/null +++ b/vulcrawl2.py @@ -0,0 +1,35 @@ +import scrapy +from VulCrawl.items import Vulcrawl2Item +from bs4 import BeautifulSoup +from scrapy_redis.spiders import RedisCrawlSpider +from scrapy_redis.spiders import RedisSpider + +class Vulcrawl2Spider(RedisSpider): + name = 'vulcrawl2' + #allowed_domains = ['www.xxx.com'] + start_urls = ['https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=1'] + redis_key = "Vul" + page = 2 + + def parse(self, response): + html = response.body + soup = BeautifulSoup(html, "lxml") + tr_list = soup.findAll('tr') + item = Vulcrawl2Item() + del (tr_list[0]) + for i in tr_list: + info_one = {} + td = i.findAll('td') + # info_one['time'] = td[0].string + # info_one['title'] = td[1].string.strip() + # info_one['Vulnerability_Type'] = td[2].string.strip() + item['Vulnerability_Type'] = td[2].string.strip() + item['title'] = td[0].string + item['time'] = td[1].string.strip() + item['url'] = "https://wooyun.m4d3bug.com/" + td[1].find('a')['href'] + + yield item + # print(info_one) + url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page) + self.page += 1 + yield scrapy.Request(url=url2, callback=self.parse)