From 79573409e29a297cac691e0510ab9612476987c5 Mon Sep 17 00:00:00 2001 From: pmh9c3ri2 <1306209041@qq.com> Date: Fri, 22 Apr 2022 10:33:40 +0800 Subject: [PATCH] ADD file via upload --- VulCrawler/spiders | 50 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 VulCrawler/spiders diff --git a/VulCrawler/spiders b/VulCrawler/spiders new file mode 100644 index 0000000..6cd17fd --- /dev/null +++ b/VulCrawler/spiders @@ -0,0 +1,50 @@ +import scrapy +from bs4 import BeautifulSoup +from scrapy_redis.spiders import RedisSpider +from VulCrawl.items import VulcrawlItem +from VulCrawl.items import Vulcrawl2Item + +# scrapy.Spider +class VulcrawlSpider(RedisSpider): + name = 'vulcrawl' + #start_urls = ['http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=1'] + redis_key = "Vul" + page = [1, 1, 1] + + def parse_tow(self, response): + html = response.body + soup = BeautifulSoup(html, "lxml") + tr_list = soup.findAll('tr') + item = Vulcrawl2Item() + del (tr_list[0]) + for i in tr_list: + td = i.findAll('td') + item['Vulnerability_Type'] = td[2].string.strip() + item['time'] = td[0].string + item['title'] = td[1].string.strip() + item['url'] = "https://wooyun.m4d3bug.com/"+td[1].find('a')['href'] + + yield item + #print(info_one) + + def parse(self, response): + #print(response.text()) + if self.page[0] < 10:#10000: + li_list = response.xpath("/html/body/div[4]/div/div[1]/div/div[2]/ul/li") + for i in li_list: + item = VulcrawlItem() + item['title'] = i.xpath("./div[1]/a/text()").extract()[0].strip() + item['Numbering'] = i.xpath("./div[1]/p/a/text()").extract()[0] + item['url'] = "http://www.cnnvd.org.cn/" + i.xpath("./div[1]/a/@href").extract()[0].strip() + item['time'] = i.xpath("./div[2]/text()").extract()[2].strip() + yield item + + url = 'http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=' + str(self.page[0]) + self.page[0] += 1 + yield scrapy.Request(url=url, callback=self.parse) + # elif self.page[1] <= 2: #4400: + # url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page[1]) + # self.page[1] += 1 + # yield scrapy.Request(url=url2, callback=self.parse_tow) + # #pass +