From 79573409e29a297cac691e0510ab9612476987c5 Mon Sep 17 00:00:00 2001
From: pmh9c3ri2 <1306209041@qq.com>
Date: Fri, 22 Apr 2022 10:33:40 +0800
Subject: [PATCH] ADD file via upload

---
 VulCrawler/spiders | 50 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 VulCrawler/spiders

diff --git a/VulCrawler/spiders b/VulCrawler/spiders
new file mode 100644
index 0000000..6cd17fd
--- /dev/null
+++ b/VulCrawler/spiders
@@ -0,0 +1,50 @@
+import scrapy
+from bs4 import BeautifulSoup
+from scrapy_redis.spiders import RedisSpider
+from VulCrawl.items import VulcrawlItem
+from VulCrawl.items import Vulcrawl2Item
+
+# scrapy.Spider
+class VulcrawlSpider(RedisSpider):
+    name = 'vulcrawl'
+    #start_urls = ['http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=1']
+    redis_key = "Vul"
+    page = [1, 1, 1]
+
+    def parse_tow(self, response):
+        html = response.body
+        soup = BeautifulSoup(html, "lxml")
+        tr_list = soup.findAll('tr')
+        item = Vulcrawl2Item()
+        del (tr_list[0])
+        for i in tr_list:
+            td = i.findAll('td')
+            item['Vulnerability_Type'] = td[2].string.strip()
+            item['time'] = td[0].string
+            item['title'] = td[1].string.strip()
+            item['url'] = "https://wooyun.m4d3bug.com/"+td[1].find('a')['href']
+
+            yield item
+            #print(info_one)
+
+    def parse(self, response):
+        #print(response.text())
+        if self.page[0] < 10:#10000:
+            li_list = response.xpath("/html/body/div[4]/div/div[1]/div/div[2]/ul/li")
+            for i in li_list:
+                item = VulcrawlItem()
+                item['title'] = i.xpath("./div[1]/a/text()").extract()[0].strip()
+                item['Numbering'] = i.xpath("./div[1]/p/a/text()").extract()[0]
+                item['url'] = "http://www.cnnvd.org.cn/" + i.xpath("./div[1]/a/@href").extract()[0].strip()
+                item['time'] = i.xpath("./div[2]/text()").extract()[2].strip()
+                yield item
+
+            url = 'http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=' + str(self.page[0])
+            self.page[0] += 1
+            yield scrapy.Request(url=url, callback=self.parse)
+        # elif self.page[1] <= 2: #4400:
+        #     url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page[1])
+        #     self.page[1] += 1
+        #     yield scrapy.Request(url=url2, callback=self.parse_tow)
+        #     #pass
+