From ea9bc6c200c4867659113ad147d85afcf55dbe19 Mon Sep 17 00:00:00 2001
From: pmh9c3ri2 <1306209041@qq.com>
Date: Fri, 22 Apr 2022 10:47:53 +0800
Subject: [PATCH] ADD file via upload

---
 vulcrawl2.py | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 vulcrawl2.py

diff --git a/vulcrawl2.py b/vulcrawl2.py
new file mode 100644
index 0000000..4d2c038
--- /dev/null
+++ b/vulcrawl2.py
@@ -0,0 +1,35 @@
+import scrapy
+from VulCrawl.items import Vulcrawl2Item
+from bs4 import BeautifulSoup
+from scrapy_redis.spiders import RedisCrawlSpider
+from scrapy_redis.spiders import RedisSpider
+
+class Vulcrawl2Spider(RedisSpider):
+    name = 'vulcrawl2'
+    #allowed_domains = ['www.xxx.com']
+    start_urls = ['https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=1']
+    redis_key = "Vul"
+    page = 2
+
+    def parse(self, response):
+        html = response.body
+        soup = BeautifulSoup(html, "lxml")
+        tr_list = soup.findAll('tr')
+        item = Vulcrawl2Item()
+        del (tr_list[0])
+        for i in tr_list:
+            info_one = {}
+            td = i.findAll('td')
+            # info_one['time'] = td[0].string
+            # info_one['title'] = td[1].string.strip()
+            # info_one['Vulnerability_Type'] = td[2].string.strip()
+            item['Vulnerability_Type'] = td[2].string.strip()
+            item['title'] = td[0].string
+            item['time'] = td[1].string.strip()
+            item['url'] = "https://wooyun.m4d3bug.com/" + td[1].find('a')['href']
+
+            yield item
+            # print(info_one)
+        url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page)
+        self.page += 1
+        yield scrapy.Request(url=url2, callback=self.parse)