Update vulcrawl2.py

3 years ago · 916216bb5b
parent 53c61f24b9
commit 916216bb5b
1 changed files with 22 additions and 21 deletions
--- a/vulcrawl2.py
+++ b/vulcrawl2.py
@ -1,5 +1,5 @@
 import scrapy
-from VulCrawl.items import Vulcrawl2Item
+from VulCrawl.items import VulcrawlItem
 from bs4 import BeautifulSoup
 from scrapy_redis.spiders import RedisCrawlSpider
 from scrapy_redis.spiders import RedisSpider
@ -12,24 +12,25 @@ class Vulcrawl2Spider(RedisSpider):
    page = 2

    def parse(self, response):
-        html = response.body
-        soup = BeautifulSoup(html, "lxml")
-        tr_list = soup.findAll('tr')
-        item = Vulcrawl2Item()
-        del (tr_list[0])
-        for i in tr_list:
-            info_one = {}
-            td = i.findAll('td')
-            # info_one['time'] = td[0].string
-            # info_one['title'] = td[1].string.strip()
-            # info_one['Vulnerability_Type'] = td[2].string.strip()
-            item['Vulnerability_Type'] = td[2].string.strip()
-            item['title'] = td[0].string
-            item['time'] = td[1].string.strip()
-            item['url'] = "https://wooyun.m4d3bug.com/" + td[1].find('a')['href']
+        if self.page < 10:
+            html = response.body
+            soup = BeautifulSoup(html, "lxml")
+            tr_list = soup.findAll('tr')
+            item = VulcrawlItem()
+            del (tr_list[0])
+            for i in tr_list:
+                info_one = {}
+                td = i.findAll('td')
+                # info_one['time'] = td[0].string
+                # info_one['title'] = td[1].string.strip()
+                # info_one['Vulnerability_Type'] = td[2].string.strip()
+                item['Numbering'] = td[2].string.strip()
+                item['time'] = td[0].string
+                item['title'] = td[1].string.strip()
+                item['url'] = "https://wooyun.m4d3bug.com/" + td[1].find('a')['href']

-            yield item
-            # print(info_one)
-        url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page)
-        self.page += 1
-        yield scrapy.Request(url=url2, callback=self.parse)
+                yield item
+                # print(info_one)
+            self.page += 1
+            url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page)
+            yield scrapy.Request(url=url2, callback=self.parse)