|
|
|
@ -1,5 +1,5 @@
|
|
|
|
|
import scrapy
|
|
|
|
|
from VulCrawl.items import Vulcrawl2Item
|
|
|
|
|
from VulCrawl.items import VulcrawlItem
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
from scrapy_redis.spiders import RedisCrawlSpider
|
|
|
|
|
from scrapy_redis.spiders import RedisSpider
|
|
|
|
@ -12,24 +12,25 @@ class Vulcrawl2Spider(RedisSpider):
|
|
|
|
|
page = 2
|
|
|
|
|
|
|
|
|
|
def parse(self, response):
|
|
|
|
|
html = response.body
|
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
|
tr_list = soup.findAll('tr')
|
|
|
|
|
item = Vulcrawl2Item()
|
|
|
|
|
del (tr_list[0])
|
|
|
|
|
for i in tr_list:
|
|
|
|
|
info_one = {}
|
|
|
|
|
td = i.findAll('td')
|
|
|
|
|
# info_one['time'] = td[0].string
|
|
|
|
|
# info_one['title'] = td[1].string.strip()
|
|
|
|
|
# info_one['Vulnerability_Type'] = td[2].string.strip()
|
|
|
|
|
item['Vulnerability_Type'] = td[2].string.strip()
|
|
|
|
|
item['title'] = td[0].string
|
|
|
|
|
item['time'] = td[1].string.strip()
|
|
|
|
|
item['url'] = "https://wooyun.m4d3bug.com/" + td[1].find('a')['href']
|
|
|
|
|
if self.page < 10:
|
|
|
|
|
html = response.body
|
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
|
tr_list = soup.findAll('tr')
|
|
|
|
|
item = VulcrawlItem()
|
|
|
|
|
del (tr_list[0])
|
|
|
|
|
for i in tr_list:
|
|
|
|
|
info_one = {}
|
|
|
|
|
td = i.findAll('td')
|
|
|
|
|
# info_one['time'] = td[0].string
|
|
|
|
|
# info_one['title'] = td[1].string.strip()
|
|
|
|
|
# info_one['Vulnerability_Type'] = td[2].string.strip()
|
|
|
|
|
item['Numbering'] = td[2].string.strip()
|
|
|
|
|
item['time'] = td[0].string
|
|
|
|
|
item['title'] = td[1].string.strip()
|
|
|
|
|
item['url'] = "https://wooyun.m4d3bug.com/" + td[1].find('a')['href']
|
|
|
|
|
|
|
|
|
|
yield item
|
|
|
|
|
# print(info_one)
|
|
|
|
|
url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page)
|
|
|
|
|
self.page += 1
|
|
|
|
|
yield scrapy.Request(url=url2, callback=self.parse)
|
|
|
|
|
yield item
|
|
|
|
|
# print(info_one)
|
|
|
|
|
self.page += 1
|
|
|
|
|
url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page)
|
|
|
|
|
yield scrapy.Request(url=url2, callback=self.parse)
|
|
|
|
|