You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
51 lines
2.0 KiB
51 lines
2.0 KiB
3 years ago
|
import scrapy
|
||
|
from bs4 import BeautifulSoup
|
||
|
from scrapy_redis.spiders import RedisSpider
|
||
|
from VulCrawl.items import VulcrawlItem
|
||
|
from VulCrawl.items import Vulcrawl2Item
|
||
|
|
||
|
# scrapy.Spider
|
||
|
class VulcrawlSpider(RedisSpider):
|
||
|
name = 'vulcrawl'
|
||
|
#start_urls = ['http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=1']
|
||
|
redis_key = "Vul"
|
||
|
page = [1, 1, 1]
|
||
|
|
||
|
def parse_tow(self, response):
|
||
|
html = response.body
|
||
|
soup = BeautifulSoup(html, "lxml")
|
||
|
tr_list = soup.findAll('tr')
|
||
|
item = Vulcrawl2Item()
|
||
|
del (tr_list[0])
|
||
|
for i in tr_list:
|
||
|
td = i.findAll('td')
|
||
|
item['Vulnerability_Type'] = td[2].string.strip()
|
||
|
item['time'] = td[0].string
|
||
|
item['title'] = td[1].string.strip()
|
||
|
item['url'] = "https://wooyun.m4d3bug.com/"+td[1].find('a')['href']
|
||
|
|
||
|
yield item
|
||
|
#print(info_one)
|
||
|
|
||
|
def parse(self, response):
|
||
|
#print(response.text())
|
||
|
if self.page[0] < 10:#10000:
|
||
|
li_list = response.xpath("/html/body/div[4]/div/div[1]/div/div[2]/ul/li")
|
||
|
for i in li_list:
|
||
|
item = VulcrawlItem()
|
||
|
item['title'] = i.xpath("./div[1]/a/text()").extract()[0].strip()
|
||
|
item['Numbering'] = i.xpath("./div[1]/p/a/text()").extract()[0]
|
||
|
item['url'] = "http://www.cnnvd.org.cn/" + i.xpath("./div[1]/a/@href").extract()[0].strip()
|
||
|
item['time'] = i.xpath("./div[2]/text()").extract()[2].strip()
|
||
|
yield item
|
||
|
|
||
|
url = 'http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=' + str(self.page[0])
|
||
|
self.page[0] += 1
|
||
|
yield scrapy.Request(url=url, callback=self.parse)
|
||
|
# elif self.page[1] <= 2: #4400:
|
||
|
# url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page[1])
|
||
|
# self.page[1] += 1
|
||
|
# yield scrapy.Request(url=url2, callback=self.parse_tow)
|
||
|
# #pass
|
||
|
|