You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							51 lines
						
					
					
						
							2.0 KiB
						
					
					
				
			
		
		
	
	
							51 lines
						
					
					
						
							2.0 KiB
						
					
					
				| import scrapy
 | |
| from bs4 import BeautifulSoup
 | |
| from scrapy_redis.spiders import RedisSpider
 | |
| from VulCrawl.items import VulcrawlItem
 | |
| from VulCrawl.items import Vulcrawl2Item
 | |
| 
 | |
| # scrapy.Spider
 | |
| class VulcrawlSpider(RedisSpider):
 | |
|     name = 'vulcrawl'
 | |
|     #start_urls = ['http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=1']
 | |
|     redis_key = "Vul"
 | |
|     page = [1, 1, 1]
 | |
| 
 | |
|     def parse_tow(self, response):
 | |
|         html = response.body
 | |
|         soup = BeautifulSoup(html, "lxml")
 | |
|         tr_list = soup.findAll('tr')
 | |
|         item = Vulcrawl2Item()
 | |
|         del (tr_list[0])
 | |
|         for i in tr_list:
 | |
|             td = i.findAll('td')
 | |
|             item['Vulnerability_Type'] = td[2].string.strip()
 | |
|             item['time'] = td[0].string
 | |
|             item['title'] = td[1].string.strip()
 | |
|             item['url'] = "https://wooyun.m4d3bug.com/"+td[1].find('a')['href']
 | |
| 
 | |
|             yield item
 | |
|             #print(info_one)
 | |
| 
 | |
|     def parse(self, response):
 | |
|         #print(response.text())
 | |
|         if self.page[0] < 10:#10000:
 | |
|             li_list = response.xpath("/html/body/div[4]/div/div[1]/div/div[2]/ul/li")
 | |
|             for i in li_list:
 | |
|                 item = VulcrawlItem()
 | |
|                 item['title'] = i.xpath("./div[1]/a/text()").extract()[0].strip()
 | |
|                 item['Numbering'] = i.xpath("./div[1]/p/a/text()").extract()[0]
 | |
|                 item['url'] = "http://www.cnnvd.org.cn/" + i.xpath("./div[1]/a/@href").extract()[0].strip()
 | |
|                 item['time'] = i.xpath("./div[2]/text()").extract()[2].strip()
 | |
|                 yield item
 | |
| 
 | |
|             url = 'http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=' + str(self.page[0])
 | |
|             self.page[0] += 1
 | |
|             yield scrapy.Request(url=url, callback=self.parse)
 | |
|         # elif self.page[1] <= 2: #4400:
 | |
|         #     url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page[1])
 | |
|         #     self.page[1] += 1
 | |
|         #     yield scrapy.Request(url=url2, callback=self.parse_tow)
 | |
|         #     #pass
 | |
| 
 |