diff --git a/VulCrawler/spiders b/VulCrawler/spiders deleted file mode 100644 index 6cd17fd..0000000 --- a/VulCrawler/spiders +++ /dev/null @@ -1,50 +0,0 @@ -import scrapy -from bs4 import BeautifulSoup -from scrapy_redis.spiders import RedisSpider -from VulCrawl.items import VulcrawlItem -from VulCrawl.items import Vulcrawl2Item - -# scrapy.Spider -class VulcrawlSpider(RedisSpider): - name = 'vulcrawl' - #start_urls = ['http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=1'] - redis_key = "Vul" - page = [1, 1, 1] - - def parse_tow(self, response): - html = response.body - soup = BeautifulSoup(html, "lxml") - tr_list = soup.findAll('tr') - item = Vulcrawl2Item() - del (tr_list[0]) - for i in tr_list: - td = i.findAll('td') - item['Vulnerability_Type'] = td[2].string.strip() - item['time'] = td[0].string - item['title'] = td[1].string.strip() - item['url'] = "https://wooyun.m4d3bug.com/"+td[1].find('a')['href'] - - yield item - #print(info_one) - - def parse(self, response): - #print(response.text()) - if self.page[0] < 10:#10000: - li_list = response.xpath("/html/body/div[4]/div/div[1]/div/div[2]/ul/li") - for i in li_list: - item = VulcrawlItem() - item['title'] = i.xpath("./div[1]/a/text()").extract()[0].strip() - item['Numbering'] = i.xpath("./div[1]/p/a/text()").extract()[0] - item['url'] = "http://www.cnnvd.org.cn/" + i.xpath("./div[1]/a/@href").extract()[0].strip() - item['time'] = i.xpath("./div[2]/text()").extract()[2].strip() - yield item - - url = 'http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=' + str(self.page[0]) - self.page[0] += 1 - yield scrapy.Request(url=url, callback=self.parse) - # elif self.page[1] <= 2: #4400: - # url2 = "https://wooyun.m4d3bug.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=" + str(self.page[1]) - # self.page[1] += 1 - # yield scrapy.Request(url=url2, callback=self.parse_tow) - # #pass -