#!/usr/bin/env python """ Copyright (c) 2006-2024 sqlmap developers (https://sqlmap.org/) See the file 'LICENSE' for copying permission """ from __future__ import division import os import re import tempfile import time from lib.core.common import checkSameHost from lib.core.common import clearConsoleLine from lib.core.common import dataToStdout from lib.core.common import extractRegexResult from lib.core.common import findPageForms from lib.core.common import getSafeExString from lib.core.common import openFile from lib.core.common import readInput from lib.core.common import safeCSValue from lib.core.common import urldecode from lib.core.compat import xrange from lib.core.convert import htmlUnescape from lib.core.data import conf from lib.core.data import kb from lib.core.data import logger from lib.core.datatype import OrderedSet from lib.core.enums import MKSTEMP_PREFIX from lib.core.exception import SqlmapConnectionException from lib.core.exception import SqlmapSyntaxException from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS from lib.core.threads import getCurrentThreadData from lib.core.threads import runThreads from lib.parse.sitemap import parseSitemap from lib.request.connect import Connect as Request from thirdparty import six from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup from thirdparty.six.moves import http_client as _http_client from thirdparty.six.moves import urllib as _urllib def crawl(target, post=None, cookie=None): # 如果目标为空,直接返回 if not target: return try: # 创建一个已访问集合 visited = set() threadData = getCurrentThreadData() threadData.shared.value = OrderedSet() threadData.shared.formsFound = False def crawlThread(): threadData = getCurrentThreadData() # 当线程继续时 while kb.threadContinue: with kb.locks.limit: # 如果还有未处理的链接 if threadData.shared.unprocessed: current = threadData.shared.unprocessed.pop() # 如果已经访问过,跳过 if current in visited: continue # 如果有排除规则且当前链接符合排除规则,跳过 elif conf.crawlExclude and re.search(conf.crawlExclude, current): dbgMsg = "skipping '%s'" % current logger.debug(dbgMsg) continue else: visited.add(current) else: break content = None try: # 发送请求获取页面内容 if current: content = Request.getPage(url=current, post=post, cookie=None, crawling=True, raise404=False)[0] # 处理不同的异常 except SqlmapConnectionException as ex: errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex) errMsg += "URL '%s'" % current logger.critical(errMsg) except SqlmapSyntaxException: errMsg = "invalid URL detected. skipping '%s'" % current logger.critical(errMsg) except _http_client.InvalidURL as ex: errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex) errMsg += "URL '%s'" % current logger.critical(errMsg) if not kb.threadContinue: break # 如果内容是文本类型 if isinstance(content, six.text_type): try: # 提取 HTML 内容 match = re.search(r"(?si)]*>(.+)", content) if match: content = "%s" % match.group(1) soup = BeautifulSoup(content) tags = soup('a') # 查找其他可能的链接 tags += re.finditer(r'(?i)\s(href|src)=["\'](?P[^>"\']+)', content) tags += re.finditer(r'(?i)window\.open\(["\'](?P[^)"\']+)["\']', content) for tag in tags: href = tag.get("href") if hasattr(tag, "get") else tag.group("href") if href: if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID: current = threadData.lastRedirectURL[1] url = _urllib.parse.urljoin(current, htmlUnescape(href)) # 检查是否是同一主机 _ = checkSameHost(url, target) # 检查是否在范围中 if conf.scope: if not re.search(conf.scope, url, re.I): continue elif not _: continue # 检查扩展是否在排除列表中 if (extractRegexResult(r"\A[^?]+\.(?P\w+)(\?|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS: with kb.locks.value: threadData.shared.deeper.add(url) # 筛选链接添加到不同集合 if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js|css)(\?|\Z)", url): threadData.shared.value.add(url) except UnicodeEncodeError: # 处理非 HTML 文件异常 pass except ValueError: # 处理无效链接异常 pass except AssertionError: # 处理无效 HTML 异常 pass finally: # 检查是否找到表单 if conf.forms: threadData.shared.formsFound |= len(findPageForms(content, current, False, True)) > 0 if conf.verbose in (1, 2): threadData.shared.count += 1 # 输出状态信息 status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length)) dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True) threadData.shared.deeper = set() threadData.shared.unprocessed = set([target]) # 处理目标 URL _ = re.sub(r"(? 1: threadData.shared.unprocessed.update(items) logger.info("%s links found" % ("no" if not items else len(items))) if not conf.bulkFile: infoMsg = "starting crawler for target URL '%s'" % target logger.info(infoMsg) # 启动多个线程进行爬取 for i in xrange(conf.crawlDepth): threadData.shared.count = 0 threadData.shared.length = len(threadData.shared.unprocessed) numThreads = min(conf.threads, len(threadData.shared.unprocessed)) if not conf.bulkFile: logger.info("searching for links with depth %d" % (i + 1)) runThreads(numThreads, crawlThread, threadChoice=(i > 0)) clearConsoleLine(True) if threadData.shared.deeper: threadData.shared.unprocessed = set(threadData.shared.deeper) else: break except KeyboardInterrupt: warnMsg = "user aborted during crawling. sqlmap " warnMsg += "will use partial list" logger.warning(warnMsg) finally: # 清除控制台行 clearConsoleLine(True) # 如果没有找到可用链接 if not threadData.shared.value: # 如果没有找到表单 if not (conf.forms and threadData.shared.formsFound): # 输出警告信息 warnMsg = "no usable links found (with GET parameters)" if conf.forms: warnMsg += " or forms" logger.warning(warnMsg) else: # 遍历找到的链接添加到 kb.targets 中 for url in threadData.shared.value: kb.targets.add((urldecode(url, kb.pageEncoding), None, None, None, None)) # 如果 kb.targets 中有链接 if kb.targets: # 如果未选择规范化选项 if kb.normalizeCrawlingChoice is None: message = "do you want to normalize " message += "crawling results [Y/n] " kb.normalizeCrawlingChoice = readInput(message, default='Y', boolean=True) # 如果用户选择规范化 if kb.normalizeCrawlingChoice: seen = set() results = OrderedSet() for target in kb.targets: value = "%s%s%s" % (target[0], '&' if '?' in target[0] else '?', target[2] or "") match = re.search(r"/[^/?]*\?.+\Z", value) if match: key = re.sub(r"=[^=&]*", "=", match.group(0)).strip("&?") if '=' in key and key not in seen: results.add(target) seen.add(key) kb.targets = results # 存储结果到文件 storeResultsToFile(kb.targets) def storeResultsToFile(results): # 如果结果为空,则返回 if not results: return # 如果kb.storeCrawlingChoice为空,则提示用户是否将爬取结果存储到临时文件中 if kb.storeCrawlingChoice is None: message = "do you want to store crawling results to a temporary file " message += "for eventual further processing with other tools [y/N] " # 读取用户输入,默认为N,返回布尔值 kb.storeCrawlingChoice = readInput(message, default='N', boolean=True) # 如果用户选择存储,则创建临时文件 if kb.storeCrawlingChoice: # 创建临时文件,返回文件句柄和文件名 handle, filename = tempfile.mkstemp(prefix=MKSTEMP_PREFIX.CRAWLER, suffix=".csv" if conf.forms else ".txt") # 关闭文件句柄 os.close(handle) # 记录日志,表示将爬取结果写入临时文件 infoMsg = "writing crawling results to a temporary file '%s' " % filename logger.info(infoMsg) # 打开文件,以二进制写模式 with openFile(filename, "w+b") as f: # 如果配置了表单,则写入表单标题 if conf.forms: f.write("URL,POST\n") # 遍历结果,将URL和POST写入文件 for url, _, data, _, _ in results: if conf.forms: f.write("%s,%s\n" % (safeCSValue(url), safeCSValue(data or ""))) else: f.write("%s\n" % url)