sqlmap/src/sqlmap-master/lib/utils/crawler.py

#!/usr/bin/env python

"""
Copyright (c) 2006-2024 sqlmap developers (https://sqlmap.org/)
See the file 'LICENSE' for copying permission
"""

from __future__ import division

import os
import re
import tempfile
import time

from lib.core.common import checkSameHost
from lib.core.common import clearConsoleLine
from lib.core.common import dataToStdout
from lib.core.common import extractRegexResult
from lib.core.common import findPageForms
from lib.core.common import getSafeExString
from lib.core.common import openFile
from lib.core.common import readInput
from lib.core.common import safeCSValue
from lib.core.common import urldecode
from lib.core.compat import xrange
from lib.core.convert import htmlUnescape
from lib.core.data import conf
from lib.core.data import kb
from lib.core.data import logger
from lib.core.datatype import OrderedSet
from lib.core.enums import MKSTEMP_PREFIX
from lib.core.exception import SqlmapConnectionException
from lib.core.exception import SqlmapSyntaxException
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads
from lib.parse.sitemap import parseSitemap
from lib.request.connect import Connect as Request
from thirdparty import six
from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
from thirdparty.six.moves import http_client as _http_client
from thirdparty.six.moves import urllib as _urllib

def crawl(target, post=None, cookie=None):
    # 如果目标为空，直接返回
    if not target:
        return

    try:
        # 创建一个已访问集合
        visited = set()
        threadData = getCurrentThreadData()
        threadData.shared.value = OrderedSet()
        threadData.shared.formsFound = False

        def crawlThread():
            threadData = getCurrentThreadData()

            # 当线程继续时
            while kb.threadContinue:
                with kb.locks.limit:
                    # 如果还有未处理的链接
                    if threadData.shared.unprocessed:
                        current = threadData.shared.unprocessed.pop()
                        # 如果已经访问过，跳过
                        if current in visited:
                            continue
                        # 如果有排除规则且当前链接符合排除规则，跳过
                        elif conf.crawlExclude and re.search(conf.crawlExclude, current):
                            dbgMsg = "skipping '%s'" % current
                            logger.debug(dbgMsg)
                            continue
                        else:
                            visited.add(current)
                    else:
                        break

                content = None
                try:
                    # 发送请求获取页面内容
                    if current:
                        content = Request.getPage(url=current, post=post, cookie=None, crawling=True, raise404=False)[0]
                # 处理不同的异常
                except SqlmapConnectionException as ex:
                    errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex)
                    errMsg += "URL '%s'" % current
                    logger.critical(errMsg)
                except SqlmapSyntaxException:
                    errMsg = "invalid URL detected. skipping '%s'" % current
                    logger.critical(errMsg)
                except _http_client.InvalidURL as ex:
                    errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex)
                    errMsg += "URL '%s'" % current
                    logger.critical(errMsg)

                if not kb.threadContinue:
                    break

                # 如果内容是文本类型
                if isinstance(content, six.text_type):
                    try:
                        # 提取 HTML 内容
                        match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
                        if match:
                            content = "<html>%s</html>" % match.group(1)

                        soup = BeautifulSoup(content)
                        tags = soup('a')

                        # 查找其他可能的链接
                        tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content)
                        tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content)

                        for tag in tags:
                            href = tag.get("href") if hasattr(tag, "get") else tag.group("href")

                            if href:
                                if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:
                                    current = threadData.lastRedirectURL[1]
                                url = _urllib.parse.urljoin(current, htmlUnescape(href))

                                # 检查是否是同一主机
                                _ = checkSameHost(url, target)

                                # 检查是否在范围中
                                if conf.scope:
                                    if not re.search(conf.scope, url, re.I):
                                        continue
                                elif not _:
                                    continue

                                # 检查扩展是否在排除列表中
                                if (extractRegexResult(r"\A[^?]+\.(?P<result>\w+)(\?|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS:
                                    with kb.locks.value:
                                        threadData.shared.deeper.add(url)
                                        # 筛选链接添加到不同集合
                                        if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js|css)(\?|\Z)", url):
                                            threadData.shared.value.add(url)
                    except UnicodeEncodeError:  # 处理非 HTML 文件异常
                        pass
                    except ValueError:          # 处理无效链接异常
                        pass
                    except AssertionError:      # 处理无效 HTML 异常
                        pass
                    finally:
                        # 检查是否找到表单
                        if conf.forms:
                            threadData.shared.formsFound |= len(findPageForms(content, current, False, True)) > 0

                if conf.verbose in (1, 2):
                    threadData.shared.count += 1
                    # 输出状态信息
                    status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))
                    dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)

        threadData.shared.deeper = set()
        threadData.shared.unprocessed = set([target])

        # 处理目标 URL
        _ = re.sub(r"(?<!/)/(?!/).*", "", target)
        if _:
            if target.strip('/')!= _.strip('/'):
                threadData.shared.unprocessed.add(_)

        if re.search(r"\?.*\b\w+=", target):
            threadData.shared.value.add(target)

        if kb.checkSitemap is None:
            message = "do you want to check for the existence of "
            message += "site's sitemap(.xml) [y/N] "
            kb.checkSitemap = readInput(message, default='N', boolean=True)

        if kb.checkSitemap:
            found = True
            items = None
            url = _urllib.parse.urljoin(target, "/sitemap.xml")
            try:
                # 解析站点地图
                items = parseSitemap(url)
            except SqlmapConnectionException as ex:
                if "page not found" in getSafeExString(ex):
                    found = False
                    logger.warning("'sitemap.xml' not found")
            except:
                pass
            finally:
                if found:
                    if items:
                        for item in items:
                            if re.search(r"(.*?)\?(.+)", item):
                                threadData.shared.value.add(item)
                        if conf.crawlDepth > 1:
                            threadData.shared.unprocessed.update(items)
                    logger.info("%s links found" % ("no" if not items else len(items)))

        if not conf.bulkFile:
            infoMsg = "starting crawler for target URL '%s'" % target
            logger.info(infoMsg)

        # 启动多个线程进行爬取
        for i in xrange(conf.crawlDepth):
            threadData.shared.count = 0
            threadData.shared.length = len(threadData.shared.unprocessed)
            numThreads = min(conf.threads, len(threadData.shared.unprocessed))

            if not conf.bulkFile:
                logger.info("searching for links with depth %d" % (i + 1))

            runThreads(numThreads, crawlThread, threadChoice=(i > 0))
            clearConsoleLine(True)

            if threadData.shared.deeper:
                threadData.shared.unprocessed = set(threadData.shared.deeper)
            else:
                break

    except KeyboardInterrupt:
        warnMsg = "user aborted during crawling. sqlmap "
        warnMsg += "will use partial list"
        logger.warning(warnMsg)

    finally:
        # 清除控制台行
        clearConsoleLine(True)

        # 如果没有找到可用链接
        if not threadData.shared.value:
            # 如果没有找到表单
            if not (conf.forms and threadData.shared.formsFound):
                # 输出警告信息
                warnMsg = "no usable links found (with GET parameters)"
                if conf.forms:
                    warnMsg += " or forms"
                logger.warning(warnMsg)
        else:
            # 遍历找到的链接添加到 kb.targets 中
            for url in threadData.shared.value:
                kb.targets.add((urldecode(url, kb.pageEncoding), None, None, None, None))

        # 如果 kb.targets 中有链接
        if kb.targets:
            # 如果未选择规范化选项
            if kb.normalizeCrawlingChoice is None:
                message = "do you want to normalize "
                message += "crawling results [Y/n] "
                kb.normalizeCrawlingChoice = readInput(message, default='Y', boolean=True)

            # 如果用户选择规范化
            if kb.normalizeCrawlingChoice:
                seen = set()
                results = OrderedSet()

                for target in kb.targets:
                    value = "%s%s%s" % (target[0], '&' if '?' in target[0] else '?', target[2] or "")
                    match = re.search(r"/[^/?]*\?.+\Z", value)
                    if match:
                        key = re.sub(r"=[^=&]*", "=", match.group(0)).strip("&?")
                        if '=' in key and key not in seen:
                            results.add(target)
                            seen.add(key)

                kb.targets = results

            # 存储结果到文件
            storeResultsToFile(kb.targets)

def storeResultsToFile(results):
    # 如果结果为空，则返回
    if not results:
        return

    # 如果kb.storeCrawlingChoice为空，则提示用户是否将爬取结果存储到临时文件中
    if kb.storeCrawlingChoice is None:
        message = "do you want to store crawling results to a temporary file "
        message += "for eventual further processing with other tools [y/N] "

        # 读取用户输入，默认为N，返回布尔值
        kb.storeCrawlingChoice = readInput(message, default='N', boolean=True)

    # 如果用户选择存储，则创建临时文件
    if kb.storeCrawlingChoice:
        # 创建临时文件，返回文件句柄和文件名
        handle, filename = tempfile.mkstemp(prefix=MKSTEMP_PREFIX.CRAWLER, suffix=".csv" if conf.forms else ".txt")
        # 关闭文件句柄
        os.close(handle)

        # 记录日志，表示将爬取结果写入临时文件
        infoMsg = "writing crawling results to a temporary file '%s' " % filename
        logger.info(infoMsg)

        # 打开文件，以二进制写模式
        with openFile(filename, "w+b") as f:
            # 如果配置了表单，则写入表单标题
            if conf.forms:
                f.write("URL,POST\n")

            # 遍历结果，将URL和POST写入文件
            for url, _, data, _, _ in results:
                if conf.forms:
                    f.write("%s,%s\n" % (safeCSValue(url), safeCSValue(data or "")))
                else:
                    f.write("%s\n" % url)