You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
sqlmap/src/sqlmap-master/lib/utils/crawler.py

303 lines
13 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python
"""
Copyright (c) 2006-2024 sqlmap developers (https://sqlmap.org/)
See the file 'LICENSE' for copying permission
"""
from __future__ import division
import os
import re
import tempfile
import time
from lib.core.common import checkSameHost
from lib.core.common import clearConsoleLine
from lib.core.common import dataToStdout
from lib.core.common import extractRegexResult
from lib.core.common import findPageForms
from lib.core.common import getSafeExString
from lib.core.common import openFile
from lib.core.common import readInput
from lib.core.common import safeCSValue
from lib.core.common import urldecode
from lib.core.compat import xrange
from lib.core.convert import htmlUnescape
from lib.core.data import conf
from lib.core.data import kb
from lib.core.data import logger
from lib.core.datatype import OrderedSet
from lib.core.enums import MKSTEMP_PREFIX
from lib.core.exception import SqlmapConnectionException
from lib.core.exception import SqlmapSyntaxException
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads
from lib.parse.sitemap import parseSitemap
from lib.request.connect import Connect as Request
from thirdparty import six
from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
from thirdparty.six.moves import http_client as _http_client
from thirdparty.six.moves import urllib as _urllib
def crawl(target, post=None, cookie=None):
# 如果目标为空,直接返回
if not target:
return
try:
# 创建一个已访问集合
visited = set()
threadData = getCurrentThreadData()
threadData.shared.value = OrderedSet()
threadData.shared.formsFound = False
def crawlThread():
threadData = getCurrentThreadData()
# 当线程继续时
while kb.threadContinue:
with kb.locks.limit:
# 如果还有未处理的链接
if threadData.shared.unprocessed:
current = threadData.shared.unprocessed.pop()
# 如果已经访问过,跳过
if current in visited:
continue
# 如果有排除规则且当前链接符合排除规则,跳过
elif conf.crawlExclude and re.search(conf.crawlExclude, current):
dbgMsg = "skipping '%s'" % current
logger.debug(dbgMsg)
continue
else:
visited.add(current)
else:
break
content = None
try:
# 发送请求获取页面内容
if current:
content = Request.getPage(url=current, post=post, cookie=None, crawling=True, raise404=False)[0]
# 处理不同的异常
except SqlmapConnectionException as ex:
errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex)
errMsg += "URL '%s'" % current
logger.critical(errMsg)
except SqlmapSyntaxException:
errMsg = "invalid URL detected. skipping '%s'" % current
logger.critical(errMsg)
except _http_client.InvalidURL as ex:
errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex)
errMsg += "URL '%s'" % current
logger.critical(errMsg)
if not kb.threadContinue:
break
# 如果内容是文本类型
if isinstance(content, six.text_type):
try:
# 提取 HTML 内容
match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
if match:
content = "<html>%s</html>" % match.group(1)
soup = BeautifulSoup(content)
tags = soup('a')
# 查找其他可能的链接
tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content)
tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content)
for tag in tags:
href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
if href:
if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:
current = threadData.lastRedirectURL[1]
url = _urllib.parse.urljoin(current, htmlUnescape(href))
# 检查是否是同一主机
_ = checkSameHost(url, target)
# 检查是否在范围中
if conf.scope:
if not re.search(conf.scope, url, re.I):
continue
elif not _:
continue
# 检查扩展是否在排除列表中
if (extractRegexResult(r"\A[^?]+\.(?P<result>\w+)(\?|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS:
with kb.locks.value:
threadData.shared.deeper.add(url)
# 筛选链接添加到不同集合
if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js|css)(\?|\Z)", url):
threadData.shared.value.add(url)
except UnicodeEncodeError: # 处理非 HTML 文件异常
pass
except ValueError: # 处理无效链接异常
pass
except AssertionError: # 处理无效 HTML 异常
pass
finally:
# 检查是否找到表单
if conf.forms:
threadData.shared.formsFound |= len(findPageForms(content, current, False, True)) > 0
if conf.verbose in (1, 2):
threadData.shared.count += 1
# 输出状态信息
status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))
dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
threadData.shared.deeper = set()
threadData.shared.unprocessed = set([target])
# 处理目标 URL
_ = re.sub(r"(?<!/)/(?!/).*", "", target)
if _:
if target.strip('/')!= _.strip('/'):
threadData.shared.unprocessed.add(_)
if re.search(r"\?.*\b\w+=", target):
threadData.shared.value.add(target)
if kb.checkSitemap is None:
message = "do you want to check for the existence of "
message += "site's sitemap(.xml) [y/N] "
kb.checkSitemap = readInput(message, default='N', boolean=True)
if kb.checkSitemap:
found = True
items = None
url = _urllib.parse.urljoin(target, "/sitemap.xml")
try:
# 解析站点地图
items = parseSitemap(url)
except SqlmapConnectionException as ex:
if "page not found" in getSafeExString(ex):
found = False
logger.warning("'sitemap.xml' not found")
except:
pass
finally:
if found:
if items:
for item in items:
if re.search(r"(.*?)\?(.+)", item):
threadData.shared.value.add(item)
if conf.crawlDepth > 1:
threadData.shared.unprocessed.update(items)
logger.info("%s links found" % ("no" if not items else len(items)))
if not conf.bulkFile:
infoMsg = "starting crawler for target URL '%s'" % target
logger.info(infoMsg)
# 启动多个线程进行爬取
for i in xrange(conf.crawlDepth):
threadData.shared.count = 0
threadData.shared.length = len(threadData.shared.unprocessed)
numThreads = min(conf.threads, len(threadData.shared.unprocessed))
if not conf.bulkFile:
logger.info("searching for links with depth %d" % (i + 1))
runThreads(numThreads, crawlThread, threadChoice=(i > 0))
clearConsoleLine(True)
if threadData.shared.deeper:
threadData.shared.unprocessed = set(threadData.shared.deeper)
else:
break
except KeyboardInterrupt:
warnMsg = "user aborted during crawling. sqlmap "
warnMsg += "will use partial list"
logger.warning(warnMsg)
finally:
# 清除控制台行
clearConsoleLine(True)
# 如果没有找到可用链接
if not threadData.shared.value:
# 如果没有找到表单
if not (conf.forms and threadData.shared.formsFound):
# 输出警告信息
warnMsg = "no usable links found (with GET parameters)"
if conf.forms:
warnMsg += " or forms"
logger.warning(warnMsg)
else:
# 遍历找到的链接添加到 kb.targets 中
for url in threadData.shared.value:
kb.targets.add((urldecode(url, kb.pageEncoding), None, None, None, None))
# 如果 kb.targets 中有链接
if kb.targets:
# 如果未选择规范化选项
if kb.normalizeCrawlingChoice is None:
message = "do you want to normalize "
message += "crawling results [Y/n] "
kb.normalizeCrawlingChoice = readInput(message, default='Y', boolean=True)
# 如果用户选择规范化
if kb.normalizeCrawlingChoice:
seen = set()
results = OrderedSet()
for target in kb.targets:
value = "%s%s%s" % (target[0], '&' if '?' in target[0] else '?', target[2] or "")
match = re.search(r"/[^/?]*\?.+\Z", value)
if match:
key = re.sub(r"=[^=&]*", "=", match.group(0)).strip("&?")
if '=' in key and key not in seen:
results.add(target)
seen.add(key)
kb.targets = results
# 存储结果到文件
storeResultsToFile(kb.targets)
def storeResultsToFile(results):
# 如果结果为空,则返回
if not results:
return
# 如果kb.storeCrawlingChoice为空则提示用户是否将爬取结果存储到临时文件中
if kb.storeCrawlingChoice is None:
message = "do you want to store crawling results to a temporary file "
message += "for eventual further processing with other tools [y/N] "
# 读取用户输入默认为N返回布尔值
kb.storeCrawlingChoice = readInput(message, default='N', boolean=True)
# 如果用户选择存储,则创建临时文件
if kb.storeCrawlingChoice:
# 创建临时文件,返回文件句柄和文件名
handle, filename = tempfile.mkstemp(prefix=MKSTEMP_PREFIX.CRAWLER, suffix=".csv" if conf.forms else ".txt")
# 关闭文件句柄
os.close(handle)
# 记录日志,表示将爬取结果写入临时文件
infoMsg = "writing crawling results to a temporary file '%s' " % filename
logger.info(infoMsg)
# 打开文件,以二进制写模式
with openFile(filename, "w+b") as f:
# 如果配置了表单,则写入表单标题
if conf.forms:
f.write("URL,POST\n")
# 遍历结果将URL和POST写入文件
for url, _, data, _, _ in results:
if conf.forms:
f.write("%s,%s\n" % (safeCSValue(url), safeCSValue(data or "")))
else:
f.write("%s\n" % url)