You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
sqlmap/src/sqlmap-master/lib/utils/sgmllib.py

619 lines
18 KiB

"""一个用于解析SGML的解析器,使用派生类作为静态DTD(文档类型定义)。"""
# 注意:Python3中已移除此模块
# XXX 这个解析器只支持HTML中使用的SGML特性
# XXX 应该有一种方法来区分以下三种数据类型:
# PCDATA(解析字符数据 - 正常情况)
# RCDATA(可替换字符数据 - 只有字符、实体引用和结束标签是特殊的)
# CDATA(字符数据 - 只有结束标签是特殊的)
# 目前不支持RCDATA
from __future__ import print_function
try:
import _markupbase as markupbase # 尝试导入_markupbase模块
except:
import markupbase # 如果失败则导入markupbase模块
import re # 导入正则表达式模块
__all__ = ["SGMLParser", "SGMLParseError"] # 指定可被导入的公共接口
# 用于解析的正则表达式定义
# 匹配有趣的字符(&和<)
interesting = re.compile('[&<]')
# 匹配不完整的标签或实体引用
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
'<([a-zA-Z][^<>]*|'
'/([a-zA-Z][^<>]*)?|'
'![^<>]*)?')
# 匹配实体引用,如&amp;
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
# 匹配字符引用,如&#160;
charref = re.compile('&#([0-9]+)[^0-9]')
# 匹配开始标签的开头
starttagopen = re.compile('<[>a-zA-Z]')
# 匹配简写标签的开头,如<tag/
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
# 匹配完整的简写标签,如<tag/data/
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
# 匹配处理指令的结束符>
piclose = re.compile('>')
# 匹配尖括号
endbracket = re.compile('[<>]')
# 匹配标签名
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
# 匹配属性
attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
class SGMLParseError(RuntimeError):
"""解析错误时抛出的异常类"""
pass
class SGMLParser(markupbase.ParserBase):
"""SGML解析器基类 - 查找标签并调用处理函数
用法: p = SGMLParser(); p.feed(data); ...; p.close()
DTD通过派生类定义,派生类需要定义特殊名称的方法来处理标签:
- start_foo和end_foo分别处理<foo>和</foo>
- 或者do_foo单独处理<foo>
(标签名会被转换为小写)
标签之间的数据通过调用self.handle_data(data)传递给解析器
实体引用通过调用self.handle_entityref(name)传递
"""
# 实体或字符引用的正则表达式
entity_or_charref = re.compile('&(?:'
'([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
')(;?)')
def __init__(self, verbose=0):
"""初始化并重置实例"""
self.verbose = verbose # 是否输出详细信息
self.reset()
def reset(self):
"""重置实例状态,丢弃所有未处理的数据"""
self.__starttag_text = None # 开始标签的原始文本
self.rawdata = '' # 原始数据
self.stack = [] # 标签栈
self.lasttag = '???' # 最后处理的标签
self.nomoretags = 0 # 是否停止处理标签
self.literal = 0 # 是否处于文字模式
markupbase.ParserBase.reset(self)
def setnomoretags(self):
"""进入文字模式(CDATA)直到文件结束
仅供派生类使用
"""
self.nomoretags = self.literal = 1
def setliteral(self, *args):
"""进入文字模式(CDATA)
仅供派生类使用
"""
self.literal = 1
def feed(self, data):
"""向解析器提供数据
可以多次调用,每次提供任意长度的文本(可以包含换行符)
这个方法只是保存文本,实际处理由goahead()完成
"""
self.rawdata = self.rawdata + data
self.goahead(0)
def close(self):
"""处理剩余数据"""
self.goahead(1)
def error(self, message):
"""抛出解析错误异常"""
raise SGMLParseError(message)
def goahead(self, end):
"""内部方法 - 尽可能处理数据
可能会留下状态和数据等待后续调用处理
如果end为True,则强制处理所有数据
"""
rawdata = self.rawdata
i = 0 # 当前处理位置
n = len(rawdata)
while i < n:
if self.nomoretags: # 如果在文字模式下
self.handle_data(rawdata[i:n])
i = n
break
# 查找下一个特殊字符(&或<)
match = interesting.search(rawdata, i)
if match:
j = match.start()
else:
j = n
# 处理普通文本
if i < j:
self.handle_data(rawdata[i:j])
i = j
if i == n:
break
# 处理标签和实体引用
if rawdata[i] == '<':
if starttagopen.match(rawdata, i): # 开始标签
if self.literal:
self.handle_data(rawdata[i])
i = i + 1
continue
k = self.parse_starttag(i)
if k < 0:
break
i = k
continue
if rawdata.startswith("</", i): # 结束标签
k = self.parse_endtag(i)
if k < 0:
break
i = k
self.literal = 0
continue
if self.literal:
if n > (i + 1):
self.handle_data("<")
i = i + 1
else:
break
continue
if rawdata.startswith("<!--", i): # 注释
k = self.parse_comment(i)
if k < 0:
break
i = k
continue
if rawdata.startswith("<?", i): # 处理指令
k = self.parse_pi(i)
if k < 0:
break
i = i + k
continue
if rawdata.startswith("<!", i): # 声明(如DOCTYPE)
k = self.parse_declaration(i)
if k < 0:
break
i = k
continue
elif rawdata[i] == '&': # 处理实体引用
if self.literal:
self.handle_data(rawdata[i])
i = i + 1
continue
match = charref.match(rawdata, i) # 字符引用
if match:
name = match.group(1)
self.handle_charref(name)
i = match.end(0)
if rawdata[i - 1] != ';':
i = i - 1
continue
match = entityref.match(rawdata, i) # 实体引用
if match:
name = match.group(1)
self.handle_entityref(name)
i = match.end(0)
if rawdata[i - 1] != ';':
i = i - 1
continue
else:
self.error('neither < nor & ??')
# 处理不完整的匹配
match = incomplete.match(rawdata, i)
if not match:
self.handle_data(rawdata[i])
i = i + 1
continue
j = match.end(0)
if j == n:
break
self.handle_data(rawdata[i:j])
i = j
# 处理剩余数据
if end and i < n:
self.handle_data(rawdata[i:n])
i = n
self.rawdata = rawdata[i:]
def parse_pi(self, i):
"""内部方法 - 解析处理指令
返回处理的字符数,如果未结束则返回-1
"""
rawdata = self.rawdata
if rawdata[i:i + 2] != '<?':
self.error('unexpected call to parse_pi()')
match = piclose.search(rawdata, i + 2)
if not match:
return -1
j = match.start(0)
self.handle_pi(rawdata[i + 2: j])
j = match.end(0)
return j - i
def get_starttag_text(self):
"""获取最近处理的开始标签文本"""
return self.__starttag_text
def parse_starttag(self, i):
"""内部方法 - 处理开始标签
返回处理的字符数,如果未结束则返回-1
"""
self.__starttag_text = None
start_pos = i
rawdata = self.rawdata
if shorttagopen.match(rawdata, i): # 简写标签
match = shorttag.match(rawdata, i)
if not match:
return -1
tag, data = match.group(1, 2)
self.__starttag_text = '<%s/' % tag
tag = tag.lower()
k = match.end(0)
self.finish_shorttag(tag, data)
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
return k
# 查找标签结束位置
match = endbracket.search(rawdata, i + 1)
if not match:
return -1
j = match.start(0)
# 解析标签名和属性
attrs = []
if rawdata[i:i + 2] == '<>': # <>表示重复上一个开始标签
k = j
tag = self.lasttag
else:
match = tagfind.match(rawdata, i + 1)
if not match:
self.error('unexpected call to parse_starttag')
k = match.end(0)
tag = rawdata[i + 1:k].lower()
self.lasttag = tag
# 解析属性
while k < j:
match = attrfind.match(rawdata, k)
if not match:
break
attrname, rest, attrvalue = match.group(1, 2, 3)
if not rest:
attrvalue = attrname
else:
if (attrvalue[:1] == "'" == attrvalue[-1:] or
attrvalue[:1] == '"' == attrvalue[-1:]):
attrvalue = attrvalue[1:-1] # 去掉引号
attrvalue = self.entity_or_charref.sub(
self._convert_ref, attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = match.end(0)
if rawdata[j] == '>':
j = j + 1
self.__starttag_text = rawdata[start_pos:j]
self.finish_starttag(tag, attrs)
return j
def _convert_ref(self, match):
"""内部方法 - 转换实体引用或字符引用"""
if match.group(2): # 字符引用
return self.convert_charref(match.group(2)) or \
'&#%s%s' % match.groups()[1:]
elif match.group(3): # 实体引用
return self.convert_entityref(match.group(1)) or \
'&%s;' % match.group(1)
else:
return '&%s' % match.group(1)
def parse_endtag(self, i):
"""内部方法 - 解析结束标签"""
rawdata = self.rawdata
match = endbracket.search(rawdata, i + 1)
if not match:
return -1
j = match.start(0)
tag = rawdata[i + 2:j].strip().lower()
if rawdata[j] == '>':
j = j + 1
self.finish_endtag(tag)
return j
def finish_shorttag(self, tag, data):
"""内部方法 - 完成简写标签的处理
<tag/data/> 等同于 <tag>data</tag>
"""
self.finish_starttag(tag, [])
self.handle_data(data)
self.finish_endtag(tag)
def finish_starttag(self, tag, attrs):
"""内部方法 - 完成开始标签的处理
返回:
-1: 未知标签
0: 仅开始标签
1: 平衡标签
"""
try:
method = getattr(self, 'start_' + tag)
except AttributeError:
try:
method = getattr(self, 'do_' + tag)
except AttributeError:
self.unknown_starttag(tag, attrs)
return -1
else:
self.handle_starttag(tag, method, attrs)
return 0
else:
self.stack.append(tag)
self.handle_starttag(tag, method, attrs)
return 1
def finish_endtag(self, tag):
"""内部方法 - 完成结束标签的处理"""
if not tag: # 空标签
found = len(self.stack) - 1
if found < 0:
self.unknown_endtag(tag)
return
else:
if tag not in self.stack: # 未匹配的结束标签
try:
method = getattr(self, 'end_' + tag)
except AttributeError:
self.unknown_endtag(tag)
else:
self.report_unbalanced(tag)
return
found = len(self.stack)
for i in range(found):
if self.stack[i] == tag:
found = i
# 处理所有未闭合的标签
while len(self.stack) > found:
tag = self.stack[-1]
try:
method = getattr(self, 'end_' + tag)
except AttributeError:
method = None
if method:
self.handle_endtag(tag, method)
else:
self.unknown_endtag(tag)
del self.stack[-1]
# 以下方法可被派生类重写
def handle_starttag(self, tag, method, attrs):
"""处理开始标签"""
method(attrs)
def handle_endtag(self, tag, method):
"""处理结束标签"""
method()
def report_unbalanced(self, tag):
"""报告未匹配的结束标签"""
if self.verbose:
print('*** Unbalanced </' + tag + '>')
print('*** Stack:', self.stack)
def convert_charref(self, name):
"""转换字符引用"""
try:
n = int(name)
except ValueError:
return
if not 0 <= n <= 127:
return
return self.convert_codepoint(n)
def convert_codepoint(self, codepoint):
"""转换代码点为字符"""
return chr(codepoint)
def handle_charref(self, name):
"""处理字符引用"""
replacement = self.convert_charref(name)
if replacement is None:
self.unknown_charref(name)
else:
self.handle_data(replacement)
# 实体定义 - 派生类可以重写
entitydefs = \
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
def convert_entityref(self, name):
"""转换实体引用
可以通过设置self.entitydefs来自定义转换规则
"""
table = self.entitydefs
if name in table:
return table[name]
else:
return
def handle_entityref(self, name):
"""处理实体引用"""
replacement = self.convert_entityref(name)
if replacement is None:
self.unknown_entityref(name)
else:
self.handle_data(replacement)
# 以下是示例处理方法 - 应该被重写
def handle_data(self, data):
"""处理文本数据"""
pass
def handle_comment(self, data):
"""处理注释"""
pass
def handle_decl(self, decl):
"""处理声明"""
pass
def handle_pi(self, data):
"""处理处理指令"""
pass
# 处理未知对象的方法 - 需要重写
def unknown_starttag(self, tag, attrs):
"""处理未知开始标签"""
pass
def unknown_endtag(self, tag):
"""处理未知结束标签"""
pass
def unknown_charref(self, ref):
"""处理未知字符引用"""
pass
def unknown_entityref(self, ref):
"""处理未知实体引用"""
pass
class TestSGMLParser(SGMLParser):
"""用于测试的SGML解析器"""
def __init__(self, verbose=0):
self.testdata = ""
SGMLParser.__init__(self, verbose)
def handle_data(self, data):
"""收集并打印文本数据"""
self.testdata = self.testdata + data
if len(repr(self.testdata)) >= 70:
self.flush()
def flush(self):
"""打印收集的数据"""
data = self.testdata
if data:
self.testdata = ""
print('data:', repr(data))
def handle_comment(self, data):
"""打印注释"""
self.flush()
r = repr(data)
if len(r) > 68:
r = r[:32] + '...' + r[-32:]
print('comment:', r)
def unknown_starttag(self, tag, attrs):
"""打印未知开始标签"""
self.flush()
if not attrs:
print('start tag: <' + tag + '>')
else:
print('start tag: <' + tag, end=' ')
for name, value in attrs:
print(name + '=' + '"' + value + '"', end=' ')
print('>')
def unknown_endtag(self, tag):
"""打印未知结束标签"""
self.flush()
print('end tag: </' + tag + '>')
def unknown_entityref(self, ref):
"""打印未知实体引用"""
self.flush()
print('*** unknown entity ref: &' + ref + ';')
def unknown_charref(self, ref):
"""打印未知字符引用"""
self.flush()
print('*** unknown char ref: &#' + ref + ';')
def unknown_decl(self, data):
"""打印未知声明"""
self.flush()
print('*** unknown decl: [' + data + ']')
def close(self):
"""关闭解析器并打印剩余数据"""
SGMLParser.close(self)
self.flush()
def test(args=None):
"""测试函数"""
import sys
if args is None:
args = sys.argv[1:]
if args and args[0] == '-s':
args = args[1:]
klass = SGMLParser
else:
klass = TestSGMLParser
if args:
file = args[0]
else:
file = 'test.html'
if file == '-':
f = sys.stdin
else:
try:
f = open(file, 'r')
except IOError as msg:
print(file, ":", msg)
sys.exit(1)
data = f.read()
if f is not sys.stdin:
f.close()
x = klass()
for c in data:
x.feed(c)
x.close()
if __name__ == '__main__':
test()