You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
619 lines
18 KiB
619 lines
18 KiB
"""一个用于解析SGML的解析器,使用派生类作为静态DTD(文档类型定义)。"""
|
|
|
|
# 注意:Python3中已移除此模块
|
|
|
|
# XXX 这个解析器只支持HTML中使用的SGML特性
|
|
|
|
# XXX 应该有一种方法来区分以下三种数据类型:
|
|
# PCDATA(解析字符数据 - 正常情况)
|
|
# RCDATA(可替换字符数据 - 只有字符、实体引用和结束标签是特殊的)
|
|
# CDATA(字符数据 - 只有结束标签是特殊的)
|
|
# 目前不支持RCDATA
|
|
|
|
from __future__ import print_function
|
|
|
|
try:
|
|
import _markupbase as markupbase # 尝试导入_markupbase模块
|
|
except:
|
|
import markupbase # 如果失败则导入markupbase模块
|
|
|
|
import re # 导入正则表达式模块
|
|
|
|
__all__ = ["SGMLParser", "SGMLParseError"] # 指定可被导入的公共接口
|
|
|
|
# 用于解析的正则表达式定义
|
|
# 匹配有趣的字符(&和<)
|
|
interesting = re.compile('[&<]')
|
|
|
|
# 匹配不完整的标签或实体引用
|
|
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
|
|
'<([a-zA-Z][^<>]*|'
|
|
'/([a-zA-Z][^<>]*)?|'
|
|
'![^<>]*)?')
|
|
|
|
# 匹配实体引用,如&
|
|
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
|
|
|
|
# 匹配字符引用,如 
|
|
charref = re.compile('&#([0-9]+)[^0-9]')
|
|
|
|
# 匹配开始标签的开头
|
|
starttagopen = re.compile('<[>a-zA-Z]')
|
|
|
|
# 匹配简写标签的开头,如<tag/
|
|
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
|
|
|
|
# 匹配完整的简写标签,如<tag/data/
|
|
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
|
|
|
|
# 匹配处理指令的结束符>
|
|
piclose = re.compile('>')
|
|
|
|
# 匹配尖括号
|
|
endbracket = re.compile('[<>]')
|
|
|
|
# 匹配标签名
|
|
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
|
|
|
|
# 匹配属性
|
|
attrfind = re.compile(
|
|
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
|
|
r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
|
|
|
|
|
|
class SGMLParseError(RuntimeError):
|
|
"""解析错误时抛出的异常类"""
|
|
pass
|
|
|
|
|
|
class SGMLParser(markupbase.ParserBase):
|
|
"""SGML解析器基类 - 查找标签并调用处理函数
|
|
|
|
用法: p = SGMLParser(); p.feed(data); ...; p.close()
|
|
|
|
DTD通过派生类定义,派生类需要定义特殊名称的方法来处理标签:
|
|
- start_foo和end_foo分别处理<foo>和</foo>
|
|
- 或者do_foo单独处理<foo>
|
|
(标签名会被转换为小写)
|
|
|
|
标签之间的数据通过调用self.handle_data(data)传递给解析器
|
|
实体引用通过调用self.handle_entityref(name)传递
|
|
"""
|
|
|
|
# 实体或字符引用的正则表达式
|
|
entity_or_charref = re.compile('&(?:'
|
|
'([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
|
|
')(;?)')
|
|
|
|
def __init__(self, verbose=0):
|
|
"""初始化并重置实例"""
|
|
self.verbose = verbose # 是否输出详细信息
|
|
self.reset()
|
|
|
|
def reset(self):
|
|
"""重置实例状态,丢弃所有未处理的数据"""
|
|
self.__starttag_text = None # 开始标签的原始文本
|
|
self.rawdata = '' # 原始数据
|
|
self.stack = [] # 标签栈
|
|
self.lasttag = '???' # 最后处理的标签
|
|
self.nomoretags = 0 # 是否停止处理标签
|
|
self.literal = 0 # 是否处于文字模式
|
|
markupbase.ParserBase.reset(self)
|
|
|
|
def setnomoretags(self):
|
|
"""进入文字模式(CDATA)直到文件结束
|
|
|
|
仅供派生类使用
|
|
"""
|
|
self.nomoretags = self.literal = 1
|
|
|
|
def setliteral(self, *args):
|
|
"""进入文字模式(CDATA)
|
|
|
|
仅供派生类使用
|
|
"""
|
|
self.literal = 1
|
|
|
|
def feed(self, data):
|
|
"""向解析器提供数据
|
|
|
|
可以多次调用,每次提供任意长度的文本(可以包含换行符)
|
|
这个方法只是保存文本,实际处理由goahead()完成
|
|
"""
|
|
self.rawdata = self.rawdata + data
|
|
self.goahead(0)
|
|
|
|
def close(self):
|
|
"""处理剩余数据"""
|
|
self.goahead(1)
|
|
|
|
def error(self, message):
|
|
"""抛出解析错误异常"""
|
|
raise SGMLParseError(message)
|
|
|
|
def goahead(self, end):
|
|
"""内部方法 - 尽可能处理数据
|
|
|
|
可能会留下状态和数据等待后续调用处理
|
|
如果end为True,则强制处理所有数据
|
|
"""
|
|
rawdata = self.rawdata
|
|
i = 0 # 当前处理位置
|
|
n = len(rawdata)
|
|
while i < n:
|
|
if self.nomoretags: # 如果在文字模式下
|
|
self.handle_data(rawdata[i:n])
|
|
i = n
|
|
break
|
|
# 查找下一个特殊字符(&或<)
|
|
match = interesting.search(rawdata, i)
|
|
if match:
|
|
j = match.start()
|
|
else:
|
|
j = n
|
|
# 处理普通文本
|
|
if i < j:
|
|
self.handle_data(rawdata[i:j])
|
|
i = j
|
|
if i == n:
|
|
break
|
|
# 处理标签和实体引用
|
|
if rawdata[i] == '<':
|
|
if starttagopen.match(rawdata, i): # 开始标签
|
|
if self.literal:
|
|
self.handle_data(rawdata[i])
|
|
i = i + 1
|
|
continue
|
|
k = self.parse_starttag(i)
|
|
if k < 0:
|
|
break
|
|
i = k
|
|
continue
|
|
if rawdata.startswith("</", i): # 结束标签
|
|
k = self.parse_endtag(i)
|
|
if k < 0:
|
|
break
|
|
i = k
|
|
self.literal = 0
|
|
continue
|
|
if self.literal:
|
|
if n > (i + 1):
|
|
self.handle_data("<")
|
|
i = i + 1
|
|
else:
|
|
break
|
|
continue
|
|
if rawdata.startswith("<!--", i): # 注释
|
|
k = self.parse_comment(i)
|
|
if k < 0:
|
|
break
|
|
i = k
|
|
continue
|
|
if rawdata.startswith("<?", i): # 处理指令
|
|
k = self.parse_pi(i)
|
|
if k < 0:
|
|
break
|
|
i = i + k
|
|
continue
|
|
if rawdata.startswith("<!", i): # 声明(如DOCTYPE)
|
|
k = self.parse_declaration(i)
|
|
if k < 0:
|
|
break
|
|
i = k
|
|
continue
|
|
elif rawdata[i] == '&': # 处理实体引用
|
|
if self.literal:
|
|
self.handle_data(rawdata[i])
|
|
i = i + 1
|
|
continue
|
|
match = charref.match(rawdata, i) # 字符引用
|
|
if match:
|
|
name = match.group(1)
|
|
self.handle_charref(name)
|
|
i = match.end(0)
|
|
if rawdata[i - 1] != ';':
|
|
i = i - 1
|
|
continue
|
|
match = entityref.match(rawdata, i) # 实体引用
|
|
if match:
|
|
name = match.group(1)
|
|
self.handle_entityref(name)
|
|
i = match.end(0)
|
|
if rawdata[i - 1] != ';':
|
|
i = i - 1
|
|
continue
|
|
else:
|
|
self.error('neither < nor & ??')
|
|
# 处理不完整的匹配
|
|
match = incomplete.match(rawdata, i)
|
|
if not match:
|
|
self.handle_data(rawdata[i])
|
|
i = i + 1
|
|
continue
|
|
j = match.end(0)
|
|
if j == n:
|
|
break
|
|
self.handle_data(rawdata[i:j])
|
|
i = j
|
|
# 处理剩余数据
|
|
if end and i < n:
|
|
self.handle_data(rawdata[i:n])
|
|
i = n
|
|
self.rawdata = rawdata[i:]
|
|
|
|
def parse_pi(self, i):
|
|
"""内部方法 - 解析处理指令
|
|
|
|
返回处理的字符数,如果未结束则返回-1
|
|
"""
|
|
rawdata = self.rawdata
|
|
if rawdata[i:i + 2] != '<?':
|
|
self.error('unexpected call to parse_pi()')
|
|
match = piclose.search(rawdata, i + 2)
|
|
if not match:
|
|
return -1
|
|
j = match.start(0)
|
|
self.handle_pi(rawdata[i + 2: j])
|
|
j = match.end(0)
|
|
return j - i
|
|
|
|
def get_starttag_text(self):
|
|
"""获取最近处理的开始标签文本"""
|
|
return self.__starttag_text
|
|
|
|
def parse_starttag(self, i):
|
|
"""内部方法 - 处理开始标签
|
|
|
|
返回处理的字符数,如果未结束则返回-1
|
|
"""
|
|
self.__starttag_text = None
|
|
start_pos = i
|
|
rawdata = self.rawdata
|
|
if shorttagopen.match(rawdata, i): # 简写标签
|
|
match = shorttag.match(rawdata, i)
|
|
if not match:
|
|
return -1
|
|
tag, data = match.group(1, 2)
|
|
self.__starttag_text = '<%s/' % tag
|
|
tag = tag.lower()
|
|
k = match.end(0)
|
|
self.finish_shorttag(tag, data)
|
|
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
|
|
return k
|
|
|
|
# 查找标签结束位置
|
|
match = endbracket.search(rawdata, i + 1)
|
|
if not match:
|
|
return -1
|
|
j = match.start(0)
|
|
|
|
# 解析标签名和属性
|
|
attrs = []
|
|
if rawdata[i:i + 2] == '<>': # <>表示重复上一个开始标签
|
|
k = j
|
|
tag = self.lasttag
|
|
else:
|
|
match = tagfind.match(rawdata, i + 1)
|
|
if not match:
|
|
self.error('unexpected call to parse_starttag')
|
|
k = match.end(0)
|
|
tag = rawdata[i + 1:k].lower()
|
|
self.lasttag = tag
|
|
|
|
# 解析属性
|
|
while k < j:
|
|
match = attrfind.match(rawdata, k)
|
|
if not match:
|
|
break
|
|
attrname, rest, attrvalue = match.group(1, 2, 3)
|
|
if not rest:
|
|
attrvalue = attrname
|
|
else:
|
|
if (attrvalue[:1] == "'" == attrvalue[-1:] or
|
|
attrvalue[:1] == '"' == attrvalue[-1:]):
|
|
attrvalue = attrvalue[1:-1] # 去掉引号
|
|
attrvalue = self.entity_or_charref.sub(
|
|
self._convert_ref, attrvalue)
|
|
attrs.append((attrname.lower(), attrvalue))
|
|
k = match.end(0)
|
|
|
|
if rawdata[j] == '>':
|
|
j = j + 1
|
|
self.__starttag_text = rawdata[start_pos:j]
|
|
self.finish_starttag(tag, attrs)
|
|
return j
|
|
|
|
def _convert_ref(self, match):
|
|
"""内部方法 - 转换实体引用或字符引用"""
|
|
if match.group(2): # 字符引用
|
|
return self.convert_charref(match.group(2)) or \
|
|
'&#%s%s' % match.groups()[1:]
|
|
elif match.group(3): # 实体引用
|
|
return self.convert_entityref(match.group(1)) or \
|
|
'&%s;' % match.group(1)
|
|
else:
|
|
return '&%s' % match.group(1)
|
|
|
|
def parse_endtag(self, i):
|
|
"""内部方法 - 解析结束标签"""
|
|
rawdata = self.rawdata
|
|
match = endbracket.search(rawdata, i + 1)
|
|
if not match:
|
|
return -1
|
|
j = match.start(0)
|
|
tag = rawdata[i + 2:j].strip().lower()
|
|
if rawdata[j] == '>':
|
|
j = j + 1
|
|
self.finish_endtag(tag)
|
|
return j
|
|
|
|
def finish_shorttag(self, tag, data):
|
|
"""内部方法 - 完成简写标签的处理
|
|
|
|
<tag/data/> 等同于 <tag>data</tag>
|
|
"""
|
|
self.finish_starttag(tag, [])
|
|
self.handle_data(data)
|
|
self.finish_endtag(tag)
|
|
|
|
def finish_starttag(self, tag, attrs):
|
|
"""内部方法 - 完成开始标签的处理
|
|
|
|
返回:
|
|
-1: 未知标签
|
|
0: 仅开始标签
|
|
1: 平衡标签
|
|
"""
|
|
try:
|
|
method = getattr(self, 'start_' + tag)
|
|
except AttributeError:
|
|
try:
|
|
method = getattr(self, 'do_' + tag)
|
|
except AttributeError:
|
|
self.unknown_starttag(tag, attrs)
|
|
return -1
|
|
else:
|
|
self.handle_starttag(tag, method, attrs)
|
|
return 0
|
|
else:
|
|
self.stack.append(tag)
|
|
self.handle_starttag(tag, method, attrs)
|
|
return 1
|
|
|
|
def finish_endtag(self, tag):
|
|
"""内部方法 - 完成结束标签的处理"""
|
|
if not tag: # 空标签
|
|
found = len(self.stack) - 1
|
|
if found < 0:
|
|
self.unknown_endtag(tag)
|
|
return
|
|
else:
|
|
if tag not in self.stack: # 未匹配的结束标签
|
|
try:
|
|
method = getattr(self, 'end_' + tag)
|
|
except AttributeError:
|
|
self.unknown_endtag(tag)
|
|
else:
|
|
self.report_unbalanced(tag)
|
|
return
|
|
found = len(self.stack)
|
|
for i in range(found):
|
|
if self.stack[i] == tag:
|
|
found = i
|
|
|
|
# 处理所有未闭合的标签
|
|
while len(self.stack) > found:
|
|
tag = self.stack[-1]
|
|
try:
|
|
method = getattr(self, 'end_' + tag)
|
|
except AttributeError:
|
|
method = None
|
|
if method:
|
|
self.handle_endtag(tag, method)
|
|
else:
|
|
self.unknown_endtag(tag)
|
|
del self.stack[-1]
|
|
|
|
# 以下方法可被派生类重写
|
|
|
|
def handle_starttag(self, tag, method, attrs):
|
|
"""处理开始标签"""
|
|
method(attrs)
|
|
|
|
def handle_endtag(self, tag, method):
|
|
"""处理结束标签"""
|
|
method()
|
|
|
|
def report_unbalanced(self, tag):
|
|
"""报告未匹配的结束标签"""
|
|
if self.verbose:
|
|
print('*** Unbalanced </' + tag + '>')
|
|
print('*** Stack:', self.stack)
|
|
|
|
def convert_charref(self, name):
|
|
"""转换字符引用"""
|
|
try:
|
|
n = int(name)
|
|
except ValueError:
|
|
return
|
|
if not 0 <= n <= 127:
|
|
return
|
|
return self.convert_codepoint(n)
|
|
|
|
def convert_codepoint(self, codepoint):
|
|
"""转换代码点为字符"""
|
|
return chr(codepoint)
|
|
|
|
def handle_charref(self, name):
|
|
"""处理字符引用"""
|
|
replacement = self.convert_charref(name)
|
|
if replacement is None:
|
|
self.unknown_charref(name)
|
|
else:
|
|
self.handle_data(replacement)
|
|
|
|
# 实体定义 - 派生类可以重写
|
|
entitydefs = \
|
|
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
|
|
|
|
def convert_entityref(self, name):
|
|
"""转换实体引用
|
|
|
|
可以通过设置self.entitydefs来自定义转换规则
|
|
"""
|
|
table = self.entitydefs
|
|
if name in table:
|
|
return table[name]
|
|
else:
|
|
return
|
|
|
|
def handle_entityref(self, name):
|
|
"""处理实体引用"""
|
|
replacement = self.convert_entityref(name)
|
|
if replacement is None:
|
|
self.unknown_entityref(name)
|
|
else:
|
|
self.handle_data(replacement)
|
|
|
|
# 以下是示例处理方法 - 应该被重写
|
|
|
|
def handle_data(self, data):
|
|
"""处理文本数据"""
|
|
pass
|
|
|
|
def handle_comment(self, data):
|
|
"""处理注释"""
|
|
pass
|
|
|
|
def handle_decl(self, decl):
|
|
"""处理声明"""
|
|
pass
|
|
|
|
def handle_pi(self, data):
|
|
"""处理处理指令"""
|
|
pass
|
|
|
|
# 处理未知对象的方法 - 需要重写
|
|
|
|
def unknown_starttag(self, tag, attrs):
|
|
"""处理未知开始标签"""
|
|
pass
|
|
|
|
def unknown_endtag(self, tag):
|
|
"""处理未知结束标签"""
|
|
pass
|
|
|
|
def unknown_charref(self, ref):
|
|
"""处理未知字符引用"""
|
|
pass
|
|
|
|
def unknown_entityref(self, ref):
|
|
"""处理未知实体引用"""
|
|
pass
|
|
|
|
|
|
class TestSGMLParser(SGMLParser):
|
|
"""用于测试的SGML解析器"""
|
|
|
|
def __init__(self, verbose=0):
|
|
self.testdata = ""
|
|
SGMLParser.__init__(self, verbose)
|
|
|
|
def handle_data(self, data):
|
|
"""收集并打印文本数据"""
|
|
self.testdata = self.testdata + data
|
|
if len(repr(self.testdata)) >= 70:
|
|
self.flush()
|
|
|
|
def flush(self):
|
|
"""打印收集的数据"""
|
|
data = self.testdata
|
|
if data:
|
|
self.testdata = ""
|
|
print('data:', repr(data))
|
|
|
|
def handle_comment(self, data):
|
|
"""打印注释"""
|
|
self.flush()
|
|
r = repr(data)
|
|
if len(r) > 68:
|
|
r = r[:32] + '...' + r[-32:]
|
|
print('comment:', r)
|
|
|
|
def unknown_starttag(self, tag, attrs):
|
|
"""打印未知开始标签"""
|
|
self.flush()
|
|
if not attrs:
|
|
print('start tag: <' + tag + '>')
|
|
else:
|
|
print('start tag: <' + tag, end=' ')
|
|
for name, value in attrs:
|
|
print(name + '=' + '"' + value + '"', end=' ')
|
|
print('>')
|
|
|
|
def unknown_endtag(self, tag):
|
|
"""打印未知结束标签"""
|
|
self.flush()
|
|
print('end tag: </' + tag + '>')
|
|
|
|
def unknown_entityref(self, ref):
|
|
"""打印未知实体引用"""
|
|
self.flush()
|
|
print('*** unknown entity ref: &' + ref + ';')
|
|
|
|
def unknown_charref(self, ref):
|
|
"""打印未知字符引用"""
|
|
self.flush()
|
|
print('*** unknown char ref: &#' + ref + ';')
|
|
|
|
def unknown_decl(self, data):
|
|
"""打印未知声明"""
|
|
self.flush()
|
|
print('*** unknown decl: [' + data + ']')
|
|
|
|
def close(self):
|
|
"""关闭解析器并打印剩余数据"""
|
|
SGMLParser.close(self)
|
|
self.flush()
|
|
|
|
|
|
def test(args=None):
|
|
"""测试函数"""
|
|
import sys
|
|
|
|
if args is None:
|
|
args = sys.argv[1:]
|
|
|
|
if args and args[0] == '-s':
|
|
args = args[1:]
|
|
klass = SGMLParser
|
|
else:
|
|
klass = TestSGMLParser
|
|
|
|
if args:
|
|
file = args[0]
|
|
else:
|
|
file = 'test.html'
|
|
|
|
if file == '-':
|
|
f = sys.stdin
|
|
else:
|
|
try:
|
|
f = open(file, 'r')
|
|
except IOError as msg:
|
|
print(file, ":", msg)
|
|
sys.exit(1)
|
|
|
|
data = f.read()
|
|
if f is not sys.stdin:
|
|
f.close()
|
|
|
|
x = klass()
|
|
for c in data:
|
|
x.feed(c)
|
|
x.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
test()
|