@ -1,141 +1,165 @@
""" A parser for SGML, using the derived class as a static DTD. """
""" 一个用于解析SGML的解析器,使用派生类作为静态DTD(文档类型定义)。 """
# Note: missing in Python3
# 注意:Python3中已移除此模块
# XXX This only supports those SGML features used by HTML.
# XXX 这个解析器只支持HTML中使用的SGML特性
# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special )
# and CDATA (character data -- only end tags are special). RCDATA is
# not supported at all.
# XXX 应该有一种方法来区分以下三种数据类型:
# PCDATA(解析字符数据 - 正常情况)
# RCDATA(可替换字符数据 - 只有字符、实体引用和结束标签是特殊的 )
# CDATA(字符数据 - 只有结束标签是特殊的)
# 目前不支持RCDATA
from __future__ import print_function
try :
import _markupbase as markupbase
import _markupbase as markupbase # 尝试导入_markupbase模块
except :
import markupbase
import markupbase # 如果失败则导入markupbase模块
import re
import re # 导入正则表达式模块
__all__ = [ " SGMLParser " , " SGMLParseError " ]
# Regular expressions used for parsing
__all__ = [ " SGMLParser " , " SGMLParseError " ] # 指定可被导入的公共接口
# 用于解析的正则表达式定义
# 匹配有趣的字符(&和<)
interesting = re . compile ( ' [&<] ' )
# 匹配不完整的标签或实体引用
incomplete = re . compile ( ' &([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?| '
' <([a-zA-Z][^<>]*| '
' /([a-zA-Z][^<>]*)?| '
' ![^<>]*)? ' )
# 匹配实体引用,如&
entityref = re . compile ( ' &([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9] ' )
# 匹配字符引用,如 
charref = re . compile ( ' &#([0-9]+)[^0-9] ' )
# 匹配开始标签的开头
starttagopen = re . compile ( ' <[>a-zA-Z] ' )
# 匹配简写标签的开头,如<tag/
shorttagopen = re . compile ( ' <[a-zA-Z][-.a-zA-Z0-9]*/ ' )
# 匹配完整的简写标签,如<tag/data/
shorttag = re . compile ( ' <([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/ ' )
# 匹配处理指令的结束符>
piclose = re . compile ( ' > ' )
# 匹配尖括号
endbracket = re . compile ( ' [<>] ' )
# 匹配标签名
tagfind = re . compile ( ' [a-zA-Z][-_.a-zA-Z0-9]* ' )
# 匹配属性
attrfind = re . compile (
r ' \ s*([a-zA-Z_][-:.a-zA-Z_0-9]*)( \ s*= \ s* '
r ' ( \' [^ \' ]* \' | " [^ " ]* " |[][ \ -a-zA-Z0-9./,:;+* % ?!&$ \ ( \ )_#=~ \' " @]*))? ' )
class SGMLParseError ( RuntimeError ) :
""" Exception raised for all parse errors. """
""" 解析错误时抛出的异常类 """
pass
# SGML parser base class -- find tags and call handler functions.
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
# The dtd is defined by deriving a class which defines methods
# with special names to handle tags: start_foo and end_foo to handle
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
# (Tags are converted to lower case for this purpose.) The data
# between tags is passed to the parser by calling self.handle_data()
# with some data as argument (the data may be split up in arbitrary
# chunks). Entity references are passed by calling
# self.handle_entityref() with the entity reference as argument.
class SGMLParser ( markupbase . ParserBase ) :
# Definition of entities -- derived classes may override
""" SGML解析器基类 - 查找标签并调用处理函数
用法 : p = SGMLParser ( ) ; p . feed ( data ) ; . . . ; p . close ( )
DTD通过派生类定义 , 派生类需要定义特殊名称的方法来处理标签 :
- start_foo和end_foo分别处理 < foo > 和 < / foo >
- 或者do_foo单独处理 < foo >
( 标签名会被转换为小写 )
标签之间的数据通过调用self . handle_data ( data ) 传递给解析器
实体引用通过调用self . handle_entityref ( name ) 传递
"""
# 实体或字符引用的正则表达式
entity_or_charref = re . compile ( ' &(?: '
' ([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+) '
' )(;?) ' )
def __init__ ( self , verbose = 0 ) :
""" Initialize and reset this instance. """
self . verbose = verbose
""" 初始化并重置实例 """
self . verbose = verbose # 是否输出详细信息
self . reset ( )
def reset ( self ) :
""" Reset this instance. Loses all unprocessed data. """
self . __starttag_text = None
self . rawdata = ' '
self . stack = [ ]
self . lasttag = ' ??? '
self . nomoretags = 0
self . literal = 0
""" 重置实例状态,丢弃所有未处理的数据 """
self . __starttag_text = None # 开始标签的原始文本
self . rawdata = ' ' # 原始数据
self . stack = [ ] # 标签栈
self . lasttag = ' ??? ' # 最后处理的标签
self . nomoretags = 0 # 是否停止处理标签
self . literal = 0 # 是否处于文字模式
markupbase . ParserBase . reset ( self )
def setnomoretags ( self ) :
""" Enter literal mode (CDATA) till EOF.
Intended for derived classes only .
""" 进入文字模式(CDATA)直到文件结束
仅供派生类使用
"""
self . nomoretags = self . literal = 1
def setliteral ( self , * args ) :
""" Enter literal mode (CDATA).
Intended for derived classes only .
""" 进入文字模式(CDATA)
仅供派生类使用
"""
self . literal = 1
def feed ( self , data ) :
""" Feed some data to the parser.
Call this as often as you want , with as little or as much text
as you want ( may include ' \n ' ) . ( This just saves the text ,
all the processing is done by goahead ( ) . )
""" 向解析器提供数据
可以多次调用 , 每次提供任意长度的文本 ( 可以包含换行符 )
这个方法只是保存文本 , 实际处理由goahead ( ) 完成
"""
self . rawdata = self . rawdata + data
self . goahead ( 0 )
def close ( self ) :
""" Handle the remaining data. """
""" 处理剩余数据 """
self . goahead ( 1 )
def error ( self , message ) :
""" 抛出解析错误异常 """
raise SGMLParseError ( message )
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
def goahead ( self , end ) :
""" 内部方法 - 尽可能处理数据
可能会留下状态和数据等待后续调用处理
如果end为True , 则强制处理所有数据
"""
rawdata = self . rawdata
i = 0
i = 0 # 当前处理位置
n = len ( rawdata )
while i < n :
if self . nomoretags :
if self . nomoretags : # 如果在文字模式下
self . handle_data ( rawdata [ i : n ] )
i = n
break
# 查找下一个特殊字符(&或<)
match = interesting . search ( rawdata , i )
if match :
j = match . start ( )
else :
j = n
# 处理普通文本
if i < j :
self . handle_data ( rawdata [ i : j ] )
i = j
if i == n :
break
# 处理标签和实体引用
if rawdata [ i ] == ' < ' :
if starttagopen . match ( rawdata , i ) :
if starttagopen . match ( rawdata , i ) : # 开始标签
if self . literal :
self . handle_data ( rawdata [ i ] )
i = i + 1
@ -145,7 +169,7 @@ class SGMLParser(markupbase.ParserBase):
break
i = k
continue
if rawdata . startswith ( " </ " , i ) :
if rawdata . startswith ( " </ " , i ) : # 结束标签
k = self . parse_endtag ( i )
if k < 0 :
break
@ -157,40 +181,32 @@ class SGMLParser(markupbase.ParserBase):
self . handle_data ( " < " )
i = i + 1
else :
# incomplete
break
continue
if rawdata . startswith ( " <!-- " , i ) :
# Strictly speaking, a comment is --.*--
# within a declaration tag <!...>.
# This should be removed,
# and comments handled only in parse_declaration.
if rawdata . startswith ( " <!-- " , i ) : # 注释
k = self . parse_comment ( i )
if k < 0 :
break
i = k
continue
if rawdata . startswith ( " <? " , i ) :
if rawdata . startswith ( " <? " , i ) : # 处理指令
k = self . parse_pi ( i )
if k < 0 :
break
i = i + k
continue
if rawdata . startswith ( " <! " , i ) :
# This is some sort of declaration; in "HTML as
# deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>").
if rawdata . startswith ( " <! " , i ) : # 声明(如DOCTYPE)
k = self . parse_declaration ( i )
if k < 0 :
break
i = k
continue
elif rawdata [ i ] == ' & ' :
elif rawdata [ i ] == ' & ' : # 处理实体引用
if self . literal :
self . handle_data ( rawdata [ i ] )
i = i + 1
continue
match = charref . match ( rawdata , i )
match = charref . match ( rawdata , i ) # 字符引用
if match :
name = match . group ( 1 )
self . handle_charref ( name )
@ -198,7 +214,7 @@ class SGMLParser(markupbase.ParserBase):
if rawdata [ i - 1 ] != ' ; ' :
i = i - 1
continue
match = entityref . match ( rawdata , i )
match = entityref . match ( rawdata , i ) # 实体引用
if match :
name = match . group ( 1 )
self . handle_entityref ( name )
@ -208,8 +224,7 @@ class SGMLParser(markupbase.ParserBase):
continue
else :
self . error ( ' neither < nor & ?? ' )
# We get here only if incomplete matches but
# nothing else
# 处理不完整的匹配
match = incomplete . match ( rawdata , i )
if not match :
self . handle_data ( rawdata [ i ] )
@ -217,21 +232,20 @@ class SGMLParser(markupbase.ParserBase):
continue
j = match . end ( 0 )
if j == n :
break # Really incomplete
break
self . handle_data ( rawdata [ i : j ] )
i = j
# end while
# 处理剩余数据
if end and i < n :
self . handle_data ( rawdata [ i : n ] )
i = n
self . rawdata = rawdata [ i : ]
# XXX if end: check for empty stack
# Extensions for the DOCTYPE scanner:
_decl_otherchars = ' = '
# Internal -- parse processing instr, return length or -1 if not terminated
def parse_pi ( self , i ) :
""" 内部方法 - 解析处理指令
返回处理的字符数 , 如果未结束则返回 - 1
"""
rawdata = self . rawdata
if rawdata [ i : i + 2 ] != ' <? ' :
self . error ( ' unexpected call to parse_pi() ' )
@ -244,18 +258,18 @@ class SGMLParser(markupbase.ParserBase):
return j - i
def get_starttag_text ( self ) :
""" 获取最近处理的开始标签文本 """
return self . __starttag_text
# Internal -- handle starttag, return length or -1 if not terminated
def parse_starttag ( self , i ) :
""" 内部方法 - 处理开始标签
返回处理的字符数 , 如果未结束则返回 - 1
"""
self . __starttag_text = None
start_pos = i
rawdata = self . rawdata
if shorttagopen . match ( rawdata , i ) :
# SGML shorthand: <tag/data/ == <tag>data</tag>
# XXX Can data contain &... (entity or char refs)?
# XXX Can data contain < or > (tag characters)?
# XXX Can there be whitespace before the first /?
if shorttagopen . match ( rawdata , i ) : # 简写标签
match = shorttag . match ( rawdata , i )
if not match :
return - 1
@ -266,18 +280,16 @@ class SGMLParser(markupbase.ParserBase):
self . finish_shorttag ( tag , data )
self . __starttag_text = rawdata [ start_pos : match . end ( 1 ) + 1 ]
return k
# XXX The following should skip matching quotes (' or ")
# As a shortcut way to exit, this isn't so bad, but shouldn't
# be used to locate the actual end of the start tag since the
# < or > characters may be embedded in an attribute value.
# 查找标签结束位置
match = endbracket . search ( rawdata , i + 1 )
if not match :
return - 1
j = match . start ( 0 )
# Now parse the data between i + 1 and j into a tag and attrs
# 解析标签名和属性
attrs = [ ]
if rawdata [ i : i + 2 ] == ' <> ' :
# SGML shorthand: <> == <last open tag seen>
if rawdata [ i : i + 2 ] == ' <> ' : # <>表示重复上一个开始标签
k = j
tag = self . lasttag
else :
@ -287,6 +299,8 @@ class SGMLParser(markupbase.ParserBase):
k = match . end ( 0 )
tag = rawdata [ i + 1 : k ] . lower ( )
self . lasttag = tag
# 解析属性
while k < j :
match = attrfind . match ( rawdata , k )
if not match :
@ -297,31 +311,31 @@ class SGMLParser(markupbase.ParserBase):
else :
if ( attrvalue [ : 1 ] == " ' " == attrvalue [ - 1 : ] or
attrvalue [ : 1 ] == ' " ' == attrvalue [ - 1 : ] ) :
# strip quotes
attrvalue = attrvalue [ 1 : - 1 ]
attrvalue = attrvalue [ 1 : - 1 ] # 去掉引号
attrvalue = self . entity_or_charref . sub (
self . _convert_ref , attrvalue )
attrs . append ( ( attrname . lower ( ) , attrvalue ) )
k = match . end ( 0 )
if rawdata [ j ] == ' > ' :
j = j + 1
self . __starttag_text = rawdata [ start_pos : j ]
self . finish_starttag ( tag , attrs )
return j
# Internal -- convert entity or character reference
def _convert_ref ( self , match ) :
if match . group ( 2 ) :
""" 内部方法 - 转换实体引用或字符引用 """
if match . group ( 2 ) : # 字符引用
return self . convert_charref ( match . group ( 2 ) ) or \
' &# %s %s ' % match . groups ( ) [ 1 : ]
elif match . group ( 3 ) :
elif match . group ( 3 ) : # 实体引用
return self . convert_entityref ( match . group ( 1 ) ) or \
' & %s ; ' % match . group ( 1 )
else :
return ' & %s ' % match . group ( 1 )
# Internal -- parse endtag
def parse_endtag ( self , i ) :
""" 内部方法 - 解析结束标签 """
rawdata = self . rawdata
match = endbracket . search ( rawdata , i + 1 )
if not match :
@ -333,15 +347,23 @@ class SGMLParser(markupbase.ParserBase):
self . finish_endtag ( tag )
return j
# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
def finish_shorttag ( self , tag , data ) :
""" 内部方法 - 完成简写标签的处理
< tag / data / > 等同于 < tag > data < / tag >
"""
self . finish_starttag ( tag , [ ] )
self . handle_data ( data )
self . finish_endtag ( tag )
# Internal -- finish processing of start tag
# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
def finish_starttag ( self , tag , attrs ) :
""" 内部方法 - 完成开始标签的处理
返回 :
- 1 : 未知标签
0 : 仅开始标签
1 : 平衡标签
"""
try :
method = getattr ( self , ' start_ ' + tag )
except AttributeError :
@ -358,15 +380,15 @@ class SGMLParser(markupbase.ParserBase):
self . handle_starttag ( tag , method , attrs )
return 1
# Internal -- finish processing of end tag
def finish_endtag ( self , tag ) :
if not tag :
""" 内部方法 - 完成结束标签的处理 """
if not tag : # 空标签
found = len ( self . stack ) - 1
if found < 0 :
self . unknown_endtag ( tag )
return
else :
if tag not in self . stack :
if tag not in self . stack : # 未匹配的结束标签
try :
method = getattr ( self , ' end_ ' + tag )
except AttributeError :
@ -378,6 +400,8 @@ class SGMLParser(markupbase.ParserBase):
for i in range ( found ) :
if self . stack [ i ] == tag :
found = i
# 处理所有未闭合的标签
while len ( self . stack ) > found :
tag = self . stack [ - 1 ]
try :
@ -390,22 +414,24 @@ class SGMLParser(markupbase.ParserBase):
self . unknown_endtag ( tag )
del self . stack [ - 1 ]
# Overridable -- handle start tag
# 以下方法可被派生类重写
def handle_starttag ( self , tag , method , attrs ) :
""" 处理开始标签 """
method ( attrs )
# Overridable -- handle end tag
def handle_endtag ( self , tag , method ) :
""" 处理结束标签 """
method ( )
# Example -- report an unbalanced </...> tag.
def report_unbalanced ( self , tag ) :
""" 报告未匹配的结束标签 """
if self . verbose :
print ( ' *** Unbalanced </ ' + tag + ' > ' )
print ( ' *** Stack: ' , self . stack )
def convert_charref ( self , name ) :
""" Convert character reference, may be overridden. """
""" 转换字符引用 """
try :
n = int ( name )
except ValueError :
@ -415,25 +441,25 @@ class SGMLParser(markupbase.ParserBase):
return self . convert_codepoint ( n )
def convert_codepoint ( self , codepoint ) :
""" 转换代码点为字符 """
return chr ( codepoint )
def handle_charref ( self , name ) :
""" Handle character reference, no need to override. """
""" 处理字符引用 """
replacement = self . convert_charref ( name )
if replacement is None :
self . unknown_charref ( name )
else :
self . handle_data ( replacement )
# Definition of entities -- derived classes may override
# 实体定义 - 派生类可以重写
entitydefs = \
{ ' lt ' : ' < ' , ' gt ' : ' > ' , ' amp ' : ' & ' , ' quot ' : ' " ' , ' apos ' : ' \' ' }
def convert_entityref ( self , name ) :
""" Convert entity references.
As an alternative to overriding this method ; one can tailor the
results by setting up the self . entitydefs mapping appropriately .
""" 转换实体引用
可以通过设置self . entitydefs来自定义转换规则
"""
table = self . entitydefs
if name in table :
@ -442,61 +468,72 @@ class SGMLParser(markupbase.ParserBase):
return
def handle_entityref ( self , name ) :
""" Handle entity references, no need to override. """
""" 处理实体引用 """
replacement = self . convert_entityref ( name )
if replacement is None :
self . unknown_entityref ( name )
else :
self . handle_data ( replacement )
# Example -- handle data, should be overridden
# 以下是示例处理方法 - 应该被重写
def handle_data ( self , data ) :
""" 处理文本数据 """
pass
# Example -- handle comment, could be overridden
def handle_comment ( self , data ) :
""" 处理注释 """
pass
# Example -- handle declaration, could be overridden
def handle_decl ( self , decl ) :
""" 处理声明 """
pass
# Example -- handle processing instruction, could be overridden
def handle_pi ( self , data ) :
""" 处理处理指令 """
pass
# To be overridden -- handlers for unknown objects
# 处理未知对象的方法 - 需要重写
def unknown_starttag ( self , tag , attrs ) :
""" 处理未知开始标签 """
pass
def unknown_endtag ( self , tag ) :
""" 处理未知结束标签 """
pass
def unknown_charref ( self , ref ) :
""" 处理未知字符引用 """
pass
def unknown_entityref ( self , ref ) :
""" 处理未知实体引用 """
pass
class TestSGMLParser ( SGMLParser ) :
""" 用于测试的SGML解析器 """
def __init__ ( self , verbose = 0 ) :
self . testdata = " "
SGMLParser . __init__ ( self , verbose )
def handle_data ( self , data ) :
""" 收集并打印文本数据 """
self . testdata = self . testdata + data
if len ( repr ( self . testdata ) ) > = 70 :
self . flush ( )
def flush ( self ) :
""" 打印收集的数据 """
data = self . testdata
if data :
self . testdata = " "
print ( ' data: ' , repr ( data ) )
def handle_comment ( self , data ) :
""" 打印注释 """
self . flush ( )
r = repr ( data )
if len ( r ) > 68 :
@ -504,6 +541,7 @@ class TestSGMLParser(SGMLParser):
print ( ' comment: ' , r )
def unknown_starttag ( self , tag , attrs ) :
""" 打印未知开始标签 """
self . flush ( )
if not attrs :
print ( ' start tag: < ' + tag + ' > ' )
@ -514,27 +552,33 @@ class TestSGMLParser(SGMLParser):
print ( ' > ' )
def unknown_endtag ( self , tag ) :
""" 打印未知结束标签 """
self . flush ( )
print ( ' end tag: </ ' + tag + ' > ' )
def unknown_entityref ( self , ref ) :
""" 打印未知实体引用 """
self . flush ( )
print ( ' *** unknown entity ref: & ' + ref + ' ; ' )
def unknown_charref ( self , ref ) :
""" 打印未知字符引用 """
self . flush ( )
print ( ' *** unknown char ref: &# ' + ref + ' ; ' )
def unknown_decl ( self , data ) :
""" 打印未知声明 """
self . flush ( )
print ( ' *** unknown decl: [ ' + data + ' ] ' )
def close ( self ) :
""" 关闭解析器并打印剩余数据 """
SGMLParser . close ( self )
self . flush ( )
def test ( args = None ) :
""" 测试函数 """
import sys
if args is None :