|
|
|
@ -79,51 +79,71 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
|
|
|
|
|
from __future__ import generators
|
|
|
|
|
from __future__ import print_function
|
|
|
|
|
|
|
|
|
|
# 导入 Python 2.6 以后版本中的一些特性,使得代码可以在早期版本中使用
|
|
|
|
|
# generators: 使得代码可以使用 Python 2.6 引入的生成器特性
|
|
|
|
|
# print_function: 使得代码可以使用 Python 3.x 中的 print 函数
|
|
|
|
|
|
|
|
|
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
|
|
|
|
__version__ = "3.2.1b"
|
|
|
|
|
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
|
|
|
|
|
__license__ = "New-style BSD"
|
|
|
|
|
|
|
|
|
|
# 定义库的作者、版本、版权和许可证信息
|
|
|
|
|
|
|
|
|
|
import codecs
|
|
|
|
|
import re
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
|
|
# 导入 Python 标准库中的模块
|
|
|
|
|
|
|
|
|
|
if sys.version_info >= (3, 0):
|
|
|
|
|
# Python 3.x 版本兼容处理
|
|
|
|
|
xrange = range
|
|
|
|
|
text_type = str
|
|
|
|
|
binary_type = bytes
|
|
|
|
|
basestring = str
|
|
|
|
|
unichr = chr
|
|
|
|
|
else:
|
|
|
|
|
# Python 2.x 版本兼容处理
|
|
|
|
|
text_type = unicode
|
|
|
|
|
binary_type = str
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 尝试从 Python 3.x 的标准库中导入 name2codepoint
|
|
|
|
|
from html.entities import name2codepoint
|
|
|
|
|
except ImportError:
|
|
|
|
|
# 如果导入失败(在 Python 2.x 中),则从 htmlentitydefs 模块导入
|
|
|
|
|
from htmlentitydefs import name2codepoint
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 尝试导入 set(Python 2.6 以后版本中新增的集合类型)
|
|
|
|
|
set
|
|
|
|
|
except NameError:
|
|
|
|
|
# 如果导入失败(在 Python 2.6 之前的版本中),则从 sets 模块导入 Set 类
|
|
|
|
|
from sets import Set as set
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 尝试导入 sgmllib 模块(Python 3.x 中的模块)
|
|
|
|
|
import sgmllib
|
|
|
|
|
except ImportError:
|
|
|
|
|
# 如果导入失败(在 Python 2.x 中),则从 lib.utils 目录导入 sgmllib
|
|
|
|
|
from lib.utils import sgmllib
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 尝试导入 markupbase 模块(Python 3.x 中的模块)
|
|
|
|
|
import markupbase
|
|
|
|
|
except ImportError:
|
|
|
|
|
# 如果导入失败(在 Python 2.x 中),则从 _markupbase 模块导入
|
|
|
|
|
import _markupbase as markupbase
|
|
|
|
|
|
|
|
|
|
#These hacks make Beautiful Soup able to parse XML with namespaces
|
|
|
|
|
# 这些 hack 使得 Beautiful Soup 能够解析带有命名空间的 XML
|
|
|
|
|
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
|
|
|
|
|
markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
|
|
|
|
|
|
|
|
|
|
DEFAULT_OUTPUT_ENCODING = "utf-8"
|
|
|
|
|
|
|
|
|
|
# 设置默认的输出编码为 UTF-8
|
|
|
|
|
|
|
|
|
|
def _match_css_class(str):
|
|
|
|
|
"""Build a RE to match the given CSS class."""
|
|
|
|
|
return re.compile(r"(^|.*\s)%s($|\s)" % str)
|
|
|
|
|