|  |  | @ -79,51 +79,71 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. | 
			
		
	
		
		
			
				
					
					|  |  |  | from __future__ import generators |  |  |  | from __future__ import generators | 
			
		
	
		
		
			
				
					
					|  |  |  | from __future__ import print_function |  |  |  | from __future__ import print_function | 
			
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | # 导入 Python 2.6 以后版本中的一些特性,使得代码可以在早期版本中使用 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | # generators: 使得代码可以使用 Python 2.6 引入的生成器特性 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | # print_function: 使得代码可以使用 Python 3.x 中的 print 函数 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  | __author__ = "Leonard Richardson (leonardr@segfault.org)" |  |  |  | __author__ = "Leonard Richardson (leonardr@segfault.org)" | 
			
		
	
		
		
			
				
					
					|  |  |  | __version__ = "3.2.1b" |  |  |  | __version__ = "3.2.1b" | 
			
		
	
		
		
			
				
					
					|  |  |  | __copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" |  |  |  | __copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" | 
			
		
	
		
		
			
				
					
					|  |  |  | __license__ = "New-style BSD" |  |  |  | __license__ = "New-style BSD" | 
			
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | # 定义库的作者、版本、版权和许可证信息 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  | import codecs |  |  |  | import codecs | 
			
		
	
		
		
			
				
					
					|  |  |  | import re |  |  |  | import re | 
			
		
	
		
		
			
				
					
					|  |  |  | import sys |  |  |  | import sys | 
			
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | # 导入 Python 标准库中的模块 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  | if sys.version_info >= (3, 0): |  |  |  | if sys.version_info >= (3, 0): | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     # Python 3.x 版本兼容处理 | 
			
		
	
		
		
			
				
					
					|  |  |  |     xrange = range |  |  |  |     xrange = range | 
			
		
	
		
		
			
				
					
					|  |  |  |     text_type = str |  |  |  |     text_type = str | 
			
		
	
		
		
			
				
					
					|  |  |  |     binary_type = bytes |  |  |  |     binary_type = bytes | 
			
		
	
		
		
			
				
					
					|  |  |  |     basestring = str |  |  |  |     basestring = str | 
			
		
	
		
		
			
				
					
					|  |  |  |     unichr = chr |  |  |  |     unichr = chr | 
			
		
	
		
		
			
				
					
					|  |  |  | else: |  |  |  | else: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     # Python 2.x 版本兼容处理 | 
			
		
	
		
		
			
				
					
					|  |  |  |     text_type = unicode |  |  |  |     text_type = unicode | 
			
		
	
		
		
			
				
					
					|  |  |  |     binary_type = str |  |  |  |     binary_type = str | 
			
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  | try: |  |  |  | try: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     # 尝试从 Python 3.x 的标准库中导入 name2codepoint | 
			
		
	
		
		
			
				
					
					|  |  |  |     from html.entities import name2codepoint |  |  |  |     from html.entities import name2codepoint | 
			
		
	
		
		
			
				
					
					|  |  |  | except ImportError: |  |  |  | except ImportError: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     # 如果导入失败(在 Python 2.x 中),则从 htmlentitydefs 模块导入 | 
			
		
	
		
		
			
				
					
					|  |  |  |     from htmlentitydefs import name2codepoint |  |  |  |     from htmlentitydefs import name2codepoint | 
			
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  | try: |  |  |  | try: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     # 尝试导入 set(Python 2.6 以后版本中新增的集合类型) | 
			
		
	
		
		
			
				
					
					|  |  |  |     set |  |  |  |     set | 
			
		
	
		
		
			
				
					
					|  |  |  | except NameError: |  |  |  | except NameError: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     # 如果导入失败(在 Python 2.6 之前的版本中),则从 sets 模块导入 Set 类 | 
			
		
	
		
		
			
				
					
					|  |  |  |     from sets import Set as set |  |  |  |     from sets import Set as set | 
			
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  | try: |  |  |  | try: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     # 尝试导入 sgmllib 模块(Python 3.x 中的模块) | 
			
		
	
		
		
			
				
					
					|  |  |  |     import sgmllib |  |  |  |     import sgmllib | 
			
		
	
		
		
			
				
					
					|  |  |  | except ImportError: |  |  |  | except ImportError: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     # 如果导入失败(在 Python 2.x 中),则从 lib.utils 目录导入 sgmllib | 
			
		
	
		
		
			
				
					
					|  |  |  |     from lib.utils import sgmllib |  |  |  |     from lib.utils import sgmllib | 
			
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  | try: |  |  |  | try: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     # 尝试导入 markupbase 模块(Python 3.x 中的模块) | 
			
		
	
		
		
			
				
					
					|  |  |  |     import markupbase |  |  |  |     import markupbase | 
			
		
	
		
		
			
				
					
					|  |  |  | except ImportError: |  |  |  | except ImportError: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     # 如果导入失败(在 Python 2.x 中),则从 _markupbase 模块导入 | 
			
		
	
		
		
			
				
					
					|  |  |  |     import _markupbase as markupbase |  |  |  |     import _markupbase as markupbase | 
			
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  | #These hacks make Beautiful Soup able to parse XML with namespaces |  |  |  | # 这些 hack 使得 Beautiful Soup 能够解析带有命名空间的 XML | 
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					|  |  |  | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') |  |  |  | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') | 
			
		
	
		
		
			
				
					
					|  |  |  | markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match |  |  |  | markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match | 
			
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  | DEFAULT_OUTPUT_ENCODING = "utf-8" |  |  |  | DEFAULT_OUTPUT_ENCODING = "utf-8" | 
			
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | # 设置默认的输出编码为 UTF-8 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  | def _match_css_class(str): |  |  |  | def _match_css_class(str): | 
			
		
	
		
		
			
				
					
					|  |  |  |     """Build a RE to match the given CSS class.""" |  |  |  |     """Build a RE to match the given CSS class.""" | 
			
		
	
		
		
			
				
					
					|  |  |  |     return re.compile(r"(^|.*\s)%s($|\s)" % str) |  |  |  |     return re.compile(r"(^|.*\s)%s($|\s)" % str) | 
			
		
	
	
		
		
			
				
					|  |  | 
 |