From 28f0660564ef22e3ad04e5ecdb875824aefa4ebd Mon Sep 17 00:00:00 2001 From: snh <1476164672@qq.com> Date: Sun, 29 Dec 2024 20:45:52 +0800 Subject: [PATCH] beautifulsoup --- .../thirdparty/beautifulsoup/beautifulsoup.py | 1626 ++++++++--------- 1 file changed, 807 insertions(+), 819 deletions(-) diff --git a/src/sqlmap-master/thirdparty/beautifulsoup/beautifulsoup.py b/src/sqlmap-master/thirdparty/beautifulsoup/beautifulsoup.py index 76c08fd..e51cb22 100644 --- a/src/sqlmap-master/thirdparty/beautifulsoup/beautifulsoup.py +++ b/src/sqlmap-master/thirdparty/beautifulsoup/beautifulsoup.py @@ -145,31 +145,37 @@ DEFAULT_OUTPUT_ENCODING = "utf-8" # 设置默认的输出编码为 UTF-8 def _match_css_class(str): +#构建一个正则表达式,以便匹配给定的CSS类名 """Build a RE to match the given CSS class.""" return re.compile(r"(^|.*\s)%s($|\s)" % str) # First, the classes that represent markup elements. class PageElement(object): +#作为页面元素的基类,包含导航信息,可以是标签或文本 """Contains the navigational information for some part of the page (either a tag or a piece of text)""" def _invert(h): + #创建一个新的字典,将原字典的键值对颠倒 "Cheap function to invert a hash." i = {} for k,v in h.items(): i[v] = k return i + #将XML实体映射到它们对应的特殊字符 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", "quot" : '"', "amp" : "&", "lt" : "<", "gt" : ">" } + #创建一个新的字典,将特殊字符映射回XML实体 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) def setup(self, parent=None, previous=None): + #初始化元素之间的关系,包括父元素、前一个元素、后一个元素以及兄弟元素 """Sets up the initial relations between this element and other elements.""" self.parent = parent @@ -182,6 +188,7 @@ class PageElement(object): self.previousSibling.nextSibling = self def replaceWith(self, replaceWith): + #首先记录当前元素的父元素和索引位置,然后提取当前元素,最后在原来的位置插入新元素 oldParent = self.parent myIndex = self.parent.index(self) if hasattr(replaceWith, "parent")\ @@ -197,6 +204,7 @@ class PageElement(object): oldParent.insert(myIndex, replaceWith) def replaceWithChildren(self): + #首先提取当前元素,然后将子元素逆序插入到父元素中 myParent = self.parent myIndex = self.parent.index(self) self.extract() @@ -206,6 +214,7 @@ class PageElement(object): myParent.insert(myIndex, child) def extract(self): + #从树中提取(移除)当前元素 """Destructively rips this element out of the tree.""" if self.parent: try: @@ -219,6 +228,7 @@ class PageElement(object): lastChild = self._lastRecursiveChild() nextElement = lastChild.next + #更新previous和next指针,以保持元素之间的正确连接 if self.previous: self.previous.next = nextElement if nextElement: @@ -226,6 +236,7 @@ class PageElement(object): self.previous = None lastChild.next = None + #清除当前元素的所有关系,包括父元素和兄弟元素,并返回当前元素 self.parent = None if self.previousSibling: self.previousSibling.nextSibling = self.nextSibling @@ -235,6 +246,7 @@ class PageElement(object): return self def _lastRecursiveChild(self): + #找到当前元素下最后一个被解析的子元素 "Finds the last element beneath this object to be parsed." lastChild = self while hasattr(lastChild, 'contents') and lastChild.contents: @@ -242,6 +254,7 @@ class PageElement(object): return lastChild def insert(self, position, newChild): + #在当前元素的内容列表中的指定位置插入一个新的子元素 if isinstance(newChild, basestring) \ and not isinstance(newChild, NavigableString): newChild = NavigableString(newChild) @@ -301,29 +314,31 @@ class PageElement(object): self.contents.insert(position, newChild) def append(self, tag): + #将给定的标签追加到当前元素的内容列表的末尾 """Appends the given tag to the contents of this tag.""" self.insert(len(self.contents), tag) def findNext(self, name=None, attrs={}, text=None, **kwargs): + #查找文档中当前标签之后第一个匹配给定条件的元素 """Returns the first item that matches the given criteria and appears after this Tag in the document.""" return self._findOne(self.findAllNext, name, attrs, text, **kwargs) - def findAllNext(self, name=None, attrs={}, text=None, limit=None, - **kwargs): + def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs): + #查找文档中当前标签之后所有匹配给定条件的元素 """Returns all items that match the given criteria and appear after this Tag in the document.""" - return self._findAll(name, attrs, text, limit, self.nextGenerator, - **kwargs) + return self._findAll(name, attrs, text, limit, self.nextGenerator, **kwargs) def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): + #查找文档中当前标签之后第一个匹配给定条件的兄弟标签 """Returns the closest sibling to this Tag that matches the given criteria and appears after this Tag in the document.""" return self._findOne(self.findNextSiblings, name, attrs, text, **kwargs) - def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, - **kwargs): + def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): + #查找文档中当前标签之后所有匹配给定条件的兄弟标签 """Returns the siblings of this Tag that match the given criteria and appear after this Tag in the document.""" return self._findAll(name, attrs, text, limit, @@ -331,6 +346,7 @@ class PageElement(object): fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x def findPrevious(self, name=None, attrs={}, text=None, **kwargs): + #查找文档中当前标签之前第一个匹配给定条件的元素 """Returns the first item that matches the given criteria and appears before this Tag in the document.""" return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) @@ -344,171 +360,230 @@ class PageElement(object): fetchPrevious = findAllPrevious # Compatibility with pre-3.x def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): + #查找文档中当前标签之前所有匹配给定条件的元素 """Returns the closest sibling to this Tag that matches the given criteria and appears before this Tag in the document.""" - return self._findOne(self.findPreviousSiblings, name, attrs, text, - **kwargs) + return self._findOne(self.findPreviousSiblings, name, attrs, text, **kwargs) - def findPreviousSiblings(self, name=None, attrs={}, text=None, - limit=None, **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear before this Tag in the document.""" + + def findPreviousSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): + """ + 返回在文档中出现在当前标签之前且符合给定条件的所有兄弟标签。 + + 参数: + name -- 要搜索的标签名称,可以是字符串、正则表达式或列表。 + attrs -- 要搜索的标签属性,可以是字典或关键字参数。 + text -- 要搜索的文本内容,可以是字符串、正则表达式或列表。 + limit -- 返回结果的数量限制。 + **kwargs -- 其他关键字参数,用于扩展搜索条件。 + + 返回: + 符合条件的兄弟标签列表。 + """ return self._findAll(name, attrs, text, limit, self.previousSiblingGenerator, **kwargs) - fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x + + + # 为了与BeautifulSoup 3.x版本兼容,重命名findPreviousSiblings为fetchPreviousSiblings + fetchPreviousSiblings = findPreviousSiblings + def findParent(self, name=None, attrs={}, **kwargs): - """Returns the closest parent of this Tag that matches the given - criteria.""" - # NOTE: We can't use _findOne because findParents takes a different - # set of arguments. + """ + 返回与给定条件匹配的最近的父标签。 + + 参数: + name -- 要搜索的标签名称,可以是字符串、正则表达式或列表。 + attrs -- 要搜索的标签属性,可以是字典或关键字参数。 + **kwargs -- 其他关键字参数,用于扩展搜索条件。 + + 返回: + 与条件匹配的最近的父标签,如果没有找到则返回None。 + """ + # 注意:我们不能使用_findOne,因为findParents接受不同的参数集。 r = None l = self.findParents(name, attrs, 1) if l: r = l[0] return r + def findParents(self, name=None, attrs={}, limit=None, **kwargs): - """Returns the parents of this Tag that match the given - criteria.""" + """返回与给定条件匹配的父标签列表。""" + return self._findAll(name, attrs, None, limit, self.parentGenerator, **kwargs) - return self._findAll(name, attrs, None, limit, self.parentGenerator, - **kwargs) - fetchParents = findParents # Compatibility with pre-3.x - #These methods do the real heavy lifting. + # 为了与BeautifulSoup 3.x版本兼容,重命名findParents为fetchParents + fetchParents = findParents # 兼容旧版本 + + + # 这些方法执行实际的查找操作。 def _findOne(self, method, name, attrs, text, **kwargs): + """使用指定的方法查找第一个匹配的标签。""" r = None - l = method(name, attrs, text, 1, **kwargs) + l = method(name, attrs, text, 1, **kwargs) # 调用指定的方法查找 if l: - r = l[0] + r = l[0] # 如果找到匹配项,返回第一个 return r - def _findAll(self, name, attrs, text, limit, generator, **kwargs): - "Iterates over a generator looking for things that match." + def _findAll(self, name, attrs, text, limit, generator, **kwargs): + """遍历生成器,查找所有匹配的标签。""" if isinstance(name, SoupStrainer): - strainer = name - # (Possibly) special case some findAll*(...) searches + strainer = name # 如果name是SoupStrainer实例,直接使用 + # 特殊情况处理 elif text is None and not limit and not attrs and not kwargs: - # findAll*(True) + # findAll*(True)的情况 if name is True: - return [element for element in generator() - if isinstance(element, Tag)] - # findAll*('tag-name') + return [element for element in generator() if isinstance(element, Tag)] + # findAll*('tag-name')的情况 elif isinstance(name, basestring): - return [element for element in generator() - if isinstance(element, Tag) and - element.name == name] + return [element for element in generator() if isinstance(element, Tag) and element.name == name] else: - strainer = SoupStrainer(name, attrs, text, **kwargs) - # Build a SoupStrainer + strainer = SoupStrainer(name, attrs, text, **kwargs) # 构建SoupStrainer else: - strainer = SoupStrainer(name, attrs, text, **kwargs) - results = ResultSet(strainer) - g = generator() + strainer = SoupStrainer(name, attrs, text, **kwargs) # 构建SoupStrainer + + results = ResultSet(strainer) # 创建结果集 + g = generator() # 获取生成器 while True: try: - i = next(g) + i = next(g) # 获取下一个元素 except StopIteration: - break + break # 如果没有更多元素,退出循环 if i: - found = strainer.search(i) + found = strainer.search(i) # 使用strainer查找匹配的标签 if found: - results.append(found) + results.append(found) # 将找到的标签添加到结果集中 if limit and len(results) >= limit: - break + break # 如果达到限制,退出循环 return results - #These Generators can be used to navigate starting from both - #NavigableStrings and Tags. + + # 这些生成器可用于从NavigableStrings和Tags开始导航。 def nextGenerator(self): + """生成器,遍历当前标签的下一个元素。""" i = self while i is not None: - i = i.next - yield i + i = i.next # 获取下一个元素 + yield i # 生成下一个元素 + def nextSiblingGenerator(self): + """生成器,遍历当前标签的下一个兄弟标签。""" i = self while i is not None: - i = i.nextSibling - yield i + i = i.nextSibling # 获取下一个兄弟标签 + yield i # 生成下一个兄弟标签 + def previousGenerator(self): + """生成器,遍历当前标签的前一个元素。""" i = self while i is not None: - i = i.previous - yield i + i = i.previous # 获取前一个元素 + yield i # 生成前一个元素 + def previousSiblingGenerator(self): + """生成器,遍历当前标签的前一个兄弟标签。""" i = self while i is not None: - i = i.previousSibling - yield i + i = i.previousSibling # 获取前一个兄弟标签 + yield i # 生成前一个兄弟标签 + def parentGenerator(self): + """生成器,遍历当前标签的父标签。""" i = self while i is not None: - i = i.parent - yield i + i = i.parent # 获取父标签 + yield i # 生成父标签 + - # Utility methods + # 工具方法 def substituteEncoding(self, str, encoding=None): - encoding = encoding or "utf-8" - return str.replace("%SOUP-ENCODING%", encoding) + """替换字符串中的编码占位符为指定的编码。 + + 参数: + str -- 需要进行编码替换的字符串。 + encoding -- 指定的编码,默认为"utf-8"。 + + 返回: + 替换后的字符串。 + """ + encoding = encoding or "utf-8" # 如果没有提供编码,则使用默认的"utf-8" + return str.replace("%SOUP-ENCODING%", encoding) # 替换占位符 + def toEncoding(self, s, encoding=None): - """Encodes an object to a string in some encoding, or to Unicode. - .""" + """将对象编码为某种编码的字符串,或转换为Unicode字符串。""" if isinstance(s, text_type): if encoding: - s = s.encode(encoding) + s = s.encode(encoding) # 如果是文本类型且指定了编码,则进行编码 elif isinstance(s, binary_type): - s = s.encode(encoding or "utf8") + s = s.encode(encoding or "utf8") # 如果是二进制类型,则使用指定编码或默认的UTF-8进行编码 else: - s = self.toEncoding(str(s), encoding or "utf8") - return s + s = self.toEncoding(str(s), encoding or "utf8") # 其他类型,先转换为字符串再进行编码 + return s # 返回编码后的字符串 + + # 用于匹配未闭合的尖括号或不完整的HTML实体的正则表达式 BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;))") + def _sub_entity(self, x): - """Used with a regular expression to substitute the - appropriate XML entity for an XML special character.""" - return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" + """用于正则表达式替换,将XML特殊字符替换为相应的XML实体。""" + return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" # 返回对应的XML实体 class NavigableString(text_type, PageElement): + """ + NavigableString类是BeautifulSoup中用于处理和导航字符串的类。 + 它继承自Python的text_type(Python 3中的str类型)和PageElement,使其既可以当作字符串使用, + 也可以像页面元素一样进行导航。 + """ def __new__(cls, value): - """Create a new NavigableString. + """创建一个新的NavigableString实例。 - When unpickling a NavigableString, this method is called with - the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be - passed in to the superclass's __new__ or the superclass won't know - how to handle non-ASCII characters. + 当反序列化(unpickling)一个NavigableString时,会调用此方法, + 并且传入DEFAULT_OUTPUT_ENCODING编码的字符串。需要将这个编码传给superclass的__new__, + 否则superclass不知道如何处理非ASCII字符。 """ if isinstance(value, text_type): return text_type.__new__(cls, value) return text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) def __getnewargs__(self): + """返回创建NavigableString实例时的参数。""" return (NavigableString.__str__(self),) def __getattr__(self, attr): - """text.string gives you text. This is for backwards - compatibility for Navigable*String, but for CData* it lets you - get the string without the CData wrapper.""" + """对于NavigableString,text.string返回的就是text本身。 + + 这是为了向后兼容Navigable*String,但对于CData*,它允许你获取没有CData包装器的字符串。 + """ if attr == 'string': return self else: - raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)) + raise AttributeError("'%s'对象没有属性'%s'" % (self.__class__.__name__, attr)) def __unicode__(self): + """返回NavigableString的Unicode表示。""" return str(self).decode(DEFAULT_OUTPUT_ENCODING) def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - # Substitute outgoing XML entities. + """返回NavigableString的字符串表示,并进行XML实体替换。 + + 参数: + encoding -- 指定编码,默认为DEFAULT_OUTPUT_ENCODING。 + + 返回: + 根据指定编码编码后的字符串,如果没有指定编码,则返回Unicode字符串。 + """ + # 替换XML特殊字符为对应的XML实体 data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self) if encoding: return data.encode(encoding) @@ -516,202 +591,205 @@ class NavigableString(text_type, PageElement): return data class CData(NavigableString): - + """ + CData类用于表示XML中的CDATA区域,它允许在XML文档中嵌入未经处理的文本数据。 + """ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """返回CDATA区域的字符串表示,使用指定的编码。""" return "" % NavigableString.__str__(self, encoding) class ProcessingInstruction(NavigableString): + """ + ProcessingInstruction类用于表示XML处理指令,它允许在XML文档中包含处理器指令。 + """ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """返回处理指令的字符串表示,使用指定的编码。""" output = self if "%SOUP-ENCODING%" in output: + # 如果输出中包含编码占位符,则替换为实际编码 output = self.substituteEncoding(output, encoding) return "" % self.toEncoding(output, encoding) class Comment(NavigableString): + """ + Comment类用于表示XML中的注释,它允许在XML文档中添加注释信息。 + """ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """返回注释的字符串表示,使用指定的编码。""" return "" % NavigableString.__str__(self, encoding) class Declaration(NavigableString): + """ + Declaration类用于表示XML声明,它定义了XML文档的版本和编码等信息。 + """ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """返回XML声明的字符串表示,使用指定的编码。""" return "" % NavigableString.__str__(self, encoding) class Tag(PageElement): - - """Represents a found HTML tag with its attributes and contents.""" + """表示找到的HTML标签及其属性和内容。""" def _convertEntities(self, match): - """Used in a call to re.sub to replace HTML, XML, and numeric - entities with the appropriate Unicode characters. If HTML - entities are being converted, any unrecognized entities are - escaped.""" + """用于调用re.sub,将HTML、XML和数字实体替换为相应的Unicode字符。 + 如果正在转换HTML实体,则任何未识别的实体都会被转义。""" try: - x = match.group(1) + x = match.group(1) # 获取匹配的实体名称 if self.convertHTMLEntities and x in name2codepoint: - return unichr(name2codepoint[x]) + return unichr(name2codepoint[x]) # 转换为对应的Unicode字符 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: - return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] + return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] # 转换为XML特殊字符 else: - return u'&%s;' % x + return u'&%s;' % x # 返回未转换的实体 elif len(x) > 0 and x[0] == '#': - # Handle numeric entities + # 处理数字实体 if len(x) > 1 and x[1] == 'x': - return unichr(int(x[2:], 16)) + return unichr(int(x[2:], 16)) # 处理十六进制数字实体 else: - return unichr(int(x[1:])) + return unichr(int(x[1:])) # 处理十进制数字实体 elif self.escapeUnrecognizedEntities: - return u'&%s;' % x + return u'&%s;' % x # 转义未识别的实体 - except ValueError: # e.g. ValueError: unichr() arg not in range(0x10000) + except ValueError: # 处理unichr()引发的值错误 pass - return u'&%s;' % x - - def __init__(self, parser, name, attrs=None, parent=None, - previous=None): - "Basic constructor." + return u'&%s;' % x # 返回未识别的实体 - # We don't actually store the parser object: that lets extracted - # chunks be garbage-collected + def __init__(self, parser, name, attrs=None, parent=None, previous=None): + """基本构造函数。""" + # 我们实际上并不存储解析器对象:这使得提取的块可以被垃圾回收 self.parserClass = parser.__class__ - self.isSelfClosing = parser.isSelfClosingTag(name) - self.name = name + self.isSelfClosing = parser.isSelfClosingTag(name) # 判断标签是否为自闭合标签 + self.name = name # 标签名称 if attrs is None: - attrs = [] + attrs = [] # 如果没有提供属性,则初始化为空列表 elif isinstance(attrs, dict): - attrs = attrs.items() - self.attrs = attrs - self.contents = [] - self.setup(parent, previous) - self.hidden = False - self.containsSubstitutions = False - self.convertHTMLEntities = parser.convertHTMLEntities - self.convertXMLEntities = parser.convertXMLEntities - self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities - - # Convert any HTML, XML, or numeric entities in the attribute values. - # Reference: https://github.com/pkrumins/xgoogle/pull/16/commits/3dba1165c436b0d6e5bdbd09e53ca0dbf8a043f8 + attrs = attrs.items() # 如果提供的是字典,则转换为元组列表 + self.attrs = attrs # 标签属性 + self.contents = [] # 标签内容 + self.setup(parent, previous) # 设置父标签和前一个标签 + self.hidden = False # 标签是否隐藏 + self.containsSubstitutions = False # 是否包含替换 + self.convertHTMLEntities = parser.convertHTMLEntities # 是否转换HTML实体 + self.convertXMLEntities = parser.convertXMLEntities # 是否转换XML实体 + self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities # 是否转义未识别的实体 + + # 转换属性值中的HTML、XML或数字实体 convert = lambda k_val: (k_val[0], re.sub(r"&(#\d+|#x[0-9a-fA-F]+|\w+);", self._convertEntities, - k_val[1])) - self.attrs = map(convert, self.attrs) + k_val[1])) # 使用正则表达式替换实体 + self.attrs = map(convert, self.attrs) # 更新属性列表 def getString(self): - if (len(self.contents) == 1 - and isinstance(self.contents[0], NavigableString)): + """获取标签的字符串内容,如果内容只有一个NavigableString,则返回该内容。""" + if (len(self.contents) == 1 and isinstance(self.contents[0], NavigableString)): return self.contents[0] def setString(self, string): - """Replace the contents of the tag with a string""" - self.clear() - self.append(string) + """用字符串替换标签的内容。""" + self.clear() # 清空当前内容 + self.append(string) # 添加新的字符串内容 - string = property(getString, setString) + string = property(getString, setString) # 将getString和setString方法绑定到string属性 def getText(self, separator=u""): + """获取标签的文本内容,使用给定的分隔符连接多个文本。""" if not len(self.contents): - return u"" - stopNode = self._lastRecursiveChild().next - strings = [] - current = self.contents[0] + return u"" # 如果没有内容,返回空字符串 + stopNode = self._lastRecursiveChild().next # 获取最后一个子元素的下一个元素 + strings = [] # 存储文本内容的列表 + current = self.contents[0] # 从第一个内容开始 while current and current is not stopNode: if isinstance(current, NavigableString): - strings.append(current.strip()) - current = current.next - return separator.join(strings) + strings.append(current.strip()) # 去除文本两端的空白并添加到列表 + current = current.next # 移动到下一个内容 + return separator.join(strings) # 使用分隔符连接所有文本并返回 - text = property(getText) + text = property(getText) # 将getText方法绑定到text属性 def get(self, key, default=None): - """Returns the value of the 'key' attribute for the tag, or - the value given for 'default' if it doesn't have that - attribute.""" + """返回标签的指定属性的值,如果没有该属性,则返回默认值。""" return self._getAttrMap().get(key, default) def clear(self): - """Extract all children.""" + """提取所有子元素。""" for child in self.contents[:]: - child.extract() + child.extract() # 从树中移除每个子元素 def index(self, element): + """返回指定元素在当前标签内容中的索引,如果未找到则抛出异常。""" for i, child in enumerate(self.contents): if child is element: return i - raise ValueError("Tag.index: element not in tag") + raise ValueError("Tag.index: element not in tag") # 如果未找到,抛出值错误 def has_key(self, key): + """检查标签是否包含指定的属性。""" return self._getAttrMap().has_key(key) def __getitem__(self, key): - """tag[key] returns the value of the 'key' attribute for the tag, - and throws an exception if it's not there.""" + """通过key访问标签的属性值,如果不存在则抛出异常。""" return self._getAttrMap()[key] def __iter__(self): - "Iterating over a tag iterates over its contents." + """迭代标签的内容。""" return iter(self.contents) def __len__(self): - "The length of a tag is the length of its list of contents." + """返回标签内容的长度。""" return len(self.contents) def __contains__(self, x): + """检查指定元素是否在标签内容中。""" return x in self.contents def __nonzero__(self): - "A tag is non-None even if it has no contents." + """标签即使没有内容也被视为非空。""" return True def __setitem__(self, key, value): - """Setting tag[key] sets the value of the 'key' attribute for the - tag.""" - self._getAttrMap() - self.attrMap[key] = value + """设置标签的属性值。""" + self._getAttrMap() # 初始化属性映射 + self.attrMap[key] = value # 更新属性映射 found = False for i in xrange(0, len(self.attrs)): if self.attrs[i][0] == key: - self.attrs[i] = (key, value) + self.attrs[i] = (key, value) # 更新现有属性 found = True if not found: - self.attrs.append((key, value)) - self._getAttrMap()[key] = value + self.attrs.append((key, value)) # 添加新属性 + self._getAttrMap()[key] = value # 更新属性映射 def __delitem__(self, key): - "Deleting tag[key] deletes all 'key' attributes for the tag." + """删除标签的指定属性。""" for item in self.attrs: if item[0] == key: - self.attrs.remove(item) - #We don't break because bad HTML can define the same - #attribute multiple times. - self._getAttrMap() - if self.attrMap.has_key(key): - del self.attrMap[key] + self.attrs.remove(item) # 移除属性 + # 不中断,因为坏HTML可能定义相同的属性多次 + self._getAttrMap() # 初始化属性映射 + if self.attrMap.has_key(key): + del self.attrMap[key] # 删除属性映射中的属性 def __call__(self, *args, **kwargs): - """Calling a tag like a function is the same as calling its - findAll() method. Eg. tag('a') returns a list of all the A tags - found within this tag.""" + """调用标签就像调用其findAll()方法一样。""" return self.findAll(*args, **kwargs) def __getattr__(self, tag): - #print "Getattr %s.%s" % (self.__class__, tag) + """根据标签名称获取标签内容。""" if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: - return self.find(tag[:-3]) + return self.find(tag[:-3]) # 如果标签名以'Tag'结尾,返回对应的标签 elif tag.find('__') != 0: - return self.find(tag) - raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag)) + return self.find(tag) # 否则,返回对应的标签 + raise AttributeError("'%s'对象没有属性'%s'" % (self.__class__, tag)) def __eq__(self, other): - """Returns true iff this tag has the same name, the same attributes, - and the same contents (recursively) as the given tag. - - NOTE: right now this will return false if two tags have the - same attributes in a different order. Should this be fixed?""" + """判断当前标签是否与另一个标签相等,比较名称、属性和内容。""" if other is self: return True - if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or \ + self.name != other.name or self.attrs != other.attrs or len(self) != len(other): return False for i in xrange(0, len(self.contents)): if self.contents[i] != other.contents[i]: @@ -719,378 +797,339 @@ class Tag(PageElement): return True def __ne__(self, other): - """Returns true iff this tag is not identical to the other tag, - as defined in __eq__.""" + """判断当前标签是否与另一个标签不相等。""" return not self == other def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): - """Renders this tag as a string.""" + """将标签渲染为字符串。""" return self.__str__(encoding) def __unicode__(self): + """返回标签的Unicode表示。""" return self.__str__(None) def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): - """Returns a string or Unicode representation of this tag and - its contents. To get Unicode, pass None for encoding. - - NOTE: since Python's HTML parser consumes whitespace, this - method is not certain to reproduce the whitespace present in - the original string.""" - - encodedName = self.toEncoding(self.name, encoding) - - attrs = [] - if self.attrs: - for key, val in self.attrs: - fmt = '%s="%s"' - if isinstance(val, basestring): - if self.containsSubstitutions and '%SOUP-ENCODING%' in val: - val = self.substituteEncoding(val, encoding) - - # The attribute value either: - # - # * Contains no embedded double quotes or single quotes. - # No problem: we enclose it in double quotes. - # * Contains embedded single quotes. No problem: - # double quotes work here too. - # * Contains embedded double quotes. No problem: - # we enclose it in single quotes. - # * Embeds both single _and_ double quotes. This - # can't happen naturally, but it can happen if - # you modify an attribute value after parsing - # the document. Now we have a bit of a - # problem. We solve it by enclosing the - # attribute in single quotes, and escaping any - # embedded single quotes to XML entities. + """返回此标签及其内容的字符串或Unicode表示。 + 如果传递None作为encoding,将获得Unicode字符串。 + + 注意:由于Python的HTML解析器会消耗空白字符,此方法不能保证重现原始字符串中的空白字符。""" + + encodedName = self.toEncoding(self.name, encoding) # 将标签名编码为指定编码 + + attrs = [] # 初始化属性列表 + if self.attrs: # 如果有属性 + for key, val in self.attrs: # 遍历属性 + fmt = '%s="%s"' # 属性格式 + if isinstance(val, basestring): # 如果属性值是字符串 + if self.containsSubstitutions and '%SOUP-ENCODING%' in val: # 如果包含编码占位符 + val = self.substituteEncoding(val, encoding) # 替换编码 + + # 根据属性值中是否包含引号来决定使用单引号还是双引号 if '"' in val: fmt = "%s='%s'" - if "'" in val: - # TODO: replace with apos when - # appropriate. + if "'" in val: # 如果同时包含单双引号,则替换单引号为实体 val = val.replace("'", "&squot;") - # Now we're okay w/r/t quotes. But the attribute - # value might also contain angle brackets, or - # ampersands that aren't part of entities. We need - # to escape those to XML entities too. + # 转义属性值中的小于号、大于号和未包含在实体中的和号 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) - attrs.append(fmt % (self.toEncoding(key, encoding), - self.toEncoding(val, encoding))) + attrs.append(fmt % (self.toEncoding(key, encoding), self.toEncoding(val, encoding))) # 添加编码后的属性 close = '' closeTag = '' - if self.isSelfClosing: + if self.isSelfClosing: # 如果是自闭合标签 close = ' /' else: - closeTag = '' % encodedName + closeTag = '' % encodedName # 标签结束符号 - indentTag, indentContents = 0, 0 - if prettyPrint: + indentTag, indentContents = 0, 0 # 初始化缩进级别 + if prettyPrint: # 如果需要美化输出 indentTag = indentLevel - space = (' ' * (indentTag-1)) + space = (' ' * (indentTag - 1)) indentContents = indentTag + 1 - contents = self.renderContents(encoding, prettyPrint, indentContents) - if self.hidden: + contents = self.renderContents(encoding, prettyPrint, indentContents) # 渲染标签内容 + if self.hidden: # 如果标签是隐藏的 s = contents else: - s = [] - attributeString = '' - if attrs: - attributeString = ' ' + ' '.join(attrs) - if prettyPrint: + s = [] # 初始化字符串列表 + attributeString = '' # 初始化属性字符串 + if attrs: # 如果有属性 + attributeString = ' ' + ' '.join(attrs) # 属性字符串 + if prettyPrint: # 如果需要美化输出 s.append(space) - s.append('<%s%s%s>' % (encodedName, attributeString, close)) - if prettyPrint: + s.append('<%s%s%s>' % (encodedName, attributeString, close)) # 开始标签 + if prettyPrint: # 如果需要美化输出 s.append("\n") - s.append(contents) - if prettyPrint and contents and contents[-1] != "\n": + s.append(contents) # 内容 + if prettyPrint and contents and contents[-1] != "\n": # 如果需要美化输出且内容不以换行符结尾 s.append("\n") - if prettyPrint and closeTag: + if prettyPrint and closeTag: # 如果需要美化输出且有结束标签 s.append(space) - s.append(closeTag) - if prettyPrint and closeTag and self.nextSibling: + s.append(closeTag) # 结束标签 + if prettyPrint and closeTag and self.nextSibling: # 如果需要美化输出且有下一个兄弟节点 s.append("\n") - s = ''.join(s) - return s + s = ''.join(s) # 合并字符串 + return s # 返回标签字符串 - def decompose(self): - """Recursively destroys the contents of this tree.""" - self.extract() - if len(self.contents) == 0: + def decompose(self): # 递归销毁树的内容 + self.extract() # 提取自身 + if len(self.contents) == 0: # 如果没有内容 return - current = self.contents[0] - while current is not None: - next = current.next - if isinstance(current, Tag): - del current.contents[:] - current.parent = None - current.previous = None - current.previousSibling = None - current.next = None - current.nextSibling = None - current = next - - def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): + current = self.contents[0] # 获取第一个内容 + while current is not None: # 遍历内容 + next = current.next # 下一个内容 + if isinstance(current, Tag): # 如果是标签 + del current.contents[:] # 删除内容 + current.parent = None # 清除父节点 + current.previous = None # 清除前一个节点 + current.previousSibling = None # 清除前一个兄弟节点 + current.next = None # 清除下一个节点 + current.nextSibling = None # 清除下一个兄弟节点 + current = next # 移动到下一个内容 + + def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): # 美化输出 return self.__str__(encoding, True) def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - """Renders the contents of this tag as a string in the given - encoding. If encoding is None, returns a Unicode string..""" - s=[] - for c in self: - text = None - if isinstance(c, NavigableString): - text = c.__str__(encoding) - elif isinstance(c, Tag): - s.append(c.__str__(encoding, prettyPrint, indentLevel)) - if text and prettyPrint: - text = text.strip() - if text: - if prettyPrint: - s.append(" " * (indentLevel-1)) - s.append(text) - if prettyPrint: + prettyPrint=False, indentLevel=0): # 渲染标签内容 + """以给定编码渲染此标签的内容。如果编码为None,返回Unicode字符串。""" + s = [] # 初始化字符串列表 + for c in self: # 遍历内容 + text = None # 初始化文本 + if isinstance(c, NavigableString): # 如果是可导航字符串 + text = c.__str__(encoding) # 转换为字符串 + elif isinstance(c, Tag): # 如果是标签 + s.append(c.__str__(encoding, prettyPrint, indentLevel)) # 添加标签字符串 + if text and prettyPrint: # 如果是文本且需要美化输出 + text = text.strip() # 去除空白 + if text: # 如果有文本 + if prettyPrint: # 如果需要美化输出 + s.append(" " * (indentLevel - 1)) + s.append(text) # 添加文本 + if prettyPrint: # 如果需要美化输出 s.append("\n") + return ''.join(s) # 返回内容字符串 - return ''.join(s) - - #Soup methods + # Soup方法 def find(self, name=None, attrs={}, recursive=True, text=None, - **kwargs): - """Return only the first child of this Tag matching the given - criteria.""" - r = None - l = self.findAll(name, attrs, recursive, text, 1, **kwargs) - if l: - r = l[0] - return r + **kwargs): # 查找第一个匹配的子标签 + """返回此标签的第一个匹配给定条件的子标签。""" + r = None # 初始化结果 + l = self.findAll(name, attrs, recursive, text, 1, **kwargs) # 查找所有匹配的子标签 + if l: # 如果有结果 + r = l[0] # 第一个结果 + return r # 返回结果 + findChild = find def findAll(self, name=None, attrs={}, recursive=True, text=None, - limit=None, **kwargs): - """Extracts a list of Tag objects that match the given - criteria. You can specify the name of the Tag and any - attributes you want the Tag to have. - - The value of a key-value pair in the 'attrs' map can be a - string, a list of strings, a regular expression object, or a - callable that takes a string and returns whether or not the - string matches for some custom definition of 'matches'. The - same is true of the tag name.""" - generator = self.recursiveChildGenerator - if not recursive: - generator = self.childGenerator - return self._findAll(name, attrs, text, limit, generator, **kwargs) + limit=None, **kwargs): # 查找所有匹配的子标签 + """提取匹配给定条件的标签列表。你可以指定标签的名称和任何你希望标签拥有的属性。""" + generator = self.recursiveChildGenerator # 递归子生成器 + if not recursive: # 如果不需要递归 + generator = self.childGenerator # 子生成器 + return self._findAll(name, attrs, text, limit, generator, **kwargs) # 查找所有匹配的标签 + findChildren = findAll - # Pre-3.x compatibility methods - first = find - fetch = findAll + # Pre-3.x兼容性方法 + first = find # 第一个匹配的子标签 + fetch = findAll # 查找所有匹配的子标签 - def fetchText(self, text=None, recursive=True, limit=None): + def fetchText(self, text=None, recursive=True, limit=None): # 查找所有匹配的文本 return self.findAll(text=text, recursive=recursive, limit=limit) - def firstText(self, text=None, recursive=True): + def firstText(self, text=None, recursive=True): # 查找第一个匹配的文本 return self.find(text=text, recursive=recursive) - #Private methods + # 私有方法 + + def _getAttrMap(self): # 获取属性映射 + """如果尚未初始化,则初始化此标签属性的映射表示。""" + if not getattr(self, 'attrMap'): # 如果没有属性映射 + self.attrMap = {} # 初始化属性映射 + for (key, value) in self.attrs: # 遍历属性 + self.attrMap[key] = value # 添加属性 + return self.attrMap # 返回属性映射 - def _getAttrMap(self): - """Initializes a map representation of this tag's attributes, - if not already initialized.""" - if not getattr(self, 'attrMap'): - self.attrMap = {} - for (key, value) in self.attrs: - self.attrMap[key] = value - return self.attrMap + # 生成器方法 - #Generator methods - def childGenerator(self): - # Just use the iterator from the contents + def childGenerator(self): # 子生成器 + # 直接使用内容的迭代器 return iter(self.contents) - def recursiveChildGenerator(self): - if not len(self.contents): - return # Note: https://stackoverflow.com/a/30217723 (PEP 479) - stopNode = self._lastRecursiveChild().next - current = self.contents[0] - while current and current is not stopNode: - yield current - current = current.next + def recursiveChildGenerator(self): # 递归子生成器 + if not len(self.contents): # 如果没有内容 + return # 返回 + stopNode = self._lastRecursiveChild().next # 停止节点 + current = self.contents[0] # 当前节点 + while current and current is not stopNode: # 遍历节点 + yield current # 产生当前节点 + current = current.next # 移动到下一个节点 # Next, a couple classes to represent queries and their results. class SoupStrainer: - """Encapsulates a number of ways of matching a markup element (tag or - text).""" + """封装了多种匹配标记元素(标签或文本)的方法。""" def __init__(self, name=None, attrs={}, text=None, **kwargs): - self.name = name - if isinstance(attrs, basestring): - kwargs['class'] = _match_css_class(attrs) - attrs = None - if kwargs: - if attrs: - attrs = attrs.copy() - attrs.update(kwargs) + self.name = name # 标签名 + if isinstance(attrs, basestring): # 如果attrs是字符串 + kwargs['class'] = _match_css_class(attrs) # 将CSS类名转换为可匹配的形式 + attrs = None # 重置attrs + if kwargs: # 如果有额外的关键字参数 + if attrs: # 如果已有attrs + attrs = attrs.copy() # 复制attrs + attrs.update(kwargs) # 更新attrs else: - attrs = kwargs - self.attrs = attrs - self.text = text + attrs = kwargs # 否则直接设置attrs + self.attrs = attrs # 属性字典 + self.text = text # 文本内容 def __str__(self): - if self.text: - return self.text + if self.text: # 如果有文本内容 + return self.text # 返回文本内容 else: - return "%s|%s" % (self.name, self.attrs) + return "%s|%s" % (self.name, self.attrs) # 返回标签名和属性 def searchTag(self, markupName=None, markupAttrs={}): - found = None - markup = None - if isinstance(markupName, Tag): - markup = markupName - markupAttrs = markup + found = None # 初始化找到的元素 + markup = None # 初始化标记 + if isinstance(markupName, Tag): # 如果传入的是Tag对象 + markup = markupName # 设置标记 + markupAttrs = markup # 设置标记属性 callFunctionWithTagData = callable(self.name) \ - and not isinstance(markupName, Tag) + and not isinstance(markupName, Tag) # 判断是否是可调用的函数 if (not self.name) \ or callFunctionWithTagData \ or (markup and self._matches(markup, self.name)) \ or (not markup and self._matches(markupName, self.name)): - if callFunctionWithTagData: - match = self.name(markupName, markupAttrs) + # 如果没有指定标签名或函数调用匹配成功 + if callFunctionWithTagData: # 如果是函数调用 + match = self.name(markupName, markupAttrs) # 调用函数 else: - match = True - markupAttrMap = None - for attr, matchAgainst in self.attrs.items(): - if not markupAttrMap: - if hasattr(markupAttrs, 'get'): - markupAttrMap = markupAttrs + match = True # 默认匹配成功 + markupAttrMap = None # 初始化属性映射 + for attr, matchAgainst in self.attrs.items(): # 遍历属性 + if not markupAttrMap: # 如果没有属性映射 + if hasattr(markupAttrs, 'get'): # 如果有get方法 + markupAttrMap = markupAttrs # 设置属性映射 else: - markupAttrMap = {} - for k,v in markupAttrs: + markupAttrMap = {} # 初始化属性映射 + for k,v in markupAttrs: # 复制属性 markupAttrMap[k] = v - attrValue = markupAttrMap.get(attr) - if not self._matches(attrValue, matchAgainst): - match = False + attrValue = markupAttrMap.get(attr) # 获取属性值 + if not self._matches(attrValue, matchAgainst): # 如果属性不匹配 + match = False # 设置不匹配 break - if match: - if markup: - found = markup + if match: # 如果匹配成功 + if markup: # 如果是Tag对象 + found = markup # 设置找到的元素 else: - found = markupName - return found + found = markupName # 设置找到的元素 + return found # 返回找到的元素 def search(self, markup): + # 打印寻找信息 #print 'looking for %s in %s' % (self, markup) - found = None - # If given a list of items, scan it for a text element that - # matches. + found = None # 初始化找到的元素 + # 如果给定的是一个元素列表,扫描它以找到匹配的文本元素 if hasattr(markup, "__iter__") \ and not isinstance(markup, Tag): - for element in markup: + for element in markup: # 遍历元素 if isinstance(element, NavigableString) \ - and self.search(element): - found = element + and self.search(element): # 如果是可导航字符串并且匹配 + found = element # 设置找到的元素 break - # If it's a Tag, make sure its name or attributes match. - # Don't bother with Tags if we're searching for text. + # 如果它是一个Tag,确保它的名称或属性匹配 + # 如果我们正在寻找文本,就不要麻烦处理Tags elif isinstance(markup, Tag): - if not self.text: - found = self.searchTag(markup) - # If it's text, make sure the text matches. + if not self.text: # 如果不是寻找文本 + found = self.searchTag(markup) # 搜索标签 + # 如果它是文本,确保文本匹配 elif isinstance(markup, NavigableString) or \ isinstance(markup, basestring): - if self._matches(markup, self.text): - found = markup + if self._matches(markup, self.text): # 如果文本匹配 + found = markup # 设置找到的元素 else: raise Exception("I don't know how to match against a %s" \ - % markup.__class__) - return found + % markup.__class__) # 抛出异常 + return found # 返回找到的元素 def _matches(self, markup, matchAgainst): + # 打印匹配信息 #print "Matching %s against %s" % (markup, matchAgainst) - result = False - if matchAgainst is True: - result = markup is not None - elif callable(matchAgainst): - result = matchAgainst(markup) + result = False # 初始化匹配结果 + if matchAgainst is True: # 如果匹配条件是True + result = markup is not None # 只要markup不是None就匹配 + elif callable(matchAgainst): # 如果匹配条件是可调用的 + result = matchAgainst(markup) # 调用函数 else: - #Custom match methods take the tag as an argument, but all - #other ways of matching match the tag name as a string. - if isinstance(markup, Tag): - markup = markup.name - if markup and not isinstance(markup, basestring): - markup = text_type(markup) - #Now we know that chunk is either a string, or None. - if hasattr(matchAgainst, 'match'): - # It's a regexp object. - result = markup and matchAgainst.search(markup) - elif hasattr(matchAgainst, '__iter__'): # list-like - result = markup in matchAgainst - elif hasattr(matchAgainst, 'items'): - result = markup.has_key(matchAgainst) - elif matchAgainst and isinstance(markup, basestring): - if isinstance(markup, text_type): - matchAgainst = text_type(matchAgainst) + # 自定义匹配方法接受标签作为参数,但所有其他匹配方式都匹配标签名称作为字符串 + if isinstance(markup, Tag): # 如果是Tag对象 + markup = markup.name # 获取标签名 + if markup and not isinstance(markup, basestring): # 如果markup不是字符串 + markup = text_type(markup) # 转换为字符串 + # 现在我们知道chunk要么是字符串,要么None + if hasattr(matchAgainst, 'match'): # 如果是正则表达式对象 + result = markup and matchAgainst.search(markup) # 搜索匹配 + elif hasattr(matchAgainst, '__iter__'): # 如果是可迭代的 + result = markup in matchAgainst # 是否在其中 + elif hasattr(matchAgainst, 'items'): # 如果有items方法 + result = markup.has_key(matchAgainst) # 是否有键 + elif matchAgainst and isinstance(markup, basestring): # 如果都是字符串 + if isinstance(markup, text_type): # 如果是unicode + matchAgainst = text_type(matchAgainst) # 转换为unicode else: - matchAgainst = str(matchAgainst) + matchAgainst = str(matchAgainst) # 转换为字符串 + + if not result: # 如果不匹配 + result = matchAgainst == markup # 比较是否相等 + return result # 返回匹配结果 - if not result: - result = matchAgainst == markup - return result class ResultSet(list): - """A ResultSet is just a list that keeps track of the SoupStrainer - that created it.""" + """ResultSet是一个特殊的列表,它记录了创建它的SoupStrainer。""" + def __init__(self, source): - list.__init__([]) - self.source = source + list.__init__([]) # 初始化列表 + self.source = source # 记录创建ResultSet的SoupStrainer + -# Now, some helper functions. +# 以下是一些辅助函数。 def buildTagMap(default, *args): - """Turns a list of maps, lists, or scalars into a single map. - Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and - NESTING_RESET_TAGS maps out of lists and partial maps.""" - built = {} - for portion in args: - if hasattr(portion, 'items'): - #It's a map. Merge it. - for k,v in portion.items(): + """将多个映射、列表或标量转换为一个单一的映射。 + 用于构建SELF_CLOSING_TAGS、NESTABLE_TAGS和NESTING_RESET_TAGS映射。""" + + built = {} # 初始化空字典 + for portion in args: # 遍历传入的参数 + if hasattr(portion, 'items'): # 如果参数是映射 + # 合并映射 + for k, v in portion.items(): built[k] = v - elif hasattr(portion, '__iter__'): # is a list - #It's a list. Map each item to the default. + elif hasattr(portion, '__iter__'): # 如果参数是列表 + # 将列表中的每个项映射到默认值 for k in portion: built[k] = default else: - #It's a scalar. Map it to the default. + # 如果参数是标量,将其映射到默认值 built[portion] = default - return built + return built # 返回构建的映射 # Now, the parser classes. class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): + """这个类包含了基本的解析器和搜索代码。它定义了一个解析器,除了以下规则外,对标签行为一无所知: - """This class contains the basic parser and search code. It defines - a parser that knows nothing about tag behavior except for the - following: + 你不能在不关闭它所包含的所有标签的情况下关闭一个标签。 + 也就是说,""实际上意味着""。 - You can't close a tag without closing all the tags it encloses. - That is, "" actually means - "". + [另一种可能的解释是"",但由于这个类没有定义SELF_CLOSING_TAGS,它永远不会使用这种解释。] - [Another possible explanation is "", but since - this class defines no SELF_CLOSING_TAGS, it will never use that - explanation.] - - This class is useful for parsing XML or made-up markup languages, - or when BeautifulSoup makes an assumption counter to what you were - expecting.""" + 这个类对于解析XML或自创的标记语言,或者当BeautifulSoup做出了与您预期相反的假设时非常有用。""" SELF_CLOSING_TAGS = {} NESTABLE_TAGS = {} @@ -1105,57 +1144,45 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): ] ROOT_TAG_NAME = u'[document]' + """根标签的名称。""" HTML_ENTITIES = "html" XML_ENTITIES = "xml" XHTML_ENTITIES = "xhtml" - # TODO: This only exists for backwards-compatibility + # TODO: 这只用于向后兼容 ALL_ENTITIES = XHTML_ENTITIES - # Used when determining whether a text node is all whitespace and - # can be replaced with a single space. A text node that contains - # fancy Unicode spaces (usually non-breaking) should be left - # alone. + # 用于确定一个文本节点是否全部是空白字符 + # 并且可以被替换为一个空格。包含花式Unicode空格(通常是不间断的)的文本节点应该保持不变。 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, markupMassage=True, smartQuotesTo=XML_ENTITIES, convertEntities=None, selfClosingTags=None, isHTML=False): - """The Soup object is initialized as the 'root tag', and the - provided markup (which can be a string or a file-like object) - is fed into the underlying parser. + """Soup对象被初始化为'根标签',提供的标记(可以是字符串或文件类对象) + 被送入底层解析器。 - sgmllib will process most bad HTML, and the BeautifulSoup - class has some tricks for dealing with some HTML that kills - sgmllib, but Beautiful Soup can nonetheless choke or lose data - if your data uses self-closing tags or declarations - incorrectly. + sgmllib会处理大多数错误的HTML,BeautifulSoup类有一些技巧来处理一些使sgmllib死亡的HTML, + 但是BeautifulSoup仍然可能因为数据使用自闭合标签或声明不正确而窒息或丢失数据。 - By default, Beautiful Soup uses regexes to sanitize input, - avoiding the vast majority of these problems. If the problems - don't apply to you, pass in False for markupMassage, and - you'll get better performance. + 默认情况下,BeautifulSoup使用正则表达式来清理输入,避免了绝大多数这些问题。如果这些问题不适用于您, + 传递False给markupMassage,您将获得更好的性能。 - The default parser massage techniques fix the two most common - instances of invalid HTML that choke sgmllib: + 默认的解析器按摩技术修复了使sgmllib窒息的两种最常见的无效HTML实例: -
(No space between name of closing tag and tag close) - (Extraneous whitespace in declaration) +
(闭合标签名称和标签关闭之间没有空格) + (声明中的多余空白) - You can pass in a custom list of (RE object, replace method) - tuples to get Beautiful Soup to scrub your input the way you - want.""" + 您可以传递自定义的(RE对象,替换方法)元组列表,让BeautifulSoup按照您想要的方式清理您的输入。""" self.parseOnlyThese = parseOnlyThese self.fromEncoding = fromEncoding self.smartQuotesTo = smartQuotesTo self.convertEntities = convertEntities - # Set the rules for how we'll deal with the entities we - # encounter + # 设置我们将如何处理我们遇到的实体的规则 if self.convertEntities: - # It doesn't make sense to convert encoded characters to - # entities even while you're converting entities to Unicode. - # Just convert it all to Unicode. + # 将编码字符转换为实体是没有意义的,即使在您正在将实体转换为Unicode时也是如此。 + # 将所有内容都转换为Unicode。 self.smartQuotesTo = None if convertEntities == self.HTML_ENTITIES: self.convertXMLEntities = False @@ -1177,7 +1204,7 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) sgmllib.SGMLParser.__init__(self) - if hasattr(markup, 'read'): # It's a file-type object. + if hasattr(markup, 'read'): # 它是一个文件类型对象。 markup = markup.read() self.markup = markup self.markupMassage = markupMassage @@ -1185,20 +1212,20 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): self._feed(isHTML=isHTML) except StopParsing: pass - self.markup = None # The markup can now be GCed + self.markup = None # 标记现在可以被GCed了 def convert_charref(self, name): - """This method fixes a bug in Python's SGMLParser.""" + """这个方法修复了Python的SGMLParser中的一个bug。""" try: n = int(name) except ValueError: return - if not 0 <= n <= 127 : # ASCII ends at 127, not 255 + if not 0 <= n <= 127 : # ASCII在127结束,不是255 return return self.convert_codepoint(n) def _feed(self, inDocumentEncoding=None, isHTML=False): - # Convert the document to Unicode. + # 将文档转换为Unicode。 markup = self.markup if isinstance(markup, text_type): if not hasattr(self, 'originalEncoding'): @@ -1216,148 +1243,163 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) - # TODO: We get rid of markupMassage so that the - # soup object can be deepcopied later on. Some - # Python installations can't copy regexes. If anyone - # was relying on the existence of markupMassage, this - # might cause problems. + # TODO: 我们摆脱markupMassage,以便soup对象可以稍后被深度复制。 + # 一些Python安装无法复制正则表达式。如果有人依赖markupMassage的存在,这可能会导致问题。 del(self.markupMassage) self.reset() sgmllib.SGMLParser.feed(self, markup) - # Close out any unfinished strings and close all the open tags. + # 关闭任何未完成的字符串并关闭所有打开的标签。 self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag() def __getattr__(self, methodName): - """This method routes method call requests to either the SGMLParser - superclass or the Tag superclass, depending on the method name.""" - #print "__getattr__ called on %s.%s" % (self.__class__, methodName) + """这个方法将方法调用请求路由到SGMLParser超类或Tag超类,具体取决于方法名。""" + # 打印出被调用的方法名 + # print "__getattr__ called on %s.%s" % (self.__class__, methodName) if methodName.startswith('start_') or methodName.startswith('end_') \ - or methodName.startswith('do_'): + or methodName.startswith('do_'): + # 如果方法是SGMLParser的方法,则从SGMLParser中获取 return sgmllib.SGMLParser.__getattr__(self, methodName) elif not methodName.startswith('__'): + # 否则,如果方法不是特殊方法,则从Tag中获取 return Tag.__getattr__(self, methodName) else: + # 如果方法是特殊方法,则抛出属性错误 raise AttributeError def isSelfClosingTag(self, name): - """Returns true iff the given string is the name of a - self-closing tag according to this parser.""" + """返回true,当且仅当给定的字符串是此解析器中自闭合标签的名称。""" + # 检查标签是否是自闭合标签 return name in self.SELF_CLOSING_TAGS \ - or name in self.instanceSelfClosingTags + or name in self.instanceSelfClosingTags def reset(self): + # 重置Tag对象,并初始化ROOT_TAG_NAME Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 + # 重置SGMLParser对象 sgmllib.SGMLParser.reset(self) - self.currentData = [] - self.currentTag = None - self.tagStack = [] - self.quoteStack = [] + self.currentData = [] # 存储当前数据 + self.currentTag = None # 当前标签 + self.tagStack = [] # 标签堆栈 + self.quoteStack = [] # 引号堆栈 + # 将ROOT_TAG_NAME推入标签堆栈 self.pushTag(self) def popTag(self): + # 从标签堆栈中弹出一个标签 tag = self.tagStack.pop() - - #print "Pop", tag.name if self.tagStack: - self.currentTag = self.tagStack[-1] + self.currentTag = self.tagStack[-1] # 更新当前标签 return self.currentTag def pushTag(self, tag): - #print "Push", tag.name + # 将一个标签推入标签堆栈 if self.currentTag: - self.currentTag.contents.append(tag) - self.tagStack.append(tag) - self.currentTag = self.tagStack[-1] + self.currentTag.contents.append(tag) # 将标签添加到当前标签的内容中 + self.tagStack.append(tag) # 推入堆栈 + self.currentTag = self.tagStack[-1] # 更新当前标签 def endData(self, containerClass=NavigableString): + # 结束当前数据的处理 if self.currentData: - currentData = u''.join(self.currentData) + currentData = u''.join(self.currentData) # 合并当前数据 + # 如果数据只包含ASCII空白字符,并且不在PRESERVE_WHITESPACE_TAGS中,则替换为单个空格 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and - not set([tag.name for tag in self.tagStack]).intersection( - self.PRESERVE_WHITESPACE_TAGS)): + not set([tag.name for tag in self.tagStack]).intersection( + self.PRESERVE_WHITESPACE_TAGS)): if '\n' in currentData: currentData = '\n' else: currentData = ' ' self.currentData = [] + # 如果设置了parseOnlyThese,并且不在顶层标签,并且当前数据不匹配,则不处理 if self.parseOnlyThese and len(self.tagStack) <= 1 and \ - (not self.parseOnlyThese.text or \ - not self.parseOnlyThese.search(currentData)): + (not self.parseOnlyThese.text or \ + not self.parseOnlyThese.search(currentData)): return - o = containerClass(currentData) - o.setup(self.currentTag, self.previous) + o = containerClass(currentData) # 创建一个新的NavigableString对象 + o.setup(self.currentTag, self.previous) # 设置对象 if self.previous: - self.previous.next = o - self.previous = o - self.currentTag.contents.append(o) - + self.previous.next = o # 设置前一个对象的下一个对象 + self.previous = o # 更新前一个对象 + self.currentTag.contents.append(o) # 将对象添加到当前标签的内容中 def _popToTag(self, name, inclusivePop=True): - """Pops the tag stack up to and including the most recent - instance of the given tag. If inclusivePop is false, pops the tag - stack up to but *not* including the most recent instqance of - the given tag.""" - #print "Popping to %s" % name + """弹出标签堆栈直到并包括最近的给定标签。如果inclusivePop为false,则弹出标签堆栈直到但不包括最近的给定标签。""" + # 打印出正在弹出到的标签名 + # print "Popping to %s" % name if name == self.ROOT_TAG_NAME: return numPops = 0 mostRecentTag = None - for i in xrange(len(self.tagStack)-1, 0, -1): + # 从后向前查找给定标签的位置 + for i in xrange(len(self.tagStack) - 1, 0, -1): if name == self.tagStack[i].name: - numPops = len(self.tagStack)-i + numPops = len(self.tagStack) - i break if not inclusivePop: numPops = numPops - 1 + # 弹出标签 for i in xrange(0, numPops): mostRecentTag = self.popTag() return mostRecentTag def _smartPop(self, name): - - """We need to pop up to the previous tag of this type, unless - one of this tag's nesting reset triggers comes between this - tag and the previous tag of this type, OR unless this tag is a - generic nesting trigger and another generic nesting trigger - comes between this tag and the previous tag of this type. - - Examples: -

FooBar *

* should pop to 'p', not 'b'. -

FooBar *

* should pop to 'table', not 'p'. -

Foo

Bar *

* should pop to 'tr', not 'p'. - -

    • *
    • * should pop to 'ul', not the first 'li'. -
  • ** should pop to 'table', not the first 'tr' - 标签应该隐式地关闭同一
    ** should pop to 'tr', not the first 'td' - """ + """我们需要弹出到这种类型的前一个标签,除非在当前标签和这种类型的前一个标签之间出现了这种标签的嵌套重置触发器, + 或者除非这个标签是一个通用嵌套触发器,并且在这个标签和这种类型的前一个标签之间出现了另一个通用嵌套触发器。 + + 例子: + < p > Foo < b > Bar * < p > + *应该弹出到 + 'p',而不是 + 'b'。 + < p > Foo < table > Bar * < p > + *应该弹出到 + 'table',而不是 + 'p'。 + < p > Foo < table > < tr > Bar * < p > + *应该弹出到 + 'tr',而不是 + 'p'。 + + < li > < ul > < li > * < li > + *应该弹出到 + 'ul',而不是第一个 + 'li'。 + < tr > < table > < tr > * < tr > + *应该弹出到 + 'table',而不是第一个 + 'tr' + < td > < tr > < td > * < td > + *应该弹出到 + 'tr',而不是第一个 + 'td' + + """ nestingResetTriggers = self.NESTABLE_TAGS.get(name) isNestable = nestingResetTriggers != None isResetNesting = name in self.RESET_NESTING_TAGS popTo = None inclusive = True - for i in xrange(len(self.tagStack)-1, 0, -1): + for i in xrange(len(self.tagStack) - 1, 0, -1): p = self.tagStack[i] if (not p or p.name == name) and not isNestable: - #Non-nestable tags get popped to the top or to their - #last occurance. + # 非嵌套标签被弹出到顶部或它们的最后一次出现。 popTo = name break if (nestingResetTriggers is not None and p.name in nestingResetTriggers) \ - or (nestingResetTriggers is None and isResetNesting - and p.name in self.RESET_NESTING_TAGS): - - #If we encounter one of the nesting reset triggers - #peculiar to this tag, or we encounter another tag - #that causes nesting to reset, pop up to but not - #including that tag. + or (nestingResetTriggers is None and isResetNesting + and p.name in self.RESET_NESTING_TAGS): + # 如果我们遇到了这个标签特有的一个嵌套重置触发器,或者我们遇到了另一个导致嵌套重置的标签, + # 弹出到但不包括那个标签。 popTo = p.name inclusive = False break @@ -1365,11 +1407,14 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): if popTo: self._popToTag(popTo, inclusive) + def unknown_starttag(self, name, attrs, selfClosing=0): - #print "Start tag %s: %s" % (name, attrs) + # 打印开始标签信息 + # print "Start tag %s: %s" % (name, attrs) if self.quoteStack: - #This is not a real tag. - #print "<%s> is not real!" % name + # 这不是一个真正的标签。 + # 打印信息 + # print "<%s> is not real!" % name attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs]) self.handle_data('<%s%s>' % (name, attrs)) return @@ -1379,7 +1424,7 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): self._smartPop(name) if self.parseOnlyThese and len(self.tagStack) <= 1 \ - and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): + and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): return tag = Tag(self, name, attrs, self.currentTag, self.previous) @@ -1390,16 +1435,20 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): if selfClosing or self.isSelfClosingTag(name): self.popTag() if name in self.QUOTE_TAGS: - #print "Beginning quote (%s)" % name + # 打印开始引用信息 + # print "Beginning quote (%s)" % name self.quoteStack.append(name) self.literal = 1 return tag + def unknown_endtag(self, name): - #print "End tag %s" % name + # 打印结束标签信息 + # print "End tag %s" % name if self.quoteStack and self.quoteStack[-1] != name: - #This is not a real end tag. - #print " is not real!" % name + # 这不是一个真正的结束标签。 + # 打印信息 + # print " is not real!" % name self.handle_data('' % name) return self.endData() @@ -1409,149 +1458,104 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): self.literal = (len(self.quoteStack) > 0) def handle_data(self, data): + """将数据添加到当前数据列表中。""" self.currentData.append(data) def _toStringSubclass(self, text, subclass): - """Adds a certain piece of text to the tree as a NavigableString - subclass.""" - self.endData() - self.handle_data(text) - self.endData(subclass) + """将特定文本作为NavigableString子类添加到树中。""" + self.endData() # 结束当前数据的处理 + self.handle_data(text) # 处理文本 + self.endData(subclass) # 结束处理并指定子类 def handle_pi(self, text): - """Handle a processing instruction as a ProcessingInstruction - object, possibly one with a %SOUP-ENCODING% slot into which an - encoding will be plugged later.""" + """将处理指令作为ProcessingInstruction对象处理,可能有一个 % SOUP - ENCODING % 插槽,稍后将插入编码。""" if text[:3] == "xml": text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" self._toStringSubclass(text, ProcessingInstruction) def handle_comment(self, text): - "Handle comments as Comment objects." + """将注释作为Comment对象处理。""" self._toStringSubclass(text, Comment) def handle_charref(self, ref): - "Handle character references as data." + """将字符引用作为数据处理。""" if self.convertEntities: - data = unichr(int(ref)) + data = unichr(int(ref)) # 转换为Unicode字符 else: - data = '&#%s;' % ref + data = '&#%s;' % ref # 保持为实体引用 self.handle_data(data) def handle_entityref(self, ref): - """Handle entity references as data, possibly converting known - HTML and/or XML entity references to the corresponding Unicode - characters.""" + """将实体引用作为数据处理,可能将已知的HTML和 / 或XML实体引用转换为相应的Unicode字符。""" data = None if self.convertHTMLEntities: try: - data = unichr(name2codepoint[ref]) + data = unichr(name2codepoint[ref]) # 尝试转换为Unicode字符 except KeyError: pass if not data and self.convertXMLEntities: - data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) + data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) # 获取特殊字符 if not data and self.convertHTMLEntities and \ - not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): - # TODO: We've got a problem here. We're told this is - # an entity reference, but it's not an XML entity - # reference or an HTML entity reference. Nonetheless, - # the logical thing to do is to pass it through as an - # unrecognized entity reference. - # - # Except: when the input is "&carol;" this function - # will be called with input "carol". When the input is - # "AT&T", this function will be called with input - # "T". We have no way of knowing whether a semicolon - # was present originally, so we don't know whether - # this is an unknown entity or just a misplaced - # ampersand. - # - # The more common case is a misplaced ampersand, so I - # escape the ampersand and omit the trailing semicolon. - data = "&%s" % ref + not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): + # 处理未知实体引用 + data = "&%s" % ref if not data: - # This case is different from the one above, because we - # haven't already gone through a supposedly comprehensive - # mapping of entities to Unicode characters. We might not - # have gone through any mapping at all. So the chances are - # very high that this is a real entity, and not a - # misplaced ampersand. + # 处理真实实体 data = "&%s;" % ref self.handle_data(data) def handle_decl(self, data): - "Handle DOCTYPEs and the like as Declaration objects." + """将DOCTYPE等声明作为Declaration对象处理。""" self._toStringSubclass(data, Declaration) def parse_declaration(self, i): - """Treat a bogus SGML declaration as raw data. Treat a CDATA - declaration as a CData object.""" + """将无效的SGML声明视为原始数据。将CDATA声明视为CData对象。""" j = None - if self.rawdata[i:i+9] == '', i) - if k == -1: - k = len(self.rawdata) - data = self.rawdata[i+9:k] - j = k+3 - self._toStringSubclass(data, CData) + if self.rawdata[i:i + 9] == '', i) # 寻找CDATA结束标志 + if k == -1: + k = len(self.rawdata) + data = self.rawdata[i + 9:k] # 获取CDATA中的数据 + j = k + 3 + self._toStringSubclass(data, CData) # 处理CDATA数据 else: try: - j = sgmllib.SGMLParser.parse_declaration(self, i) + j = sgmllib.SGMLParser.parse_declaration(self, i) # 处理SGML声明 except sgmllib.SGMLParseError: - toHandle = self.rawdata[i:] - self.handle_data(toHandle) + toHandle = self.rawdata[i:] # 获取错误后的数据 + self.handle_data(toHandle) # 处理数据 j = i + len(toHandle) return j class BeautifulSoup(BeautifulStoneSoup): + """这个解析器了解HTML的一些事实: - """This parser knows the following facts about HTML: + * 有些标签没有闭合标签,并且应该被解释为一旦遇到就立即关闭。 - * Some tags have no closing tag and should be interpreted as being - closed as soon as they are encountered. + * 某些标签内的文本(例如'script')可能包含标签,这些标签实际上不是文档的一部分,应该被解析为文本,而不是标签。如果你想将文本作为标签解析,你可以随时获取它并显式地解析。 - * The text inside some tags (ie. 'script') may contain tags which - are not really part of the document and which should be parsed - as text, not tags. If you want to parse the text as tags, you can - always fetch it and parse it explicitly. + * 标签嵌套规则: - * Tag nesting rules: + 大多数标签根本无法嵌套。例如,

    标签的出现应该隐式地关闭前一个

    标签。 - Most tags can't be nested at all. For instance, the occurance of - a

    tag should implicitly close the previous

    tag. +

    Para1

    Para2应该被转换为:

    Para1

    Para2 -

    Para1

    Para2 - should be transformed into: -

    Para1

    Para2 + 有些标签可以任意嵌套。例如,

    标签的出现不应该隐式地关闭前一个
    标签。 - Some tags can be nested arbitrarily. For instance, the occurance - of a
    tag should _not_ implicitly close the previous -
    tag. + Alice said:
    Bob said:
    Blah不应该被转换为: + Alice said:
    Bob said:
    Blah - Alice said:
    Bob said:
    Blah - should NOT be transformed into: - Alice said:
    Bob said:
    Blah + 有些标签可以嵌套,但是嵌套被其他标签的介入重置。例如,
    中的前一个标签,但不应该关闭另一个
    中的标签。 - Some tags can be nested, but the nesting is reset by the - interposition of other tags. For instance, a tag should - implicitly close the previous tag within the same
    , - but not close a tag in another table. +
    BlahBlah应该被转换为: +
    BlahBlah + 但是, + Blah
    Blah不应该被转换为: + Blah
    Blah -
    BlahBlah - should be transformed into: -
    BlahBlah - but, - Blah
    Blah - should NOT be transformed into - Blah
    Blah - - Differing assumptions about tag nesting rules are a major source - of problems with the BeautifulSoup class. If BeautifulSoup is not - treating as nestable a tag your page author treats as nestable, - try ICantBelieveItsBeautifulSoup, MinimalSoup, or - BeautifulStoneSoup before writing your own subclass.""" + 对标签嵌套规则的不同假设是BeautifulSoup类问题的主要来源。如果BeautifulSoup没有将页面作者视为可嵌套的标签视为可嵌套,请尝试ICantBelieveItsBeautifulSoup、MinimalSoup或BeautifulStoneSoup,然后再编写自己的子类。""" def __init__(self, *args, **kwargs): if 'smartQuotesTo' not in kwargs: @@ -1559,26 +1563,25 @@ class BeautifulSoup(BeautifulStoneSoup): kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs) + # 定义自闭合标签 SELF_CLOSING_TAGS = buildTagMap(None, ('br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base', 'col')) + # 定义保持空白字符的标签 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + # 定义包含脚本或文本区域的标签 QUOTE_TAGS = {'script' : None, 'textarea' : None} - #According to the HTML standard, each of these inline tags can - #contain another tag of the same type. Furthermore, it's common - #to actually use these tags this way. + # 定义内联标签可以包含另一个相同类型的标签 NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center') - #According to the HTML standard, these block tags can contain - #another tag of the same type. Furthermore, it's common - #to actually use these tags this way. + # 定义块级标签可以包含另一个相同类型的标签 NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del') - #Lists can contain other lists, but there are restrictions. + # 定义列表可以包含其他列表,但有限制 NESTABLE_LIST_TAGS = { 'ol' : [], 'ul' : [], 'li' : ['ul', 'ol'], @@ -1586,7 +1589,7 @@ class BeautifulSoup(BeautifulStoneSoup): 'dd' : ['dl'], 'dt' : ['dl'] } - #Tables can contain other tables, but there are restrictions. + # 定义表格可以包含其他表格,但有限制 NESTABLE_TABLE_TAGS = {'table' : [], 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 'td' : ['tr'], @@ -1596,25 +1599,25 @@ class BeautifulSoup(BeautifulStoneSoup): 'tfoot' : ['table'], } + # 定义非嵌套块级标签 NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre') - #If one of these tags is encountered, all tags up to the next tag of - #this type are popped. + # 如果遇到这些标签之一,则直到下一个相同类型的标签为止的所有标签都被弹出 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', NON_NESTABLE_BLOCK_TAGS, NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) + # 定义可以嵌套的标签 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) - # Used to detect the charset in a META tag; see start_meta + # 用于检测META标签中的字符集;参见start_meta CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) def start_meta(self, attrs): - """Beautiful Soup can detect a charset included in a META tag, - try to convert the document to that charset, and re-parse the - document from the beginning.""" + """Beautiful Soup可以检测META标签中的字符集, + 尝试将文档转换为该字符集,并从头开始重新解析文档。""" httpEquiv = None contentType = None contentTypeIndex = None @@ -1629,16 +1632,13 @@ class BeautifulSoup(BeautifulStoneSoup): contentType = value contentTypeIndex = i - if httpEquiv and contentType: # It's an interesting meta tag. + if httpEquiv and contentType: # 它是一个有趣的meta标签 match = self.CHARSET_RE.search(contentType) if match: if (self.declaredHTMLEncoding is not None or self.originalEncoding == self.fromEncoding): - # An HTML encoding was sniffed while converting - # the document to Unicode, or an HTML encoding was - # sniffed during a previous pass through the - # document, or an encoding was specified - # explicitly and it worked. Rewrite the meta tag. + # 在将文档转换为Unicode时检测到HTML编码,或在以前的文档遍历中检测到HTML编码,或显式指定的编码有效 + # 重写meta标签 def rewrite(match): return match.group(1) + "%SOUP-ENCODING%" newAttr = self.CHARSET_RE.sub(rewrite, contentType) @@ -1646,89 +1646,74 @@ class BeautifulSoup(BeautifulStoneSoup): newAttr) tagNeedsEncodingSubstitution = True else: - # This is our first pass through the document. - # Go through it again with the encoding information. + # 这是我们第一次通过文档 + # 使用编码信息再次遍历文档 newCharset = match.group(3) if newCharset and newCharset != self.originalEncoding: self.declaredHTMLEncoding = newCharset self._feed(self.declaredHTMLEncoding) raise StopParsing - pass tag = self.unknown_starttag("meta", attrs) if tag and tagNeedsEncodingSubstitution: tag.containsSubstitutions = True class StopParsing(Exception): + """用于停止解析的异常类。""" pass class ICantBelieveItsBeautifulSoup(BeautifulSoup): - - """The BeautifulSoup class is oriented towards skipping over - common HTML errors like unclosed tags. However, sometimes it makes - errors of its own. For instance, consider this fragment: + """BeautifulSoup类通常忽略一些常见的HTML错误,如未闭合的标签。然而,有时它也会犯自己的错误。 + 例如,考虑这个片段: FooBar - This is perfectly valid (if bizarre) HTML. However, the - BeautifulSoup class will implicitly close the first b tag when it - encounters the second 'b'. It will think the author wrote - "FooBar", and didn't close the first 'b' tag, because - there's no real-world reason to bold something that's already - bold. When it encounters '' it will close two more 'b' - tags, for a grand total of three tags closed instead of two. This - can throw off the rest of your document structure. The same is - true of a number of other tags, listed below. - - It's much more common for someone to forget to close a 'b' tag - than to actually use nested 'b' tags, and the BeautifulSoup class - handles the common case. This class handles the not-co-common - case: where you can't believe someone wrote what they did, but - it's valid HTML and BeautifulSoup screwed up by assuming it - wouldn't be.""" + 这是完全有效的(如果奇怪的)HTML。然而,BeautifulSoup类会在遇到第二个'b'时隐式地关闭第一个b标签。 + 它会认为作者写了"FooBar",并没有关闭第一个'b'标签,因为在现实中没有理由将已经加粗的内容再次加粗。 + 当它遇到''时,它会关闭两个更多的'b'标签,总共关闭了三个标签而不是两个。这可能会打乱你的文档结构。 + 同样的情况也适用于一些其他标签,如下所述。 + + 人们忘记关闭'b'标签比实际使用嵌套的'b'标签要常见得多,而BeautifulSoup类处理的是常见情况。 + 这个类处理的是不常见的情况:你不敢相信有人写了他们所写的东西,但是它是有效的HTML,而BeautifulSoup通过假设它不会是而搞砸了。""" I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 'big') + """一些内联标签可以被嵌套,即使它们通常不应该被嵌套。""" I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',) + """一些块级标签可以被嵌套,即使它们通常不应该被嵌套。""" NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) + """构建一个包含所有可以被嵌套的标签的映射。""" class MinimalSoup(BeautifulSoup): - """The MinimalSoup class is for parsing HTML that contains - pathologically bad markup. It makes no assumptions about tag - nesting, but it does know which tags are self-closing, that -