From 28f0660564ef22e3ad04e5ecdb875824aefa4ebd Mon Sep 17 00:00:00 2001
From: snh <1476164672@qq.com>
Date: Sun, 29 Dec 2024 20:45:52 +0800
Subject: [PATCH] beautifulsoup
---
.../thirdparty/beautifulsoup/beautifulsoup.py | 1626 ++++++++---------
1 file changed, 807 insertions(+), 819 deletions(-)
diff --git a/src/sqlmap-master/thirdparty/beautifulsoup/beautifulsoup.py b/src/sqlmap-master/thirdparty/beautifulsoup/beautifulsoup.py
index 76c08fd..e51cb22 100644
--- a/src/sqlmap-master/thirdparty/beautifulsoup/beautifulsoup.py
+++ b/src/sqlmap-master/thirdparty/beautifulsoup/beautifulsoup.py
@@ -145,31 +145,37 @@ DEFAULT_OUTPUT_ENCODING = "utf-8"
# 设置默认的输出编码为 UTF-8
def _match_css_class(str):
+#构建一个正则表达式,以便匹配给定的CSS类名
"""Build a RE to match the given CSS class."""
return re.compile(r"(^|.*\s)%s($|\s)" % str)
# First, the classes that represent markup elements.
class PageElement(object):
+#作为页面元素的基类,包含导航信息,可以是标签或文本
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
def _invert(h):
+ #创建一个新的字典,将原字典的键值对颠倒
"Cheap function to invert a hash."
i = {}
for k,v in h.items():
i[v] = k
return i
+ #将XML实体映射到它们对应的特殊字符
XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
"quot" : '"',
"amp" : "&",
"lt" : "<",
"gt" : ">" }
+ #创建一个新的字典,将特殊字符映射回XML实体
XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
def setup(self, parent=None, previous=None):
+ #初始化元素之间的关系,包括父元素、前一个元素、后一个元素以及兄弟元素
"""Sets up the initial relations between this element and
other elements."""
self.parent = parent
@@ -182,6 +188,7 @@ class PageElement(object):
self.previousSibling.nextSibling = self
def replaceWith(self, replaceWith):
+ #首先记录当前元素的父元素和索引位置,然后提取当前元素,最后在原来的位置插入新元素
oldParent = self.parent
myIndex = self.parent.index(self)
if hasattr(replaceWith, "parent")\
@@ -197,6 +204,7 @@ class PageElement(object):
oldParent.insert(myIndex, replaceWith)
def replaceWithChildren(self):
+ #首先提取当前元素,然后将子元素逆序插入到父元素中
myParent = self.parent
myIndex = self.parent.index(self)
self.extract()
@@ -206,6 +214,7 @@ class PageElement(object):
myParent.insert(myIndex, child)
def extract(self):
+ #从树中提取(移除)当前元素
"""Destructively rips this element out of the tree."""
if self.parent:
try:
@@ -219,6 +228,7 @@ class PageElement(object):
lastChild = self._lastRecursiveChild()
nextElement = lastChild.next
+ #更新previous和next指针,以保持元素之间的正确连接
if self.previous:
self.previous.next = nextElement
if nextElement:
@@ -226,6 +236,7 @@ class PageElement(object):
self.previous = None
lastChild.next = None
+ #清除当前元素的所有关系,包括父元素和兄弟元素,并返回当前元素
self.parent = None
if self.previousSibling:
self.previousSibling.nextSibling = self.nextSibling
@@ -235,6 +246,7 @@ class PageElement(object):
return self
def _lastRecursiveChild(self):
+ #找到当前元素下最后一个被解析的子元素
"Finds the last element beneath this object to be parsed."
lastChild = self
while hasattr(lastChild, 'contents') and lastChild.contents:
@@ -242,6 +254,7 @@ class PageElement(object):
return lastChild
def insert(self, position, newChild):
+ #在当前元素的内容列表中的指定位置插入一个新的子元素
if isinstance(newChild, basestring) \
and not isinstance(newChild, NavigableString):
newChild = NavigableString(newChild)
@@ -301,29 +314,31 @@ class PageElement(object):
self.contents.insert(position, newChild)
def append(self, tag):
+ #将给定的标签追加到当前元素的内容列表的末尾
"""Appends the given tag to the contents of this tag."""
self.insert(len(self.contents), tag)
def findNext(self, name=None, attrs={}, text=None, **kwargs):
+ #查找文档中当前标签之后第一个匹配给定条件的元素
"""Returns the first item that matches the given criteria and
appears after this Tag in the document."""
return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
- def findAllNext(self, name=None, attrs={}, text=None, limit=None,
- **kwargs):
+ def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs):
+ #查找文档中当前标签之后所有匹配给定条件的元素
"""Returns all items that match the given criteria and appear
after this Tag in the document."""
- return self._findAll(name, attrs, text, limit, self.nextGenerator,
- **kwargs)
+ return self._findAll(name, attrs, text, limit, self.nextGenerator, **kwargs)
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
+ #查找文档中当前标签之后第一个匹配给定条件的兄弟标签
"""Returns the closest sibling to this Tag that matches the
given criteria and appears after this Tag in the document."""
return self._findOne(self.findNextSiblings, name, attrs, text,
**kwargs)
- def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
- **kwargs):
+ def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs):
+ #查找文档中当前标签之后所有匹配给定条件的兄弟标签
"""Returns the siblings of this Tag that match the given
criteria and appear after this Tag in the document."""
return self._findAll(name, attrs, text, limit,
@@ -331,6 +346,7 @@ class PageElement(object):
fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
+ #查找文档中当前标签之前第一个匹配给定条件的元素
"""Returns the first item that matches the given criteria and
appears before this Tag in the document."""
return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
@@ -344,171 +360,230 @@ class PageElement(object):
fetchPrevious = findAllPrevious # Compatibility with pre-3.x
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
+ #查找文档中当前标签之前所有匹配给定条件的元素
"""Returns the closest sibling to this Tag that matches the
given criteria and appears before this Tag in the document."""
- return self._findOne(self.findPreviousSiblings, name, attrs, text,
- **kwargs)
+ return self._findOne(self.findPreviousSiblings, name, attrs, text, **kwargs)
- def findPreviousSiblings(self, name=None, attrs={}, text=None,
- limit=None, **kwargs):
- """Returns the siblings of this Tag that match the given
- criteria and appear before this Tag in the document."""
+
+ def findPreviousSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs):
+ """
+ 返回在文档中出现在当前标签之前且符合给定条件的所有兄弟标签。
+
+ 参数:
+ name -- 要搜索的标签名称,可以是字符串、正则表达式或列表。
+ attrs -- 要搜索的标签属性,可以是字典或关键字参数。
+ text -- 要搜索的文本内容,可以是字符串、正则表达式或列表。
+ limit -- 返回结果的数量限制。
+ **kwargs -- 其他关键字参数,用于扩展搜索条件。
+
+ 返回:
+ 符合条件的兄弟标签列表。
+ """
return self._findAll(name, attrs, text, limit,
self.previousSiblingGenerator, **kwargs)
- fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
+
+
+ # 为了与BeautifulSoup 3.x版本兼容,重命名findPreviousSiblings为fetchPreviousSiblings
+ fetchPreviousSiblings = findPreviousSiblings
+
def findParent(self, name=None, attrs={}, **kwargs):
- """Returns the closest parent of this Tag that matches the given
- criteria."""
- # NOTE: We can't use _findOne because findParents takes a different
- # set of arguments.
+ """
+ 返回与给定条件匹配的最近的父标签。
+
+ 参数:
+ name -- 要搜索的标签名称,可以是字符串、正则表达式或列表。
+ attrs -- 要搜索的标签属性,可以是字典或关键字参数。
+ **kwargs -- 其他关键字参数,用于扩展搜索条件。
+
+ 返回:
+ 与条件匹配的最近的父标签,如果没有找到则返回None。
+ """
+ # 注意:我们不能使用_findOne,因为findParents接受不同的参数集。
r = None
l = self.findParents(name, attrs, 1)
if l:
r = l[0]
return r
+
def findParents(self, name=None, attrs={}, limit=None, **kwargs):
- """Returns the parents of this Tag that match the given
- criteria."""
+ """返回与给定条件匹配的父标签列表。"""
+ return self._findAll(name, attrs, None, limit, self.parentGenerator, **kwargs)
- return self._findAll(name, attrs, None, limit, self.parentGenerator,
- **kwargs)
- fetchParents = findParents # Compatibility with pre-3.x
- #These methods do the real heavy lifting.
+ # 为了与BeautifulSoup 3.x版本兼容,重命名findParents为fetchParents
+ fetchParents = findParents # 兼容旧版本
+
+
+ # 这些方法执行实际的查找操作。
def _findOne(self, method, name, attrs, text, **kwargs):
+ """使用指定的方法查找第一个匹配的标签。"""
r = None
- l = method(name, attrs, text, 1, **kwargs)
+ l = method(name, attrs, text, 1, **kwargs) # 调用指定的方法查找
if l:
- r = l[0]
+ r = l[0] # 如果找到匹配项,返回第一个
return r
- def _findAll(self, name, attrs, text, limit, generator, **kwargs):
- "Iterates over a generator looking for things that match."
+ def _findAll(self, name, attrs, text, limit, generator, **kwargs):
+ """遍历生成器,查找所有匹配的标签。"""
if isinstance(name, SoupStrainer):
- strainer = name
- # (Possibly) special case some findAll*(...) searches
+ strainer = name # 如果name是SoupStrainer实例,直接使用
+ # 特殊情况处理
elif text is None and not limit and not attrs and not kwargs:
- # findAll*(True)
+ # findAll*(True)的情况
if name is True:
- return [element for element in generator()
- if isinstance(element, Tag)]
- # findAll*('tag-name')
+ return [element for element in generator() if isinstance(element, Tag)]
+ # findAll*('tag-name')的情况
elif isinstance(name, basestring):
- return [element for element in generator()
- if isinstance(element, Tag) and
- element.name == name]
+ return [element for element in generator() if isinstance(element, Tag) and element.name == name]
else:
- strainer = SoupStrainer(name, attrs, text, **kwargs)
- # Build a SoupStrainer
+ strainer = SoupStrainer(name, attrs, text, **kwargs) # 构建SoupStrainer
else:
- strainer = SoupStrainer(name, attrs, text, **kwargs)
- results = ResultSet(strainer)
- g = generator()
+ strainer = SoupStrainer(name, attrs, text, **kwargs) # 构建SoupStrainer
+
+ results = ResultSet(strainer) # 创建结果集
+ g = generator() # 获取生成器
while True:
try:
- i = next(g)
+ i = next(g) # 获取下一个元素
except StopIteration:
- break
+ break # 如果没有更多元素,退出循环
if i:
- found = strainer.search(i)
+ found = strainer.search(i) # 使用strainer查找匹配的标签
if found:
- results.append(found)
+ results.append(found) # 将找到的标签添加到结果集中
if limit and len(results) >= limit:
- break
+ break # 如果达到限制,退出循环
return results
- #These Generators can be used to navigate starting from both
- #NavigableStrings and Tags.
+
+ # 这些生成器可用于从NavigableStrings和Tags开始导航。
def nextGenerator(self):
+ """生成器,遍历当前标签的下一个元素。"""
i = self
while i is not None:
- i = i.next
- yield i
+ i = i.next # 获取下一个元素
+ yield i # 生成下一个元素
+
def nextSiblingGenerator(self):
+ """生成器,遍历当前标签的下一个兄弟标签。"""
i = self
while i is not None:
- i = i.nextSibling
- yield i
+ i = i.nextSibling # 获取下一个兄弟标签
+ yield i # 生成下一个兄弟标签
+
def previousGenerator(self):
+ """生成器,遍历当前标签的前一个元素。"""
i = self
while i is not None:
- i = i.previous
- yield i
+ i = i.previous # 获取前一个元素
+ yield i # 生成前一个元素
+
def previousSiblingGenerator(self):
+ """生成器,遍历当前标签的前一个兄弟标签。"""
i = self
while i is not None:
- i = i.previousSibling
- yield i
+ i = i.previousSibling # 获取前一个兄弟标签
+ yield i # 生成前一个兄弟标签
+
def parentGenerator(self):
+ """生成器,遍历当前标签的父标签。"""
i = self
while i is not None:
- i = i.parent
- yield i
+ i = i.parent # 获取父标签
+ yield i # 生成父标签
+
- # Utility methods
+ # 工具方法
def substituteEncoding(self, str, encoding=None):
- encoding = encoding or "utf-8"
- return str.replace("%SOUP-ENCODING%", encoding)
+ """替换字符串中的编码占位符为指定的编码。
+
+ 参数:
+ str -- 需要进行编码替换的字符串。
+ encoding -- 指定的编码,默认为"utf-8"。
+
+ 返回:
+ 替换后的字符串。
+ """
+ encoding = encoding or "utf-8" # 如果没有提供编码,则使用默认的"utf-8"
+ return str.replace("%SOUP-ENCODING%", encoding) # 替换占位符
+
def toEncoding(self, s, encoding=None):
- """Encodes an object to a string in some encoding, or to Unicode.
- ."""
+ """将对象编码为某种编码的字符串,或转换为Unicode字符串。"""
if isinstance(s, text_type):
if encoding:
- s = s.encode(encoding)
+ s = s.encode(encoding) # 如果是文本类型且指定了编码,则进行编码
elif isinstance(s, binary_type):
- s = s.encode(encoding or "utf8")
+ s = s.encode(encoding or "utf8") # 如果是二进制类型,则使用指定编码或默认的UTF-8进行编码
else:
- s = self.toEncoding(str(s), encoding or "utf8")
- return s
+ s = self.toEncoding(str(s), encoding or "utf8") # 其他类型,先转换为字符串再进行编码
+ return s # 返回编码后的字符串
+
+ # 用于匹配未闭合的尖括号或不完整的HTML实体的正则表达式
BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;))")
+
def _sub_entity(self, x):
- """Used with a regular expression to substitute the
- appropriate XML entity for an XML special character."""
- return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
+ """用于正则表达式替换,将XML特殊字符替换为相应的XML实体。"""
+ return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" # 返回对应的XML实体
class NavigableString(text_type, PageElement):
+ """
+ NavigableString类是BeautifulSoup中用于处理和导航字符串的类。
+ 它继承自Python的text_type(Python 3中的str类型)和PageElement,使其既可以当作字符串使用,
+ 也可以像页面元素一样进行导航。
+ """
def __new__(cls, value):
- """Create a new NavigableString.
+ """创建一个新的NavigableString实例。
- When unpickling a NavigableString, this method is called with
- the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
- passed in to the superclass's __new__ or the superclass won't know
- how to handle non-ASCII characters.
+ 当反序列化(unpickling)一个NavigableString时,会调用此方法,
+ 并且传入DEFAULT_OUTPUT_ENCODING编码的字符串。需要将这个编码传给superclass的__new__,
+ 否则superclass不知道如何处理非ASCII字符。
"""
if isinstance(value, text_type):
return text_type.__new__(cls, value)
return text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
def __getnewargs__(self):
+ """返回创建NavigableString实例时的参数。"""
return (NavigableString.__str__(self),)
def __getattr__(self, attr):
- """text.string gives you text. This is for backwards
- compatibility for Navigable*String, but for CData* it lets you
- get the string without the CData wrapper."""
+ """对于NavigableString,text.string返回的就是text本身。
+
+ 这是为了向后兼容Navigable*String,但对于CData*,它允许你获取没有CData包装器的字符串。
+ """
if attr == 'string':
return self
else:
- raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr))
+ raise AttributeError("'%s'对象没有属性'%s'" % (self.__class__.__name__, attr))
def __unicode__(self):
+ """返回NavigableString的Unicode表示。"""
return str(self).decode(DEFAULT_OUTPUT_ENCODING)
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
- # Substitute outgoing XML entities.
+ """返回NavigableString的字符串表示,并进行XML实体替换。
+
+ 参数:
+ encoding -- 指定编码,默认为DEFAULT_OUTPUT_ENCODING。
+
+ 返回:
+ 根据指定编码编码后的字符串,如果没有指定编码,则返回Unicode字符串。
+ """
+ # 替换XML特殊字符为对应的XML实体
data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self)
if encoding:
return data.encode(encoding)
@@ -516,202 +591,205 @@ class NavigableString(text_type, PageElement):
return data
class CData(NavigableString):
-
+ """
+ CData类用于表示XML中的CDATA区域,它允许在XML文档中嵌入未经处理的文本数据。
+ """
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ """返回CDATA区域的字符串表示,使用指定的编码。"""
return "" % NavigableString.__str__(self, encoding)
class ProcessingInstruction(NavigableString):
+ """
+ ProcessingInstruction类用于表示XML处理指令,它允许在XML文档中包含处理器指令。
+ """
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ """返回处理指令的字符串表示,使用指定的编码。"""
output = self
if "%SOUP-ENCODING%" in output:
+ # 如果输出中包含编码占位符,则替换为实际编码
output = self.substituteEncoding(output, encoding)
return "%s?>" % self.toEncoding(output, encoding)
class Comment(NavigableString):
+ """
+ Comment类用于表示XML中的注释,它允许在XML文档中添加注释信息。
+ """
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ """返回注释的字符串表示,使用指定的编码。"""
return "" % NavigableString.__str__(self, encoding)
class Declaration(NavigableString):
+ """
+ Declaration类用于表示XML声明,它定义了XML文档的版本和编码等信息。
+ """
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ """返回XML声明的字符串表示,使用指定的编码。"""
return "" % NavigableString.__str__(self, encoding)
class Tag(PageElement):
-
- """Represents a found HTML tag with its attributes and contents."""
+ """表示找到的HTML标签及其属性和内容。"""
def _convertEntities(self, match):
- """Used in a call to re.sub to replace HTML, XML, and numeric
- entities with the appropriate Unicode characters. If HTML
- entities are being converted, any unrecognized entities are
- escaped."""
+ """用于调用re.sub,将HTML、XML和数字实体替换为相应的Unicode字符。
+ 如果正在转换HTML实体,则任何未识别的实体都会被转义。"""
try:
- x = match.group(1)
+ x = match.group(1) # 获取匹配的实体名称
if self.convertHTMLEntities and x in name2codepoint:
- return unichr(name2codepoint[x])
+ return unichr(name2codepoint[x]) # 转换为对应的Unicode字符
elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
if self.convertXMLEntities:
- return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
+ return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] # 转换为XML特殊字符
else:
- return u'&%s;' % x
+ return u'&%s;' % x # 返回未转换的实体
elif len(x) > 0 and x[0] == '#':
- # Handle numeric entities
+ # 处理数字实体
if len(x) > 1 and x[1] == 'x':
- return unichr(int(x[2:], 16))
+ return unichr(int(x[2:], 16)) # 处理十六进制数字实体
else:
- return unichr(int(x[1:]))
+ return unichr(int(x[1:])) # 处理十进制数字实体
elif self.escapeUnrecognizedEntities:
- return u'&%s;' % x
+ return u'&%s;' % x # 转义未识别的实体
- except ValueError: # e.g. ValueError: unichr() arg not in range(0x10000)
+ except ValueError: # 处理unichr()引发的值错误
pass
- return u'&%s;' % x
-
- def __init__(self, parser, name, attrs=None, parent=None,
- previous=None):
- "Basic constructor."
+ return u'&%s;' % x # 返回未识别的实体
- # We don't actually store the parser object: that lets extracted
- # chunks be garbage-collected
+ def __init__(self, parser, name, attrs=None, parent=None, previous=None):
+ """基本构造函数。"""
+ # 我们实际上并不存储解析器对象:这使得提取的块可以被垃圾回收
self.parserClass = parser.__class__
- self.isSelfClosing = parser.isSelfClosingTag(name)
- self.name = name
+ self.isSelfClosing = parser.isSelfClosingTag(name) # 判断标签是否为自闭合标签
+ self.name = name # 标签名称
if attrs is None:
- attrs = []
+ attrs = [] # 如果没有提供属性,则初始化为空列表
elif isinstance(attrs, dict):
- attrs = attrs.items()
- self.attrs = attrs
- self.contents = []
- self.setup(parent, previous)
- self.hidden = False
- self.containsSubstitutions = False
- self.convertHTMLEntities = parser.convertHTMLEntities
- self.convertXMLEntities = parser.convertXMLEntities
- self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
-
- # Convert any HTML, XML, or numeric entities in the attribute values.
- # Reference: https://github.com/pkrumins/xgoogle/pull/16/commits/3dba1165c436b0d6e5bdbd09e53ca0dbf8a043f8
+ attrs = attrs.items() # 如果提供的是字典,则转换为元组列表
+ self.attrs = attrs # 标签属性
+ self.contents = [] # 标签内容
+ self.setup(parent, previous) # 设置父标签和前一个标签
+ self.hidden = False # 标签是否隐藏
+ self.containsSubstitutions = False # 是否包含替换
+ self.convertHTMLEntities = parser.convertHTMLEntities # 是否转换HTML实体
+ self.convertXMLEntities = parser.convertXMLEntities # 是否转换XML实体
+ self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities # 是否转义未识别的实体
+
+ # 转换属性值中的HTML、XML或数字实体
convert = lambda k_val: (k_val[0],
re.sub(r"&(#\d+|#x[0-9a-fA-F]+|\w+);",
self._convertEntities,
- k_val[1]))
- self.attrs = map(convert, self.attrs)
+ k_val[1])) # 使用正则表达式替换实体
+ self.attrs = map(convert, self.attrs) # 更新属性列表
def getString(self):
- if (len(self.contents) == 1
- and isinstance(self.contents[0], NavigableString)):
+ """获取标签的字符串内容,如果内容只有一个NavigableString,则返回该内容。"""
+ if (len(self.contents) == 1 and isinstance(self.contents[0], NavigableString)):
return self.contents[0]
def setString(self, string):
- """Replace the contents of the tag with a string"""
- self.clear()
- self.append(string)
+ """用字符串替换标签的内容。"""
+ self.clear() # 清空当前内容
+ self.append(string) # 添加新的字符串内容
- string = property(getString, setString)
+ string = property(getString, setString) # 将getString和setString方法绑定到string属性
def getText(self, separator=u""):
+ """获取标签的文本内容,使用给定的分隔符连接多个文本。"""
if not len(self.contents):
- return u""
- stopNode = self._lastRecursiveChild().next
- strings = []
- current = self.contents[0]
+ return u"" # 如果没有内容,返回空字符串
+ stopNode = self._lastRecursiveChild().next # 获取最后一个子元素的下一个元素
+ strings = [] # 存储文本内容的列表
+ current = self.contents[0] # 从第一个内容开始
while current and current is not stopNode:
if isinstance(current, NavigableString):
- strings.append(current.strip())
- current = current.next
- return separator.join(strings)
+ strings.append(current.strip()) # 去除文本两端的空白并添加到列表
+ current = current.next # 移动到下一个内容
+ return separator.join(strings) # 使用分隔符连接所有文本并返回
- text = property(getText)
+ text = property(getText) # 将getText方法绑定到text属性
def get(self, key, default=None):
- """Returns the value of the 'key' attribute for the tag, or
- the value given for 'default' if it doesn't have that
- attribute."""
+ """返回标签的指定属性的值,如果没有该属性,则返回默认值。"""
return self._getAttrMap().get(key, default)
def clear(self):
- """Extract all children."""
+ """提取所有子元素。"""
for child in self.contents[:]:
- child.extract()
+ child.extract() # 从树中移除每个子元素
def index(self, element):
+ """返回指定元素在当前标签内容中的索引,如果未找到则抛出异常。"""
for i, child in enumerate(self.contents):
if child is element:
return i
- raise ValueError("Tag.index: element not in tag")
+ raise ValueError("Tag.index: element not in tag") # 如果未找到,抛出值错误
def has_key(self, key):
+ """检查标签是否包含指定的属性。"""
return self._getAttrMap().has_key(key)
def __getitem__(self, key):
- """tag[key] returns the value of the 'key' attribute for the tag,
- and throws an exception if it's not there."""
+ """通过key访问标签的属性值,如果不存在则抛出异常。"""
return self._getAttrMap()[key]
def __iter__(self):
- "Iterating over a tag iterates over its contents."
+ """迭代标签的内容。"""
return iter(self.contents)
def __len__(self):
- "The length of a tag is the length of its list of contents."
+ """返回标签内容的长度。"""
return len(self.contents)
def __contains__(self, x):
+ """检查指定元素是否在标签内容中。"""
return x in self.contents
def __nonzero__(self):
- "A tag is non-None even if it has no contents."
+ """标签即使没有内容也被视为非空。"""
return True
def __setitem__(self, key, value):
- """Setting tag[key] sets the value of the 'key' attribute for the
- tag."""
- self._getAttrMap()
- self.attrMap[key] = value
+ """设置标签的属性值。"""
+ self._getAttrMap() # 初始化属性映射
+ self.attrMap[key] = value # 更新属性映射
found = False
for i in xrange(0, len(self.attrs)):
if self.attrs[i][0] == key:
- self.attrs[i] = (key, value)
+ self.attrs[i] = (key, value) # 更新现有属性
found = True
if not found:
- self.attrs.append((key, value))
- self._getAttrMap()[key] = value
+ self.attrs.append((key, value)) # 添加新属性
+ self._getAttrMap()[key] = value # 更新属性映射
def __delitem__(self, key):
- "Deleting tag[key] deletes all 'key' attributes for the tag."
+ """删除标签的指定属性。"""
for item in self.attrs:
if item[0] == key:
- self.attrs.remove(item)
- #We don't break because bad HTML can define the same
- #attribute multiple times.
- self._getAttrMap()
- if self.attrMap.has_key(key):
- del self.attrMap[key]
+ self.attrs.remove(item) # 移除属性
+ # 不中断,因为坏HTML可能定义相同的属性多次
+ self._getAttrMap() # 初始化属性映射
+ if self.attrMap.has_key(key):
+ del self.attrMap[key] # 删除属性映射中的属性
def __call__(self, *args, **kwargs):
- """Calling a tag like a function is the same as calling its
- findAll() method. Eg. tag('a') returns a list of all the A tags
- found within this tag."""
+ """调用标签就像调用其findAll()方法一样。"""
return self.findAll(*args, **kwargs)
def __getattr__(self, tag):
- #print "Getattr %s.%s" % (self.__class__, tag)
+ """根据标签名称获取标签内容。"""
if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
- return self.find(tag[:-3])
+ return self.find(tag[:-3]) # 如果标签名以'Tag'结尾,返回对应的标签
elif tag.find('__') != 0:
- return self.find(tag)
- raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag))
+ return self.find(tag) # 否则,返回对应的标签
+ raise AttributeError("'%s'对象没有属性'%s'" % (self.__class__, tag))
def __eq__(self, other):
- """Returns true iff this tag has the same name, the same attributes,
- and the same contents (recursively) as the given tag.
-
- NOTE: right now this will return false if two tags have the
- same attributes in a different order. Should this be fixed?"""
+ """判断当前标签是否与另一个标签相等,比较名称、属性和内容。"""
if other is self:
return True
- if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
+ if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or \
+ self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
return False
for i in xrange(0, len(self.contents)):
if self.contents[i] != other.contents[i]:
@@ -719,378 +797,339 @@ class Tag(PageElement):
return True
def __ne__(self, other):
- """Returns true iff this tag is not identical to the other tag,
- as defined in __eq__."""
+ """判断当前标签是否与另一个标签不相等。"""
return not self == other
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
- """Renders this tag as a string."""
+ """将标签渲染为字符串。"""
return self.__str__(encoding)
def __unicode__(self):
+ """返回标签的Unicode表示。"""
return self.__str__(None)
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
prettyPrint=False, indentLevel=0):
- """Returns a string or Unicode representation of this tag and
- its contents. To get Unicode, pass None for encoding.
-
- NOTE: since Python's HTML parser consumes whitespace, this
- method is not certain to reproduce the whitespace present in
- the original string."""
-
- encodedName = self.toEncoding(self.name, encoding)
-
- attrs = []
- if self.attrs:
- for key, val in self.attrs:
- fmt = '%s="%s"'
- if isinstance(val, basestring):
- if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
- val = self.substituteEncoding(val, encoding)
-
- # The attribute value either:
- #
- # * Contains no embedded double quotes or single quotes.
- # No problem: we enclose it in double quotes.
- # * Contains embedded single quotes. No problem:
- # double quotes work here too.
- # * Contains embedded double quotes. No problem:
- # we enclose it in single quotes.
- # * Embeds both single _and_ double quotes. This
- # can't happen naturally, but it can happen if
- # you modify an attribute value after parsing
- # the document. Now we have a bit of a
- # problem. We solve it by enclosing the
- # attribute in single quotes, and escaping any
- # embedded single quotes to XML entities.
+ """返回此标签及其内容的字符串或Unicode表示。
+ 如果传递None作为encoding,将获得Unicode字符串。
+
+ 注意:由于Python的HTML解析器会消耗空白字符,此方法不能保证重现原始字符串中的空白字符。"""
+
+ encodedName = self.toEncoding(self.name, encoding) # 将标签名编码为指定编码
+
+ attrs = [] # 初始化属性列表
+ if self.attrs: # 如果有属性
+ for key, val in self.attrs: # 遍历属性
+ fmt = '%s="%s"' # 属性格式
+ if isinstance(val, basestring): # 如果属性值是字符串
+ if self.containsSubstitutions and '%SOUP-ENCODING%' in val: # 如果包含编码占位符
+ val = self.substituteEncoding(val, encoding) # 替换编码
+
+ # 根据属性值中是否包含引号来决定使用单引号还是双引号
if '"' in val:
fmt = "%s='%s'"
- if "'" in val:
- # TODO: replace with apos when
- # appropriate.
+ if "'" in val: # 如果同时包含单双引号,则替换单引号为实体
val = val.replace("'", "&squot;")
- # Now we're okay w/r/t quotes. But the attribute
- # value might also contain angle brackets, or
- # ampersands that aren't part of entities. We need
- # to escape those to XML entities too.
+ # 转义属性值中的小于号、大于号和未包含在实体中的和号
val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
- attrs.append(fmt % (self.toEncoding(key, encoding),
- self.toEncoding(val, encoding)))
+ attrs.append(fmt % (self.toEncoding(key, encoding), self.toEncoding(val, encoding))) # 添加编码后的属性
close = ''
closeTag = ''
- if self.isSelfClosing:
+ if self.isSelfClosing: # 如果是自闭合标签
close = ' /'
else:
- closeTag = '%s>' % encodedName
+ closeTag = '%s>' % encodedName # 标签结束符号
- indentTag, indentContents = 0, 0
- if prettyPrint:
+ indentTag, indentContents = 0, 0 # 初始化缩进级别
+ if prettyPrint: # 如果需要美化输出
indentTag = indentLevel
- space = (' ' * (indentTag-1))
+ space = (' ' * (indentTag - 1))
indentContents = indentTag + 1
- contents = self.renderContents(encoding, prettyPrint, indentContents)
- if self.hidden:
+ contents = self.renderContents(encoding, prettyPrint, indentContents) # 渲染标签内容
+ if self.hidden: # 如果标签是隐藏的
s = contents
else:
- s = []
- attributeString = ''
- if attrs:
- attributeString = ' ' + ' '.join(attrs)
- if prettyPrint:
+ s = [] # 初始化字符串列表
+ attributeString = '' # 初始化属性字符串
+ if attrs: # 如果有属性
+ attributeString = ' ' + ' '.join(attrs) # 属性字符串
+ if prettyPrint: # 如果需要美化输出
s.append(space)
- s.append('<%s%s%s>' % (encodedName, attributeString, close))
- if prettyPrint:
+ s.append('<%s%s%s>' % (encodedName, attributeString, close)) # 开始标签
+ if prettyPrint: # 如果需要美化输出
s.append("\n")
- s.append(contents)
- if prettyPrint and contents and contents[-1] != "\n":
+ s.append(contents) # 内容
+ if prettyPrint and contents and contents[-1] != "\n": # 如果需要美化输出且内容不以换行符结尾
s.append("\n")
- if prettyPrint and closeTag:
+ if prettyPrint and closeTag: # 如果需要美化输出且有结束标签
s.append(space)
- s.append(closeTag)
- if prettyPrint and closeTag and self.nextSibling:
+ s.append(closeTag) # 结束标签
+ if prettyPrint and closeTag and self.nextSibling: # 如果需要美化输出且有下一个兄弟节点
s.append("\n")
- s = ''.join(s)
- return s
+ s = ''.join(s) # 合并字符串
+ return s # 返回标签字符串
- def decompose(self):
- """Recursively destroys the contents of this tree."""
- self.extract()
- if len(self.contents) == 0:
+ def decompose(self): # 递归销毁树的内容
+ self.extract() # 提取自身
+ if len(self.contents) == 0: # 如果没有内容
return
- current = self.contents[0]
- while current is not None:
- next = current.next
- if isinstance(current, Tag):
- del current.contents[:]
- current.parent = None
- current.previous = None
- current.previousSibling = None
- current.next = None
- current.nextSibling = None
- current = next
-
- def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ current = self.contents[0] # 获取第一个内容
+ while current is not None: # 遍历内容
+ next = current.next # 下一个内容
+ if isinstance(current, Tag): # 如果是标签
+ del current.contents[:] # 删除内容
+ current.parent = None # 清除父节点
+ current.previous = None # 清除前一个节点
+ current.previousSibling = None # 清除前一个兄弟节点
+ current.next = None # 清除下一个节点
+ current.nextSibling = None # 清除下一个兄弟节点
+ current = next # 移动到下一个内容
+
+ def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): # 美化输出
return self.__str__(encoding, True)
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
- prettyPrint=False, indentLevel=0):
- """Renders the contents of this tag as a string in the given
- encoding. If encoding is None, returns a Unicode string.."""
- s=[]
- for c in self:
- text = None
- if isinstance(c, NavigableString):
- text = c.__str__(encoding)
- elif isinstance(c, Tag):
- s.append(c.__str__(encoding, prettyPrint, indentLevel))
- if text and prettyPrint:
- text = text.strip()
- if text:
- if prettyPrint:
- s.append(" " * (indentLevel-1))
- s.append(text)
- if prettyPrint:
+ prettyPrint=False, indentLevel=0): # 渲染标签内容
+ """以给定编码渲染此标签的内容。如果编码为None,返回Unicode字符串。"""
+ s = [] # 初始化字符串列表
+ for c in self: # 遍历内容
+ text = None # 初始化文本
+ if isinstance(c, NavigableString): # 如果是可导航字符串
+ text = c.__str__(encoding) # 转换为字符串
+ elif isinstance(c, Tag): # 如果是标签
+ s.append(c.__str__(encoding, prettyPrint, indentLevel)) # 添加标签字符串
+ if text and prettyPrint: # 如果是文本且需要美化输出
+ text = text.strip() # 去除空白
+ if text: # 如果有文本
+ if prettyPrint: # 如果需要美化输出
+ s.append(" " * (indentLevel - 1))
+ s.append(text) # 添加文本
+ if prettyPrint: # 如果需要美化输出
s.append("\n")
+ return ''.join(s) # 返回内容字符串
- return ''.join(s)
-
- #Soup methods
+ # Soup方法
def find(self, name=None, attrs={}, recursive=True, text=None,
- **kwargs):
- """Return only the first child of this Tag matching the given
- criteria."""
- r = None
- l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
- if l:
- r = l[0]
- return r
+ **kwargs): # 查找第一个匹配的子标签
+ """返回此标签的第一个匹配给定条件的子标签。"""
+ r = None # 初始化结果
+ l = self.findAll(name, attrs, recursive, text, 1, **kwargs) # 查找所有匹配的子标签
+ if l: # 如果有结果
+ r = l[0] # 第一个结果
+ return r # 返回结果
+
findChild = find
def findAll(self, name=None, attrs={}, recursive=True, text=None,
- limit=None, **kwargs):
- """Extracts a list of Tag objects that match the given
- criteria. You can specify the name of the Tag and any
- attributes you want the Tag to have.
-
- The value of a key-value pair in the 'attrs' map can be a
- string, a list of strings, a regular expression object, or a
- callable that takes a string and returns whether or not the
- string matches for some custom definition of 'matches'. The
- same is true of the tag name."""
- generator = self.recursiveChildGenerator
- if not recursive:
- generator = self.childGenerator
- return self._findAll(name, attrs, text, limit, generator, **kwargs)
+ limit=None, **kwargs): # 查找所有匹配的子标签
+ """提取匹配给定条件的标签列表。你可以指定标签的名称和任何你希望标签拥有的属性。"""
+ generator = self.recursiveChildGenerator # 递归子生成器
+ if not recursive: # 如果不需要递归
+ generator = self.childGenerator # 子生成器
+ return self._findAll(name, attrs, text, limit, generator, **kwargs) # 查找所有匹配的标签
+
findChildren = findAll
- # Pre-3.x compatibility methods
- first = find
- fetch = findAll
+ # Pre-3.x兼容性方法
+ first = find # 第一个匹配的子标签
+ fetch = findAll # 查找所有匹配的子标签
- def fetchText(self, text=None, recursive=True, limit=None):
+ def fetchText(self, text=None, recursive=True, limit=None): # 查找所有匹配的文本
return self.findAll(text=text, recursive=recursive, limit=limit)
- def firstText(self, text=None, recursive=True):
+ def firstText(self, text=None, recursive=True): # 查找第一个匹配的文本
return self.find(text=text, recursive=recursive)
- #Private methods
+ # 私有方法
+
+ def _getAttrMap(self): # 获取属性映射
+ """如果尚未初始化,则初始化此标签属性的映射表示。"""
+ if not getattr(self, 'attrMap'): # 如果没有属性映射
+ self.attrMap = {} # 初始化属性映射
+ for (key, value) in self.attrs: # 遍历属性
+ self.attrMap[key] = value # 添加属性
+ return self.attrMap # 返回属性映射
- def _getAttrMap(self):
- """Initializes a map representation of this tag's attributes,
- if not already initialized."""
- if not getattr(self, 'attrMap'):
- self.attrMap = {}
- for (key, value) in self.attrs:
- self.attrMap[key] = value
- return self.attrMap
+ # 生成器方法
- #Generator methods
- def childGenerator(self):
- # Just use the iterator from the contents
+ def childGenerator(self): # 子生成器
+ # 直接使用内容的迭代器
return iter(self.contents)
- def recursiveChildGenerator(self):
- if not len(self.contents):
- return # Note: https://stackoverflow.com/a/30217723 (PEP 479)
- stopNode = self._lastRecursiveChild().next
- current = self.contents[0]
- while current and current is not stopNode:
- yield current
- current = current.next
+ def recursiveChildGenerator(self): # 递归子生成器
+ if not len(self.contents): # 如果没有内容
+ return # 返回
+ stopNode = self._lastRecursiveChild().next # 停止节点
+ current = self.contents[0] # 当前节点
+ while current and current is not stopNode: # 遍历节点
+ yield current # 产生当前节点
+ current = current.next # 移动到下一个节点
# Next, a couple classes to represent queries and their results.
class SoupStrainer:
- """Encapsulates a number of ways of matching a markup element (tag or
- text)."""
+ """封装了多种匹配标记元素(标签或文本)的方法。"""
def __init__(self, name=None, attrs={}, text=None, **kwargs):
- self.name = name
- if isinstance(attrs, basestring):
- kwargs['class'] = _match_css_class(attrs)
- attrs = None
- if kwargs:
- if attrs:
- attrs = attrs.copy()
- attrs.update(kwargs)
+ self.name = name # 标签名
+ if isinstance(attrs, basestring): # 如果attrs是字符串
+ kwargs['class'] = _match_css_class(attrs) # 将CSS类名转换为可匹配的形式
+ attrs = None # 重置attrs
+ if kwargs: # 如果有额外的关键字参数
+ if attrs: # 如果已有attrs
+ attrs = attrs.copy() # 复制attrs
+ attrs.update(kwargs) # 更新attrs
else:
- attrs = kwargs
- self.attrs = attrs
- self.text = text
+ attrs = kwargs # 否则直接设置attrs
+ self.attrs = attrs # 属性字典
+ self.text = text # 文本内容
def __str__(self):
- if self.text:
- return self.text
+ if self.text: # 如果有文本内容
+ return self.text # 返回文本内容
else:
- return "%s|%s" % (self.name, self.attrs)
+ return "%s|%s" % (self.name, self.attrs) # 返回标签名和属性
def searchTag(self, markupName=None, markupAttrs={}):
- found = None
- markup = None
- if isinstance(markupName, Tag):
- markup = markupName
- markupAttrs = markup
+ found = None # 初始化找到的元素
+ markup = None # 初始化标记
+ if isinstance(markupName, Tag): # 如果传入的是Tag对象
+ markup = markupName # 设置标记
+ markupAttrs = markup # 设置标记属性
callFunctionWithTagData = callable(self.name) \
- and not isinstance(markupName, Tag)
+ and not isinstance(markupName, Tag) # 判断是否是可调用的函数
if (not self.name) \
or callFunctionWithTagData \
or (markup and self._matches(markup, self.name)) \
or (not markup and self._matches(markupName, self.name)):
- if callFunctionWithTagData:
- match = self.name(markupName, markupAttrs)
+ # 如果没有指定标签名或函数调用匹配成功
+ if callFunctionWithTagData: # 如果是函数调用
+ match = self.name(markupName, markupAttrs) # 调用函数
else:
- match = True
- markupAttrMap = None
- for attr, matchAgainst in self.attrs.items():
- if not markupAttrMap:
- if hasattr(markupAttrs, 'get'):
- markupAttrMap = markupAttrs
+ match = True # 默认匹配成功
+ markupAttrMap = None # 初始化属性映射
+ for attr, matchAgainst in self.attrs.items(): # 遍历属性
+ if not markupAttrMap: # 如果没有属性映射
+ if hasattr(markupAttrs, 'get'): # 如果有get方法
+ markupAttrMap = markupAttrs # 设置属性映射
else:
- markupAttrMap = {}
- for k,v in markupAttrs:
+ markupAttrMap = {} # 初始化属性映射
+ for k,v in markupAttrs: # 复制属性
markupAttrMap[k] = v
- attrValue = markupAttrMap.get(attr)
- if not self._matches(attrValue, matchAgainst):
- match = False
+ attrValue = markupAttrMap.get(attr) # 获取属性值
+ if not self._matches(attrValue, matchAgainst): # 如果属性不匹配
+ match = False # 设置不匹配
break
- if match:
- if markup:
- found = markup
+ if match: # 如果匹配成功
+ if markup: # 如果是Tag对象
+ found = markup # 设置找到的元素
else:
- found = markupName
- return found
+ found = markupName # 设置找到的元素
+ return found # 返回找到的元素
def search(self, markup):
+ # 打印寻找信息
#print 'looking for %s in %s' % (self, markup)
- found = None
- # If given a list of items, scan it for a text element that
- # matches.
+ found = None # 初始化找到的元素
+ # 如果给定的是一个元素列表,扫描它以找到匹配的文本元素
if hasattr(markup, "__iter__") \
and not isinstance(markup, Tag):
- for element in markup:
+ for element in markup: # 遍历元素
if isinstance(element, NavigableString) \
- and self.search(element):
- found = element
+ and self.search(element): # 如果是可导航字符串并且匹配
+ found = element # 设置找到的元素
break
- # If it's a Tag, make sure its name or attributes match.
- # Don't bother with Tags if we're searching for text.
+ # 如果它是一个Tag,确保它的名称或属性匹配
+ # 如果我们正在寻找文本,就不要麻烦处理Tags
elif isinstance(markup, Tag):
- if not self.text:
- found = self.searchTag(markup)
- # If it's text, make sure the text matches.
+ if not self.text: # 如果不是寻找文本
+ found = self.searchTag(markup) # 搜索标签
+ # 如果它是文本,确保文本匹配
elif isinstance(markup, NavigableString) or \
isinstance(markup, basestring):
- if self._matches(markup, self.text):
- found = markup
+ if self._matches(markup, self.text): # 如果文本匹配
+ found = markup # 设置找到的元素
else:
raise Exception("I don't know how to match against a %s" \
- % markup.__class__)
- return found
+ % markup.__class__) # 抛出异常
+ return found # 返回找到的元素
def _matches(self, markup, matchAgainst):
+ # 打印匹配信息
#print "Matching %s against %s" % (markup, matchAgainst)
- result = False
- if matchAgainst is True:
- result = markup is not None
- elif callable(matchAgainst):
- result = matchAgainst(markup)
+ result = False # 初始化匹配结果
+ if matchAgainst is True: # 如果匹配条件是True
+ result = markup is not None # 只要markup不是None就匹配
+ elif callable(matchAgainst): # 如果匹配条件是可调用的
+ result = matchAgainst(markup) # 调用函数
else:
- #Custom match methods take the tag as an argument, but all
- #other ways of matching match the tag name as a string.
- if isinstance(markup, Tag):
- markup = markup.name
- if markup and not isinstance(markup, basestring):
- markup = text_type(markup)
- #Now we know that chunk is either a string, or None.
- if hasattr(matchAgainst, 'match'):
- # It's a regexp object.
- result = markup and matchAgainst.search(markup)
- elif hasattr(matchAgainst, '__iter__'): # list-like
- result = markup in matchAgainst
- elif hasattr(matchAgainst, 'items'):
- result = markup.has_key(matchAgainst)
- elif matchAgainst and isinstance(markup, basestring):
- if isinstance(markup, text_type):
- matchAgainst = text_type(matchAgainst)
+ # 自定义匹配方法接受标签作为参数,但所有其他匹配方式都匹配标签名称作为字符串
+ if isinstance(markup, Tag): # 如果是Tag对象
+ markup = markup.name # 获取标签名
+ if markup and not isinstance(markup, basestring): # 如果markup不是字符串
+ markup = text_type(markup) # 转换为字符串
+ # 现在我们知道chunk要么是字符串,要么None
+ if hasattr(matchAgainst, 'match'): # 如果是正则表达式对象
+ result = markup and matchAgainst.search(markup) # 搜索匹配
+ elif hasattr(matchAgainst, '__iter__'): # 如果是可迭代的
+ result = markup in matchAgainst # 是否在其中
+ elif hasattr(matchAgainst, 'items'): # 如果有items方法
+ result = markup.has_key(matchAgainst) # 是否有键
+ elif matchAgainst and isinstance(markup, basestring): # 如果都是字符串
+ if isinstance(markup, text_type): # 如果是unicode
+ matchAgainst = text_type(matchAgainst) # 转换为unicode
else:
- matchAgainst = str(matchAgainst)
+ matchAgainst = str(matchAgainst) # 转换为字符串
+
+ if not result: # 如果不匹配
+ result = matchAgainst == markup # 比较是否相等
+ return result # 返回匹配结果
- if not result:
- result = matchAgainst == markup
- return result
class ResultSet(list):
- """A ResultSet is just a list that keeps track of the SoupStrainer
- that created it."""
+ """ResultSet是一个特殊的列表,它记录了创建它的SoupStrainer。"""
+
def __init__(self, source):
- list.__init__([])
- self.source = source
+ list.__init__([]) # 初始化列表
+ self.source = source # 记录创建ResultSet的SoupStrainer
+
-# Now, some helper functions.
+# 以下是一些辅助函数。
def buildTagMap(default, *args):
- """Turns a list of maps, lists, or scalars into a single map.
- Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
- NESTING_RESET_TAGS maps out of lists and partial maps."""
- built = {}
- for portion in args:
- if hasattr(portion, 'items'):
- #It's a map. Merge it.
- for k,v in portion.items():
+ """将多个映射、列表或标量转换为一个单一的映射。
+ 用于构建SELF_CLOSING_TAGS、NESTABLE_TAGS和NESTING_RESET_TAGS映射。"""
+
+ built = {} # 初始化空字典
+ for portion in args: # 遍历传入的参数
+ if hasattr(portion, 'items'): # 如果参数是映射
+ # 合并映射
+ for k, v in portion.items():
built[k] = v
- elif hasattr(portion, '__iter__'): # is a list
- #It's a list. Map each item to the default.
+ elif hasattr(portion, '__iter__'): # 如果参数是列表
+ # 将列表中的每个项映射到默认值
for k in portion:
built[k] = default
else:
- #It's a scalar. Map it to the default.
+ # 如果参数是标量,将其映射到默认值
built[portion] = default
- return built
+ return built # 返回构建的映射
# Now, the parser classes.
class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):
+ """这个类包含了基本的解析器和搜索代码。它定义了一个解析器,除了以下规则外,对标签行为一无所知:
- """This class contains the basic parser and search code. It defines
- a parser that knows nothing about tag behavior except for the
- following:
+ 你不能在不关闭它所包含的所有标签的情况下关闭一个标签。
+ 也就是说,"
(No space between name of closing tag and tag close)
- (Extraneous whitespace in declaration)
+
(闭合标签名称和标签关闭之间没有空格)
+ (声明中的多余空白)
- You can pass in a custom list of (RE object, replace method)
- tuples to get Beautiful Soup to scrub your input the way you
- want."""
+ 您可以传递自定义的(RE对象,替换方法)元组列表,让BeautifulSoup按照您想要的方式清理您的输入。"""
self.parseOnlyThese = parseOnlyThese
self.fromEncoding = fromEncoding
self.smartQuotesTo = smartQuotesTo
self.convertEntities = convertEntities
- # Set the rules for how we'll deal with the entities we
- # encounter
+ # 设置我们将如何处理我们遇到的实体的规则
if self.convertEntities:
- # It doesn't make sense to convert encoded characters to
- # entities even while you're converting entities to Unicode.
- # Just convert it all to Unicode.
+ # 将编码字符转换为实体是没有意义的,即使在您正在将实体转换为Unicode时也是如此。
+ # 将所有内容都转换为Unicode。
self.smartQuotesTo = None
if convertEntities == self.HTML_ENTITIES:
self.convertXMLEntities = False
@@ -1177,7 +1204,7 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):
self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
sgmllib.SGMLParser.__init__(self)
- if hasattr(markup, 'read'): # It's a file-type object.
+ if hasattr(markup, 'read'): # 它是一个文件类型对象。
markup = markup.read()
self.markup = markup
self.markupMassage = markupMassage
@@ -1185,20 +1212,20 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):
self._feed(isHTML=isHTML)
except StopParsing:
pass
- self.markup = None # The markup can now be GCed
+ self.markup = None # 标记现在可以被GCed了
def convert_charref(self, name):
- """This method fixes a bug in Python's SGMLParser."""
+ """这个方法修复了Python的SGMLParser中的一个bug。"""
try:
n = int(name)
except ValueError:
return
- if not 0 <= n <= 127 : # ASCII ends at 127, not 255
+ if not 0 <= n <= 127 : # ASCII在127结束,不是255
return
return self.convert_codepoint(n)
def _feed(self, inDocumentEncoding=None, isHTML=False):
- # Convert the document to Unicode.
+ # 将文档转换为Unicode。
markup = self.markup
if isinstance(markup, text_type):
if not hasattr(self, 'originalEncoding'):
@@ -1216,148 +1243,163 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):
self.markupMassage = self.MARKUP_MASSAGE
for fix, m in self.markupMassage:
markup = fix.sub(m, markup)
- # TODO: We get rid of markupMassage so that the
- # soup object can be deepcopied later on. Some
- # Python installations can't copy regexes. If anyone
- # was relying on the existence of markupMassage, this
- # might cause problems.
+ # TODO: 我们摆脱markupMassage,以便soup对象可以稍后被深度复制。
+ # 一些Python安装无法复制正则表达式。如果有人依赖markupMassage的存在,这可能会导致问题。
del(self.markupMassage)
self.reset()
sgmllib.SGMLParser.feed(self, markup)
- # Close out any unfinished strings and close all the open tags.
+ # 关闭任何未完成的字符串并关闭所有打开的标签。
self.endData()
while self.currentTag.name != self.ROOT_TAG_NAME:
self.popTag()
def __getattr__(self, methodName):
- """This method routes method call requests to either the SGMLParser
- superclass or the Tag superclass, depending on the method name."""
- #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
+ """这个方法将方法调用请求路由到SGMLParser超类或Tag超类,具体取决于方法名。"""
+ # 打印出被调用的方法名
+ # print "__getattr__ called on %s.%s" % (self.__class__, methodName)
if methodName.startswith('start_') or methodName.startswith('end_') \
- or methodName.startswith('do_'):
+ or methodName.startswith('do_'):
+ # 如果方法是SGMLParser的方法,则从SGMLParser中获取
return sgmllib.SGMLParser.__getattr__(self, methodName)
elif not methodName.startswith('__'):
+ # 否则,如果方法不是特殊方法,则从Tag中获取
return Tag.__getattr__(self, methodName)
else:
+ # 如果方法是特殊方法,则抛出属性错误
raise AttributeError
def isSelfClosingTag(self, name):
- """Returns true iff the given string is the name of a
- self-closing tag according to this parser."""
+ """返回true,当且仅当给定的字符串是此解析器中自闭合标签的名称。"""
+ # 检查标签是否是自闭合标签
return name in self.SELF_CLOSING_TAGS \
- or name in self.instanceSelfClosingTags
+ or name in self.instanceSelfClosingTags
def reset(self):
+ # 重置Tag对象,并初始化ROOT_TAG_NAME
Tag.__init__(self, self, self.ROOT_TAG_NAME)
self.hidden = 1
+ # 重置SGMLParser对象
sgmllib.SGMLParser.reset(self)
- self.currentData = []
- self.currentTag = None
- self.tagStack = []
- self.quoteStack = []
+ self.currentData = [] # 存储当前数据
+ self.currentTag = None # 当前标签
+ self.tagStack = [] # 标签堆栈
+ self.quoteStack = [] # 引号堆栈
+ # 将ROOT_TAG_NAME推入标签堆栈
self.pushTag(self)
def popTag(self):
+ # 从标签堆栈中弹出一个标签
tag = self.tagStack.pop()
-
- #print "Pop", tag.name
if self.tagStack:
- self.currentTag = self.tagStack[-1]
+ self.currentTag = self.tagStack[-1] # 更新当前标签
return self.currentTag
def pushTag(self, tag):
- #print "Push", tag.name
+ # 将一个标签推入标签堆栈
if self.currentTag:
- self.currentTag.contents.append(tag)
- self.tagStack.append(tag)
- self.currentTag = self.tagStack[-1]
+ self.currentTag.contents.append(tag) # 将标签添加到当前标签的内容中
+ self.tagStack.append(tag) # 推入堆栈
+ self.currentTag = self.tagStack[-1] # 更新当前标签
def endData(self, containerClass=NavigableString):
+ # 结束当前数据的处理
if self.currentData:
- currentData = u''.join(self.currentData)
+ currentData = u''.join(self.currentData) # 合并当前数据
+ # 如果数据只包含ASCII空白字符,并且不在PRESERVE_WHITESPACE_TAGS中,则替换为单个空格
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
- not set([tag.name for tag in self.tagStack]).intersection(
- self.PRESERVE_WHITESPACE_TAGS)):
+ not set([tag.name for tag in self.tagStack]).intersection(
+ self.PRESERVE_WHITESPACE_TAGS)):
if '\n' in currentData:
currentData = '\n'
else:
currentData = ' '
self.currentData = []
+ # 如果设置了parseOnlyThese,并且不在顶层标签,并且当前数据不匹配,则不处理
if self.parseOnlyThese and len(self.tagStack) <= 1 and \
- (not self.parseOnlyThese.text or \
- not self.parseOnlyThese.search(currentData)):
+ (not self.parseOnlyThese.text or \
+ not self.parseOnlyThese.search(currentData)):
return
- o = containerClass(currentData)
- o.setup(self.currentTag, self.previous)
+ o = containerClass(currentData) # 创建一个新的NavigableString对象
+ o.setup(self.currentTag, self.previous) # 设置对象
if self.previous:
- self.previous.next = o
- self.previous = o
- self.currentTag.contents.append(o)
-
+ self.previous.next = o # 设置前一个对象的下一个对象
+ self.previous = o # 更新前一个对象
+ self.currentTag.contents.append(o) # 将对象添加到当前标签的内容中
def _popToTag(self, name, inclusivePop=True):
- """Pops the tag stack up to and including the most recent
- instance of the given tag. If inclusivePop is false, pops the tag
- stack up to but *not* including the most recent instqance of
- the given tag."""
- #print "Popping to %s" % name
+ """弹出标签堆栈直到并包括最近的给定标签。如果inclusivePop为false,则弹出标签堆栈直到但不包括最近的给定标签。"""
+ # 打印出正在弹出到的标签名
+ # print "Popping to %s" % name
if name == self.ROOT_TAG_NAME:
return
numPops = 0
mostRecentTag = None
- for i in xrange(len(self.tagStack)-1, 0, -1):
+ # 从后向前查找给定标签的位置
+ for i in xrange(len(self.tagStack) - 1, 0, -1):
if name == self.tagStack[i].name:
- numPops = len(self.tagStack)-i
+ numPops = len(self.tagStack) - i
break
if not inclusivePop:
numPops = numPops - 1
+ # 弹出标签
for i in xrange(0, numPops):
mostRecentTag = self.popTag()
return mostRecentTag
def _smartPop(self, name):
-
- """We need to pop up to the previous tag of this type, unless
- one of this tag's nesting reset triggers comes between this
- tag and the previous tag of this type, OR unless this tag is a
- generic nesting trigger and another generic nesting trigger
- comes between this tag and the previous tag of this type.
-
- Examples:
-
FooBar *
* should pop to 'p', not 'b'. -
Foo
* | * should pop to 'tr', not the first 'td'
- """
+ """我们需要弹出到这种类型的前一个标签,除非在当前标签和这种类型的前一个标签之间出现了这种标签的嵌套重置触发器,
+ 或者除非这个标签是一个通用嵌套触发器,并且在这个标签和这种类型的前一个标签之间出现了另一个通用嵌套触发器。
+
+ 例子:
+ < p > Foo < b > Bar * < p >
+ *应该弹出到
+ 'p',而不是
+ 'b'。
+ < p > Foo < table > Bar * < p >
+ *应该弹出到
+ 'table',而不是
+ 'p'。
+ < p > Foo < table > < tr > Bar * < p >
+ *应该弹出到
+ 'tr',而不是
+ 'p'。
+
+ < li > < ul > < li > * < li >
+ *应该弹出到
+ 'ul',而不是第一个
+ 'li'。
+ < tr > < table > < tr > * < tr >
+ *应该弹出到
+ 'table',而不是第一个
+ 'tr'
+ < td > < tr > < td > * < td >
+ *应该弹出到
+ 'tr',而不是第一个
+ 'td'
+
+ """
nestingResetTriggers = self.NESTABLE_TAGS.get(name)
isNestable = nestingResetTriggers != None
isResetNesting = name in self.RESET_NESTING_TAGS
popTo = None
inclusive = True
- for i in xrange(len(self.tagStack)-1, 0, -1):
+ for i in xrange(len(self.tagStack) - 1, 0, -1):
p = self.tagStack[i]
if (not p or p.name == name) and not isNestable:
- #Non-nestable tags get popped to the top or to their
- #last occurance.
+ # 非嵌套标签被弹出到顶部或它们的最后一次出现。
popTo = name
break
if (nestingResetTriggers is not None
and p.name in nestingResetTriggers) \
- or (nestingResetTriggers is None and isResetNesting
- and p.name in self.RESET_NESTING_TAGS):
-
- #If we encounter one of the nesting reset triggers
- #peculiar to this tag, or we encounter another tag
- #that causes nesting to reset, pop up to but not
- #including that tag.
+ or (nestingResetTriggers is None and isResetNesting
+ and p.name in self.RESET_NESTING_TAGS):
+ # 如果我们遇到了这个标签特有的一个嵌套重置触发器,或者我们遇到了另一个导致嵌套重置的标签,
+ # 弹出到但不包括那个标签。
popTo = p.name
inclusive = False
break
@@ -1365,11 +1407,14 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):
if popTo:
self._popToTag(popTo, inclusive)
+
def unknown_starttag(self, name, attrs, selfClosing=0):
- #print "Start tag %s: %s" % (name, attrs)
+ # 打印开始标签信息
+ # print "Start tag %s: %s" % (name, attrs)
if self.quoteStack:
- #This is not a real tag.
- #print "<%s> is not real!" % name
+ # 这不是一个真正的标签。
+ # 打印信息
+ # print "<%s> is not real!" % name
attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
self.handle_data('<%s%s>' % (name, attrs))
return
@@ -1379,7 +1424,7 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):
self._smartPop(name)
if self.parseOnlyThese and len(self.tagStack) <= 1 \
- and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
+ and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
return
tag = Tag(self, name, attrs, self.currentTag, self.previous)
@@ -1390,16 +1435,20 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):
if selfClosing or self.isSelfClosingTag(name):
self.popTag()
if name in self.QUOTE_TAGS:
- #print "Beginning quote (%s)" % name
+ # 打印开始引用信息
+ # print "Beginning quote (%s)" % name
self.quoteStack.append(name)
self.literal = 1
return tag
+
def unknown_endtag(self, name):
- #print "End tag %s" % name
+ # 打印结束标签信息
+ # print "End tag %s" % name
if self.quoteStack and self.quoteStack[-1] != name:
- #This is not a real end tag.
- #print "%s> is not real!" % name
+ # 这不是一个真正的结束标签。
+ # 打印信息
+ # print "%s> is not real!" % name
self.handle_data('%s>' % name)
return
self.endData()
@@ -1409,149 +1458,104 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):
self.literal = (len(self.quoteStack) > 0)
def handle_data(self, data):
+ """将数据添加到当前数据列表中。"""
self.currentData.append(data)
def _toStringSubclass(self, text, subclass):
- """Adds a certain piece of text to the tree as a NavigableString
- subclass."""
- self.endData()
- self.handle_data(text)
- self.endData(subclass)
+ """将特定文本作为NavigableString子类添加到树中。"""
+ self.endData() # 结束当前数据的处理
+ self.handle_data(text) # 处理文本
+ self.endData(subclass) # 结束处理并指定子类
def handle_pi(self, text):
- """Handle a processing instruction as a ProcessingInstruction
- object, possibly one with a %SOUP-ENCODING% slot into which an
- encoding will be plugged later."""
+ """将处理指令作为ProcessingInstruction对象处理,可能有一个 % SOUP - ENCODING % 插槽,稍后将插入编码。"""
if text[:3] == "xml":
text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
self._toStringSubclass(text, ProcessingInstruction)
def handle_comment(self, text):
- "Handle comments as Comment objects."
+ """将注释作为Comment对象处理。"""
self._toStringSubclass(text, Comment)
def handle_charref(self, ref):
- "Handle character references as data."
+ """将字符引用作为数据处理。"""
if self.convertEntities:
- data = unichr(int(ref))
+ data = unichr(int(ref)) # 转换为Unicode字符
else:
- data = '%s;' % ref
+ data = '%s;' % ref # 保持为实体引用
self.handle_data(data)
def handle_entityref(self, ref):
- """Handle entity references as data, possibly converting known
- HTML and/or XML entity references to the corresponding Unicode
- characters."""
+ """将实体引用作为数据处理,可能将已知的HTML和 / 或XML实体引用转换为相应的Unicode字符。"""
data = None
if self.convertHTMLEntities:
try:
- data = unichr(name2codepoint[ref])
+ data = unichr(name2codepoint[ref]) # 尝试转换为Unicode字符
except KeyError:
pass
if not data and self.convertXMLEntities:
- data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+ data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) # 获取特殊字符
if not data and self.convertHTMLEntities and \
- not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
- # TODO: We've got a problem here. We're told this is
- # an entity reference, but it's not an XML entity
- # reference or an HTML entity reference. Nonetheless,
- # the logical thing to do is to pass it through as an
- # unrecognized entity reference.
- #
- # Except: when the input is "&carol;" this function
- # will be called with input "carol". When the input is
- # "AT&T", this function will be called with input
- # "T". We have no way of knowing whether a semicolon
- # was present originally, so we don't know whether
- # this is an unknown entity or just a misplaced
- # ampersand.
- #
- # The more common case is a misplaced ampersand, so I
- # escape the ampersand and omit the trailing semicolon.
- data = "&%s" % ref
+ not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+ # 处理未知实体引用
+ data = "&%s" % ref
if not data:
- # This case is different from the one above, because we
- # haven't already gone through a supposedly comprehensive
- # mapping of entities to Unicode characters. We might not
- # have gone through any mapping at all. So the chances are
- # very high that this is a real entity, and not a
- # misplaced ampersand.
+ # 处理真实实体
data = "&%s;" % ref
self.handle_data(data)
def handle_decl(self, data):
- "Handle DOCTYPEs and the like as Declaration objects."
+ """将DOCTYPE等声明作为Declaration对象处理。"""
self._toStringSubclass(data, Declaration)
def parse_declaration(self, i):
- """Treat a bogus SGML declaration as raw data. Treat a CDATA
- declaration as a CData object."""
+ """将无效的SGML声明视为原始数据。将CDATA声明视为CData对象。"""
j = None
- if self.rawdata[i:i+9] == '', i)
- if k == -1:
- k = len(self.rawdata)
- data = self.rawdata[i+9:k]
- j = k+3
- self._toStringSubclass(data, CData)
+ if self.rawdata[i:i + 9] == '', i) # 寻找CDATA结束标志
+ if k == -1:
+ k = len(self.rawdata)
+ data = self.rawdata[i + 9:k] # 获取CDATA中的数据
+ j = k + 3
+ self._toStringSubclass(data, CData) # 处理CDATA数据
else:
try:
- j = sgmllib.SGMLParser.parse_declaration(self, i)
+ j = sgmllib.SGMLParser.parse_declaration(self, i) # 处理SGML声明
except sgmllib.SGMLParseError:
- toHandle = self.rawdata[i:]
- self.handle_data(toHandle)
+ toHandle = self.rawdata[i:] # 获取错误后的数据
+ self.handle_data(toHandle) # 处理数据
j = i + len(toHandle)
return j
class BeautifulSoup(BeautifulStoneSoup):
+ """这个解析器了解HTML的一些事实:
- """This parser knows the following facts about HTML:
+ * 有些标签没有闭合标签,并且应该被解释为一旦遇到就立即关闭。
- * Some tags have no closing tag and should be interpreted as being
- closed as soon as they are encountered.
+ * 某些标签内的文本(例如'script')可能包含标签,这些标签实际上不是文档的一部分,应该被解析为文本,而不是标签。如果你想将文本作为标签解析,你可以随时获取它并显式地解析。
- * The text inside some tags (ie. 'script') may contain tags which
- are not really part of the document and which should be parsed
- as text, not tags. If you want to parse the text as tags, you can
- always fetch it and parse it explicitly.
+ * 标签嵌套规则:
- * Tag nesting rules:
+ 大多数标签根本无法嵌套。例如, 标签的出现应该隐式地关闭前一个 标签。 - Most tags can't be nested at all. For instance, the occurance of - a tag should implicitly close the previous tag. + Para1 Para2应该被转换为: Para1 Para2 - Para1 Para2 - should be transformed into: - Para1 Para2 + 有些标签可以任意嵌套。例如, 标签的出现不应该隐式地关闭前一个标签。 - Some tags can be nested arbitrarily. For instance, the occurance - of atag should _not_ implicitly close the previous -tag. + Alice said:Bob said:Blah不应该被转换为: + Alice said:Bob said:Blah - Alice said:Bob said:Blah - should NOT be transformed into: - Alice said:Bob said:Blah + 有些标签可以嵌套,但是嵌套被其他标签的介入重置。例如, |