diff --git a/src/sqlmap-master/thirdparty/ansistrm/ansistrm.py b/src/sqlmap-master/thirdparty/ansistrm/ansistrm.py index 4d9731c..6c88776 100644 --- a/src/sqlmap-master/thirdparty/ansistrm/ansistrm.py +++ b/src/sqlmap-master/thirdparty/ansistrm/ansistrm.py @@ -7,7 +7,7 @@ import logging import re import sys -from lib.core.settings import IS_WIN +from lib.core.settings import IS_WIN # 导入一个设置,用于判断是否在Windows系统上运行 if IS_WIN: import ctypes @@ -16,14 +16,15 @@ if IS_WIN: # Reference: https://gist.github.com/vsajip/758430 # https://github.com/ipython/ipython/issues/4252 # https://msdn.microsoft.com/en-us/library/windows/desktop/ms686047%28v=vs.85%29.aspx + # 设置Windows API函数SetConsoleTextAttribute的参数和返回值类型 ctypes.windll.kernel32.SetConsoleTextAttribute.argtypes = [ctypes.wintypes.HANDLE, ctypes.wintypes.WORD] ctypes.windll.kernel32.SetConsoleTextAttribute.restype = ctypes.wintypes.BOOL -def stdoutEncode(data): # Cross-referenced function +def stdoutEncode(data): # 用于编码标准输出数据的函数 return data class ColorizingStreamHandler(logging.StreamHandler): - # color names to indices + # 定义颜色名称到索引的映射 color_map = { 'black': 0, 'red': 1, @@ -35,7 +36,7 @@ class ColorizingStreamHandler(logging.StreamHandler): 'white': 7, } - # levels to (background, foreground, bold/intense) + # 定义日志级别到颜色和样式的映射 level_map = { logging.DEBUG: (None, 'blue', False), logging.INFO: (None, 'green', False), @@ -43,25 +44,30 @@ class ColorizingStreamHandler(logging.StreamHandler): logging.ERROR: (None, 'red', False), logging.CRITICAL: ('red', 'white', False) } - csi = '\x1b[' - reset = '\x1b[0m' - bold = "\x1b[1m" - disable_coloring = False + csi = '\x1b[' # ANSI转义序列的前缀 + reset = '\x1b[0m' # ANSI重置颜色的转义序列 + bold = "\x1b[1m" # ANSI加粗的转义序列 + disable_coloring = False # 是否禁用颜色 @property def is_tty(self): + # 检查流是否是终端 isatty = getattr(self.stream, 'isatty', None) return isatty and isatty() and not self.disable_coloring def emit(self, record): + # 发送日志记录 try: message = stdoutEncode(self.format(record)) stream = self.stream + #如果当前流不是TTY,直接写入消息 + if not self.is_tty: if message and message[0] == "\r": message = message[1:] stream.write(message) + #如果是TTY,调用output_colorized方法来输出带颜色的消息 else: self.output_colorized(message) stream.write(getattr(self, 'terminator', '\n')) @@ -70,15 +76,19 @@ class ColorizingStreamHandler(logging.StreamHandler): except (KeyboardInterrupt, SystemExit): raise except IOError: + #IO错误时,什么也不做(pass) pass except: + #其他异常时,调用handleError方法 self.handleError(record) if not IS_WIN: def output_colorized(self, message): + # 如果不是Windows系统,直接写入消息 self.stream.write(message) else: ansi_esc = re.compile(r'\x1b\[((?:\d+)(?:;(?:\d+))*)m') + # 正则表达式,用于匹配ANSI转义序列 nt_color_map = { 0: 0x00, # black @@ -92,26 +102,32 @@ class ColorizingStreamHandler(logging.StreamHandler): } def output_colorized(self, message): + # 如果是Windows系统,解析ANSI转义序列并设置控制台颜色 parts = self.ansi_esc.split(message) h = None fd = getattr(self.stream, 'fileno', None) + #文件描述符有效,并且是标准输出或标准错误,获取对应的Windows句柄 if fd is not None: fd = fd() if fd in (1, 2): # stdout or stderr h = ctypes.windll.kernel32.GetStdHandle(-10 - fd) + #循环处理分割后的消息部分 while parts: text = parts.pop(0) + #如果部分是文本,写入并刷新流 if text: self.stream.write(text) self.stream.flush() + #如果还有部分,取出下一个部分作为参数 if parts: params = parts.pop(0) + #如果句柄有效,将参数分割并转换为整数,初始化颜色代码 if h is not None: params = [int(p) for p in params.split(';')] color = 0 @@ -131,9 +147,12 @@ class ColorizingStreamHandler(logging.StreamHandler): ctypes.windll.kernel32.SetConsoleTextAttribute(h, color) def _reset(self, message): + #重置消息的颜色 if not message.endswith(self.reset): + # 如果消息不以重置序列结尾,则添加重置序列 reset = self.reset - elif self.bold in message: # bold + elif self.bold in message: + # 如果消息包含加粗,则在重置后加粗 reset = self.reset + self.bold else: reset = self.reset @@ -141,19 +160,23 @@ class ColorizingStreamHandler(logging.StreamHandler): return reset def colorize(self, message, levelno): + # 根据日志级别给消息上色 if levelno in self.level_map and self.is_tty: bg, fg, bold = self.level_map[levelno] params = [] + #如果背景色有效,添加背景色参数 if bg in self.color_map: params.append(str(self.color_map[bg] + 40)) + #如果前景色有效,添加前景色参数 if fg in self.color_map: params.append(str(self.color_map[fg] + 30)) - + #如果需要加粗,添加加粗参数 if bold: params.append('1') + #如果参数和消息都有效,检查消息是否有前缀(空格),并提取出来 if params and message: if message.lstrip() != message: prefix = re.search(r"\s+", message).group(0) @@ -167,5 +190,6 @@ class ColorizingStreamHandler(logging.StreamHandler): return message def format(self, record): + # 格式化日志记录 message = logging.StreamHandler.format(self, record) - return self.colorize(message, record.levelno) + return self.colorize(message, record.levelno) \ No newline at end of file diff --git a/src/sqlmap-master/thirdparty/beautifulsoup/beautifulsoup.py b/src/sqlmap-master/thirdparty/beautifulsoup/beautifulsoup.py index 7401def..e51cb22 100644 --- a/src/sqlmap-master/thirdparty/beautifulsoup/beautifulsoup.py +++ b/src/sqlmap-master/thirdparty/beautifulsoup/beautifulsoup.py @@ -79,77 +79,103 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. from __future__ import generators from __future__ import print_function +# 导入 Python 2.6 以后版本中的一些特性,使得代码可以在早期版本中使用 +# generators: 使得代码可以使用 Python 2.6 引入的生成器特性 +# print_function: 使得代码可以使用 Python 3.x 中的 print 函数 + __author__ = "Leonard Richardson (leonardr@segfault.org)" __version__ = "3.2.1b" __copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" __license__ = "New-style BSD" +# 定义库的作者、版本、版权和许可证信息 + import codecs import re import sys +# 导入 Python 标准库中的模块 + if sys.version_info >= (3, 0): + # Python 3.x 版本兼容处理 xrange = range text_type = str binary_type = bytes basestring = str unichr = chr else: + # Python 2.x 版本兼容处理 text_type = unicode binary_type = str try: + # 尝试从 Python 3.x 的标准库中导入 name2codepoint from html.entities import name2codepoint except ImportError: + # 如果导入失败(在 Python 2.x 中),则从 htmlentitydefs 模块导入 from htmlentitydefs import name2codepoint try: + # 尝试导入 set(Python 2.6 以后版本中新增的集合类型) set except NameError: + # 如果导入失败(在 Python 2.6 之前的版本中),则从 sets 模块导入 Set 类 from sets import Set as set try: + # 尝试导入 sgmllib 模块(Python 3.x 中的模块) import sgmllib except ImportError: + # 如果导入失败(在 Python 2.x 中),则从 lib.utils 目录导入 sgmllib from lib.utils import sgmllib try: + # 尝试导入 markupbase 模块(Python 3.x 中的模块) import markupbase except ImportError: + # 如果导入失败(在 Python 2.x 中),则从 _markupbase 模块导入 import _markupbase as markupbase -#These hacks make Beautiful Soup able to parse XML with namespaces +# 这些 hack 使得 Beautiful Soup 能够解析带有命名空间的 XML sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match DEFAULT_OUTPUT_ENCODING = "utf-8" +# 设置默认的输出编码为 UTF-8 + def _match_css_class(str): +#构建一个正则表达式,以便匹配给定的CSS类名 """Build a RE to match the given CSS class.""" return re.compile(r"(^|.*\s)%s($|\s)" % str) # First, the classes that represent markup elements. class PageElement(object): +#作为页面元素的基类,包含导航信息,可以是标签或文本 """Contains the navigational information for some part of the page (either a tag or a piece of text)""" def _invert(h): + #创建一个新的字典,将原字典的键值对颠倒 "Cheap function to invert a hash." i = {} for k,v in h.items(): i[v] = k return i + #将XML实体映射到它们对应的特殊字符 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", "quot" : '"', "amp" : "&", "lt" : "<", "gt" : ">" } + #创建一个新的字典,将特殊字符映射回XML实体 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) def setup(self, parent=None, previous=None): + #初始化元素之间的关系,包括父元素、前一个元素、后一个元素以及兄弟元素 """Sets up the initial relations between this element and other elements.""" self.parent = parent @@ -162,6 +188,7 @@ class PageElement(object): self.previousSibling.nextSibling = self def replaceWith(self, replaceWith): + #首先记录当前元素的父元素和索引位置,然后提取当前元素,最后在原来的位置插入新元素 oldParent = self.parent myIndex = self.parent.index(self) if hasattr(replaceWith, "parent")\ @@ -177,6 +204,7 @@ class PageElement(object): oldParent.insert(myIndex, replaceWith) def replaceWithChildren(self): + #首先提取当前元素,然后将子元素逆序插入到父元素中 myParent = self.parent myIndex = self.parent.index(self) self.extract() @@ -186,6 +214,7 @@ class PageElement(object): myParent.insert(myIndex, child) def extract(self): + #从树中提取(移除)当前元素 """Destructively rips this element out of the tree.""" if self.parent: try: @@ -199,6 +228,7 @@ class PageElement(object): lastChild = self._lastRecursiveChild() nextElement = lastChild.next + #更新previous和next指针,以保持元素之间的正确连接 if self.previous: self.previous.next = nextElement if nextElement: @@ -206,6 +236,7 @@ class PageElement(object): self.previous = None lastChild.next = None + #清除当前元素的所有关系,包括父元素和兄弟元素,并返回当前元素 self.parent = None if self.previousSibling: self.previousSibling.nextSibling = self.nextSibling @@ -215,6 +246,7 @@ class PageElement(object): return self def _lastRecursiveChild(self): + #找到当前元素下最后一个被解析的子元素 "Finds the last element beneath this object to be parsed." lastChild = self while hasattr(lastChild, 'contents') and lastChild.contents: @@ -222,6 +254,7 @@ class PageElement(object): return lastChild def insert(self, position, newChild): + #在当前元素的内容列表中的指定位置插入一个新的子元素 if isinstance(newChild, basestring) \ and not isinstance(newChild, NavigableString): newChild = NavigableString(newChild) @@ -281,29 +314,31 @@ class PageElement(object): self.contents.insert(position, newChild) def append(self, tag): + #将给定的标签追加到当前元素的内容列表的末尾 """Appends the given tag to the contents of this tag.""" self.insert(len(self.contents), tag) def findNext(self, name=None, attrs={}, text=None, **kwargs): + #查找文档中当前标签之后第一个匹配给定条件的元素 """Returns the first item that matches the given criteria and appears after this Tag in the document.""" return self._findOne(self.findAllNext, name, attrs, text, **kwargs) - def findAllNext(self, name=None, attrs={}, text=None, limit=None, - **kwargs): + def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs): + #查找文档中当前标签之后所有匹配给定条件的元素 """Returns all items that match the given criteria and appear after this Tag in the document.""" - return self._findAll(name, attrs, text, limit, self.nextGenerator, - **kwargs) + return self._findAll(name, attrs, text, limit, self.nextGenerator, **kwargs) def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): + #查找文档中当前标签之后第一个匹配给定条件的兄弟标签 """Returns the closest sibling to this Tag that matches the given criteria and appears after this Tag in the document.""" return self._findOne(self.findNextSiblings, name, attrs, text, **kwargs) - def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, - **kwargs): + def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): + #查找文档中当前标签之后所有匹配给定条件的兄弟标签 """Returns the siblings of this Tag that match the given criteria and appear after this Tag in the document.""" return self._findAll(name, attrs, text, limit, @@ -311,6 +346,7 @@ class PageElement(object): fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x def findPrevious(self, name=None, attrs={}, text=None, **kwargs): + #查找文档中当前标签之前第一个匹配给定条件的元素 """Returns the first item that matches the given criteria and appears before this Tag in the document.""" return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) @@ -324,171 +360,230 @@ class PageElement(object): fetchPrevious = findAllPrevious # Compatibility with pre-3.x def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): + #查找文档中当前标签之前所有匹配给定条件的元素 """Returns the closest sibling to this Tag that matches the given criteria and appears before this Tag in the document.""" - return self._findOne(self.findPreviousSiblings, name, attrs, text, - **kwargs) + return self._findOne(self.findPreviousSiblings, name, attrs, text, **kwargs) - def findPreviousSiblings(self, name=None, attrs={}, text=None, - limit=None, **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear before this Tag in the document.""" + + def findPreviousSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): + """ + 返回在文档中出现在当前标签之前且符合给定条件的所有兄弟标签。 + + 参数: + name -- 要搜索的标签名称,可以是字符串、正则表达式或列表。 + attrs -- 要搜索的标签属性,可以是字典或关键字参数。 + text -- 要搜索的文本内容,可以是字符串、正则表达式或列表。 + limit -- 返回结果的数量限制。 + **kwargs -- 其他关键字参数,用于扩展搜索条件。 + + 返回: + 符合条件的兄弟标签列表。 + """ return self._findAll(name, attrs, text, limit, self.previousSiblingGenerator, **kwargs) - fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x + + + # 为了与BeautifulSoup 3.x版本兼容,重命名findPreviousSiblings为fetchPreviousSiblings + fetchPreviousSiblings = findPreviousSiblings + def findParent(self, name=None, attrs={}, **kwargs): - """Returns the closest parent of this Tag that matches the given - criteria.""" - # NOTE: We can't use _findOne because findParents takes a different - # set of arguments. + """ + 返回与给定条件匹配的最近的父标签。 + + 参数: + name -- 要搜索的标签名称,可以是字符串、正则表达式或列表。 + attrs -- 要搜索的标签属性,可以是字典或关键字参数。 + **kwargs -- 其他关键字参数,用于扩展搜索条件。 + + 返回: + 与条件匹配的最近的父标签,如果没有找到则返回None。 + """ + # 注意:我们不能使用_findOne,因为findParents接受不同的参数集。 r = None l = self.findParents(name, attrs, 1) if l: r = l[0] return r + def findParents(self, name=None, attrs={}, limit=None, **kwargs): - """Returns the parents of this Tag that match the given - criteria.""" + """返回与给定条件匹配的父标签列表。""" + return self._findAll(name, attrs, None, limit, self.parentGenerator, **kwargs) + + + # 为了与BeautifulSoup 3.x版本兼容,重命名findParents为fetchParents + fetchParents = findParents # 兼容旧版本 - return self._findAll(name, attrs, None, limit, self.parentGenerator, - **kwargs) - fetchParents = findParents # Compatibility with pre-3.x - #These methods do the real heavy lifting. + # 这些方法执行实际的查找操作。 def _findOne(self, method, name, attrs, text, **kwargs): + """使用指定的方法查找第一个匹配的标签。""" r = None - l = method(name, attrs, text, 1, **kwargs) + l = method(name, attrs, text, 1, **kwargs) # 调用指定的方法查找 if l: - r = l[0] + r = l[0] # 如果找到匹配项,返回第一个 return r - def _findAll(self, name, attrs, text, limit, generator, **kwargs): - "Iterates over a generator looking for things that match." + def _findAll(self, name, attrs, text, limit, generator, **kwargs): + """遍历生成器,查找所有匹配的标签。""" if isinstance(name, SoupStrainer): - strainer = name - # (Possibly) special case some findAll*(...) searches + strainer = name # 如果name是SoupStrainer实例,直接使用 + # 特殊情况处理 elif text is None and not limit and not attrs and not kwargs: - # findAll*(True) + # findAll*(True)的情况 if name is True: - return [element for element in generator() - if isinstance(element, Tag)] - # findAll*('tag-name') + return [element for element in generator() if isinstance(element, Tag)] + # findAll*('tag-name')的情况 elif isinstance(name, basestring): - return [element for element in generator() - if isinstance(element, Tag) and - element.name == name] + return [element for element in generator() if isinstance(element, Tag) and element.name == name] else: - strainer = SoupStrainer(name, attrs, text, **kwargs) - # Build a SoupStrainer + strainer = SoupStrainer(name, attrs, text, **kwargs) # 构建SoupStrainer else: - strainer = SoupStrainer(name, attrs, text, **kwargs) - results = ResultSet(strainer) - g = generator() + strainer = SoupStrainer(name, attrs, text, **kwargs) # 构建SoupStrainer + + results = ResultSet(strainer) # 创建结果集 + g = generator() # 获取生成器 while True: try: - i = next(g) + i = next(g) # 获取下一个元素 except StopIteration: - break + break # 如果没有更多元素,退出循环 if i: - found = strainer.search(i) + found = strainer.search(i) # 使用strainer查找匹配的标签 if found: - results.append(found) + results.append(found) # 将找到的标签添加到结果集中 if limit and len(results) >= limit: - break + break # 如果达到限制,退出循环 return results - #These Generators can be used to navigate starting from both - #NavigableStrings and Tags. + + # 这些生成器可用于从NavigableStrings和Tags开始导航。 def nextGenerator(self): + """生成器,遍历当前标签的下一个元素。""" i = self while i is not None: - i = i.next - yield i + i = i.next # 获取下一个元素 + yield i # 生成下一个元素 + def nextSiblingGenerator(self): + """生成器,遍历当前标签的下一个兄弟标签。""" i = self while i is not None: - i = i.nextSibling - yield i + i = i.nextSibling # 获取下一个兄弟标签 + yield i # 生成下一个兄弟标签 + def previousGenerator(self): + """生成器,遍历当前标签的前一个元素。""" i = self while i is not None: - i = i.previous - yield i + i = i.previous # 获取前一个元素 + yield i # 生成前一个元素 + def previousSiblingGenerator(self): + """生成器,遍历当前标签的前一个兄弟标签。""" i = self while i is not None: - i = i.previousSibling - yield i + i = i.previousSibling # 获取前一个兄弟标签 + yield i # 生成前一个兄弟标签 + def parentGenerator(self): + """生成器,遍历当前标签的父标签。""" i = self while i is not None: - i = i.parent - yield i + i = i.parent # 获取父标签 + yield i # 生成父标签 - # Utility methods + + # 工具方法 def substituteEncoding(self, str, encoding=None): - encoding = encoding or "utf-8" - return str.replace("%SOUP-ENCODING%", encoding) + """替换字符串中的编码占位符为指定的编码。 + + 参数: + str -- 需要进行编码替换的字符串。 + encoding -- 指定的编码,默认为"utf-8"。 + + 返回: + 替换后的字符串。 + """ + encoding = encoding or "utf-8" # 如果没有提供编码,则使用默认的"utf-8" + return str.replace("%SOUP-ENCODING%", encoding) # 替换占位符 + def toEncoding(self, s, encoding=None): - """Encodes an object to a string in some encoding, or to Unicode. - .""" + """将对象编码为某种编码的字符串,或转换为Unicode字符串。""" if isinstance(s, text_type): if encoding: - s = s.encode(encoding) + s = s.encode(encoding) # 如果是文本类型且指定了编码,则进行编码 elif isinstance(s, binary_type): - s = s.encode(encoding or "utf8") + s = s.encode(encoding or "utf8") # 如果是二进制类型,则使用指定编码或默认的UTF-8进行编码 else: - s = self.toEncoding(str(s), encoding or "utf8") - return s + s = self.toEncoding(str(s), encoding or "utf8") # 其他类型,先转换为字符串再进行编码 + return s # 返回编码后的字符串 + + # 用于匹配未闭合的尖括号或不完整的HTML实体的正则表达式 BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;))") + def _sub_entity(self, x): - """Used with a regular expression to substitute the - appropriate XML entity for an XML special character.""" - return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" + """用于正则表达式替换,将XML特殊字符替换为相应的XML实体。""" + return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" # 返回对应的XML实体 class NavigableString(text_type, PageElement): + """ + NavigableString类是BeautifulSoup中用于处理和导航字符串的类。 + 它继承自Python的text_type(Python 3中的str类型)和PageElement,使其既可以当作字符串使用, + 也可以像页面元素一样进行导航。 + """ def __new__(cls, value): - """Create a new NavigableString. + """创建一个新的NavigableString实例。 - When unpickling a NavigableString, this method is called with - the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be - passed in to the superclass's __new__ or the superclass won't know - how to handle non-ASCII characters. + 当反序列化(unpickling)一个NavigableString时,会调用此方法, + 并且传入DEFAULT_OUTPUT_ENCODING编码的字符串。需要将这个编码传给superclass的__new__, + 否则superclass不知道如何处理非ASCII字符。 """ if isinstance(value, text_type): return text_type.__new__(cls, value) return text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) def __getnewargs__(self): + """返回创建NavigableString实例时的参数。""" return (NavigableString.__str__(self),) def __getattr__(self, attr): - """text.string gives you text. This is for backwards - compatibility for Navigable*String, but for CData* it lets you - get the string without the CData wrapper.""" + """对于NavigableString,text.string返回的就是text本身。 + + 这是为了向后兼容Navigable*String,但对于CData*,它允许你获取没有CData包装器的字符串。 + """ if attr == 'string': return self else: - raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)) + raise AttributeError("'%s'对象没有属性'%s'" % (self.__class__.__name__, attr)) def __unicode__(self): + """返回NavigableString的Unicode表示。""" return str(self).decode(DEFAULT_OUTPUT_ENCODING) def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - # Substitute outgoing XML entities. + """返回NavigableString的字符串表示,并进行XML实体替换。 + + 参数: + encoding -- 指定编码,默认为DEFAULT_OUTPUT_ENCODING。 + + 返回: + 根据指定编码编码后的字符串,如果没有指定编码,则返回Unicode字符串。 + """ + # 替换XML特殊字符为对应的XML实体 data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self) if encoding: return data.encode(encoding) @@ -496,202 +591,205 @@ class NavigableString(text_type, PageElement): return data class CData(NavigableString): - + """ + CData类用于表示XML中的CDATA区域,它允许在XML文档中嵌入未经处理的文本数据。 + """ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """返回CDATA区域的字符串表示,使用指定的编码。""" return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) class ProcessingInstruction(NavigableString): + """ + ProcessingInstruction类用于表示XML处理指令,它允许在XML文档中包含处理器指令。 + """ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """返回处理指令的字符串表示,使用指定的编码。""" output = self if "%SOUP-ENCODING%" in output: + # 如果输出中包含编码占位符,则替换为实际编码 output = self.substituteEncoding(output, encoding) return "<?%s?>" % self.toEncoding(output, encoding) class Comment(NavigableString): + """ + Comment类用于表示XML中的注释,它允许在XML文档中添加注释信息。 + """ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """返回注释的字符串表示,使用指定的编码。""" return "<!--%s-->" % NavigableString.__str__(self, encoding) class Declaration(NavigableString): + """ + Declaration类用于表示XML声明,它定义了XML文档的版本和编码等信息。 + """ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """返回XML声明的字符串表示,使用指定的编码。""" return "<!%s>" % NavigableString.__str__(self, encoding) class Tag(PageElement): - - """Represents a found HTML tag with its attributes and contents.""" + """表示找到的HTML标签及其属性和内容。""" def _convertEntities(self, match): - """Used in a call to re.sub to replace HTML, XML, and numeric - entities with the appropriate Unicode characters. If HTML - entities are being converted, any unrecognized entities are - escaped.""" + """用于调用re.sub,将HTML、XML和数字实体替换为相应的Unicode字符。 + 如果正在转换HTML实体,则任何未识别的实体都会被转义。""" try: - x = match.group(1) + x = match.group(1) # 获取匹配的实体名称 if self.convertHTMLEntities and x in name2codepoint: - return unichr(name2codepoint[x]) + return unichr(name2codepoint[x]) # 转换为对应的Unicode字符 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: - return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] + return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] # 转换为XML特殊字符 else: - return u'&%s;' % x + return u'&%s;' % x # 返回未转换的实体 elif len(x) > 0 and x[0] == '#': - # Handle numeric entities + # 处理数字实体 if len(x) > 1 and x[1] == 'x': - return unichr(int(x[2:], 16)) + return unichr(int(x[2:], 16)) # 处理十六进制数字实体 else: - return unichr(int(x[1:])) + return unichr(int(x[1:])) # 处理十进制数字实体 elif self.escapeUnrecognizedEntities: - return u'&%s;' % x + return u'&%s;' % x # 转义未识别的实体 - except ValueError: # e.g. ValueError: unichr() arg not in range(0x10000) + except ValueError: # 处理unichr()引发的值错误 pass - return u'&%s;' % x + return u'&%s;' % x # 返回未识别的实体 - def __init__(self, parser, name, attrs=None, parent=None, - previous=None): - "Basic constructor." - - # We don't actually store the parser object: that lets extracted - # chunks be garbage-collected + def __init__(self, parser, name, attrs=None, parent=None, previous=None): + """基本构造函数。""" + # 我们实际上并不存储解析器对象:这使得提取的块可以被垃圾回收 self.parserClass = parser.__class__ - self.isSelfClosing = parser.isSelfClosingTag(name) - self.name = name + self.isSelfClosing = parser.isSelfClosingTag(name) # 判断标签是否为自闭合标签 + self.name = name # 标签名称 if attrs is None: - attrs = [] + attrs = [] # 如果没有提供属性,则初始化为空列表 elif isinstance(attrs, dict): - attrs = attrs.items() - self.attrs = attrs - self.contents = [] - self.setup(parent, previous) - self.hidden = False - self.containsSubstitutions = False - self.convertHTMLEntities = parser.convertHTMLEntities - self.convertXMLEntities = parser.convertXMLEntities - self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities - - # Convert any HTML, XML, or numeric entities in the attribute values. - # Reference: https://github.com/pkrumins/xgoogle/pull/16/commits/3dba1165c436b0d6e5bdbd09e53ca0dbf8a043f8 + attrs = attrs.items() # 如果提供的是字典,则转换为元组列表 + self.attrs = attrs # 标签属性 + self.contents = [] # 标签内容 + self.setup(parent, previous) # 设置父标签和前一个标签 + self.hidden = False # 标签是否隐藏 + self.containsSubstitutions = False # 是否包含替换 + self.convertHTMLEntities = parser.convertHTMLEntities # 是否转换HTML实体 + self.convertXMLEntities = parser.convertXMLEntities # 是否转换XML实体 + self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities # 是否转义未识别的实体 + + # 转换属性值中的HTML、XML或数字实体 convert = lambda k_val: (k_val[0], re.sub(r"&(#\d+|#x[0-9a-fA-F]+|\w+);", self._convertEntities, - k_val[1])) - self.attrs = map(convert, self.attrs) + k_val[1])) # 使用正则表达式替换实体 + self.attrs = map(convert, self.attrs) # 更新属性列表 def getString(self): - if (len(self.contents) == 1 - and isinstance(self.contents[0], NavigableString)): + """获取标签的字符串内容,如果内容只有一个NavigableString,则返回该内容。""" + if (len(self.contents) == 1 and isinstance(self.contents[0], NavigableString)): return self.contents[0] def setString(self, string): - """Replace the contents of the tag with a string""" - self.clear() - self.append(string) + """用字符串替换标签的内容。""" + self.clear() # 清空当前内容 + self.append(string) # 添加新的字符串内容 - string = property(getString, setString) + string = property(getString, setString) # 将getString和setString方法绑定到string属性 def getText(self, separator=u""): + """获取标签的文本内容,使用给定的分隔符连接多个文本。""" if not len(self.contents): - return u"" - stopNode = self._lastRecursiveChild().next - strings = [] - current = self.contents[0] + return u"" # 如果没有内容,返回空字符串 + stopNode = self._lastRecursiveChild().next # 获取最后一个子元素的下一个元素 + strings = [] # 存储文本内容的列表 + current = self.contents[0] # 从第一个内容开始 while current and current is not stopNode: if isinstance(current, NavigableString): - strings.append(current.strip()) - current = current.next - return separator.join(strings) + strings.append(current.strip()) # 去除文本两端的空白并添加到列表 + current = current.next # 移动到下一个内容 + return separator.join(strings) # 使用分隔符连接所有文本并返回 - text = property(getText) + text = property(getText) # 将getText方法绑定到text属性 def get(self, key, default=None): - """Returns the value of the 'key' attribute for the tag, or - the value given for 'default' if it doesn't have that - attribute.""" + """返回标签的指定属性的值,如果没有该属性,则返回默认值。""" return self._getAttrMap().get(key, default) def clear(self): - """Extract all children.""" + """提取所有子元素。""" for child in self.contents[:]: - child.extract() + child.extract() # 从树中移除每个子元素 def index(self, element): + """返回指定元素在当前标签内容中的索引,如果未找到则抛出异常。""" for i, child in enumerate(self.contents): if child is element: return i - raise ValueError("Tag.index: element not in tag") + raise ValueError("Tag.index: element not in tag") # 如果未找到,抛出值错误 def has_key(self, key): + """检查标签是否包含指定的属性。""" return self._getAttrMap().has_key(key) def __getitem__(self, key): - """tag[key] returns the value of the 'key' attribute for the tag, - and throws an exception if it's not there.""" + """通过key访问标签的属性值,如果不存在则抛出异常。""" return self._getAttrMap()[key] def __iter__(self): - "Iterating over a tag iterates over its contents." + """迭代标签的内容。""" return iter(self.contents) def __len__(self): - "The length of a tag is the length of its list of contents." + """返回标签内容的长度。""" return len(self.contents) def __contains__(self, x): + """检查指定元素是否在标签内容中。""" return x in self.contents def __nonzero__(self): - "A tag is non-None even if it has no contents." + """标签即使没有内容也被视为非空。""" return True def __setitem__(self, key, value): - """Setting tag[key] sets the value of the 'key' attribute for the - tag.""" - self._getAttrMap() - self.attrMap[key] = value + """设置标签的属性值。""" + self._getAttrMap() # 初始化属性映射 + self.attrMap[key] = value # 更新属性映射 found = False for i in xrange(0, len(self.attrs)): if self.attrs[i][0] == key: - self.attrs[i] = (key, value) + self.attrs[i] = (key, value) # 更新现有属性 found = True if not found: - self.attrs.append((key, value)) - self._getAttrMap()[key] = value + self.attrs.append((key, value)) # 添加新属性 + self._getAttrMap()[key] = value # 更新属性映射 def __delitem__(self, key): - "Deleting tag[key] deletes all 'key' attributes for the tag." + """删除标签的指定属性。""" for item in self.attrs: if item[0] == key: - self.attrs.remove(item) - #We don't break because bad HTML can define the same - #attribute multiple times. - self._getAttrMap() - if self.attrMap.has_key(key): - del self.attrMap[key] + self.attrs.remove(item) # 移除属性 + # 不中断,因为坏HTML可能定义相同的属性多次 + self._getAttrMap() # 初始化属性映射 + if self.attrMap.has_key(key): + del self.attrMap[key] # 删除属性映射中的属性 def __call__(self, *args, **kwargs): - """Calling a tag like a function is the same as calling its - findAll() method. Eg. tag('a') returns a list of all the A tags - found within this tag.""" + """调用标签就像调用其findAll()方法一样。""" return self.findAll(*args, **kwargs) def __getattr__(self, tag): - #print "Getattr %s.%s" % (self.__class__, tag) + """根据标签名称获取标签内容。""" if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: - return self.find(tag[:-3]) + return self.find(tag[:-3]) # 如果标签名以'Tag'结尾,返回对应的标签 elif tag.find('__') != 0: - return self.find(tag) - raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag)) + return self.find(tag) # 否则,返回对应的标签 + raise AttributeError("'%s'对象没有属性'%s'" % (self.__class__, tag)) def __eq__(self, other): - """Returns true iff this tag has the same name, the same attributes, - and the same contents (recursively) as the given tag. - - NOTE: right now this will return false if two tags have the - same attributes in a different order. Should this be fixed?""" + """判断当前标签是否与另一个标签相等,比较名称、属性和内容。""" if other is self: return True - if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or \ + self.name != other.name or self.attrs != other.attrs or len(self) != len(other): return False for i in xrange(0, len(self.contents)): if self.contents[i] != other.contents[i]: @@ -699,378 +797,339 @@ class Tag(PageElement): return True def __ne__(self, other): - """Returns true iff this tag is not identical to the other tag, - as defined in __eq__.""" + """判断当前标签是否与另一个标签不相等。""" return not self == other def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): - """Renders this tag as a string.""" + """将标签渲染为字符串。""" return self.__str__(encoding) def __unicode__(self): + """返回标签的Unicode表示。""" return self.__str__(None) def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): - """Returns a string or Unicode representation of this tag and - its contents. To get Unicode, pass None for encoding. - - NOTE: since Python's HTML parser consumes whitespace, this - method is not certain to reproduce the whitespace present in - the original string.""" - - encodedName = self.toEncoding(self.name, encoding) - - attrs = [] - if self.attrs: - for key, val in self.attrs: - fmt = '%s="%s"' - if isinstance(val, basestring): - if self.containsSubstitutions and '%SOUP-ENCODING%' in val: - val = self.substituteEncoding(val, encoding) - - # The attribute value either: - # - # * Contains no embedded double quotes or single quotes. - # No problem: we enclose it in double quotes. - # * Contains embedded single quotes. No problem: - # double quotes work here too. - # * Contains embedded double quotes. No problem: - # we enclose it in single quotes. - # * Embeds both single _and_ double quotes. This - # can't happen naturally, but it can happen if - # you modify an attribute value after parsing - # the document. Now we have a bit of a - # problem. We solve it by enclosing the - # attribute in single quotes, and escaping any - # embedded single quotes to XML entities. + """返回此标签及其内容的字符串或Unicode表示。 + 如果传递None作为encoding,将获得Unicode字符串。 + + 注意:由于Python的HTML解析器会消耗空白字符,此方法不能保证重现原始字符串中的空白字符。""" + + encodedName = self.toEncoding(self.name, encoding) # 将标签名编码为指定编码 + + attrs = [] # 初始化属性列表 + if self.attrs: # 如果有属性 + for key, val in self.attrs: # 遍历属性 + fmt = '%s="%s"' # 属性格式 + if isinstance(val, basestring): # 如果属性值是字符串 + if self.containsSubstitutions and '%SOUP-ENCODING%' in val: # 如果包含编码占位符 + val = self.substituteEncoding(val, encoding) # 替换编码 + + # 根据属性值中是否包含引号来决定使用单引号还是双引号 if '"' in val: fmt = "%s='%s'" - if "'" in val: - # TODO: replace with apos when - # appropriate. + if "'" in val: # 如果同时包含单双引号,则替换单引号为实体 val = val.replace("'", "&squot;") - # Now we're okay w/r/t quotes. But the attribute - # value might also contain angle brackets, or - # ampersands that aren't part of entities. We need - # to escape those to XML entities too. + # 转义属性值中的小于号、大于号和未包含在实体中的和号 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) - attrs.append(fmt % (self.toEncoding(key, encoding), - self.toEncoding(val, encoding))) + attrs.append(fmt % (self.toEncoding(key, encoding), self.toEncoding(val, encoding))) # 添加编码后的属性 close = '' closeTag = '' - if self.isSelfClosing: + if self.isSelfClosing: # 如果是自闭合标签 close = ' /' else: - closeTag = '</%s>' % encodedName + closeTag = '</%s>' % encodedName # 标签结束符号 - indentTag, indentContents = 0, 0 - if prettyPrint: + indentTag, indentContents = 0, 0 # 初始化缩进级别 + if prettyPrint: # 如果需要美化输出 indentTag = indentLevel - space = (' ' * (indentTag-1)) + space = (' ' * (indentTag - 1)) indentContents = indentTag + 1 - contents = self.renderContents(encoding, prettyPrint, indentContents) - if self.hidden: + contents = self.renderContents(encoding, prettyPrint, indentContents) # 渲染标签内容 + if self.hidden: # 如果标签是隐藏的 s = contents else: - s = [] - attributeString = '' - if attrs: - attributeString = ' ' + ' '.join(attrs) - if prettyPrint: + s = [] # 初始化字符串列表 + attributeString = '' # 初始化属性字符串 + if attrs: # 如果有属性 + attributeString = ' ' + ' '.join(attrs) # 属性字符串 + if prettyPrint: # 如果需要美化输出 s.append(space) - s.append('<%s%s%s>' % (encodedName, attributeString, close)) - if prettyPrint: + s.append('<%s%s%s>' % (encodedName, attributeString, close)) # 开始标签 + if prettyPrint: # 如果需要美化输出 s.append("\n") - s.append(contents) - if prettyPrint and contents and contents[-1] != "\n": + s.append(contents) # 内容 + if prettyPrint and contents and contents[-1] != "\n": # 如果需要美化输出且内容不以换行符结尾 s.append("\n") - if prettyPrint and closeTag: + if prettyPrint and closeTag: # 如果需要美化输出且有结束标签 s.append(space) - s.append(closeTag) - if prettyPrint and closeTag and self.nextSibling: + s.append(closeTag) # 结束标签 + if prettyPrint and closeTag and self.nextSibling: # 如果需要美化输出且有下一个兄弟节点 s.append("\n") - s = ''.join(s) - return s + s = ''.join(s) # 合并字符串 + return s # 返回标签字符串 - def decompose(self): - """Recursively destroys the contents of this tree.""" - self.extract() - if len(self.contents) == 0: + def decompose(self): # 递归销毁树的内容 + self.extract() # 提取自身 + if len(self.contents) == 0: # 如果没有内容 return - current = self.contents[0] - while current is not None: - next = current.next - if isinstance(current, Tag): - del current.contents[:] - current.parent = None - current.previous = None - current.previousSibling = None - current.next = None - current.nextSibling = None - current = next - - def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): + current = self.contents[0] # 获取第一个内容 + while current is not None: # 遍历内容 + next = current.next # 下一个内容 + if isinstance(current, Tag): # 如果是标签 + del current.contents[:] # 删除内容 + current.parent = None # 清除父节点 + current.previous = None # 清除前一个节点 + current.previousSibling = None # 清除前一个兄弟节点 + current.next = None # 清除下一个节点 + current.nextSibling = None # 清除下一个兄弟节点 + current = next # 移动到下一个内容 + + def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): # 美化输出 return self.__str__(encoding, True) def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - """Renders the contents of this tag as a string in the given - encoding. If encoding is None, returns a Unicode string..""" - s=[] - for c in self: - text = None - if isinstance(c, NavigableString): - text = c.__str__(encoding) - elif isinstance(c, Tag): - s.append(c.__str__(encoding, prettyPrint, indentLevel)) - if text and prettyPrint: - text = text.strip() - if text: - if prettyPrint: - s.append(" " * (indentLevel-1)) - s.append(text) - if prettyPrint: + prettyPrint=False, indentLevel=0): # 渲染标签内容 + """以给定编码渲染此标签的内容。如果编码为None,返回Unicode字符串。""" + s = [] # 初始化字符串列表 + for c in self: # 遍历内容 + text = None # 初始化文本 + if isinstance(c, NavigableString): # 如果是可导航字符串 + text = c.__str__(encoding) # 转换为字符串 + elif isinstance(c, Tag): # 如果是标签 + s.append(c.__str__(encoding, prettyPrint, indentLevel)) # 添加标签字符串 + if text and prettyPrint: # 如果是文本且需要美化输出 + text = text.strip() # 去除空白 + if text: # 如果有文本 + if prettyPrint: # 如果需要美化输出 + s.append(" " * (indentLevel - 1)) + s.append(text) # 添加文本 + if prettyPrint: # 如果需要美化输出 s.append("\n") + return ''.join(s) # 返回内容字符串 - return ''.join(s) - - #Soup methods + # Soup方法 def find(self, name=None, attrs={}, recursive=True, text=None, - **kwargs): - """Return only the first child of this Tag matching the given - criteria.""" - r = None - l = self.findAll(name, attrs, recursive, text, 1, **kwargs) - if l: - r = l[0] - return r + **kwargs): # 查找第一个匹配的子标签 + """返回此标签的第一个匹配给定条件的子标签。""" + r = None # 初始化结果 + l = self.findAll(name, attrs, recursive, text, 1, **kwargs) # 查找所有匹配的子标签 + if l: # 如果有结果 + r = l[0] # 第一个结果 + return r # 返回结果 + findChild = find def findAll(self, name=None, attrs={}, recursive=True, text=None, - limit=None, **kwargs): - """Extracts a list of Tag objects that match the given - criteria. You can specify the name of the Tag and any - attributes you want the Tag to have. - - The value of a key-value pair in the 'attrs' map can be a - string, a list of strings, a regular expression object, or a - callable that takes a string and returns whether or not the - string matches for some custom definition of 'matches'. The - same is true of the tag name.""" - generator = self.recursiveChildGenerator - if not recursive: - generator = self.childGenerator - return self._findAll(name, attrs, text, limit, generator, **kwargs) + limit=None, **kwargs): # 查找所有匹配的子标签 + """提取匹配给定条件的标签列表。你可以指定标签的名称和任何你希望标签拥有的属性。""" + generator = self.recursiveChildGenerator # 递归子生成器 + if not recursive: # 如果不需要递归 + generator = self.childGenerator # 子生成器 + return self._findAll(name, attrs, text, limit, generator, **kwargs) # 查找所有匹配的标签 + findChildren = findAll - # Pre-3.x compatibility methods - first = find - fetch = findAll + # Pre-3.x兼容性方法 + first = find # 第一个匹配的子标签 + fetch = findAll # 查找所有匹配的子标签 - def fetchText(self, text=None, recursive=True, limit=None): + def fetchText(self, text=None, recursive=True, limit=None): # 查找所有匹配的文本 return self.findAll(text=text, recursive=recursive, limit=limit) - def firstText(self, text=None, recursive=True): + def firstText(self, text=None, recursive=True): # 查找第一个匹配的文本 return self.find(text=text, recursive=recursive) - #Private methods + # 私有方法 + + def _getAttrMap(self): # 获取属性映射 + """如果尚未初始化,则初始化此标签属性的映射表示。""" + if not getattr(self, 'attrMap'): # 如果没有属性映射 + self.attrMap = {} # 初始化属性映射 + for (key, value) in self.attrs: # 遍历属性 + self.attrMap[key] = value # 添加属性 + return self.attrMap # 返回属性映射 - def _getAttrMap(self): - """Initializes a map representation of this tag's attributes, - if not already initialized.""" - if not getattr(self, 'attrMap'): - self.attrMap = {} - for (key, value) in self.attrs: - self.attrMap[key] = value - return self.attrMap + # 生成器方法 - #Generator methods - def childGenerator(self): - # Just use the iterator from the contents + def childGenerator(self): # 子生成器 + # 直接使用内容的迭代器 return iter(self.contents) - def recursiveChildGenerator(self): - if not len(self.contents): - return # Note: https://stackoverflow.com/a/30217723 (PEP 479) - stopNode = self._lastRecursiveChild().next - current = self.contents[0] - while current and current is not stopNode: - yield current - current = current.next + def recursiveChildGenerator(self): # 递归子生成器 + if not len(self.contents): # 如果没有内容 + return # 返回 + stopNode = self._lastRecursiveChild().next # 停止节点 + current = self.contents[0] # 当前节点 + while current and current is not stopNode: # 遍历节点 + yield current # 产生当前节点 + current = current.next # 移动到下一个节点 # Next, a couple classes to represent queries and their results. class SoupStrainer: - """Encapsulates a number of ways of matching a markup element (tag or - text).""" + """封装了多种匹配标记元素(标签或文本)的方法。""" def __init__(self, name=None, attrs={}, text=None, **kwargs): - self.name = name - if isinstance(attrs, basestring): - kwargs['class'] = _match_css_class(attrs) - attrs = None - if kwargs: - if attrs: - attrs = attrs.copy() - attrs.update(kwargs) + self.name = name # 标签名 + if isinstance(attrs, basestring): # 如果attrs是字符串 + kwargs['class'] = _match_css_class(attrs) # 将CSS类名转换为可匹配的形式 + attrs = None # 重置attrs + if kwargs: # 如果有额外的关键字参数 + if attrs: # 如果已有attrs + attrs = attrs.copy() # 复制attrs + attrs.update(kwargs) # 更新attrs else: - attrs = kwargs - self.attrs = attrs - self.text = text + attrs = kwargs # 否则直接设置attrs + self.attrs = attrs # 属性字典 + self.text = text # 文本内容 def __str__(self): - if self.text: - return self.text + if self.text: # 如果有文本内容 + return self.text # 返回文本内容 else: - return "%s|%s" % (self.name, self.attrs) + return "%s|%s" % (self.name, self.attrs) # 返回标签名和属性 def searchTag(self, markupName=None, markupAttrs={}): - found = None - markup = None - if isinstance(markupName, Tag): - markup = markupName - markupAttrs = markup + found = None # 初始化找到的元素 + markup = None # 初始化标记 + if isinstance(markupName, Tag): # 如果传入的是Tag对象 + markup = markupName # 设置标记 + markupAttrs = markup # 设置标记属性 callFunctionWithTagData = callable(self.name) \ - and not isinstance(markupName, Tag) + and not isinstance(markupName, Tag) # 判断是否是可调用的函数 if (not self.name) \ or callFunctionWithTagData \ or (markup and self._matches(markup, self.name)) \ or (not markup and self._matches(markupName, self.name)): - if callFunctionWithTagData: - match = self.name(markupName, markupAttrs) + # 如果没有指定标签名或函数调用匹配成功 + if callFunctionWithTagData: # 如果是函数调用 + match = self.name(markupName, markupAttrs) # 调用函数 else: - match = True - markupAttrMap = None - for attr, matchAgainst in self.attrs.items(): - if not markupAttrMap: - if hasattr(markupAttrs, 'get'): - markupAttrMap = markupAttrs + match = True # 默认匹配成功 + markupAttrMap = None # 初始化属性映射 + for attr, matchAgainst in self.attrs.items(): # 遍历属性 + if not markupAttrMap: # 如果没有属性映射 + if hasattr(markupAttrs, 'get'): # 如果有get方法 + markupAttrMap = markupAttrs # 设置属性映射 else: - markupAttrMap = {} - for k,v in markupAttrs: + markupAttrMap = {} # 初始化属性映射 + for k,v in markupAttrs: # 复制属性 markupAttrMap[k] = v - attrValue = markupAttrMap.get(attr) - if not self._matches(attrValue, matchAgainst): - match = False + attrValue = markupAttrMap.get(attr) # 获取属性值 + if not self._matches(attrValue, matchAgainst): # 如果属性不匹配 + match = False # 设置不匹配 break - if match: - if markup: - found = markup + if match: # 如果匹配成功 + if markup: # 如果是Tag对象 + found = markup # 设置找到的元素 else: - found = markupName - return found + found = markupName # 设置找到的元素 + return found # 返回找到的元素 def search(self, markup): + # 打印寻找信息 #print 'looking for %s in %s' % (self, markup) - found = None - # If given a list of items, scan it for a text element that - # matches. + found = None # 初始化找到的元素 + # 如果给定的是一个元素列表,扫描它以找到匹配的文本元素 if hasattr(markup, "__iter__") \ and not isinstance(markup, Tag): - for element in markup: + for element in markup: # 遍历元素 if isinstance(element, NavigableString) \ - and self.search(element): - found = element + and self.search(element): # 如果是可导航字符串并且匹配 + found = element # 设置找到的元素 break - # If it's a Tag, make sure its name or attributes match. - # Don't bother with Tags if we're searching for text. + # 如果它是一个Tag,确保它的名称或属性匹配 + # 如果我们正在寻找文本,就不要麻烦处理Tags elif isinstance(markup, Tag): - if not self.text: - found = self.searchTag(markup) - # If it's text, make sure the text matches. + if not self.text: # 如果不是寻找文本 + found = self.searchTag(markup) # 搜索标签 + # 如果它是文本,确保文本匹配 elif isinstance(markup, NavigableString) or \ isinstance(markup, basestring): - if self._matches(markup, self.text): - found = markup + if self._matches(markup, self.text): # 如果文本匹配 + found = markup # 设置找到的元素 else: raise Exception("I don't know how to match against a %s" \ - % markup.__class__) - return found + % markup.__class__) # 抛出异常 + return found # 返回找到的元素 def _matches(self, markup, matchAgainst): + # 打印匹配信息 #print "Matching %s against %s" % (markup, matchAgainst) - result = False - if matchAgainst is True: - result = markup is not None - elif callable(matchAgainst): - result = matchAgainst(markup) + result = False # 初始化匹配结果 + if matchAgainst is True: # 如果匹配条件是True + result = markup is not None # 只要markup不是None就匹配 + elif callable(matchAgainst): # 如果匹配条件是可调用的 + result = matchAgainst(markup) # 调用函数 else: - #Custom match methods take the tag as an argument, but all - #other ways of matching match the tag name as a string. - if isinstance(markup, Tag): - markup = markup.name - if markup and not isinstance(markup, basestring): - markup = text_type(markup) - #Now we know that chunk is either a string, or None. - if hasattr(matchAgainst, 'match'): - # It's a regexp object. - result = markup and matchAgainst.search(markup) - elif hasattr(matchAgainst, '__iter__'): # list-like - result = markup in matchAgainst - elif hasattr(matchAgainst, 'items'): - result = markup.has_key(matchAgainst) - elif matchAgainst and isinstance(markup, basestring): - if isinstance(markup, text_type): - matchAgainst = text_type(matchAgainst) + # 自定义匹配方法接受标签作为参数,但所有其他匹配方式都匹配标签名称作为字符串 + if isinstance(markup, Tag): # 如果是Tag对象 + markup = markup.name # 获取标签名 + if markup and not isinstance(markup, basestring): # 如果markup不是字符串 + markup = text_type(markup) # 转换为字符串 + # 现在我们知道chunk要么是字符串,要么None + if hasattr(matchAgainst, 'match'): # 如果是正则表达式对象 + result = markup and matchAgainst.search(markup) # 搜索匹配 + elif hasattr(matchAgainst, '__iter__'): # 如果是可迭代的 + result = markup in matchAgainst # 是否在其中 + elif hasattr(matchAgainst, 'items'): # 如果有items方法 + result = markup.has_key(matchAgainst) # 是否有键 + elif matchAgainst and isinstance(markup, basestring): # 如果都是字符串 + if isinstance(markup, text_type): # 如果是unicode + matchAgainst = text_type(matchAgainst) # 转换为unicode else: - matchAgainst = str(matchAgainst) + matchAgainst = str(matchAgainst) # 转换为字符串 + + if not result: # 如果不匹配 + result = matchAgainst == markup # 比较是否相等 + return result # 返回匹配结果 - if not result: - result = matchAgainst == markup - return result class ResultSet(list): - """A ResultSet is just a list that keeps track of the SoupStrainer - that created it.""" + """ResultSet是一个特殊的列表,它记录了创建它的SoupStrainer。""" + def __init__(self, source): - list.__init__([]) - self.source = source + list.__init__([]) # 初始化列表 + self.source = source # 记录创建ResultSet的SoupStrainer -# Now, some helper functions. + +# 以下是一些辅助函数。 def buildTagMap(default, *args): - """Turns a list of maps, lists, or scalars into a single map. - Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and - NESTING_RESET_TAGS maps out of lists and partial maps.""" - built = {} - for portion in args: - if hasattr(portion, 'items'): - #It's a map. Merge it. - for k,v in portion.items(): + """将多个映射、列表或标量转换为一个单一的映射。 + 用于构建SELF_CLOSING_TAGS、NESTABLE_TAGS和NESTING_RESET_TAGS映射。""" + + built = {} # 初始化空字典 + for portion in args: # 遍历传入的参数 + if hasattr(portion, 'items'): # 如果参数是映射 + # 合并映射 + for k, v in portion.items(): built[k] = v - elif hasattr(portion, '__iter__'): # is a list - #It's a list. Map each item to the default. + elif hasattr(portion, '__iter__'): # 如果参数是列表 + # 将列表中的每个项映射到默认值 for k in portion: built[k] = default else: - #It's a scalar. Map it to the default. + # 如果参数是标量,将其映射到默认值 built[portion] = default - return built + return built # 返回构建的映射 # Now, the parser classes. class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): + """这个类包含了基本的解析器和搜索代码。它定义了一个解析器,除了以下规则外,对标签行为一无所知: - """This class contains the basic parser and search code. It defines - a parser that knows nothing about tag behavior except for the - following: - - You can't close a tag without closing all the tags it encloses. - That is, "<foo><bar></foo>" actually means - "<foo><bar></bar></foo>". + 你不能在不关闭它所包含的所有标签的情况下关闭一个标签。 + 也就是说,"<foo><bar></foo>"实际上意味着"<foo><bar></bar></foo>"。 - [Another possible explanation is "<foo><bar /></foo>", but since - this class defines no SELF_CLOSING_TAGS, it will never use that - explanation.] + [另一种可能的解释是"<foo><bar /></foo>",但由于这个类没有定义SELF_CLOSING_TAGS,它永远不会使用这种解释。] - This class is useful for parsing XML or made-up markup languages, - or when BeautifulSoup makes an assumption counter to what you were - expecting.""" + 这个类对于解析XML或自创的标记语言,或者当BeautifulSoup做出了与您预期相反的假设时非常有用。""" SELF_CLOSING_TAGS = {} NESTABLE_TAGS = {} @@ -1085,57 +1144,45 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): ] ROOT_TAG_NAME = u'[document]' + """根标签的名称。""" HTML_ENTITIES = "html" XML_ENTITIES = "xml" XHTML_ENTITIES = "xhtml" - # TODO: This only exists for backwards-compatibility + # TODO: 这只用于向后兼容 ALL_ENTITIES = XHTML_ENTITIES - # Used when determining whether a text node is all whitespace and - # can be replaced with a single space. A text node that contains - # fancy Unicode spaces (usually non-breaking) should be left - # alone. + # 用于确定一个文本节点是否全部是空白字符 + # 并且可以被替换为一个空格。包含花式Unicode空格(通常是不间断的)的文本节点应该保持不变。 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, markupMassage=True, smartQuotesTo=XML_ENTITIES, convertEntities=None, selfClosingTags=None, isHTML=False): - """The Soup object is initialized as the 'root tag', and the - provided markup (which can be a string or a file-like object) - is fed into the underlying parser. + """Soup对象被初始化为'根标签',提供的标记(可以是字符串或文件类对象) + 被送入底层解析器。 - sgmllib will process most bad HTML, and the BeautifulSoup - class has some tricks for dealing with some HTML that kills - sgmllib, but Beautiful Soup can nonetheless choke or lose data - if your data uses self-closing tags or declarations - incorrectly. + sgmllib会处理大多数错误的HTML,BeautifulSoup类有一些技巧来处理一些使sgmllib死亡的HTML, + 但是BeautifulSoup仍然可能因为数据使用自闭合标签或声明不正确而窒息或丢失数据。 - By default, Beautiful Soup uses regexes to sanitize input, - avoiding the vast majority of these problems. If the problems - don't apply to you, pass in False for markupMassage, and - you'll get better performance. + 默认情况下,BeautifulSoup使用正则表达式来清理输入,避免了绝大多数这些问题。如果这些问题不适用于您, + 传递False给markupMassage,您将获得更好的性能。 - The default parser massage techniques fix the two most common - instances of invalid HTML that choke sgmllib: + 默认的解析器按摩技术修复了使sgmllib窒息的两种最常见的无效HTML实例: - <br/> (No space between name of closing tag and tag close) - <! --Comment--> (Extraneous whitespace in declaration) + <br/>(闭合标签名称和标签关闭之间没有空格) + <! --Comment-->(声明中的多余空白) - You can pass in a custom list of (RE object, replace method) - tuples to get Beautiful Soup to scrub your input the way you - want.""" + 您可以传递自定义的(RE对象,替换方法)元组列表,让BeautifulSoup按照您想要的方式清理您的输入。""" self.parseOnlyThese = parseOnlyThese self.fromEncoding = fromEncoding self.smartQuotesTo = smartQuotesTo self.convertEntities = convertEntities - # Set the rules for how we'll deal with the entities we - # encounter + # 设置我们将如何处理我们遇到的实体的规则 if self.convertEntities: - # It doesn't make sense to convert encoded characters to - # entities even while you're converting entities to Unicode. - # Just convert it all to Unicode. + # 将编码字符转换为实体是没有意义的,即使在您正在将实体转换为Unicode时也是如此。 + # 将所有内容都转换为Unicode。 self.smartQuotesTo = None if convertEntities == self.HTML_ENTITIES: self.convertXMLEntities = False @@ -1157,7 +1204,7 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) sgmllib.SGMLParser.__init__(self) - if hasattr(markup, 'read'): # It's a file-type object. + if hasattr(markup, 'read'): # 它是一个文件类型对象。 markup = markup.read() self.markup = markup self.markupMassage = markupMassage @@ -1165,20 +1212,20 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): self._feed(isHTML=isHTML) except StopParsing: pass - self.markup = None # The markup can now be GCed + self.markup = None # 标记现在可以被GCed了 def convert_charref(self, name): - """This method fixes a bug in Python's SGMLParser.""" + """这个方法修复了Python的SGMLParser中的一个bug。""" try: n = int(name) except ValueError: return - if not 0 <= n <= 127 : # ASCII ends at 127, not 255 + if not 0 <= n <= 127 : # ASCII在127结束,不是255 return return self.convert_codepoint(n) def _feed(self, inDocumentEncoding=None, isHTML=False): - # Convert the document to Unicode. + # 将文档转换为Unicode。 markup = self.markup if isinstance(markup, text_type): if not hasattr(self, 'originalEncoding'): @@ -1196,148 +1243,163 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) - # TODO: We get rid of markupMassage so that the - # soup object can be deepcopied later on. Some - # Python installations can't copy regexes. If anyone - # was relying on the existence of markupMassage, this - # might cause problems. + # TODO: 我们摆脱markupMassage,以便soup对象可以稍后被深度复制。 + # 一些Python安装无法复制正则表达式。如果有人依赖markupMassage的存在,这可能会导致问题。 del(self.markupMassage) self.reset() sgmllib.SGMLParser.feed(self, markup) - # Close out any unfinished strings and close all the open tags. + # 关闭任何未完成的字符串并关闭所有打开的标签。 self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag() def __getattr__(self, methodName): - """This method routes method call requests to either the SGMLParser - superclass or the Tag superclass, depending on the method name.""" - #print "__getattr__ called on %s.%s" % (self.__class__, methodName) + """这个方法将方法调用请求路由到SGMLParser超类或Tag超类,具体取决于方法名。""" + # 打印出被调用的方法名 + # print "__getattr__ called on %s.%s" % (self.__class__, methodName) if methodName.startswith('start_') or methodName.startswith('end_') \ - or methodName.startswith('do_'): + or methodName.startswith('do_'): + # 如果方法是SGMLParser的方法,则从SGMLParser中获取 return sgmllib.SGMLParser.__getattr__(self, methodName) elif not methodName.startswith('__'): + # 否则,如果方法不是特殊方法,则从Tag中获取 return Tag.__getattr__(self, methodName) else: + # 如果方法是特殊方法,则抛出属性错误 raise AttributeError def isSelfClosingTag(self, name): - """Returns true iff the given string is the name of a - self-closing tag according to this parser.""" + """返回true,当且仅当给定的字符串是此解析器中自闭合标签的名称。""" + # 检查标签是否是自闭合标签 return name in self.SELF_CLOSING_TAGS \ - or name in self.instanceSelfClosingTags + or name in self.instanceSelfClosingTags def reset(self): + # 重置Tag对象,并初始化ROOT_TAG_NAME Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 + # 重置SGMLParser对象 sgmllib.SGMLParser.reset(self) - self.currentData = [] - self.currentTag = None - self.tagStack = [] - self.quoteStack = [] + self.currentData = [] # 存储当前数据 + self.currentTag = None # 当前标签 + self.tagStack = [] # 标签堆栈 + self.quoteStack = [] # 引号堆栈 + # 将ROOT_TAG_NAME推入标签堆栈 self.pushTag(self) def popTag(self): + # 从标签堆栈中弹出一个标签 tag = self.tagStack.pop() - - #print "Pop", tag.name if self.tagStack: - self.currentTag = self.tagStack[-1] + self.currentTag = self.tagStack[-1] # 更新当前标签 return self.currentTag def pushTag(self, tag): - #print "Push", tag.name + # 将一个标签推入标签堆栈 if self.currentTag: - self.currentTag.contents.append(tag) - self.tagStack.append(tag) - self.currentTag = self.tagStack[-1] + self.currentTag.contents.append(tag) # 将标签添加到当前标签的内容中 + self.tagStack.append(tag) # 推入堆栈 + self.currentTag = self.tagStack[-1] # 更新当前标签 def endData(self, containerClass=NavigableString): + # 结束当前数据的处理 if self.currentData: - currentData = u''.join(self.currentData) + currentData = u''.join(self.currentData) # 合并当前数据 + # 如果数据只包含ASCII空白字符,并且不在PRESERVE_WHITESPACE_TAGS中,则替换为单个空格 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and - not set([tag.name for tag in self.tagStack]).intersection( - self.PRESERVE_WHITESPACE_TAGS)): + not set([tag.name for tag in self.tagStack]).intersection( + self.PRESERVE_WHITESPACE_TAGS)): if '\n' in currentData: currentData = '\n' else: currentData = ' ' self.currentData = [] + # 如果设置了parseOnlyThese,并且不在顶层标签,并且当前数据不匹配,则不处理 if self.parseOnlyThese and len(self.tagStack) <= 1 and \ - (not self.parseOnlyThese.text or \ - not self.parseOnlyThese.search(currentData)): + (not self.parseOnlyThese.text or \ + not self.parseOnlyThese.search(currentData)): return - o = containerClass(currentData) - o.setup(self.currentTag, self.previous) + o = containerClass(currentData) # 创建一个新的NavigableString对象 + o.setup(self.currentTag, self.previous) # 设置对象 if self.previous: - self.previous.next = o - self.previous = o - self.currentTag.contents.append(o) - + self.previous.next = o # 设置前一个对象的下一个对象 + self.previous = o # 更新前一个对象 + self.currentTag.contents.append(o) # 将对象添加到当前标签的内容中 def _popToTag(self, name, inclusivePop=True): - """Pops the tag stack up to and including the most recent - instance of the given tag. If inclusivePop is false, pops the tag - stack up to but *not* including the most recent instqance of - the given tag.""" - #print "Popping to %s" % name + """弹出标签堆栈直到并包括最近的给定标签。如果inclusivePop为false,则弹出标签堆栈直到但不包括最近的给定标签。""" + # 打印出正在弹出到的标签名 + # print "Popping to %s" % name if name == self.ROOT_TAG_NAME: return numPops = 0 mostRecentTag = None - for i in xrange(len(self.tagStack)-1, 0, -1): + # 从后向前查找给定标签的位置 + for i in xrange(len(self.tagStack) - 1, 0, -1): if name == self.tagStack[i].name: - numPops = len(self.tagStack)-i + numPops = len(self.tagStack) - i break if not inclusivePop: numPops = numPops - 1 + # 弹出标签 for i in xrange(0, numPops): mostRecentTag = self.popTag() return mostRecentTag def _smartPop(self, name): - - """We need to pop up to the previous tag of this type, unless - one of this tag's nesting reset triggers comes between this - tag and the previous tag of this type, OR unless this tag is a - generic nesting trigger and another generic nesting trigger - comes between this tag and the previous tag of this type. - - Examples: - <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. - <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. - <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. - - <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. - <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' - <td><tr><td> *<td>* should pop to 'tr', not the first 'td' - """ + """我们需要弹出到这种类型的前一个标签,除非在当前标签和这种类型的前一个标签之间出现了这种标签的嵌套重置触发器, + 或者除非这个标签是一个通用嵌套触发器,并且在这个标签和这种类型的前一个标签之间出现了另一个通用嵌套触发器。 + + 例子: + < p > Foo < b > Bar * < p > + *应该弹出到 + 'p',而不是 + 'b'。 + < p > Foo < table > Bar * < p > + *应该弹出到 + 'table',而不是 + 'p'。 + < p > Foo < table > < tr > Bar * < p > + *应该弹出到 + 'tr',而不是 + 'p'。 + + < li > < ul > < li > * < li > + *应该弹出到 + 'ul',而不是第一个 + 'li'。 + < tr > < table > < tr > * < tr > + *应该弹出到 + 'table',而不是第一个 + 'tr' + < td > < tr > < td > * < td > + *应该弹出到 + 'tr',而不是第一个 + 'td' + + """ nestingResetTriggers = self.NESTABLE_TAGS.get(name) isNestable = nestingResetTriggers != None isResetNesting = name in self.RESET_NESTING_TAGS popTo = None inclusive = True - for i in xrange(len(self.tagStack)-1, 0, -1): + for i in xrange(len(self.tagStack) - 1, 0, -1): p = self.tagStack[i] if (not p or p.name == name) and not isNestable: - #Non-nestable tags get popped to the top or to their - #last occurance. + # 非嵌套标签被弹出到顶部或它们的最后一次出现。 popTo = name break if (nestingResetTriggers is not None and p.name in nestingResetTriggers) \ - or (nestingResetTriggers is None and isResetNesting - and p.name in self.RESET_NESTING_TAGS): - - #If we encounter one of the nesting reset triggers - #peculiar to this tag, or we encounter another tag - #that causes nesting to reset, pop up to but not - #including that tag. + or (nestingResetTriggers is None and isResetNesting + and p.name in self.RESET_NESTING_TAGS): + # 如果我们遇到了这个标签特有的一个嵌套重置触发器,或者我们遇到了另一个导致嵌套重置的标签, + # 弹出到但不包括那个标签。 popTo = p.name inclusive = False break @@ -1345,11 +1407,14 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): if popTo: self._popToTag(popTo, inclusive) + def unknown_starttag(self, name, attrs, selfClosing=0): - #print "Start tag %s: %s" % (name, attrs) + # 打印开始标签信息 + # print "Start tag %s: %s" % (name, attrs) if self.quoteStack: - #This is not a real tag. - #print "<%s> is not real!" % name + # 这不是一个真正的标签。 + # 打印信息 + # print "<%s> is not real!" % name attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs]) self.handle_data('<%s%s>' % (name, attrs)) return @@ -1359,7 +1424,7 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): self._smartPop(name) if self.parseOnlyThese and len(self.tagStack) <= 1 \ - and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): + and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): return tag = Tag(self, name, attrs, self.currentTag, self.previous) @@ -1370,16 +1435,20 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): if selfClosing or self.isSelfClosingTag(name): self.popTag() if name in self.QUOTE_TAGS: - #print "Beginning quote (%s)" % name + # 打印开始引用信息 + # print "Beginning quote (%s)" % name self.quoteStack.append(name) self.literal = 1 return tag + def unknown_endtag(self, name): - #print "End tag %s" % name + # 打印结束标签信息 + # print "End tag %s" % name if self.quoteStack and self.quoteStack[-1] != name: - #This is not a real end tag. - #print "</%s> is not real!" % name + # 这不是一个真正的结束标签。 + # 打印信息 + # print "</%s> is not real!" % name self.handle_data('</%s>' % name) return self.endData() @@ -1389,149 +1458,104 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): self.literal = (len(self.quoteStack) > 0) def handle_data(self, data): + """将数据添加到当前数据列表中。""" self.currentData.append(data) def _toStringSubclass(self, text, subclass): - """Adds a certain piece of text to the tree as a NavigableString - subclass.""" - self.endData() - self.handle_data(text) - self.endData(subclass) + """将特定文本作为NavigableString子类添加到树中。""" + self.endData() # 结束当前数据的处理 + self.handle_data(text) # 处理文本 + self.endData(subclass) # 结束处理并指定子类 def handle_pi(self, text): - """Handle a processing instruction as a ProcessingInstruction - object, possibly one with a %SOUP-ENCODING% slot into which an - encoding will be plugged later.""" + """将处理指令作为ProcessingInstruction对象处理,可能有一个 % SOUP - ENCODING % 插槽,稍后将插入编码。""" if text[:3] == "xml": text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" self._toStringSubclass(text, ProcessingInstruction) def handle_comment(self, text): - "Handle comments as Comment objects." + """将注释作为Comment对象处理。""" self._toStringSubclass(text, Comment) def handle_charref(self, ref): - "Handle character references as data." + """将字符引用作为数据处理。""" if self.convertEntities: - data = unichr(int(ref)) + data = unichr(int(ref)) # 转换为Unicode字符 else: - data = '&#%s;' % ref + data = '&#%s;' % ref # 保持为实体引用 self.handle_data(data) def handle_entityref(self, ref): - """Handle entity references as data, possibly converting known - HTML and/or XML entity references to the corresponding Unicode - characters.""" + """将实体引用作为数据处理,可能将已知的HTML和 / 或XML实体引用转换为相应的Unicode字符。""" data = None if self.convertHTMLEntities: try: - data = unichr(name2codepoint[ref]) + data = unichr(name2codepoint[ref]) # 尝试转换为Unicode字符 except KeyError: pass if not data and self.convertXMLEntities: - data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) + data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) # 获取特殊字符 if not data and self.convertHTMLEntities and \ - not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): - # TODO: We've got a problem here. We're told this is - # an entity reference, but it's not an XML entity - # reference or an HTML entity reference. Nonetheless, - # the logical thing to do is to pass it through as an - # unrecognized entity reference. - # - # Except: when the input is "&carol;" this function - # will be called with input "carol". When the input is - # "AT&T", this function will be called with input - # "T". We have no way of knowing whether a semicolon - # was present originally, so we don't know whether - # this is an unknown entity or just a misplaced - # ampersand. - # - # The more common case is a misplaced ampersand, so I - # escape the ampersand and omit the trailing semicolon. - data = "&%s" % ref + not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): + # 处理未知实体引用 + data = "&%s" % ref if not data: - # This case is different from the one above, because we - # haven't already gone through a supposedly comprehensive - # mapping of entities to Unicode characters. We might not - # have gone through any mapping at all. So the chances are - # very high that this is a real entity, and not a - # misplaced ampersand. + # 处理真实实体 data = "&%s;" % ref self.handle_data(data) def handle_decl(self, data): - "Handle DOCTYPEs and the like as Declaration objects." + """将DOCTYPE等声明作为Declaration对象处理。""" self._toStringSubclass(data, Declaration) def parse_declaration(self, i): - """Treat a bogus SGML declaration as raw data. Treat a CDATA - declaration as a CData object.""" + """将无效的SGML声明视为原始数据。将CDATA声明视为CData对象。""" j = None - if self.rawdata[i:i+9] == '<![CDATA[': - k = self.rawdata.find(']]>', i) - if k == -1: - k = len(self.rawdata) - data = self.rawdata[i+9:k] - j = k+3 - self._toStringSubclass(data, CData) + if self.rawdata[i:i + 9] == '<![CDATA[': + k = self.rawdata.find(']]>', i) # 寻找CDATA结束标志 + if k == -1: + k = len(self.rawdata) + data = self.rawdata[i + 9:k] # 获取CDATA中的数据 + j = k + 3 + self._toStringSubclass(data, CData) # 处理CDATA数据 else: try: - j = sgmllib.SGMLParser.parse_declaration(self, i) + j = sgmllib.SGMLParser.parse_declaration(self, i) # 处理SGML声明 except sgmllib.SGMLParseError: - toHandle = self.rawdata[i:] - self.handle_data(toHandle) + toHandle = self.rawdata[i:] # 获取错误后的数据 + self.handle_data(toHandle) # 处理数据 j = i + len(toHandle) return j class BeautifulSoup(BeautifulStoneSoup): + """这个解析器了解HTML的一些事实: - """This parser knows the following facts about HTML: - - * Some tags have no closing tag and should be interpreted as being - closed as soon as they are encountered. + * 有些标签没有闭合标签,并且应该被解释为一旦遇到就立即关闭。 - * The text inside some tags (ie. 'script') may contain tags which - are not really part of the document and which should be parsed - as text, not tags. If you want to parse the text as tags, you can - always fetch it and parse it explicitly. + * 某些标签内的文本(例如'script')可能包含标签,这些标签实际上不是文档的一部分,应该被解析为文本,而不是标签。如果你想将文本作为标签解析,你可以随时获取它并显式地解析。 - * Tag nesting rules: + * 标签嵌套规则: - Most tags can't be nested at all. For instance, the occurance of - a <p> tag should implicitly close the previous <p> tag. + 大多数标签根本无法嵌套。例如,<p>标签的出现应该隐式地关闭前一个<p>标签。 - <p>Para1<p>Para2 - should be transformed into: - <p>Para1</p><p>Para2 + <p>Para1<p>Para2应该被转换为:<p>Para1</p><p>Para2 - Some tags can be nested arbitrarily. For instance, the occurance - of a <blockquote> tag should _not_ implicitly close the previous - <blockquote> tag. + 有些标签可以任意嵌套。例如,<blockquote>标签的出现不应该隐式地关闭前一个<blockquote>标签。 - Alice said: <blockquote>Bob said: <blockquote>Blah - should NOT be transformed into: - Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah + Alice said: <blockquote>Bob said: <blockquote>Blah不应该被转换为: + Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah - Some tags can be nested, but the nesting is reset by the - interposition of other tags. For instance, a <tr> tag should - implicitly close the previous <tr> tag within the same <table>, - but not close a <tr> tag in another table. + 有些标签可以嵌套,但是嵌套被其他标签的介入重置。例如,<tr>标签应该隐式地关闭同一<table>中的前一个<tr>标签,但不应该关闭另一个<table>中的<tr>标签。 - <table><tr>Blah<tr>Blah - should be transformed into: - <table><tr>Blah</tr><tr>Blah - but, - <tr>Blah<table><tr>Blah - should NOT be transformed into - <tr>Blah<table></tr><tr>Blah + <table><tr>Blah<tr>Blah应该被转换为: + <table><tr>Blah</tr><tr>Blah + 但是, + <tr>Blah<table><tr>Blah不应该被转换为: + <tr>Blah<table></tr><tr>Blah - Differing assumptions about tag nesting rules are a major source - of problems with the BeautifulSoup class. If BeautifulSoup is not - treating as nestable a tag your page author treats as nestable, - try ICantBelieveItsBeautifulSoup, MinimalSoup, or - BeautifulStoneSoup before writing your own subclass.""" + 对标签嵌套规则的不同假设是BeautifulSoup类问题的主要来源。如果BeautifulSoup没有将页面作者视为可嵌套的标签视为可嵌套,请尝试ICantBelieveItsBeautifulSoup、MinimalSoup或BeautifulStoneSoup,然后再编写自己的子类。""" def __init__(self, *args, **kwargs): if 'smartQuotesTo' not in kwargs: @@ -1539,26 +1563,25 @@ class BeautifulSoup(BeautifulStoneSoup): kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs) + # 定义自闭合标签 SELF_CLOSING_TAGS = buildTagMap(None, ('br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base', 'col')) + # 定义保持空白字符的标签 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + # 定义包含脚本或文本区域的标签 QUOTE_TAGS = {'script' : None, 'textarea' : None} - #According to the HTML standard, each of these inline tags can - #contain another tag of the same type. Furthermore, it's common - #to actually use these tags this way. + # 定义内联标签可以包含另一个相同类型的标签 NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center') - #According to the HTML standard, these block tags can contain - #another tag of the same type. Furthermore, it's common - #to actually use these tags this way. + # 定义块级标签可以包含另一个相同类型的标签 NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del') - #Lists can contain other lists, but there are restrictions. + # 定义列表可以包含其他列表,但有限制 NESTABLE_LIST_TAGS = { 'ol' : [], 'ul' : [], 'li' : ['ul', 'ol'], @@ -1566,7 +1589,7 @@ class BeautifulSoup(BeautifulStoneSoup): 'dd' : ['dl'], 'dt' : ['dl'] } - #Tables can contain other tables, but there are restrictions. + # 定义表格可以包含其他表格,但有限制 NESTABLE_TABLE_TAGS = {'table' : [], 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 'td' : ['tr'], @@ -1576,25 +1599,25 @@ class BeautifulSoup(BeautifulStoneSoup): 'tfoot' : ['table'], } + # 定义非嵌套块级标签 NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre') - #If one of these tags is encountered, all tags up to the next tag of - #this type are popped. + # 如果遇到这些标签之一,则直到下一个相同类型的标签为止的所有标签都被弹出 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', NON_NESTABLE_BLOCK_TAGS, NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) + # 定义可以嵌套的标签 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) - # Used to detect the charset in a META tag; see start_meta + # 用于检测META标签中的字符集;参见start_meta CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) def start_meta(self, attrs): - """Beautiful Soup can detect a charset included in a META tag, - try to convert the document to that charset, and re-parse the - document from the beginning.""" + """Beautiful Soup可以检测META标签中的字符集, + 尝试将文档转换为该字符集,并从头开始重新解析文档。""" httpEquiv = None contentType = None contentTypeIndex = None @@ -1609,16 +1632,13 @@ class BeautifulSoup(BeautifulStoneSoup): contentType = value contentTypeIndex = i - if httpEquiv and contentType: # It's an interesting meta tag. + if httpEquiv and contentType: # 它是一个有趣的meta标签 match = self.CHARSET_RE.search(contentType) if match: if (self.declaredHTMLEncoding is not None or self.originalEncoding == self.fromEncoding): - # An HTML encoding was sniffed while converting - # the document to Unicode, or an HTML encoding was - # sniffed during a previous pass through the - # document, or an encoding was specified - # explicitly and it worked. Rewrite the meta tag. + # 在将文档转换为Unicode时检测到HTML编码,或在以前的文档遍历中检测到HTML编码,或显式指定的编码有效 + # 重写meta标签 def rewrite(match): return match.group(1) + "%SOUP-ENCODING%" newAttr = self.CHARSET_RE.sub(rewrite, contentType) @@ -1626,89 +1646,74 @@ class BeautifulSoup(BeautifulStoneSoup): newAttr) tagNeedsEncodingSubstitution = True else: - # This is our first pass through the document. - # Go through it again with the encoding information. + # 这是我们第一次通过文档 + # 使用编码信息再次遍历文档 newCharset = match.group(3) if newCharset and newCharset != self.originalEncoding: self.declaredHTMLEncoding = newCharset self._feed(self.declaredHTMLEncoding) raise StopParsing - pass tag = self.unknown_starttag("meta", attrs) if tag and tagNeedsEncodingSubstitution: tag.containsSubstitutions = True class StopParsing(Exception): + """用于停止解析的异常类。""" pass class ICantBelieveItsBeautifulSoup(BeautifulSoup): - - """The BeautifulSoup class is oriented towards skipping over - common HTML errors like unclosed tags. However, sometimes it makes - errors of its own. For instance, consider this fragment: + """BeautifulSoup类通常忽略一些常见的HTML错误,如未闭合的标签。然而,有时它也会犯自己的错误。 + 例如,考虑这个片段: <b>Foo<b>Bar</b></b> - This is perfectly valid (if bizarre) HTML. However, the - BeautifulSoup class will implicitly close the first b tag when it - encounters the second 'b'. It will think the author wrote - "<b>Foo<b>Bar", and didn't close the first 'b' tag, because - there's no real-world reason to bold something that's already - bold. When it encounters '</b></b>' it will close two more 'b' - tags, for a grand total of three tags closed instead of two. This - can throw off the rest of your document structure. The same is - true of a number of other tags, listed below. - - It's much more common for someone to forget to close a 'b' tag - than to actually use nested 'b' tags, and the BeautifulSoup class - handles the common case. This class handles the not-co-common - case: where you can't believe someone wrote what they did, but - it's valid HTML and BeautifulSoup screwed up by assuming it - wouldn't be.""" + 这是完全有效的(如果奇怪的)HTML。然而,BeautifulSoup类会在遇到第二个'b'时隐式地关闭第一个b标签。 + 它会认为作者写了"<b>Foo<b>Bar",并没有关闭第一个'b'标签,因为在现实中没有理由将已经加粗的内容再次加粗。 + 当它遇到'</b></b>'时,它会关闭两个更多的'b'标签,总共关闭了三个标签而不是两个。这可能会打乱你的文档结构。 + 同样的情况也适用于一些其他标签,如下所述。 + + 人们忘记关闭'b'标签比实际使用嵌套的'b'标签要常见得多,而BeautifulSoup类处理的是常见情况。 + 这个类处理的是不常见的情况:你不敢相信有人写了他们所写的东西,但是它是有效的HTML,而BeautifulSoup通过假设它不会是而搞砸了。""" I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 'big') + """一些内联标签可以被嵌套,即使它们通常不应该被嵌套。""" I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',) + """一些块级标签可以被嵌套,即使它们通常不应该被嵌套。""" NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) + """构建一个包含所有可以被嵌套的标签的映射。""" class MinimalSoup(BeautifulSoup): - """The MinimalSoup class is for parsing HTML that contains - pathologically bad markup. It makes no assumptions about tag - nesting, but it does know which tags are self-closing, that - <script> tags contain Javascript and should not be parsed, that - META tags may contain encoding information, and so on. + """MinimalSoup类用于解析包含极端糟糕标记的HTML。它不对标签嵌套做任何假设,但它知道哪些标签是自闭合的, + <script>标签包含Javascript不应该被解析,META标签可能包含编码信息等。 - This also makes it better for subclassing than BeautifulStoneSoup - or BeautifulSoup.""" + 这也使得它比BeautifulStoneSoup或BeautifulSoup更适合子类化。""" RESET_NESTING_TAGS = buildTagMap('noscript') + """定义重置嵌套的标签。""" + NESTABLE_TAGS = {} + """不假设任何标签可以嵌套。""" class BeautifulSOAP(BeautifulStoneSoup): - """This class will push a tag with only a single string child into - the tag's parent as an attribute. The attribute's name is the tag - name, and the value is the string child. An example should give - the flavor of the change: + """这个类会将只有一个字符串子标签的标签推入其父标签作为属性。属性的名称是标签名称,值是字符串子标签。 + 一个例子应该能给出变化的味道: <foo><bar>baz</bar></foo> => <foo bar="baz"><bar>baz</bar></foo> - You can then access fooTag['bar'] instead of fooTag.barTag.string. + 然后你可以访问fooTag['bar']而不是fooTag.barTag.string。 - This is, of course, useful for scraping structures that tend to - use subelements instead of attributes, such as SOAP messages. Note - that it modifies its input, so don't print the modified version - out. + 这当然对于抓取倾向于使用子元素而不是属性的结构(如SOAP消息)很有用。注意它修改了它的输入,所以不要打印出修改后的版本。 - I'm not sure how many people really want to use this class; let me - know if you do. Mainly I like the name.""" + 我不确定有多少人真的想使用这个类;如果你这样做,请告诉我。主要是我喜欢这个名字。""" def popTag(self): if len(self.tagStack) > 1: @@ -1720,6 +1725,7 @@ class BeautifulSOAP(BeautifulStoneSoup): not parent.attrMap.has_key(tag.name)): parent[tag.name] = tag.contents[0] BeautifulStoneSoup.popTag(self) + """当弹出标签时,如果标签只有一个字符串子标签,将其作为属性添加到父标签中。""" #Enterprise class names! It has come to our attention that some people #think the names of the Beautiful Soup parser classes are too silly @@ -1772,118 +1778,111 @@ except ImportError: pass class UnicodeDammit: - """A class for detecting the encoding of a *ML document and - converting it to a Unicode string. If the source encoding is - windows-1252, can replace MS smart quotes with their HTML or XML - equivalents.""" - - # This dictionary maps commonly seen values for "charset" in HTML - # meta tags to the corresponding Python codec names. It only covers - # values that aren't in Python's aliases and can't be determined - # by the heuristics in find_codec. + """一个用于检测*ML文档编码并将其转换为Unicode字符串的类。 + 如果源编码是windows-1252,可以将MS智能引号替换为它们的HTML或XML等价物。""" + + # 这个字典映射了HTML meta标签中常见的"charset"值到相应的Python编解码器名称。 + # 它只涵盖了Python的别名中没有的值,以及不能通过find_codec中的启发式确定的值。 CHARSET_ALIASES = { "macintosh" : "mac-roman", "x-sjis" : "shift-jis" } def __init__(self, markup, overrideEncodings=[], smartQuotesTo='xml', isHTML=False): + """初始化UnicodeDammit类,尝试检测markup的编码并转换为Unicode。""" self.declaredHTMLEncoding = None self.markup, documentEncoding, sniffedEncoding = \ - self._detectEncoding(markup, isHTML) - self.smartQuotesTo = smartQuotesTo - self.triedEncodings = [] - if markup == '' or isinstance(markup, text_type): - self.originalEncoding = None - self.unicode = text_type(markup) + self._detectEncoding(markup, isHTML) # 检测编码 + self.smartQuotesTo = smartQuotesTo # 智能引号转换类型 + self.triedEncodings = [] # 尝试过的编码列表 + if markup == '' or isinstance(markup, text_type): # 如果markup是空或者已经是Unicode + self.originalEncoding = None # 原始编码未知 + self.unicode = text_type(markup) # 直接设置Unicode return u = None - for proposedEncoding in overrideEncodings: + for proposedEncoding in overrideEncodings: # 尝试用户指定的编码 u = self._convertFrom(proposedEncoding) if u: break - if not u: - for proposedEncoding in (documentEncoding, sniffedEncoding): + if not u: # 如果用户指定的编码不工作 + for proposedEncoding in (documentEncoding, sniffedEncoding): # 尝试检测到的编码 u = self._convertFrom(proposedEncoding) if u: break - # If no luck and we have auto-detection library, try that: + # 如果没有成功,并且有自动检测库,尝试使用它: if not u and chardet and not isinstance(self.markup, text_type): u = self._convertFrom(chardet.detect(self.markup)['encoding']) - # As a last resort, try utf-8 and windows-1252: + # 作为最后的手段,尝试utf-8和windows-1252: if not u: for proposed_encoding in ("utf-8", "windows-1252"): u = self._convertFrom(proposed_encoding) if u: break - self.unicode = u - if not u: self.originalEncoding = None + self.unicode = u # 设置Unicode结果 + if not u: self.originalEncoding = None # 如果失败,原始编码未知 def _subMSChar(self, orig): - """Changes a MS smart quote character to an XML or HTML - entity.""" - sub = self.MS_CHARS.get(orig) - if isinstance(sub, tuple): + """将MS智能引号字符更改为XML或HTML实体。""" + sub = self.MS_CHARS.get(orig) # 获取替换字符 + if isinstance(sub, tuple): # 如果是元组,根据smartQuotesTo选择替换方式 if self.smartQuotesTo == 'xml': - sub = '&#x%s;' % sub[1] + sub = '&#x%s;' % sub[1] # XML实体 else: - sub = '&%s;' % sub[0] + sub = '&%s;' % sub[0] # HTML实体 return sub def _convertFrom(self, proposed): - proposed = self.find_codec(proposed) - if not proposed or proposed in self.triedEncodings: + """尝试将文档从proposed编码转换为Unicode。""" + proposed = self.find_codec(proposed) # 查找编解码器 + if not proposed or proposed in self.triedEncodings: # 如果没有编解码器或已尝试过 return None - self.triedEncodings.append(proposed) - markup = self.markup + self.triedEncodings.append(proposed) # 记录尝试过的编码 + markup = self.markup # 待转换的文档 - # Convert smart quotes to HTML if coming from an encoding - # that might have them. + # 如果来自可能包含智能引号的编码,则将其转换为HTML if self.smartQuotesTo and proposed.lower() in("windows-1252", "iso-8859-1", "iso-8859-2"): markup = re.compile("([\x80-\x9f])").sub \ - (lambda x: self._subMSChar(x.group(1)), - markup) + (lambda x: self._subMSChar(x.group(1)), markup) try: - # print "Trying to convert document to %s" % proposed + # 尝试转换文档为Unicode u = self._toUnicode(markup, proposed) - self.markup = u - self.originalEncoding = proposed + self.markup = u # 设置转换后的文档 + self.originalEncoding = proposed # 设置原始编码 except Exception as e: - # print "That didn't work!" - # print e + # 如果失败,返回None return None - #print "Correct encoding: %s" % proposed - return self.markup + return self.markup # 返回转换后的文档 def _toUnicode(self, data, encoding): - '''Given a string and its encoding, decodes the string into Unicode. - %encoding is a string recognized by encodings.aliases''' + '''给定一个字符串和它的编码,将该字符串解码为Unicode。 + %encoding是encodings.aliases识别的一个字符串。''' - # strip Byte Order Mark (if present) + # 去除字节顺序标记(如果存在) if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16be' - data = data[2:] + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' # 修正编码为utf-16be + data = data[2:] # 去除字节顺序标记 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16le' - data = data[2:] + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' # 修正编码为utf-16le + data = data[2:] # 去除字节顺序标记 elif data[:3] == '\xef\xbb\xbf': - encoding = 'utf-8' - data = data[3:] + encoding = 'utf-8' # 修正编码为utf-8 + data = data[3:] # 去除字节顺序标记 elif data[:4] == '\x00\x00\xfe\xff': - encoding = 'utf-32be' - data = data[4:] + encoding = 'utf-32be' # 修正编码为utf-32be + data = data[4:] # 去除字节顺序标记 elif data[:4] == '\xff\xfe\x00\x00': - encoding = 'utf-32le' - data = data[4:] - newdata = text_type(data, encoding) + encoding = 'utf-32le' # 修正编码为utf-32le + data = data[4:] # 去除字节顺序标记 + newdata = text_type(data, encoding) # 将数据解码为Unicode return newdata def _detectEncoding(self, xml_data, isHTML=False): - """Given a document, tries to detect its XML encoding.""" + """给定一个文档,尝试检测其XML编码。""" xml_encoding = sniffed_xml_encoding = None try: if xml_data[:4] == '\x4c\x6f\xa7\x94': @@ -1894,7 +1893,7 @@ class UnicodeDammit: sniffed_xml_encoding = 'utf-16be' xml_data = text_type(xml_data, 'utf-16be').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ - and (xml_data[2:4] != '\x00\x00'): + and (xml_data[2:4] != '\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' xml_data = text_type(xml_data[2:], 'utf-16be').encode('utf-8') @@ -1903,7 +1902,7 @@ class UnicodeDammit: sniffed_xml_encoding = 'utf-16le' xml_data = text_type(xml_data, 'utf-16le').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ - (xml_data[2:4] != '\x00\x00'): + (xml_data[2:4] != '\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' xml_data = text_type(xml_data[2:], 'utf-16le').encode('utf-8') @@ -1932,103 +1931,112 @@ class UnicodeDammit: pass except: xml_encoding_match = None + # 尝试从XML声明中匹配编码 xml_encoding_match = re.compile( r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) if not xml_encoding_match and isHTML: + # 尝试从HTML的meta标签中匹配编码 regexp = re.compile(r'<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I) xml_encoding_match = regexp.search(xml_data) if xml_encoding_match is not None: xml_encoding = xml_encoding_match.groups()[0].lower() if isHTML: self.declaredHTMLEncoding = xml_encoding + # 如果检测到的编码和声明的编码不一致,以检测到的编码为准 if sniffed_xml_encoding and \ - (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', - 'iso-10646-ucs-4', 'ucs-4', 'csucs4', - 'utf-16', 'utf-32', 'utf_16', 'utf_32', - 'utf16', 'u16')): + (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', + 'iso-10646-ucs-4', 'ucs-4', 'csucs4', + 'utf-16', 'utf-32', 'utf_16', 'utf_32', + 'utf16', 'u16')): xml_encoding = sniffed_xml_encoding return xml_data, xml_encoding, sniffed_xml_encoding - def find_codec(self, charset): + '''根据给定的字符集名称,找到对应的Python编解码器名称。''' return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ - or (charset and self._codec(charset.replace("-", ""))) \ - or (charset and self._codec(charset.replace("-", "_"))) \ - or charset + or (charset and self._codec(charset.replace("-", ""))) \ + or (charset and self._codec(charset.replace("-", "_"))) \ + or charset def _codec(self, charset): + """检查给定的字符集是否有对应的编解码器。""" if not charset: return charset codec = None try: - codecs.lookup(charset) + codecs.lookup(charset) # 尝试查找编解码器 codec = charset - except (LookupError, ValueError): + except (LookupError, ValueError): # 如果查找失败 pass return codec EBCDIC_TO_ASCII_MAP = None + def _ebcdic_to_ascii(self, s): + """将EBCDIC编码的字符串转换为ASCII编码。""" c = self.__class__ - if not c.EBCDIC_TO_ASCII_MAP: - emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, - 201,202,106,107,108,109,110,111,112,113,114,203,204,205, - 206,207,208,209,126,115,116,117,118,119,120,121,122,210, - 211,212,213,214,215,216,217,218,219,220,221,222,223,224, - 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, - 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, - 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, - 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, - 250,251,252,253,254,255) + if not c.EBCDIC_TO_ASCII_MAP: # 如果转换表尚未创建 + emap = (0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, + 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, + 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, + 32, 160, 161, 162, 163, 164, 165, 166, 167, 168, 91, 46, 60, 40, 43, 33, + 38, 169, 170, 171, 172, 173, 174, 175, 176, 177, 93, 36, 42, 41, 59, 94, + 45, 47, 178, 179, 180, 181, 182, 183, 184, 185, 124, 44, 37, 95, 62, 63, + 186, 187, 188, 189, 190, 191, 192, 193, 194, 96, 58, 35, 64, 39, 61, 34, + 195, 97, 98, 99, 100, 101, 102, 103, 104, 105, 196, 197, 198, 199, 200, + 201, 202, 106, 107, 108, 109, 110, 111, 112, 113, 114, 203, 204, 205, + 206, 207, 208, 209, 126, 115, 116, 117, 118, 119, 120, 121, 122, 210, + 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 123, 65, 66, 67, 68, 69, 70, 71, 72, + 73, 232, 233, 234, 235, 236, 237, 125, 74, 75, 76, 77, 78, 79, 80, 81, + 82, 238, 239, 240, 241, 242, 243, 92, 159, 83, 84, 85, 86, 87, 88, 89, + 90, 244, 245, 246, 247, 248, 249, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, + 250, 251, 252, 253, 254, 255) import string - c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ - ''.join(map(chr, xrange(256))), ''.join(map(chr, emap))) + c.EBCDIC_TO_ASCII_MAP = string.maketrans( + ''.join(map(chr, xrange(256))), ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP) - MS_CHARS = { '\x80' : ('euro', '20AC'), - '\x81' : ' ', - '\x82' : ('sbquo', '201A'), - '\x83' : ('fnof', '192'), - '\x84' : ('bdquo', '201E'), - '\x85' : ('hellip', '2026'), - '\x86' : ('dagger', '2020'), - '\x87' : ('Dagger', '2021'), - '\x88' : ('circ', '2C6'), - '\x89' : ('permil', '2030'), - '\x8A' : ('Scaron', '160'), - '\x8B' : ('lsaquo', '2039'), - '\x8C' : ('OElig', '152'), - '\x8D' : '?', - '\x8E' : ('#x17D', '17D'), - '\x8F' : '?', - '\x90' : '?', - '\x91' : ('lsquo', '2018'), - '\x92' : ('rsquo', '2019'), - '\x93' : ('ldquo', '201C'), - '\x94' : ('rdquo', '201D'), - '\x95' : ('bull', '2022'), - '\x96' : ('ndash', '2013'), - '\x97' : ('mdash', '2014'), - '\x98' : ('tilde', '2DC'), - '\x99' : ('trade', '2122'), - '\x9a' : ('scaron', '161'), - '\x9b' : ('rsaquo', '203A'), - '\x9c' : ('oelig', '153'), - '\x9d' : '?', - '\x9e' : ('#x17E', '17E'), - '\x9f' : ('Yuml', ''),} + MS_CHARS = { + '\x80': ('euro', '20AC'), # MS智能引号到HTML/XML实体的映射 + '\x81': ' ', + '\x82': ('sbquo', '201A'), + '\x83': ('fnof', '192'), + '\x84': ('bdquo', '201E'), + '\x85': ('hellip', '2026'), + '\x86': ('dagger', '2020'), + '\x87': ('Dagger', '2021'), + '\x88': ('circ', '2C6'), + '\x89': ('permil', '2030'), + '\x8A': ('Scaron', '160'), + '\x8B': ('lsaquo', '2039'), + '\x8C': ('OElig', '152'), + '\x8D': '?', + '\x8E': ('#x17D', '17D'), + '\x8F': '?', + '\x90': '?', + '\x91': ('lsquo', '2018'), + '\x92': ('rsquo', '2019'), + '\x93': ('ldquo', '201C'), + '\x94': ('rdquo', '201D'), + '\x95': ('bull', '2022'), + '\x96': ('ndash', '2013'), + '\x97': ('mdash', '2014'), + '\x98': ('tilde', '2DC'), + '\x99': ('trade', '2122'), + '\x9a': ('scaron', '161'), + '\x9b': ('rsaquo', '203A'), + '\x9c': ('oelig', '153'), + '\x9d': '?', + '\x9e': ('#x17E', '17E'), + '\x9f': ('Yuml', ''), + } ####################################################################### #By default, act as an HTML pretty-printer. if __name__ == '__main__': - soup = BeautifulSoup(sys.stdin) - print(soup.prettify()) + # 如果这是主程序(而不是被其他脚本导入的模块),则执行以下代码 + soup = BeautifulSoup(sys.stdin) # 创建一个BeautifulSoup对象,从标准输入读取数据 + print(soup.prettify()) # 打印BeautifulSoup对象的美化版本 \ No newline at end of file diff --git a/src/sqlmap-master/thirdparty/bottle/bottle.py b/src/sqlmap-master/thirdparty/bottle/bottle.py index 9df4629..b934907 100644 --- a/src/sqlmap-master/thirdparty/bottle/bottle.py +++ b/src/sqlmap-master/thirdparty/bottle/bottle.py @@ -29,9 +29,12 @@ __license__ = 'MIT' def _cli_parse(args): # pragma: no coverage + # 导入ArgumentParser模块 from argparse import ArgumentParser + # 创建ArgumentParser对象,设置程序名称和用法 parser = ArgumentParser(prog=args[0], usage="%(prog)s [options] package.module:app") + # 添加参数 opt = parser.add_argument opt("--version", action="store_true", help="show version number.") opt("-b", "--bind", metavar="ADDRESS", help="bind socket to ADDRESS.") @@ -45,6 +48,7 @@ def _cli_parse(args): # pragma: no coverage opt("--reload", action="store_true", help="auto-reload on file changes.") opt('app', help='WSGI app entry point.', nargs='?') + # 解析命令行参数 cli_args = parser.parse_args(args[1:]) return cli_args, parser @@ -179,7 +183,9 @@ def depr(major, minor, cause, fix): def makelist(data): # This is just too handy + # 判断data是否为元组、列表、集合或字典类型 if isinstance(data, (tuple, list, set, dict)): + # 如果是,则返回data的列表形式 return list(data) elif data: return [data] @@ -198,18 +204,24 @@ class DictProperty(object): self.getter, self.key = func, self.key or func.__name__ return self + # 如果obj为None,则返回self def __get__(self, obj, cls): + # 获取属性名和存储对象 if obj is None: return self + # 如果属性名不在存储对象中,则调用getter方法获取值并存储 key, storage = self.key, getattr(obj, self.attr) if key not in storage: storage[key] = self.getter(obj) return storage[key] + # 如果属性是只读的,则抛出AttributeError异常 def __set__(self, obj, value): if self.read_only: raise AttributeError("Read-Only property.") getattr(obj, self.attr)[self.key] = value def __delete__(self, obj): + # 如果属性是只读的,则抛出AttributeError异常 if self.read_only: raise AttributeError("Read-Only property.") + # 从存储对象中删除对应的值 del getattr(obj, self.attr)[self.key] @@ -737,26 +749,38 @@ class Bottle(object): self.route('/' + '/'.join(segments), **options) def _mount_app(self, prefix, app, **options): + # 检查app是否已经被挂载,或者app的config中是否已经存在'_mount.app'键 if app in self._mounts or '_mount.app' in app.config: + # 如果app已经被挂载,或者app的config中已经存在'_mount.app'键,则发出警告,并回退到WSGI挂载 depr(0, 13, "Application mounted multiple times. Falling back to WSGI mount.", "Clone application before mounting to a different location.") return self._mount_wsgi(prefix, app, **options) + # 检查options是否为空 if options: + # 如果options不为空,则发出警告,并回退到WSGI挂载 depr(0, 13, "Unsupported mount options. Falling back to WSGI mount.", "Do not specify any route options when mounting bottle application.") return self._mount_wsgi(prefix, app, **options) + # 检查prefix是否以'/'结尾 if not prefix.endswith("/"): + # 如果prefix不以'/'结尾,则发出警告,并回退到WSGI挂载 depr(0, 13, "Prefix must end in '/'. Falling back to WSGI mount.", "Consider adding an explicit redirect from '/prefix' to '/prefix/' in the parent application.") return self._mount_wsgi(prefix, app, **options) + # 将app添加到_mounts列表中 self._mounts.append(app) + # 将prefix添加到app的config中 app.config['_mount.prefix'] = prefix + # 将self添加到app的config中 app.config['_mount.app'] = self + # 遍历app的routes for route in app.routes: + # 将route的rule修改为prefix + route.rule.lstrip('/') route.rule = prefix + route.rule.lstrip('/') + # 将修改后的route添加到self的routes中 self.add_route(route) def mount(self, prefix, app, **options): @@ -781,11 +805,15 @@ class Bottle(object): parent application. """ + # 检查prefix是否以'/'开头 if not prefix.startswith('/'): + # 如果prefix不以'/'开头,则抛出ValueError异常 raise ValueError("Prefix must start with '/'") + # 如果app是Bottle实例,则调用_mount_app方法 if isinstance(app, Bottle): return self._mount_app(prefix, app, **options) + # 否则,调用_mount_wsgi方法 else: return self._mount_wsgi(prefix, app, **options) @@ -1089,31 +1117,46 @@ class Bottle(object): def wsgi(self, environ, start_response): """ The bottle WSGI-interface. """ try: + # 将environ传递给_handle方法,获取返回值 out = self._cast(self._handle(environ)) # rfc2616 section 4.3 + # 如果返回的状态码是100, 101, 204, 304,或者请求方法是HEAD,则关闭输出流 if response._status_code in (100, 101, 204, 304)\ or environ['REQUEST_METHOD'] == 'HEAD': if hasattr(out, 'close'): out.close() out = [] + # 获取environ中的bottle.exc_info exc_info = environ.get('bottle.exc_info') + # 如果有异常信息,则删除environ中的bottle.exc_info if exc_info is not None: del environ['bottle.exc_info'] + # 调用start_response方法,设置响应状态行、响应头和异常信息 start_response(response._wsgi_status_line(), response.headerlist, exc_info) + # 返回输出流 return out except (KeyboardInterrupt, SystemExit, MemoryError): + # 如果捕获到KeyboardInterrupt, SystemExit, MemoryError异常,则抛出 raise except Exception as E: + # 如果没有开启catchall,则抛出异常 if not self.catchall: raise + # 构造错误页面 err = '<h1>Critical error while processing request: %s</h1>' \ % html_escape(environ.get('PATH_INFO', '/')) + # 如果开启了DEBUG模式,则输出错误信息和堆栈信息 if DEBUG: err += '<h2>Error:</h2>\n<pre>\n%s\n</pre>\n' \ '<h2>Traceback:</h2>\n<pre>\n%s\n</pre>\n' \ % (html_escape(repr(E)), html_escape(format_exc())) + # 将错误页面写入environ中的wsgi.errors environ['wsgi.errors'].write(err) + # 刷新wsgi.errors environ['wsgi.errors'].flush() + # 设置响应头 headers = [('Content-Type', 'text/html; charset=UTF-8')] + # 调用start_response方法,设置响应状态行、响应头和异常信息 start_response('500 INTERNAL SERVER ERROR', headers, sys.exc_info()) + # 返回错误页面 return [tob(err)] def __call__(self, environ, start_response): diff --git a/src/sqlmap-master/thirdparty/chardet/__init__.py b/src/sqlmap-master/thirdparty/chardet/__init__.py index 0f9f820..b67e7ce 100644 --- a/src/sqlmap-master/thirdparty/chardet/__init__.py +++ b/src/sqlmap-master/thirdparty/chardet/__init__.py @@ -15,7 +15,6 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### - from .compat import PY2, PY3 from .universaldetector import UniversalDetector from .version import __version__, VERSION @@ -25,15 +24,28 @@ def detect(byte_str): """ Detect the encoding of the given byte string. + This function uses the UniversalDetector class to determine the encoding + of a given byte string. It creates a new UniversalDetector instance, + feeds the byte string to it, and then returns the detected encoding. + :param byte_str: The byte sequence to examine. :type byte_str: ``bytes`` or ``bytearray`` + :return: The detected encoding. """ + # Check if the input is of the correct type if not isinstance(byte_str, bytearray): if not isinstance(byte_str, bytes): raise TypeError('Expected object of type bytes or bytearray, got: ' '{0}'.format(type(byte_str))) else: + # If the input is of type bytes, convert it to bytearray byte_str = bytearray(byte_str) + + # Create a new UniversalDetector instance detector = UniversalDetector() + + # Feed the byte string to the detector detector.feed(byte_str) - return detector.close() + + # Close the detector and return the detected encoding + return detector.close() \ No newline at end of file diff --git a/src/sqlmap-master/thirdparty/chardet/big5prober.py b/src/sqlmap-master/thirdparty/chardet/big5prober.py index 98f9970..5e08ec7 100644 --- a/src/sqlmap-master/thirdparty/chardet/big5prober.py +++ b/src/sqlmap-master/thirdparty/chardet/big5prober.py @@ -32,10 +32,15 @@ from .mbcssm import BIG5_SM_MODEL class Big5Prober(MultiByteCharSetProber): + # 初始化Big5Prober类 def __init__(self): + # 调用父类MultiByteCharSetProber的初始化方法 super(Big5Prober, self).__init__() + # 初始化Big5编码状态机 self.coding_sm = CodingStateMachine(BIG5_SM_MODEL) + # 初始化Big5分布分析器 self.distribution_analyzer = Big5DistributionAnalysis() + # 重置Big5Prober类 self.reset() @property diff --git a/src/sqlmap-master/thirdparty/chardet/charsetgroupprober.py b/src/sqlmap-master/thirdparty/chardet/charsetgroupprober.py index 8b3738e..d1d1470 100644 --- a/src/sqlmap-master/thirdparty/chardet/charsetgroupprober.py +++ b/src/sqlmap-master/thirdparty/chardet/charsetgroupprober.py @@ -30,69 +30,126 @@ from .charsetprober import CharSetProber class CharSetGroupProber(CharSetProber): + # 初始化函数,传入语言过滤器 def __init__(self, lang_filter=None): + # 调用父类的初始化函数 super(CharSetGroupProber, self).__init__(lang_filter=lang_filter) + # 初始化活动探测器数量 self._active_num = 0 + # 初始化探测器列表 self.probers = [] + # 初始化最佳猜测探测器 self._best_guess_prober = None + # 重置函数 def reset(self): + # 调用父类的重置函数 super(CharSetGroupProber, self).reset() + # 重置活动探测器数量 self._active_num = 0 + # 遍历探测器列表 for prober in self.probers: + # 如果探测器存在 if prober: + # 重置探测器 prober.reset() + # 设置探测器为活动状态 prober.active = True + # 活动探测器数量加一 self._active_num += 1 + # 重置最佳猜测探测器 self._best_guess_prober = None + # 获取字符集名称的属性函数 @property def charset_name(self): + # 如果最佳猜测探测器不存在 if not self._best_guess_prober: + # 调用获取置信度函数 self.get_confidence() + # 如果最佳猜测探测器仍然不存在 if not self._best_guess_prober: + # 返回None return None + # 返回最佳猜测探测器的字符集名称 return self._best_guess_prober.charset_name + # 获取语言的属性函数 @property def language(self): + # 如果最佳猜测探测器不存在 if not self._best_guess_prober: + # 调用获取置信度函数 self.get_confidence() + # 如果最佳猜测探测器仍然不存在 if not self._best_guess_prober: + # 返回None return None + # 返回最佳猜测探测器的语言 return self._best_guess_prober.language + # 接收字节字符串的函数 def feed(self, byte_str): + # 遍历探测器列表 for prober in self.probers: + # 如果探测器不存在 if not prober: + # 跳过 continue + # 如果探测器不是活动状态 if not prober.active: + # 跳过 continue + # 调用探测器接收字节字符串的函数 state = prober.feed(byte_str) + # 如果探测器返回的状态不是FOUND_IT if not state: + # 跳过 continue + # 如果探测器返回的状态是FOUND_IT if state == ProbingState.FOUND_IT: + # 设置最佳猜测探测器为当前探测器 self._best_guess_prober = prober + # 返回当前探测器的状态 return self.state + # 如果探测器返回的状态是NOT_ME elif state == ProbingState.NOT_ME: + # 设置探测器为非活动状态 prober.active = False + # 活动探测器数量减一 self._active_num -= 1 + # 如果活动探测器数量小于等于0 if self._active_num <= 0: + # 设置当前探测器的状态为NOT_ME self._state = ProbingState.NOT_ME + # 返回当前探测器的状态 return self.state + # 返回当前探测器的状态 return self.state + # 获取置信度的函数 def get_confidence(self): + # 获取当前探测器的状态 state = self.state + # 如果当前探测器的状态是FOUND_IT if state == ProbingState.FOUND_IT: + # 返回0.99 return 0.99 + # 如果当前探测器的状态是NOT_ME elif state == ProbingState.NOT_ME: + # 返回0.01 return 0.01 + # 初始化最佳置信度 best_conf = 0.0 + # 重置最佳猜测探测器 self._best_guess_prober = None + # 遍历探测器列表 for prober in self.probers: + # 如果探测器不存在 if not prober: + # 跳过 continue + # 如果探测器不是活动状态 if not prober.active: self.logger.debug('%s not active', prober.charset_name) continue diff --git a/src/sqlmap-master/thirdparty/chardet/charsetprober.py b/src/sqlmap-master/thirdparty/chardet/charsetprober.py index eac4e59..2ebca10 100644 --- a/src/sqlmap-master/thirdparty/chardet/charsetprober.py +++ b/src/sqlmap-master/thirdparty/chardet/charsetprober.py @@ -34,32 +34,42 @@ from .enums import ProbingState class CharSetProber(object): + # 定义一个阈值,当检测到的字符集概率大于这个值时,认为检测成功 SHORTCUT_THRESHOLD = 0.95 def __init__(self, lang_filter=None): + # 初始化状态为检测中 self._state = None + # 设置语言过滤器 self.lang_filter = lang_filter + # 获取日志记录器 self.logger = logging.getLogger(__name__) def reset(self): + # 重置状态为检测中 self._state = ProbingState.DETECTING @property def charset_name(self): + # 返回字符集名称,这里返回None return None def feed(self, buf): + # 接收输入的缓冲区 pass @property def state(self): + # 返回当前状态 return self._state def get_confidence(self): + # 返回检测到的字符集的概率,这里返回0.0 return 0.0 @staticmethod def filter_high_byte_only(buf): + # 过滤掉所有非高字节字符 buf = re.sub(b'([\x00-\x7F])+', b' ', buf) return buf diff --git a/src/sqlmap-master/thirdparty/chardet/codingstatemachine.py b/src/sqlmap-master/thirdparty/chardet/codingstatemachine.py index 68fba44..f56a1d2 100644 --- a/src/sqlmap-master/thirdparty/chardet/codingstatemachine.py +++ b/src/sqlmap-master/thirdparty/chardet/codingstatemachine.py @@ -53,20 +53,29 @@ class CodingStateMachine(object): encoding from consideration from here on. """ def __init__(self, sm): + # 初始化函数,sm为传入的模型 self._model = sm + # 当前字节位置 self._curr_byte_pos = 0 + # 当前字符长度 self._curr_char_len = 0 + # 当前状态 self._curr_state = None + # 获取logger self.logger = logging.getLogger(__name__) + # 重置 self.reset() def reset(self): + # 重置函数,将当前状态设置为起始状态 self._curr_state = MachineState.START def next_state(self, c): # for each byte we get its class # if it is first byte, we also get byte length + # 获取当前字节的类别 byte_class = self._model['class_table'][c] + # 如果当前状态为起始状态,则获取当前字符长度 if self._curr_state == MachineState.START: self._curr_byte_pos = 0 self._curr_char_len = self._model['char_len_table'][byte_class] diff --git a/src/sqlmap-master/thirdparty/chardet/compat.py b/src/sqlmap-master/thirdparty/chardet/compat.py index ddd7468..f68c510 100644 --- a/src/sqlmap-master/thirdparty/chardet/compat.py +++ b/src/sqlmap-master/thirdparty/chardet/compat.py @@ -22,13 +22,20 @@ import sys +# 判断当前Python版本是否小于3.0 if sys.version_info < (3, 0): + # 如果是Python2版本 PY2 = True PY3 = False + # 定义base_str为str和unicode类型 base_str = (str, unicode) + # 定义text_type为unicode类型 text_type = unicode else: + # 如果是Python3版本 PY2 = False PY3 = True + # 定义base_str为bytes和str类型 base_str = (bytes, str) + # 定义text_type为str类型 text_type = str diff --git a/src/sqlmap-master/thirdparty/chardet/escprober.py b/src/sqlmap-master/thirdparty/chardet/escprober.py index c70493f..e1976cc 100644 --- a/src/sqlmap-master/thirdparty/chardet/escprober.py +++ b/src/sqlmap-master/thirdparty/chardet/escprober.py @@ -40,62 +40,95 @@ class EscCharSetProber(CharSetProber): """ def __init__(self, lang_filter=None): + # 初始化EscCharSetProber类 super(EscCharSetProber, self).__init__(lang_filter=lang_filter) + # 初始化编码状态机列表 self.coding_sm = [] + # 如果语言过滤器包含简体中文 if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED: + # 添加简体中文编码状态机 self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL)) + # 添加ISO2022CN编码状态机 self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL)) + # 如果语言过滤器包含日语 if self.lang_filter & LanguageFilter.JAPANESE: + # 添加ISO2022JP编码状态机 self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL)) + # 如果语言过滤器包含韩语 if self.lang_filter & LanguageFilter.KOREAN: + # 添加ISO2022KR编码状态机 self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL)) + # 初始化活动状态机数量 self.active_sm_count = None + # 初始化检测到的字符集 self._detected_charset = None + # 初始化检测到的语言 self._detected_language = None + # 初始化状态 self._state = None + # 重置 self.reset() def reset(self): + # 重置EscCharSetProber类 super(EscCharSetProber, self).reset() + # 遍历编码状态机列表 for coding_sm in self.coding_sm: + # 如果编码状态机为空,则跳过 if not coding_sm: continue + # 设置编码状态机为活动状态 coding_sm.active = True + # 重置编码状态机 coding_sm.reset() + # 设置活动状态机数量为编码状态机列表的长度 self.active_sm_count = len(self.coding_sm) + # 设置检测到的字符集为空 self._detected_charset = None + # 设置检测到的语言为空 self._detected_language = None @property def charset_name(self): + # 返回检测到的字符集 return self._detected_charset @property def language(self): + # 返回检测到的语言 return self._detected_language def get_confidence(self): + # 如果检测到了字符集,则返回0.99,否则返回0.00 if self._detected_charset: return 0.99 else: return 0.00 def feed(self, byte_str): + # 遍历字节字符串 for c in byte_str: + # 遍历编码状态机列表 for coding_sm in self.coding_sm: + # 如果编码状态机为空或非活动状态,则跳过 if not coding_sm or not coding_sm.active: continue + # 获取编码状态机的下一个状态 coding_state = coding_sm.next_state(c) + # 如果状态为错误,则设置编码状态机为非活动状态,活动状态机数量减一 if coding_state == MachineState.ERROR: coding_sm.active = False self.active_sm_count -= 1 + # 如果活动状态机数量小于等于0,则设置状态为非匹配 if self.active_sm_count <= 0: self._state = ProbingState.NOT_ME return self.state + # 如果状态为匹配,则设置状态为匹配,设置检测到的字符集和语言 elif coding_state == MachineState.ITS_ME: self._state = ProbingState.FOUND_IT self._detected_charset = coding_sm.get_coding_state_machine() self._detected_language = coding_sm.language return self.state + # 返回状态 return self.state diff --git a/src/sqlmap-master/thirdparty/chardet/eucjpprober.py b/src/sqlmap-master/thirdparty/chardet/eucjpprober.py index 20ce8f7..2b96237 100644 --- a/src/sqlmap-master/thirdparty/chardet/eucjpprober.py +++ b/src/sqlmap-master/thirdparty/chardet/eucjpprober.py @@ -34,59 +34,90 @@ from .mbcssm import EUCJP_SM_MODEL class EUCJPProber(MultiByteCharSetProber): + # 初始化EUCJPProber类 def __init__(self): super(EUCJPProber, self).__init__() + # 初始化编码状态机 self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL) + # 初始化分布分析器 self.distribution_analyzer = EUCJPDistributionAnalysis() + # 初始化上下文分析器 self.context_analyzer = EUCJPContextAnalysis() + # 重置 self.reset() + # 重置 def reset(self): super(EUCJPProber, self).reset() self.context_analyzer.reset() + # 获取字符集名称 @property def charset_name(self): return "EUC-JP" + # 获取语言 @property def language(self): return "Japanese" + # 输入字节流 def feed(self, byte_str): for i in range(len(byte_str)): # PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte + # 获取下一个状态 coding_state = self.coding_sm.next_state(byte_str[i]) + # 如果状态为错误 if coding_state == MachineState.ERROR: self.logger.debug('%s %s prober hit error at byte %s', self.charset_name, self.language, i) + # 设置状态为不是该字符集 self._state = ProbingState.NOT_ME break + # 如果状态为确定 elif coding_state == MachineState.ITS_ME: + # 设置状态为确定 self._state = ProbingState.FOUND_IT break + # 如果状态为开始 elif coding_state == MachineState.START: + # 获取当前字符长度 char_len = self.coding_sm.get_current_charlen() + # 如果是第一个字符 if i == 0: + # 更新最后一个字符 self._last_char[1] = byte_str[0] + # 输入最后一个字符和当前字符长度到上下文分析器 self.context_analyzer.feed(self._last_char, char_len) + # 输入最后一个字符和当前字符长度到分布分析器 self.distribution_analyzer.feed(self._last_char, char_len) else: + # 输入前一个字符和当前字符到上下文分析器 self.context_analyzer.feed(byte_str[i - 1:i + 1], char_len) + # 输入前一个字符和当前字符到分布分析器 self.distribution_analyzer.feed(byte_str[i - 1:i + 1], char_len) + # 更新最后一个字符 self._last_char[0] = byte_str[-1] + # 如果状态为检测中 if self.state == ProbingState.DETECTING: + # 如果上下文分析器有足够的数据,并且置信度大于阈值 if (self.context_analyzer.got_enough_data() and (self.get_confidence() > self.SHORTCUT_THRESHOLD)): + # 设置状态为确定 self._state = ProbingState.FOUND_IT + # 返回状态 return self.state + # 获取置信度 def get_confidence(self): + # 获取上下文分析器的置信度 context_conf = self.context_analyzer.get_confidence() + # 获取分布分析器的置信度 distrib_conf = self.distribution_analyzer.get_confidence() + # 返回最大置信度 return max(context_conf, distrib_conf) diff --git a/src/sqlmap-master/thirdparty/chardet/euckrprober.py b/src/sqlmap-master/thirdparty/chardet/euckrprober.py index 345a060..a38ae86 100644 --- a/src/sqlmap-master/thirdparty/chardet/euckrprober.py +++ b/src/sqlmap-master/thirdparty/chardet/euckrprober.py @@ -32,16 +32,23 @@ from .mbcssm import EUCKR_SM_MODEL class EUCKRProber(MultiByteCharSetProber): + # 初始化EUCKRProber类 def __init__(self): + # 调用父类MultiByteCharSetProber的初始化方法 super(EUCKRProber, self).__init__() + # 初始化编码状态机 self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL) + # 初始化分布分析器 self.distribution_analyzer = EUCKRDistributionAnalysis() + # 重置 self.reset() + # 获取字符集名称 @property def charset_name(self): return "EUC-KR" + # 获取语言 @property def language(self): return "Korean" diff --git a/src/sqlmap-master/thirdparty/chardet/euctwprober.py b/src/sqlmap-master/thirdparty/chardet/euctwprober.py index 35669cc..ceddcda 100644 --- a/src/sqlmap-master/thirdparty/chardet/euctwprober.py +++ b/src/sqlmap-master/thirdparty/chardet/euctwprober.py @@ -31,16 +31,23 @@ from .chardistribution import EUCTWDistributionAnalysis from .mbcssm import EUCTW_SM_MODEL class EUCTWProber(MultiByteCharSetProber): + # 初始化EUCTWProber类 def __init__(self): + # 调用父类MultiByteCharSetProber的初始化方法 super(EUCTWProber, self).__init__() + # 初始化编码状态机 self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL) + # 初始化分布分析器 self.distribution_analyzer = EUCTWDistributionAnalysis() + # 重置 self.reset() + # 获取字符集名称 @property def charset_name(self): return "EUC-TW" + # 获取语言 @property def language(self): return "Taiwan" diff --git a/src/sqlmap-master/thirdparty/chardet/gb2312prober.py b/src/sqlmap-master/thirdparty/chardet/gb2312prober.py index 8446d2d..942e7a7 100644 --- a/src/sqlmap-master/thirdparty/chardet/gb2312prober.py +++ b/src/sqlmap-master/thirdparty/chardet/gb2312prober.py @@ -31,16 +31,23 @@ from .chardistribution import GB2312DistributionAnalysis from .mbcssm import GB2312_SM_MODEL class GB2312Prober(MultiByteCharSetProber): + # 初始化GB2312Prober类 def __init__(self): + # 调用父类MultiByteCharSetProber的初始化方法 super(GB2312Prober, self).__init__() + # 初始化GB2312编码状态机 self.coding_sm = CodingStateMachine(GB2312_SM_MODEL) + # 初始化GB2312分布分析器 self.distribution_analyzer = GB2312DistributionAnalysis() + # 重置 self.reset() + # 获取字符集名称 @property def charset_name(self): return "GB2312" + # 获取语言 @property def language(self): return "Chinese" diff --git a/src/sqlmap-master/thirdparty/chardet/hebrewprober.py b/src/sqlmap-master/thirdparty/chardet/hebrewprober.py index b0e1bf4..ed56d83 100644 --- a/src/sqlmap-master/thirdparty/chardet/hebrewprober.py +++ b/src/sqlmap-master/thirdparty/chardet/hebrewprober.py @@ -152,17 +152,27 @@ class HebrewProber(CharSetProber): LOGICAL_HEBREW_NAME = "windows-1255" def __init__(self): + # 初始化HebrewProber类 super(HebrewProber, self).__init__() + # 初始化_final_char_logical_score为None self._final_char_logical_score = None + # 初始化_final_char_visual_score为None self._final_char_visual_score = None + # 初始化_prev为None self._prev = None + # 初始化_before_prev为None self._before_prev = None + # 初始化_logical_prober为None self._logical_prober = None + # 初始化_visual_prober为None self._visual_prober = None + # 调用reset方法 self.reset() def reset(self): + # 重置_final_char_logical_score为0 self._final_char_logical_score = 0 + # 重置_final_char_visual_score为0 self._final_char_visual_score = 0 # The two last characters seen in the previous buffer, # mPrev and mBeforePrev are initialized to space in order to simulate diff --git a/src/sqlmap-master/thirdparty/chardet/mbcharsetprober.py b/src/sqlmap-master/thirdparty/chardet/mbcharsetprober.py index 6256ecf..0d73c00 100644 --- a/src/sqlmap-master/thirdparty/chardet/mbcharsetprober.py +++ b/src/sqlmap-master/thirdparty/chardet/mbcharsetprober.py @@ -37,17 +37,28 @@ class MultiByteCharSetProber(CharSetProber): """ def __init__(self, lang_filter=None): + # 初始化函数,传入参数lang_filter super(MultiByteCharSetProber, self).__init__(lang_filter=lang_filter) + # 调用父类的初始化函数 self.distribution_analyzer = None + # 初始化分布分析器 self.coding_sm = None + # 初始化编码状态机 self._last_char = [0, 0] + # 初始化最后一个字符 def reset(self): + # 重置函数 super(MultiByteCharSetProber, self).reset() + # 调用父类的重置函数 if self.coding_sm: + # 如果编码状态机存在 self.coding_sm.reset() + # 重置编码状态机 if self.distribution_analyzer: + # 如果分布分析器存在 self.distribution_analyzer.reset() + # 重置分布分析器 self._last_char = [0, 0] @property @@ -59,33 +70,45 @@ class MultiByteCharSetProber(CharSetProber): raise NotImplementedError def feed(self, byte_str): + # 遍历byte_str中的每个字节 for i in range(len(byte_str)): + # 获取当前字节的编码状态 coding_state = self.coding_sm.next_state(byte_str[i]) + # 如果编码状态为错误,则记录错误信息,并将状态设置为NOT_ME if coding_state == MachineState.ERROR: self.logger.debug('%s %s prober hit error at byte %s', self.charset_name, self.language, i) self._state = ProbingState.NOT_ME break + # 如果编码状态为确定,则将状态设置为FOUND_IT elif coding_state == MachineState.ITS_ME: self._state = ProbingState.FOUND_IT break + # 如果编码状态为开始,则获取当前字符长度 elif coding_state == MachineState.START: char_len = self.coding_sm.get_current_charlen() + # 如果是第一个字节,则将当前字节和上一个字节作为参数传入feed方法 if i == 0: self._last_char[1] = byte_str[0] self.distribution_analyzer.feed(self._last_char, char_len) + # 否则,将当前字节和上一个字节作为参数传入feed方法 else: self.distribution_analyzer.feed(byte_str[i - 1:i + 1], char_len) + # 将最后一个字节赋值给_last_char[0] self._last_char[0] = byte_str[-1] + # 如果状态为DETECTING,则判断是否已经获取足够的数据,并且置信度是否大于SHORTCUT_THRESHOLD if self.state == ProbingState.DETECTING: if (self.distribution_analyzer.got_enough_data() and (self.get_confidence() > self.SHORTCUT_THRESHOLD)): + # 如果满足条件,则将状态设置为FOUND_IT self._state = ProbingState.FOUND_IT + # 返回状态 return self.state def get_confidence(self): + # 获取置信度 return self.distribution_analyzer.get_confidence() diff --git a/src/sqlmap-master/thirdparty/chardet/mbcsgroupprober.py b/src/sqlmap-master/thirdparty/chardet/mbcsgroupprober.py index 530abe7..3ba0475 100644 --- a/src/sqlmap-master/thirdparty/chardet/mbcsgroupprober.py +++ b/src/sqlmap-master/thirdparty/chardet/mbcsgroupprober.py @@ -39,16 +39,20 @@ from .euctwprober import EUCTWProber class MBCSGroupProber(CharSetGroupProber): + # 初始化MBCSGroupProber类,继承自CharSetGroupProber类 def __init__(self, lang_filter=None): + # 调用父类CharSetGroupProber的初始化方法 super(MBCSGroupProber, self).__init__(lang_filter=lang_filter) + # 定义一个包含多种字符集探测器的列表 self.probers = [ - UTF8Prober(), - SJISProber(), - EUCJPProber(), - GB2312Prober(), - EUCKRProber(), - CP949Prober(), - Big5Prober(), - EUCTWProber() + UTF8Prober(), # UTF-8字符集探测器 + SJISProber(), # Shift_JIS字符集探测器 + EUCJPProber(), # EUC-JP字符集探测器 + GB2312Prober(), # GB2312字符集探测器 + EUCKRProber(), # EUCKR字符集探测器 + CP949Prober(), # CP949字符集探测器 + Big5Prober(), # Big5字符集探测器 + EUCTWProber() # EUCTW字符集探测器 ] + # 重置探测器 self.reset() diff --git a/src/sqlmap-master/thirdparty/chardet/sbcharsetprober.py b/src/sqlmap-master/thirdparty/chardet/sbcharsetprober.py index 0adb51d..5ccee8a 100644 --- a/src/sqlmap-master/thirdparty/chardet/sbcharsetprober.py +++ b/src/sqlmap-master/thirdparty/chardet/sbcharsetprober.py @@ -31,13 +31,19 @@ from .enums import CharacterCategory, ProbingState, SequenceLikelihood class SingleByteCharSetProber(CharSetProber): + # 定义样本大小 SAMPLE_SIZE = 64 + # 定义相对阈值 SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2 + # 定义正向阈值 POSITIVE_SHORTCUT_THRESHOLD = 0.95 + # 定义负向阈值 NEGATIVE_SHORTCUT_THRESHOLD = 0.05 def __init__(self, model, reversed=False, name_prober=None): + # 调用父类构造函数 super(SingleByteCharSetProber, self).__init__() + # 设置模型 self._model = model # TRUE if we need to reverse every pair in the model lookup self._reversed = reversed @@ -51,6 +57,7 @@ class SingleByteCharSetProber(CharSetProber): self.reset() def reset(self): + # 重置函数 super(SingleByteCharSetProber, self).reset() # char order of last character self._last_order = 255 @@ -69,16 +76,20 @@ class SingleByteCharSetProber(CharSetProber): @property def language(self): + # 如果_name_prober存在,则返回_name_prober的语言,否则返回_model中的语言 if self._name_prober: return self._name_prober.language else: return self._model.get('language') def feed(self, byte_str): + # 如果_model中的keep_english_letter为False,则过滤掉国际字符 if not self._model['keep_english_letter']: byte_str = self.filter_international_words(byte_str) + # 如果byte_str为空,则返回状态 if not byte_str: return self.state + # 获取字符到顺序的映射 char_to_order_map = self._model['char_to_order_map'] for i, c in enumerate(byte_str): # XXX: Order is in range 1-64, so one would think we want 0-63 here, @@ -122,11 +133,17 @@ class SingleByteCharSetProber(CharSetProber): return self.state def get_confidence(self): + # 初始化r为0.01 r = 0.01 + # 如果总序列数大于0 if self._total_seqs > 0: + # 计算r的值 r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) / self._total_seqs / self._model['typical_positive_ratio']) + # 乘以字符频率和总字符数 r = r * self._freq_char / self._total_char + # 如果r大于等于1.0,则将r设置为0.99 if r >= 1.0: r = 0.99 + # 返回r的值 return r diff --git a/src/sqlmap-master/thirdparty/chardet/sjisprober.py b/src/sqlmap-master/thirdparty/chardet/sjisprober.py index 9e29623..f1755f8 100644 --- a/src/sqlmap-master/thirdparty/chardet/sjisprober.py +++ b/src/sqlmap-master/thirdparty/chardet/sjisprober.py @@ -34,59 +34,94 @@ from .enums import ProbingState, MachineState class SJISProber(MultiByteCharSetProber): + # 初始化函数 def __init__(self): + # 调用父类的初始化函数 super(SJISProber, self).__init__() + # 初始化编码状态机 self.coding_sm = CodingStateMachine(SJIS_SM_MODEL) + # 初始化分布分析器 self.distribution_analyzer = SJISDistributionAnalysis() + # 初始化上下文分析器 self.context_analyzer = SJISContextAnalysis() + # 重置分析器 self.reset() + # 重置函数 def reset(self): + # 调用父类的重置函数 super(SJISProber, self).reset() + # 重置上下文分析器 self.context_analyzer.reset() @property def charset_name(self): + # 返回字符集名称 return self.context_analyzer.charset_name @property def language(self): + # 返回语言 return "Japanese" def feed(self, byte_str): + # 遍历字节字符串 for i in range(len(byte_str)): + # 获取下一个状态 coding_state = self.coding_sm.next_state(byte_str[i]) + # 如果状态为错误 if coding_state == MachineState.ERROR: + # 记录错误日志 self.logger.debug('%s %s prober hit error at byte %s', self.charset_name, self.language, i) + # 设置状态为不是该字符集 self._state = ProbingState.NOT_ME break + # 如果状态为确定 elif coding_state == MachineState.ITS_ME: + # 设置状态为确定 self._state = ProbingState.FOUND_IT break + # 如果状态为开始 elif coding_state == MachineState.START: + # 获取当前字符长度 char_len = self.coding_sm.get_current_charlen() + # 如果是第一个字符 if i == 0: + # 更新最后一个字符 self._last_char[1] = byte_str[0] + # 向上下文分析器输入字符 self.context_analyzer.feed(self._last_char[2 - char_len:], char_len) + # 向分布分析器输入字符 self.distribution_analyzer.feed(self._last_char, char_len) else: + # 向上下文分析器输入字符 self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3 - char_len], char_len) + # 向分布分析器输入字符 self.distribution_analyzer.feed(byte_str[i - 1:i + 1], char_len) + # 更新最后一个字符 self._last_char[0] = byte_str[-1] + # 如果状态为检测中 if self.state == ProbingState.DETECTING: + # 如果上下文分析器有足够的数据,并且置信度大于阈值 if (self.context_analyzer.got_enough_data() and (self.get_confidence() > self.SHORTCUT_THRESHOLD)): + # 设置状态为确定 self._state = ProbingState.FOUND_IT + # 返回状态 return self.state + # 获取置信度 def get_confidence(self): + # 获取上下文分析器的置信度 context_conf = self.context_analyzer.get_confidence() + # 获取分布分析器的置信度 distrib_conf = self.distribution_analyzer.get_confidence() + # 返回上下文置信度和分布置信度中的最大值 return max(context_conf, distrib_conf) diff --git a/src/sqlmap-master/thirdparty/chardet/universaldetector.py b/src/sqlmap-master/thirdparty/chardet/universaldetector.py index 7b4e92d..796fd03 100644 --- a/src/sqlmap-master/thirdparty/chardet/universaldetector.py +++ b/src/sqlmap-master/thirdparty/chardet/universaldetector.py @@ -79,16 +79,27 @@ class UniversalDetector(object): 'iso-8859-13': 'Windows-1257'} def __init__(self, lang_filter=LanguageFilter.ALL): + # 初始化语言过滤器 self._esc_charset_prober = None + # 初始化字符集探测器 self._charset_probers = [] + # 初始化结果 self.result = None + # 初始化完成标志 self.done = None + # 初始化是否获取数据标志 self._got_data = None + # 初始化输入状态 self._input_state = None + # 初始化最后一个字符 self._last_char = None + # 设置语言过滤器 self.lang_filter = lang_filter + # 获取日志记录器 self.logger = logging.getLogger(__name__) + # 初始化是否包含Windows字节标志 self._has_win_bytes = None + # 重置 self.reset() def reset(self): @@ -97,14 +108,22 @@ class UniversalDetector(object): initial states. This is called by ``__init__``, so you only need to call this directly in between analyses of different documents. """ + # 重置结果 self.result = {'encoding': None, 'confidence': 0.0, 'language': None} + # 重置完成标志 self.done = False + # 重置是否接收到数据标志 self._got_data = False + # 重置是否有win字节标志 self._has_win_bytes = False + # 重置输入状态 self._input_state = InputState.PURE_ASCII + # 重置最后一个字符 self._last_char = b'' + # 如果有esc字符集探测器,重置它 if self._esc_charset_prober: self._esc_charset_prober.reset() + # 重置所有字符集探测器 for prober in self._charset_probers: prober.reset() diff --git a/src/sqlmap-master/thirdparty/chardet/utf8prober.py b/src/sqlmap-master/thirdparty/chardet/utf8prober.py index 6c3196c..90788b5 100644 --- a/src/sqlmap-master/thirdparty/chardet/utf8prober.py +++ b/src/sqlmap-master/thirdparty/chardet/utf8prober.py @@ -33,50 +33,75 @@ from .mbcssm import UTF8_SM_MODEL class UTF8Prober(CharSetProber): + # 定义一个常量,表示一个字符的初始概率为0.5 ONE_CHAR_PROB = 0.5 + # 初始化函数 def __init__(self): + # 调用父类的初始化函数 super(UTF8Prober, self).__init__() + # 初始化编码状态机 self.coding_sm = CodingStateMachine(UTF8_SM_MODEL) + # 初始化多字节字符数量 self._num_mb_chars = None + # 调用重置函数 self.reset() + # 重置函数 def reset(self): + # 调用父类的重置函数 super(UTF8Prober, self).reset() + # 重置编码状态机 self.coding_sm.reset() + # 重置多字节字符数量 self._num_mb_chars = 0 + # 获取字符集名称的属性 @property def charset_name(self): + # 返回字符集名称 return "utf-8" + # 获取语言名称的属性 @property def language(self): + # 返回语言名称 return "" def feed(self, byte_str): + # 遍历byte_str中的每个字符 for c in byte_str: + # 获取下一个状态 coding_state = self.coding_sm.next_state(c) + # 如果状态为ERROR,则将状态设置为NOT_ME,并跳出循环 if coding_state == MachineState.ERROR: self._state = ProbingState.NOT_ME break + # 如果状态为ITS_ME,则将状态设置为FOUND_IT,并跳出循环 elif coding_state == MachineState.ITS_ME: self._state = ProbingState.FOUND_IT break + # 如果状态为START,且当前字符长度大于等于2,则将_num_mb_chars加1 elif coding_state == MachineState.START: if self.coding_sm.get_current_charlen() >= 2: self._num_mb_chars += 1 + # 如果状态为DETECTING,且置信度大于SHORTCUT_THRESHOLD,则将状态设置为FOUND_IT if self.state == ProbingState.DETECTING: if self.get_confidence() > self.SHORTCUT_THRESHOLD: self._state = ProbingState.FOUND_IT + # 返回状态 return self.state def get_confidence(self): + # 初始化 unlike 为 0.99 unlike = 0.99 + # 如果_num_mb_chars 小于 6,则 unlike 乘以 ONE_CHAR_PROB 的 _num_mb_chars 次方 if self._num_mb_chars < 6: unlike *= self.ONE_CHAR_PROB ** self._num_mb_chars + # 返回 1.0 减去 unlike return 1.0 - unlike + # 否则返回 unlike else: return unlike diff --git a/src/sqlmap-master/thirdparty/clientform/clientform.py b/src/sqlmap-master/thirdparty/clientform/clientform.py index 34f2f99..bc1eb4a 100644 --- a/src/sqlmap-master/thirdparty/clientform/clientform.py +++ b/src/sqlmap-master/thirdparty/clientform/clientform.py @@ -67,30 +67,47 @@ __all__ = ['AmbiguityError', 'CheckboxControl', 'Control', 'TextareaControl', 'XHTMLCompatibleFormParser'] try: + # 尝试导入logging和inspect模块 import logging import inspect except ImportError: + # 如果导入失败,定义一个空的debug函数 def debug(msg, *args, **kwds): pass else: + # 如果导入成功,定义一个_logger对象 _logger = logging.getLogger("ClientForm") + # 定义一个优化hack变量 OPTIMIZATION_HACK = True + # 定义一个debug函数 def debug(msg, *args, **kwds): + # 如果优化hack为True,则返回 if OPTIMIZATION_HACK: return + # 获取调用者的函数名 caller_name = inspect.stack()[1][3] + # 定义一个扩展的消息 extended_msg = '%%s %s' % msg + # 定义一个扩展的参数 extended_args = (caller_name,)+args + # 调用_logger对象的debug方法 debug = _logger.debug(extended_msg, *extended_args, **kwds) + # 定义一个_show_debug_messages函数 def _show_debug_messages(): + # 定义一个全局变量OPTIMIZATION_HACK global OPTIMIZATION_HACK + # 将优化hack设置为False OPTIMIZATION_HACK = False + # 将_logger对象的日志级别设置为DEBUG _logger.setLevel(logging.DEBUG) + # 定义一个StreamHandler对象 handler = logging.StreamHandler(sys.stdout) + # 将StreamHandler对象的日志级别设置为DEBUG handler.setLevel(logging.DEBUG) + # 将StreamHandler对象添加到_logger对象中 _logger.addHandler(handler) try: @@ -114,13 +131,17 @@ except ImportError: import sys, re, random if sys.version_info >= (3, 0): + # 如果Python版本大于等于3.0,则将xrange替换为range xrange = range # monkeypatch to fix http://www.python.org/sf/803422 :-( +# 修补monkeypatch以修复http://www.python.org/sf/803422 :-( sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]") # HTMLParser.HTMLParser is recent, so live without it if it's not available # (also, sgmllib.SGMLParser is much more tolerant of bad HTML) +# HTMLParser.HTMLParser是最近的,如果不可用,则没有它 +# (另外,sgmllib.SGMLParser对不良HTML的容忍度更高) try: import HTMLParser except ImportError: @@ -131,9 +152,11 @@ else: try: import warnings except ImportError: + # 如果没有导入warnings模块,则定义一个空函数 def deprecation(message, stack_offset=0): pass else: + # 如果成功导入warnings模块,则定义一个警告函数 def deprecation(message, stack_offset=0): warnings.warn(message, DeprecationWarning, stacklevel=3+stack_offset) @@ -224,29 +247,39 @@ string. return '&'.join(l) def unescape(data, entities, encoding=DEFAULT_ENCODING): + # 如果data为None或者data中不包含"&",则直接返回data if data is None or "&" not in data: return data + # 如果data是字符串类型,则将encoding设置为None if isinstance(data, six.string_types): encoding = None + # 定义一个函数,用于替换实体 def replace_entities(match, entities=entities, encoding=encoding): + # 获取匹配到的实体 ent = match.group() + # 如果实体以"#"开头,则调用unescape_charref函数进行替换 if ent[1] == "#": return unescape_charref(ent[2:-1], encoding) + # 从entities中获取实体的替换值 repl = entities.get(ent) + # 如果替换值存在,并且encoding不为None,则尝试将替换值解码为字符串 if repl is not None: if hasattr(repl, "decode") and encoding is not None: try: repl = repl.decode(encoding) except UnicodeError: repl = ent + # 如果替换值不存在,则将替换值设置为实体本身 else: repl = ent + # 返回替换值 return repl + # 使用正则表达式替换data中的实体 return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data) def unescape_charref(data, encoding): @@ -646,31 +679,47 @@ class _AbstractFormParser: self._textarea = None def start_label(self, attrs): + # 打印attrs debug("%s", attrs) + # 如果当前标签存在,则结束标签 if self._current_label: self.end_label() + # 创建一个空字典 d = {} + # 遍历attrs for key, val in attrs: + # 如果val需要转义,则进行转义 d[key] = self.unescape_attr_if_required(val) + # 如果存在for属性,则taken为True taken = bool(d.get("for")) # empty id is invalid + # 添加__text属性,值为空字符串 d["__text"] = "" + # 添加__taken属性,值为taken d["__taken"] = taken + # 如果taken为True,则将d添加到labels列表中 if taken: self.labels.append(d) + # 将当前标签设置为d self._current_label = d def end_label(self): + # 打印空字符串 debug("") + # 获取当前标签 label = self._current_label + # 如果当前标签不存在,则返回 if label is None: # something is ugly in the HTML, but we're ignoring it return + # 将当前标签设置为None self._current_label = None + # 如果当前标签存在,则删除__taken属性 # if it is staying around, it is True in all cases del label["__taken"] def _add_label(self, d): #debug("%s", d) + # 如果当前标签存在,且__taken属性为False,则将__taken属性设置为True,并将当前标签添加到d的__label属性中 if self._current_label is not None: if not self._current_label["__taken"]: self._current_label["__taken"] = True @@ -743,12 +792,16 @@ class _AbstractFormParser: controls.append((type, name, d)) def do_isindex(self, attrs): + # 打印传入的属性 debug("%s", attrs) d = {} + # 遍历属性,将属性名和属性值存入字典 for key, val in attrs: d[key] = self.unescape_attr_if_required(val) + # 获取当前表单的控件 controls = self._current_form[2] + # 添加标签 self._add_label(d) # isindex doesn't have type or name HTML attributes controls.append(("isindex", None, d)) diff --git a/src/sqlmap-master/thirdparty/magic/magic.py b/src/sqlmap-master/thirdparty/magic/magic.py index 0a5c257..1cac7e1 100644 --- a/src/sqlmap-master/thirdparty/magic/magic.py +++ b/src/sqlmap-master/thirdparty/magic/magic.py @@ -64,14 +64,16 @@ class Magic: return magic_file(self.cookie, filename) def __del__(self): - # during shutdown magic_close may have been cleared already + # 析构函数,确保在对象被垃圾回收时关闭 libmagic cookie if self.cookie and magic_close: magic_close(self.cookie) self.cookie = None +# 全局变量,用于保存默认和MIME magic对象 _magic_mime = None _magic = None +# 获取默认和MIME magic对象的函数 def _get_magic_mime(): global _magic_mime if not _magic_mime: @@ -90,6 +92,7 @@ def _get_magic_type(mime): else: return _get_magic() +# 公共函数,用于识别文件和缓冲区 def from_file(filename, mime=False): m = _get_magic_type(mime) return m.from_file(filename) @@ -98,6 +101,7 @@ def from_buffer(buffer, mime=False): m = _get_magic_type(mime) return m.from_buffer(buffer) +# 使用 ctypes 导入 libmagic 库 try: libmagic = None @@ -106,7 +110,7 @@ try: from ctypes import c_char_p, c_int, c_size_t, c_void_p - # Let's try to find magic or magic1 + # 尝试找到 libmagic 库 dll = ctypes.util.find_library('magic') or ctypes.util.find_library('magic1') # This is necessary because find_library returns None if it doesn't find the library @@ -116,6 +120,7 @@ try: except WindowsError: pass + # 如果没有找到,尝试平台特定的路径 if not libmagic or not libmagic._name: platform_to_lib = {'darwin': ['/opt/local/lib/libmagic.dylib', '/usr/local/lib/libmagic.dylib', @@ -126,11 +131,13 @@ try: libmagic = ctypes.CDLL(dll) except OSError: pass - + + # 如果仍然没有找到,抛出 ImportError if not libmagic or not libmagic._name: # It is better to raise an ImportError since we are importing magic module raise ImportError('failed to find libmagic. Check your installation') + # 定义 magic_t 类型和错误检查函数 magic_t = ctypes.c_void_p def errorcheck(result, func, args): @@ -145,6 +152,7 @@ try: return None return filename.encode(sys.getfilesystemencoding()) + # 使用 ctypes 定义 libmagic 函数 magic_open = libmagic.magic_open magic_open.restype = magic_t magic_open.argtypes = [c_int] @@ -198,28 +206,31 @@ try: magic_compile.restype = c_int magic_compile.argtypes = [magic_t, c_char_p] +# 如果 libmagic 无法导入,定义回退函数 except (ImportError, OSError): from_file = from_buffer = lambda *args, **kwargs: MAGIC_UNKNOWN_FILETYPE -MAGIC_NONE = 0x000000 # No flags -MAGIC_DEBUG = 0x000001 # Turn on debugging -MAGIC_SYMLINK = 0x000002 # Follow symlinks -MAGIC_COMPRESS = 0x000004 # Check inside compressed files -MAGIC_DEVICES = 0x000008 # Look at the contents of devices -MAGIC_MIME = 0x000010 # Return a mime string -MAGIC_MIME_ENCODING = 0x000400 # Return the MIME encoding -MAGIC_CONTINUE = 0x000020 # Return all matches -MAGIC_CHECK = 0x000040 # Print warnings to stderr -MAGIC_PRESERVE_ATIME = 0x000080 # Restore access time on exit -MAGIC_RAW = 0x000100 # Don't translate unprintable chars -MAGIC_ERROR = 0x000200 # Handle ENOENT etc as real errors -MAGIC_NO_CHECK_COMPRESS = 0x001000 # Don't check for compressed files -MAGIC_NO_CHECK_TAR = 0x002000 # Don't check for tar files -MAGIC_NO_CHECK_SOFT = 0x004000 # Don't check magic entries -MAGIC_NO_CHECK_APPTYPE = 0x008000 # Don't check application type -MAGIC_NO_CHECK_ELF = 0x010000 # Don't check for elf details -MAGIC_NO_CHECK_ASCII = 0x020000 # Don't check for ascii files -MAGIC_NO_CHECK_TROFF = 0x040000 # Don't check ascii/troff -MAGIC_NO_CHECK_FORTRAN = 0x080000 # Don't check ascii/fortran -MAGIC_NO_CHECK_TOKENS = 0x100000 # Don't check ascii/tokens + +# 定义 libmagic 标志常量 +MAGIC_NONE = 0x000000 # 无标志 +MAGIC_DEBUG = 0x000001 # 打开调试 +MAGIC_SYMLINK = 0x000002 # 跟随符号链接 +MAGIC_COMPRESS = 0x000004 # 检查压缩文件内部 +MAGIC_DEVICES = 0x000008 # 查看设备内容 +MAGIC_MIME = 0x000010 # 返回 MIME 字符串 +MAGIC_MIME_ENCODING = 0x000400 # 返回 MIME 编码 +MAGIC_CONTINUE = 0x000020 # 返回所有匹配项 +MAGIC_CHECK = 0x000040 # 打印警告到标准错误 +MAGIC_PRESERVE_ATIME = 0x000080 # 退出时恢复访问时间 +MAGIC_RAW = 0x000100 # 不转换不可打印字符 +MAGIC_ERROR = 0x000200 # 将 ENOENT 等视为真实错误 +MAGIC_NO_CHECK_COMPRESS = 0x001000 # 不检查压缩文件 +MAGIC_NO_CHECK_TAR = 0x002000 # 不检查 tar 文件 +MAGIC_NO_CHECK_SOFT = 0x004000 # 不检查 magic 条目 +MAGIC_NO_CHECK_APPTYPE = 0x008000 # 不检查应用程序类型 +MAGIC_NO_CHECK_ELF = 0x010000 # 不检查 elf 详细信息 +MAGIC_NO_CHECK_ASCII = 0x020000 # 不检查 ascii 文件 +MAGIC_NO_CHECK_TROFF = 0x040000 # 不检查 ascii/troff +MAGIC_NO_CHECK_FORTRAN = 0x080000 # 不检查 ascii/fortran +MAGIC_NO_CHECK_TOKENS = 0x100000 # 不检查 ascii/tokens MAGIC_UNKNOWN_FILETYPE = b"unknown" diff --git a/src/sqlmap-master/thirdparty/wininetpton/win_inet_pton.py b/src/sqlmap-master/thirdparty/wininetpton/win_inet_pton.py index 50ae621..a821761 100644 --- a/src/sqlmap-master/thirdparty/wininetpton/win_inet_pton.py +++ b/src/sqlmap-master/thirdparty/wininetpton/win_inet_pton.py @@ -8,14 +8,15 @@ import socket import ctypes import os - +# 定义一个结构体,用于存储socket地址信息 class sockaddr(ctypes.Structure): - _fields_ = [("sa_family", ctypes.c_short), - ("__pad1", ctypes.c_ushort), - ("ipv4_addr", ctypes.c_byte * 4), - ("ipv6_addr", ctypes.c_byte * 16), - ("__pad2", ctypes.c_ulong)] + _fields_ = [("sa_family", ctypes.c_short), # 地址族(例如AF_INET或AF_INET6) + ("__pad1", ctypes.c_ushort), # 填充字段 + ("ipv4_addr", ctypes.c_byte * 4), # IPv4地址(4个字节) + ("ipv6_addr", ctypes.c_byte * 16),# IPv6地址(16个字节) + ("__pad2", ctypes.c_ulong)] # 填充字段 +# 根据操作系统的不同,导入不同的库 if hasattr(ctypes, 'windll'): WSAStringToAddressA = ctypes.windll.ws2_32.WSAStringToAddressA WSAAddressToStringA = ctypes.windll.ws2_32.WSAAddressToStringA @@ -27,12 +28,13 @@ else: WSAStringToAddressA = not_windows WSAAddressToStringA = not_windows - +# inet_pton函数:将IP字符串转换为二进制格式 def inet_pton(address_family, ip_string): - addr = sockaddr() - addr.sa_family = address_family - addr_size = ctypes.c_int(ctypes.sizeof(addr)) + addr = sockaddr() # 创建sockaddr实例 + addr.sa_family = address_family # 设置地址族 + addr_size = ctypes.c_int(ctypes.sizeof(addr)) # 获取地址结构体大小 + # 使用WSAStringToAddressA函数将IP字符串转换为地址结构体 if WSAStringToAddressA( ip_string, address_family, @@ -42,6 +44,7 @@ def inet_pton(address_family, ip_string): ) != 0: raise socket.error(ctypes.FormatError()) + # 根据地址族返回对应的二进制IP地址 if address_family == socket.AF_INET: return ctypes.string_at(addr.ipv4_addr, 4) if address_family == socket.AF_INET6: @@ -49,14 +52,15 @@ def inet_pton(address_family, ip_string): raise socket.error('unknown address family') - +# inet_ntop函数:将二进制格式的IP地址转换为字符串 def inet_ntop(address_family, packed_ip): - addr = sockaddr() - addr.sa_family = address_family - addr_size = ctypes.c_int(ctypes.sizeof(addr)) - ip_string = ctypes.create_string_buffer(128) - ip_string_size = ctypes.c_int(ctypes.sizeof(ip_string)) + addr = sockaddr() # 创建sockaddr实例 + addr.sa_family = address_family # 设置地址族 + addr_size = ctypes.c_int(ctypes.sizeof(addr)) # 获取地址结构体大小 + ip_string = ctypes.create_string_buffer(128) # 创建字符串缓冲区 + ip_string_size = ctypes.c_int(ctypes.sizeof(ip_string)) # 获取字符串缓冲区大小 + # 根据地址族将二进制IP地址复制到地址结构体中 if address_family == socket.AF_INET: if len(packed_ip) != ctypes.sizeof(addr.ipv4_addr): raise socket.error('packed IP wrong length for inet_ntoa') @@ -68,6 +72,7 @@ def inet_ntop(address_family, packed_ip): else: raise socket.error('unknown address family') + # 使用WSAAddressToStringA函数将地址结构体转换为IP字符串 if WSAAddressToStringA( ctypes.byref(addr), addr_size, @@ -79,7 +84,7 @@ def inet_ntop(address_family, packed_ip): return ip_string[:ip_string_size.value - 1] -# Adding our two functions to the socket library +# 如果当前操作系统是Windows,将自定义的inet_pton和inet_ntop函数添加到socket库中 if os.name == 'nt': socket.inet_pton = inet_pton - socket.inet_ntop = inet_ntop + socket.inet_ntop = inet_ntop \ No newline at end of file