You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
sqlmap/src/sqlmap-master/thirdparty/chardet/utf8prober.py

108 lines
3.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .charsetprober import CharSetProber
from .enums import ProbingState, MachineState
from .codingstatemachine import CodingStateMachine
from .mbcssm import UTF8_SM_MODEL
class UTF8Prober(CharSetProber):
# 定义一个常量表示一个字符的初始概率为0.5
ONE_CHAR_PROB = 0.5
# 初始化函数
def __init__(self):
# 调用父类的初始化函数
super(UTF8Prober, self).__init__()
# 初始化编码状态机
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
# 初始化多字节字符数量
self._num_mb_chars = None
# 调用重置函数
self.reset()
# 重置函数
def reset(self):
# 调用父类的重置函数
super(UTF8Prober, self).reset()
# 重置编码状态机
self.coding_sm.reset()
# 重置多字节字符数量
self._num_mb_chars = 0
# 获取字符集名称的属性
@property
def charset_name(self):
# 返回字符集名称
return "utf-8"
# 获取语言名称的属性
@property
def language(self):
# 返回语言名称
return ""
def feed(self, byte_str):
# 遍历byte_str中的每个字符
for c in byte_str:
# 获取下一个状态
coding_state = self.coding_sm.next_state(c)
# 如果状态为ERROR则将状态设置为NOT_ME并跳出循环
if coding_state == MachineState.ERROR:
self._state = ProbingState.NOT_ME
break
# 如果状态为ITS_ME则将状态设置为FOUND_IT并跳出循环
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
# 如果状态为START且当前字符长度大于等于2则将_num_mb_chars加1
elif coding_state == MachineState.START:
if self.coding_sm.get_current_charlen() >= 2:
self._num_mb_chars += 1
# 如果状态为DETECTING且置信度大于SHORTCUT_THRESHOLD则将状态设置为FOUND_IT
if self.state == ProbingState.DETECTING:
if self.get_confidence() > self.SHORTCUT_THRESHOLD:
self._state = ProbingState.FOUND_IT
# 返回状态
return self.state
def get_confidence(self):
# 初始化 unlike 为 0.99
unlike = 0.99
# 如果_num_mb_chars 小于 6则 unlike 乘以 ONE_CHAR_PROB 的 _num_mb_chars 次方
if self._num_mb_chars < 6:
unlike *= self.ONE_CHAR_PROB ** self._num_mb_chars
# 返回 1.0 减去 unlike
return 1.0 - unlike
# 否则返回 unlike
else:
return unlike