######################## BEGIN LICENSE BLOCK ######################## # The Original Code is mozilla.org code. # # The Initial Developer of the Original Code is # Netscape Communications Corporation. # Portions created by the Initial Developer are Copyright (C) 1998 # the Initial Developer. All Rights Reserved. # # Contributor(s): # Mark Pilgrim - port to Python # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### from .charsetprober import CharSetProber from .enums import ProbingState, MachineState from .codingstatemachine import CodingStateMachine from .mbcssm import UTF8_SM_MODEL class UTF8Prober(CharSetProber): # 定义一个常量,表示一个字符的初始概率为0.5 ONE_CHAR_PROB = 0.5 # 初始化函数 def __init__(self): # 调用父类的初始化函数 super(UTF8Prober, self).__init__() # 初始化编码状态机 self.coding_sm = CodingStateMachine(UTF8_SM_MODEL) # 初始化多字节字符数量 self._num_mb_chars = None # 调用重置函数 self.reset() # 重置函数 def reset(self): # 调用父类的重置函数 super(UTF8Prober, self).reset() # 重置编码状态机 self.coding_sm.reset() # 重置多字节字符数量 self._num_mb_chars = 0 # 获取字符集名称的属性 @property def charset_name(self): # 返回字符集名称 return "utf-8" # 获取语言名称的属性 @property def language(self): # 返回语言名称 return "" def feed(self, byte_str): # 遍历byte_str中的每个字符 for c in byte_str: # 获取下一个状态 coding_state = self.coding_sm.next_state(c) # 如果状态为ERROR,则将状态设置为NOT_ME,并跳出循环 if coding_state == MachineState.ERROR: self._state = ProbingState.NOT_ME break # 如果状态为ITS_ME,则将状态设置为FOUND_IT,并跳出循环 elif coding_state == MachineState.ITS_ME: self._state = ProbingState.FOUND_IT break # 如果状态为START,且当前字符长度大于等于2,则将_num_mb_chars加1 elif coding_state == MachineState.START: if self.coding_sm.get_current_charlen() >= 2: self._num_mb_chars += 1 # 如果状态为DETECTING,且置信度大于SHORTCUT_THRESHOLD,则将状态设置为FOUND_IT if self.state == ProbingState.DETECTING: if self.get_confidence() > self.SHORTCUT_THRESHOLD: self._state = ProbingState.FOUND_IT # 返回状态 return self.state def get_confidence(self): # 初始化 unlike 为 0.99 unlike = 0.99 # 如果_num_mb_chars 小于 6,则 unlike 乘以 ONE_CHAR_PROB 的 _num_mb_chars 次方 if self._num_mb_chars < 6: unlike *= self.ONE_CHAR_PROB ** self._num_mb_chars # 返回 1.0 减去 unlike return 1.0 - unlike # 否则返回 unlike else: return unlike