You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
55 lines
2.0 KiB
55 lines
2.0 KiB
5 months ago
|
from typing import Any, Dict, Optional, Union
|
||
|
from warnings import warn
|
||
|
|
||
|
from .api import from_bytes
|
||
|
from .constant import CHARDET_CORRESPONDENCE
|
||
|
|
||
|
|
||
|
def detect(
|
||
|
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
|
||
|
) -> Dict[str, Optional[Union[str, float]]]:
|
||
|
"""
|
||
|
chardet legacy method
|
||
|
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
||
|
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
||
|
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
||
|
further information. Not planned for removal.
|
||
|
|
||
|
:param byte_str: The byte sequence to examine.
|
||
|
:param should_rename_legacy: Should we rename legacy encodings
|
||
|
to their more modern equivalents?
|
||
|
"""
|
||
|
if len(kwargs):
|
||
|
warn(
|
||
|
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
|
||
|
)
|
||
|
|
||
|
if not isinstance(byte_str, (bytearray, bytes)):
|
||
|
raise TypeError( # pragma: nocover
|
||
|
"Expected object of type bytes or bytearray, got: "
|
||
|
"{0}".format(type(byte_str))
|
||
|
)
|
||
|
|
||
|
if isinstance(byte_str, bytearray):
|
||
|
byte_str = bytes(byte_str)
|
||
|
|
||
|
r = from_bytes(byte_str).best()
|
||
|
|
||
|
encoding = r.encoding if r is not None else None
|
||
|
language = r.language if r is not None and r.language != "Unknown" else ""
|
||
|
confidence = 1.0 - r.chaos if r is not None else None
|
||
|
|
||
|
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
||
|
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
||
|
if r is not None and encoding == "utf_8" and r.bom:
|
||
|
encoding += "_sig"
|
||
|
|
||
|
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
|
||
|
encoding = CHARDET_CORRESPONDENCE[encoding]
|
||
|
|
||
|
return {
|
||
|
"encoding": encoding,
|
||
|
"language": language,
|
||
|
"confidence": confidence,
|
||
|
}
|