httpie-cli/httpie/encoding.py
Batuhan Taskaya e09401b81a
Optimize encoding detection (#1243)
* Optimize encoding detection

* Use a threshold based system
2021-12-23 11:05:58 -08:00

51 lines
1.4 KiB
Python

from typing import Union, Tuple
from charset_normalizer import from_bytes
from charset_normalizer.constant import TOO_SMALL_SEQUENCE
UTF8 = 'utf-8'
ContentBytes = Union[bytearray, bytes]
def detect_encoding(content: ContentBytes) -> str:
"""
We default to UTF-8 if text too short, because the detection
can return a random encoding leading to confusing results
given the `charset_normalizer` version (< 2.0.5).
>>> too_short = ']"foo"'
>>> detected = from_bytes(too_short.encode()).best().encoding
>>> detected
'ascii'
>>> too_short.encode().decode(detected)
']"foo"'
"""
encoding = UTF8
if len(content) > TOO_SMALL_SEQUENCE:
match = from_bytes(bytes(content)).best()
if match:
encoding = match.encoding
return encoding
def smart_decode(content: ContentBytes, encoding: str) -> Tuple[str, str]:
"""Decode `content` using the given `encoding`.
If no `encoding` is provided, the best effort is to guess it from `content`.
Unicode errors are replaced.
"""
if not encoding:
encoding = detect_encoding(content)
return content.decode(encoding, 'replace'), encoding
def smart_encode(content: str, encoding: str) -> bytes:
"""Encode `content` using the given `encoding`.
Unicode errors are replaced.
"""
return content.encode(encoding, 'replace')