httpie-cli/httpie/encoding.py

from typing import Union, Tuple

from charset_normalizer import from_bytes
from charset_normalizer.constant import TOO_SMALL_SEQUENCE

UTF8 = 'utf-8'

ContentBytes = Union[bytearray, bytes]


def detect_encoding(content: ContentBytes) -> str:
    """
    We default to UTF-8 if text too short, because the detection
    can return a random encoding leading to confusing results
    given the `charset_normalizer` version (< 2.0.5).

    >>> too_short = ']"foo"'
    >>> detected = from_bytes(too_short.encode()).best().encoding
    >>> detected
    'ascii'
    >>> too_short.encode().decode(detected)
    ']"foo"'
    """
    encoding = UTF8
    if len(content) > TOO_SMALL_SEQUENCE:
        match = from_bytes(bytes(content)).best()
        if match:
            encoding = match.encoding
    return encoding


def smart_decode(content: ContentBytes, encoding: str) -> Tuple[str, str]:
    """Decode `content` using the given `encoding`.
    If no `encoding` is provided, the best effort is to guess it from `content`.

    Unicode errors are replaced.

    """
    if not encoding:
        encoding = detect_encoding(content)
    return content.decode(encoding, 'replace'), encoding


def smart_encode(content: str, encoding: str) -> bytes:
    """Encode `content` using the given `encoding`.

    Unicode errors are replaced.

    """
    return content.encode(encoding, 'replace')