Drop sequence length check

This commit is contained in:
Mickaël Schoentgen 2021-10-05 10:23:08 +02:00
parent 252fe02f74
commit d52a4833e4
3 changed files with 7 additions and 12 deletions

View File

@ -1,7 +1,6 @@
from typing import Union
from charset_normalizer import from_bytes
from charset_normalizer.constant import TOO_SMALL_SEQUENCE
UTF8 = 'utf-8'
@ -16,16 +15,14 @@ def detect_encoding(content: ContentBytes) -> str:
>>> too_short = ']"foo"'
>>> detected = from_bytes(too_short.encode()).best().encoding
>>> detected
'utf_16_be'
'ascii'
>>> too_short.encode().decode(detected)
'崢景漢'
']"foo"'
"""
encoding = UTF8
if len(content) > TOO_SMALL_SEQUENCE:
match = from_bytes(bytes(content)).best()
if match:
encoding = match.encoding
match = from_bytes(bytes(content)).best()
if match:
encoding = match.encoding
return encoding

View File

@ -25,7 +25,7 @@ dev_require = [
'wheel',
]
install_requires = [
'charset_normalizer>=2.0.0',
'charset_normalizer>=2.0.5',
'defusedxml>=0.6.0',
'requests[socks]>=2.22.0',
'Pygments>=2.5.2',

View File

@ -4,7 +4,6 @@ Various unicode handling related tests.
"""
import pytest
import responses
from charset_normalizer.constant import TOO_SMALL_SEQUENCE
from httpie.cli.constants import PRETTY_MAP
from httpie.encoding import UTF8
@ -13,8 +12,7 @@ from .utils import http, HTTP_OK, URL_EXAMPLE, MockEnvironment, StdinBytesIO
from .fixtures import UNICODE
CZECH_TEXT = 'Všichni lidé jsou si rovni. Všichni lidé jsou si rovni.'
assert len(CZECH_TEXT) > TOO_SMALL_SEQUENCE
CZECH_TEXT = 'Všichni lidé jsou si rovni.'
CZECH_TEXT_SPECIFIC_CHARSET = 'windows-1250'
ENCODINGS = [UTF8, CZECH_TEXT_SPECIFIC_CHARSET]