forked from extern/httpie-cli
Optimize encoding detection (#1243)
* Optimize encoding detection * Use a threshold based system
This commit is contained in:
parent
5a83a9ebc4
commit
e09401b81a
@ -178,9 +178,8 @@ for pretty in ['all', 'none']:
|
|||||||
f'`http --pretty={pretty} pie.dev/stream/1000`',
|
f'`http --pretty={pretty} pie.dev/stream/1000`',
|
||||||
[
|
[
|
||||||
'--print=HBhb',
|
'--print=HBhb',
|
||||||
'--stream',
|
|
||||||
f'--pretty={pretty}',
|
f'--pretty={pretty}',
|
||||||
'httpbin.org/stream/100'
|
'httpbin.org/stream/1000'
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
DownloadRunner('download', '`http --download :/big_file.txt` (3GB)', '3G')
|
DownloadRunner('download', '`http --download :/big_file.txt` (3GB)', '3G')
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Union
|
from typing import Union, Tuple
|
||||||
|
|
||||||
from charset_normalizer import from_bytes
|
from charset_normalizer import from_bytes
|
||||||
from charset_normalizer.constant import TOO_SMALL_SEQUENCE
|
from charset_normalizer.constant import TOO_SMALL_SEQUENCE
|
||||||
@ -29,7 +29,7 @@ def detect_encoding(content: ContentBytes) -> str:
|
|||||||
return encoding
|
return encoding
|
||||||
|
|
||||||
|
|
||||||
def smart_decode(content: ContentBytes, encoding: str) -> str:
|
def smart_decode(content: ContentBytes, encoding: str) -> Tuple[str, str]:
|
||||||
"""Decode `content` using the given `encoding`.
|
"""Decode `content` using the given `encoding`.
|
||||||
If no `encoding` is provided, the best effort is to guess it from `content`.
|
If no `encoding` is provided, the best effort is to guess it from `content`.
|
||||||
|
|
||||||
@ -38,7 +38,7 @@ def smart_decode(content: ContentBytes, encoding: str) -> str:
|
|||||||
"""
|
"""
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = detect_encoding(content)
|
encoding = detect_encoding(content)
|
||||||
return content.decode(encoding, 'replace')
|
return content.decode(encoding, 'replace'), encoding
|
||||||
|
|
||||||
|
|
||||||
def smart_encode(content: str, encoding: str) -> bytes:
|
def smart_encode(content: str, encoding: str) -> bytes:
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from abc import ABCMeta, abstractmethod
|
from abc import ABCMeta, abstractmethod
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from typing import Callable, Iterable, Union
|
from typing import Callable, Iterable, Optional, Union
|
||||||
|
|
||||||
from .processing import Conversion, Formatting
|
from .processing import Conversion, Formatting
|
||||||
from ..context import Environment
|
from ..context import Environment
|
||||||
@ -89,6 +89,9 @@ class RawStream(BaseStream):
|
|||||||
return self.msg.iter_body(self.chunk_size)
|
return self.msg.iter_body(self.chunk_size)
|
||||||
|
|
||||||
|
|
||||||
|
ENCODING_GUESS_THRESHOLD = 3
|
||||||
|
|
||||||
|
|
||||||
class EncodedStream(BaseStream):
|
class EncodedStream(BaseStream):
|
||||||
"""Encoded HTTP message stream.
|
"""Encoded HTTP message stream.
|
||||||
|
|
||||||
@ -111,7 +114,8 @@ class EncodedStream(BaseStream):
|
|||||||
self.mime = mime_overwrite
|
self.mime = mime_overwrite
|
||||||
else:
|
else:
|
||||||
self.mime, _ = parse_content_type_header(self.msg.content_type)
|
self.mime, _ = parse_content_type_header(self.msg.content_type)
|
||||||
self.encoding = encoding_overwrite or self.msg.encoding
|
self._encoding = encoding_overwrite or self.msg.encoding
|
||||||
|
self._encoding_guesses = []
|
||||||
if env.stdout_isatty:
|
if env.stdout_isatty:
|
||||||
# Use the encoding supported by the terminal.
|
# Use the encoding supported by the terminal.
|
||||||
output_encoding = env.stdout_encoding
|
output_encoding = env.stdout_encoding
|
||||||
@ -125,9 +129,33 @@ class EncodedStream(BaseStream):
|
|||||||
for line, lf in self.msg.iter_lines(self.CHUNK_SIZE):
|
for line, lf in self.msg.iter_lines(self.CHUNK_SIZE):
|
||||||
if b'\0' in line:
|
if b'\0' in line:
|
||||||
raise BinarySuppressedError()
|
raise BinarySuppressedError()
|
||||||
line = smart_decode(line, self.encoding)
|
line = self.decode_chunk(line)
|
||||||
yield smart_encode(line, self.output_encoding) + lf
|
yield smart_encode(line, self.output_encoding) + lf
|
||||||
|
|
||||||
|
def decode_chunk(self, raw_chunk: str) -> str:
|
||||||
|
chunk, guessed_encoding = smart_decode(raw_chunk, self.encoding)
|
||||||
|
self._encoding_guesses.append(guessed_encoding)
|
||||||
|
return chunk
|
||||||
|
|
||||||
|
@property
|
||||||
|
def encoding(self) -> Optional[str]:
|
||||||
|
if self._encoding:
|
||||||
|
return self._encoding
|
||||||
|
|
||||||
|
# If we find a reliable (used consecutively) encoding, than
|
||||||
|
# use it for the next iterations.
|
||||||
|
if len(self._encoding_guesses) < ENCODING_GUESS_THRESHOLD:
|
||||||
|
return None
|
||||||
|
|
||||||
|
guess_1, guess_2 = self._encoding_guesses[-2:]
|
||||||
|
if guess_1 == guess_2:
|
||||||
|
self._encoding = guess_1
|
||||||
|
return guess_1
|
||||||
|
|
||||||
|
@encoding.setter
|
||||||
|
def encoding(self, value) -> None:
|
||||||
|
self._encoding = value
|
||||||
|
|
||||||
|
|
||||||
class PrettyStream(EncodedStream):
|
class PrettyStream(EncodedStream):
|
||||||
"""In addition to :class:`EncodedStream` behaviour, this stream applies
|
"""In addition to :class:`EncodedStream` behaviour, this stream applies
|
||||||
@ -178,7 +206,7 @@ class PrettyStream(EncodedStream):
|
|||||||
if not isinstance(chunk, str):
|
if not isinstance(chunk, str):
|
||||||
# Text when a converter has been used,
|
# Text when a converter has been used,
|
||||||
# otherwise it will always be bytes.
|
# otherwise it will always be bytes.
|
||||||
chunk = smart_decode(chunk, self.encoding)
|
chunk = self.decode_chunk(chunk)
|
||||||
chunk = self.formatting.format_body(content=chunk, mime=self.mime)
|
chunk = self.formatting.format_body(content=chunk, mime=self.mime)
|
||||||
return smart_encode(chunk, self.output_encoding)
|
return smart_encode(chunk, self.output_encoding)
|
||||||
|
|
||||||
|
2
tests/fixtures/__init__.py
vendored
2
tests/fixtures/__init__.py
vendored
@ -32,6 +32,8 @@ JSON_FILE_PATH_ARG = patharg(JSON_FILE_PATH)
|
|||||||
# line would be escaped).
|
# line would be escaped).
|
||||||
FILE_CONTENT = FILE_PATH.read_text(encoding=UTF8).strip()
|
FILE_CONTENT = FILE_PATH.read_text(encoding=UTF8).strip()
|
||||||
|
|
||||||
|
ASCII_FILE_CONTENT = "random text" * 10
|
||||||
|
|
||||||
|
|
||||||
JSON_FILE_CONTENT = JSON_FILE_PATH.read_text(encoding=UTF8)
|
JSON_FILE_CONTENT = JSON_FILE_PATH.read_text(encoding=UTF8)
|
||||||
BIN_FILE_CONTENT = BIN_FILE_PATH.read_bytes()
|
BIN_FILE_CONTENT = BIN_FILE_PATH.read_bytes()
|
||||||
|
@ -11,7 +11,12 @@ from httpie.plugins import ConverterPlugin
|
|||||||
from httpie.plugins.registry import plugin_manager
|
from httpie.plugins.registry import plugin_manager
|
||||||
|
|
||||||
from .utils import StdinBytesIO, http, MockEnvironment, DUMMY_URL
|
from .utils import StdinBytesIO, http, MockEnvironment, DUMMY_URL
|
||||||
from .fixtures import BIN_FILE_CONTENT, BIN_FILE_PATH
|
from .fixtures import (
|
||||||
|
ASCII_FILE_CONTENT,
|
||||||
|
BIN_FILE_CONTENT,
|
||||||
|
BIN_FILE_PATH,
|
||||||
|
FILE_CONTENT as UNICODE_FILE_CONTENT
|
||||||
|
)
|
||||||
|
|
||||||
PRETTY_OPTIONS = list(PRETTY_MAP.keys())
|
PRETTY_OPTIONS = list(PRETTY_MAP.keys())
|
||||||
|
|
||||||
@ -133,3 +138,9 @@ def test_auto_streaming(http_server, extras, expected):
|
|||||||
for call_arg in env.stdout.write.call_args_list
|
for call_arg in env.stdout.write.call_args_list
|
||||||
if b'test' in call_arg[0][0]
|
if b'test' in call_arg[0][0]
|
||||||
]) == expected
|
]) == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_streaming_encoding_detection(http_server):
|
||||||
|
r = http('--stream', http_server + '/stream/encoding/random')
|
||||||
|
assert ASCII_FILE_CONTENT in r
|
||||||
|
assert UNICODE_FILE_CONTENT in r
|
||||||
|
@ -52,6 +52,27 @@ def chunked_drip(handler):
|
|||||||
handler.wfile.write('0\r\n\r\n'.encode('utf-8'))
|
handler.wfile.write('0\r\n\r\n'.encode('utf-8'))
|
||||||
|
|
||||||
|
|
||||||
|
@TestHandler.handler('GET', '/stream/encoding/random')
|
||||||
|
def random_encoding(handler):
|
||||||
|
from tests.fixtures import ASCII_FILE_CONTENT, FILE_CONTENT as UNICODE_FILE_CONTENT
|
||||||
|
|
||||||
|
handler.send_response(200)
|
||||||
|
handler.send_header('Transfer-Encoding', 'chunked')
|
||||||
|
handler.end_headers()
|
||||||
|
|
||||||
|
for body in [
|
||||||
|
ASCII_FILE_CONTENT,
|
||||||
|
ASCII_FILE_CONTENT,
|
||||||
|
UNICODE_FILE_CONTENT,
|
||||||
|
UNICODE_FILE_CONTENT,
|
||||||
|
UNICODE_FILE_CONTENT,
|
||||||
|
]:
|
||||||
|
body += "\n"
|
||||||
|
handler.wfile.write(f'{len(body.encode()):X}\r\n{body}\r\n'.encode())
|
||||||
|
|
||||||
|
handler.wfile.write('0\r\n\r\n'.encode('utf-8'))
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
def http_server():
|
def http_server():
|
||||||
"""A custom HTTP server implementation for our tests, that is
|
"""A custom HTTP server implementation for our tests, that is
|
||||||
|
Loading…
Reference in New Issue
Block a user