Optimize encoding detection (#1243)

* Optimize encoding detection

* Use a threshold based system
This commit is contained in:
Batuhan Taskaya 2021-12-23 22:05:58 +03:00 committed by GitHub
parent 5a83a9ebc4
commit e09401b81a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 71 additions and 10 deletions

View File

@ -178,9 +178,8 @@ for pretty in ['all', 'none']:
f'`http --pretty={pretty} pie.dev/stream/1000`', f'`http --pretty={pretty} pie.dev/stream/1000`',
[ [
'--print=HBhb', '--print=HBhb',
'--stream',
f'--pretty={pretty}', f'--pretty={pretty}',
'httpbin.org/stream/100' 'httpbin.org/stream/1000'
] ]
) )
DownloadRunner('download', '`http --download :/big_file.txt` (3GB)', '3G') DownloadRunner('download', '`http --download :/big_file.txt` (3GB)', '3G')

View File

@ -1,4 +1,4 @@
from typing import Union from typing import Union, Tuple
from charset_normalizer import from_bytes from charset_normalizer import from_bytes
from charset_normalizer.constant import TOO_SMALL_SEQUENCE from charset_normalizer.constant import TOO_SMALL_SEQUENCE
@ -29,7 +29,7 @@ def detect_encoding(content: ContentBytes) -> str:
return encoding return encoding
def smart_decode(content: ContentBytes, encoding: str) -> str: def smart_decode(content: ContentBytes, encoding: str) -> Tuple[str, str]:
"""Decode `content` using the given `encoding`. """Decode `content` using the given `encoding`.
If no `encoding` is provided, the best effort is to guess it from `content`. If no `encoding` is provided, the best effort is to guess it from `content`.
@ -38,7 +38,7 @@ def smart_decode(content: ContentBytes, encoding: str) -> str:
""" """
if not encoding: if not encoding:
encoding = detect_encoding(content) encoding = detect_encoding(content)
return content.decode(encoding, 'replace') return content.decode(encoding, 'replace'), encoding
def smart_encode(content: str, encoding: str) -> bytes: def smart_encode(content: str, encoding: str) -> bytes:

View File

@ -1,6 +1,6 @@
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
from itertools import chain from itertools import chain
from typing import Callable, Iterable, Union from typing import Callable, Iterable, Optional, Union
from .processing import Conversion, Formatting from .processing import Conversion, Formatting
from ..context import Environment from ..context import Environment
@ -89,6 +89,9 @@ class RawStream(BaseStream):
return self.msg.iter_body(self.chunk_size) return self.msg.iter_body(self.chunk_size)
ENCODING_GUESS_THRESHOLD = 3
class EncodedStream(BaseStream): class EncodedStream(BaseStream):
"""Encoded HTTP message stream. """Encoded HTTP message stream.
@ -111,7 +114,8 @@ class EncodedStream(BaseStream):
self.mime = mime_overwrite self.mime = mime_overwrite
else: else:
self.mime, _ = parse_content_type_header(self.msg.content_type) self.mime, _ = parse_content_type_header(self.msg.content_type)
self.encoding = encoding_overwrite or self.msg.encoding self._encoding = encoding_overwrite or self.msg.encoding
self._encoding_guesses = []
if env.stdout_isatty: if env.stdout_isatty:
# Use the encoding supported by the terminal. # Use the encoding supported by the terminal.
output_encoding = env.stdout_encoding output_encoding = env.stdout_encoding
@ -125,9 +129,33 @@ class EncodedStream(BaseStream):
for line, lf in self.msg.iter_lines(self.CHUNK_SIZE): for line, lf in self.msg.iter_lines(self.CHUNK_SIZE):
if b'\0' in line: if b'\0' in line:
raise BinarySuppressedError() raise BinarySuppressedError()
line = smart_decode(line, self.encoding) line = self.decode_chunk(line)
yield smart_encode(line, self.output_encoding) + lf yield smart_encode(line, self.output_encoding) + lf
def decode_chunk(self, raw_chunk: str) -> str:
chunk, guessed_encoding = smart_decode(raw_chunk, self.encoding)
self._encoding_guesses.append(guessed_encoding)
return chunk
@property
def encoding(self) -> Optional[str]:
if self._encoding:
return self._encoding
# If we find a reliable (used consecutively) encoding, than
# use it for the next iterations.
if len(self._encoding_guesses) < ENCODING_GUESS_THRESHOLD:
return None
guess_1, guess_2 = self._encoding_guesses[-2:]
if guess_1 == guess_2:
self._encoding = guess_1
return guess_1
@encoding.setter
def encoding(self, value) -> None:
self._encoding = value
class PrettyStream(EncodedStream): class PrettyStream(EncodedStream):
"""In addition to :class:`EncodedStream` behaviour, this stream applies """In addition to :class:`EncodedStream` behaviour, this stream applies
@ -178,7 +206,7 @@ class PrettyStream(EncodedStream):
if not isinstance(chunk, str): if not isinstance(chunk, str):
# Text when a converter has been used, # Text when a converter has been used,
# otherwise it will always be bytes. # otherwise it will always be bytes.
chunk = smart_decode(chunk, self.encoding) chunk = self.decode_chunk(chunk)
chunk = self.formatting.format_body(content=chunk, mime=self.mime) chunk = self.formatting.format_body(content=chunk, mime=self.mime)
return smart_encode(chunk, self.output_encoding) return smart_encode(chunk, self.output_encoding)

View File

@ -32,6 +32,8 @@ JSON_FILE_PATH_ARG = patharg(JSON_FILE_PATH)
# line would be escaped). # line would be escaped).
FILE_CONTENT = FILE_PATH.read_text(encoding=UTF8).strip() FILE_CONTENT = FILE_PATH.read_text(encoding=UTF8).strip()
ASCII_FILE_CONTENT = "random text" * 10
JSON_FILE_CONTENT = JSON_FILE_PATH.read_text(encoding=UTF8) JSON_FILE_CONTENT = JSON_FILE_PATH.read_text(encoding=UTF8)
BIN_FILE_CONTENT = BIN_FILE_PATH.read_bytes() BIN_FILE_CONTENT = BIN_FILE_PATH.read_bytes()

View File

@ -11,7 +11,12 @@ from httpie.plugins import ConverterPlugin
from httpie.plugins.registry import plugin_manager from httpie.plugins.registry import plugin_manager
from .utils import StdinBytesIO, http, MockEnvironment, DUMMY_URL from .utils import StdinBytesIO, http, MockEnvironment, DUMMY_URL
from .fixtures import BIN_FILE_CONTENT, BIN_FILE_PATH from .fixtures import (
ASCII_FILE_CONTENT,
BIN_FILE_CONTENT,
BIN_FILE_PATH,
FILE_CONTENT as UNICODE_FILE_CONTENT
)
PRETTY_OPTIONS = list(PRETTY_MAP.keys()) PRETTY_OPTIONS = list(PRETTY_MAP.keys())
@ -133,3 +138,9 @@ def test_auto_streaming(http_server, extras, expected):
for call_arg in env.stdout.write.call_args_list for call_arg in env.stdout.write.call_args_list
if b'test' in call_arg[0][0] if b'test' in call_arg[0][0]
]) == expected ]) == expected
def test_streaming_encoding_detection(http_server):
r = http('--stream', http_server + '/stream/encoding/random')
assert ASCII_FILE_CONTENT in r
assert UNICODE_FILE_CONTENT in r

View File

@ -52,6 +52,27 @@ def chunked_drip(handler):
handler.wfile.write('0\r\n\r\n'.encode('utf-8')) handler.wfile.write('0\r\n\r\n'.encode('utf-8'))
@TestHandler.handler('GET', '/stream/encoding/random')
def random_encoding(handler):
from tests.fixtures import ASCII_FILE_CONTENT, FILE_CONTENT as UNICODE_FILE_CONTENT
handler.send_response(200)
handler.send_header('Transfer-Encoding', 'chunked')
handler.end_headers()
for body in [
ASCII_FILE_CONTENT,
ASCII_FILE_CONTENT,
UNICODE_FILE_CONTENT,
UNICODE_FILE_CONTENT,
UNICODE_FILE_CONTENT,
]:
body += "\n"
handler.wfile.write(f'{len(body.encode()):X}\r\n{body}\r\n'.encode())
handler.wfile.write('0\r\n\r\n'.encode('utf-8'))
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def http_server(): def http_server():
"""A custom HTTP server implementation for our tests, that is """A custom HTTP server implementation for our tests, that is