Optimize encoding detection (#1243)

* Optimize encoding detection

* Use a threshold based system
This commit is contained in:
Batuhan Taskaya 2021-12-23 22:05:58 +03:00 committed by GitHub
parent 5a83a9ebc4
commit e09401b81a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 71 additions and 10 deletions

View File

@ -178,9 +178,8 @@ for pretty in ['all', 'none']:
f'`http --pretty={pretty} pie.dev/stream/1000`',
[
'--print=HBhb',
'--stream',
f'--pretty={pretty}',
'httpbin.org/stream/100'
'httpbin.org/stream/1000'
]
)
DownloadRunner('download', '`http --download :/big_file.txt` (3GB)', '3G')

View File

@ -1,4 +1,4 @@
from typing import Union
from typing import Union, Tuple
from charset_normalizer import from_bytes
from charset_normalizer.constant import TOO_SMALL_SEQUENCE
@ -29,7 +29,7 @@ def detect_encoding(content: ContentBytes) -> str:
return encoding
def smart_decode(content: ContentBytes, encoding: str) -> str:
def smart_decode(content: ContentBytes, encoding: str) -> Tuple[str, str]:
"""Decode `content` using the given `encoding`.
If no `encoding` is provided, the best effort is to guess it from `content`.
@ -38,7 +38,7 @@ def smart_decode(content: ContentBytes, encoding: str) -> str:
"""
if not encoding:
encoding = detect_encoding(content)
return content.decode(encoding, 'replace')
return content.decode(encoding, 'replace'), encoding
def smart_encode(content: str, encoding: str) -> bytes:

View File

@ -1,6 +1,6 @@
from abc import ABCMeta, abstractmethod
from itertools import chain
from typing import Callable, Iterable, Union
from typing import Callable, Iterable, Optional, Union
from .processing import Conversion, Formatting
from ..context import Environment
@ -89,6 +89,9 @@ class RawStream(BaseStream):
return self.msg.iter_body(self.chunk_size)
ENCODING_GUESS_THRESHOLD = 3
class EncodedStream(BaseStream):
"""Encoded HTTP message stream.
@ -111,7 +114,8 @@ class EncodedStream(BaseStream):
self.mime = mime_overwrite
else:
self.mime, _ = parse_content_type_header(self.msg.content_type)
self.encoding = encoding_overwrite or self.msg.encoding
self._encoding = encoding_overwrite or self.msg.encoding
self._encoding_guesses = []
if env.stdout_isatty:
# Use the encoding supported by the terminal.
output_encoding = env.stdout_encoding
@ -125,9 +129,33 @@ class EncodedStream(BaseStream):
for line, lf in self.msg.iter_lines(self.CHUNK_SIZE):
if b'\0' in line:
raise BinarySuppressedError()
line = smart_decode(line, self.encoding)
line = self.decode_chunk(line)
yield smart_encode(line, self.output_encoding) + lf
def decode_chunk(self, raw_chunk: str) -> str:
chunk, guessed_encoding = smart_decode(raw_chunk, self.encoding)
self._encoding_guesses.append(guessed_encoding)
return chunk
@property
def encoding(self) -> Optional[str]:
if self._encoding:
return self._encoding
# If we find a reliable (used consecutively) encoding, than
# use it for the next iterations.
if len(self._encoding_guesses) < ENCODING_GUESS_THRESHOLD:
return None
guess_1, guess_2 = self._encoding_guesses[-2:]
if guess_1 == guess_2:
self._encoding = guess_1
return guess_1
@encoding.setter
def encoding(self, value) -> None:
self._encoding = value
class PrettyStream(EncodedStream):
"""In addition to :class:`EncodedStream` behaviour, this stream applies
@ -178,7 +206,7 @@ class PrettyStream(EncodedStream):
if not isinstance(chunk, str):
# Text when a converter has been used,
# otherwise it will always be bytes.
chunk = smart_decode(chunk, self.encoding)
chunk = self.decode_chunk(chunk)
chunk = self.formatting.format_body(content=chunk, mime=self.mime)
return smart_encode(chunk, self.output_encoding)

View File

@ -32,6 +32,8 @@ JSON_FILE_PATH_ARG = patharg(JSON_FILE_PATH)
# line would be escaped).
FILE_CONTENT = FILE_PATH.read_text(encoding=UTF8).strip()
ASCII_FILE_CONTENT = "random text" * 10
JSON_FILE_CONTENT = JSON_FILE_PATH.read_text(encoding=UTF8)
BIN_FILE_CONTENT = BIN_FILE_PATH.read_bytes()

View File

@ -11,7 +11,12 @@ from httpie.plugins import ConverterPlugin
from httpie.plugins.registry import plugin_manager
from .utils import StdinBytesIO, http, MockEnvironment, DUMMY_URL
from .fixtures import BIN_FILE_CONTENT, BIN_FILE_PATH
from .fixtures import (
ASCII_FILE_CONTENT,
BIN_FILE_CONTENT,
BIN_FILE_PATH,
FILE_CONTENT as UNICODE_FILE_CONTENT
)
PRETTY_OPTIONS = list(PRETTY_MAP.keys())
@ -133,3 +138,9 @@ def test_auto_streaming(http_server, extras, expected):
for call_arg in env.stdout.write.call_args_list
if b'test' in call_arg[0][0]
]) == expected
def test_streaming_encoding_detection(http_server):
r = http('--stream', http_server + '/stream/encoding/random')
assert ASCII_FILE_CONTENT in r
assert UNICODE_FILE_CONTENT in r

View File

@ -52,6 +52,27 @@ def chunked_drip(handler):
handler.wfile.write('0\r\n\r\n'.encode('utf-8'))
@TestHandler.handler('GET', '/stream/encoding/random')
def random_encoding(handler):
from tests.fixtures import ASCII_FILE_CONTENT, FILE_CONTENT as UNICODE_FILE_CONTENT
handler.send_response(200)
handler.send_header('Transfer-Encoding', 'chunked')
handler.end_headers()
for body in [
ASCII_FILE_CONTENT,
ASCII_FILE_CONTENT,
UNICODE_FILE_CONTENT,
UNICODE_FILE_CONTENT,
UNICODE_FILE_CONTENT,
]:
body += "\n"
handler.wfile.write(f'{len(body.encode()):X}\r\n{body}\r\n'.encode())
handler.wfile.write('0\r\n\r\n'.encode('utf-8'))
@pytest.fixture(scope="function")
def http_server():
"""A custom HTTP server implementation for our tests, that is