From e09401b81aec7d630b986eb3a4056b2f387dc3e0 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Thu, 23 Dec 2021 22:05:58 +0300 Subject: [PATCH] Optimize encoding detection (#1243) * Optimize encoding detection * Use a threshold based system --- extras/profiling/benchmarks.py | 3 +-- httpie/encoding.py | 6 +++--- httpie/output/streams.py | 36 ++++++++++++++++++++++++++++++---- tests/fixtures/__init__.py | 2 ++ tests/test_stream.py | 13 +++++++++++- tests/utils/http_server.py | 21 ++++++++++++++++++++ 6 files changed, 71 insertions(+), 10 deletions(-) diff --git a/extras/profiling/benchmarks.py b/extras/profiling/benchmarks.py index 50a53a5a..5d47a3a1 100644 --- a/extras/profiling/benchmarks.py +++ b/extras/profiling/benchmarks.py @@ -178,9 +178,8 @@ for pretty in ['all', 'none']: f'`http --pretty={pretty} pie.dev/stream/1000`', [ '--print=HBhb', - '--stream', f'--pretty={pretty}', - 'httpbin.org/stream/100' + 'httpbin.org/stream/1000' ] ) DownloadRunner('download', '`http --download :/big_file.txt` (3GB)', '3G') diff --git a/httpie/encoding.py b/httpie/encoding.py index 8888743a..f796dde9 100644 --- a/httpie/encoding.py +++ b/httpie/encoding.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Union, Tuple from charset_normalizer import from_bytes from charset_normalizer.constant import TOO_SMALL_SEQUENCE @@ -29,7 +29,7 @@ def detect_encoding(content: ContentBytes) -> str: return encoding -def smart_decode(content: ContentBytes, encoding: str) -> str: +def smart_decode(content: ContentBytes, encoding: str) -> Tuple[str, str]: """Decode `content` using the given `encoding`. If no `encoding` is provided, the best effort is to guess it from `content`. @@ -38,7 +38,7 @@ def smart_decode(content: ContentBytes, encoding: str) -> str: """ if not encoding: encoding = detect_encoding(content) - return content.decode(encoding, 'replace') + return content.decode(encoding, 'replace'), encoding def smart_encode(content: str, encoding: str) -> bytes: diff --git a/httpie/output/streams.py b/httpie/output/streams.py index f9492a21..8cc17d7b 100644 --- a/httpie/output/streams.py +++ b/httpie/output/streams.py @@ -1,6 +1,6 @@ from abc import ABCMeta, abstractmethod from itertools import chain -from typing import Callable, Iterable, Union +from typing import Callable, Iterable, Optional, Union from .processing import Conversion, Formatting from ..context import Environment @@ -89,6 +89,9 @@ class RawStream(BaseStream): return self.msg.iter_body(self.chunk_size) +ENCODING_GUESS_THRESHOLD = 3 + + class EncodedStream(BaseStream): """Encoded HTTP message stream. @@ -111,7 +114,8 @@ class EncodedStream(BaseStream): self.mime = mime_overwrite else: self.mime, _ = parse_content_type_header(self.msg.content_type) - self.encoding = encoding_overwrite or self.msg.encoding + self._encoding = encoding_overwrite or self.msg.encoding + self._encoding_guesses = [] if env.stdout_isatty: # Use the encoding supported by the terminal. output_encoding = env.stdout_encoding @@ -125,9 +129,33 @@ class EncodedStream(BaseStream): for line, lf in self.msg.iter_lines(self.CHUNK_SIZE): if b'\0' in line: raise BinarySuppressedError() - line = smart_decode(line, self.encoding) + line = self.decode_chunk(line) yield smart_encode(line, self.output_encoding) + lf + def decode_chunk(self, raw_chunk: str) -> str: + chunk, guessed_encoding = smart_decode(raw_chunk, self.encoding) + self._encoding_guesses.append(guessed_encoding) + return chunk + + @property + def encoding(self) -> Optional[str]: + if self._encoding: + return self._encoding + + # If we find a reliable (used consecutively) encoding, than + # use it for the next iterations. + if len(self._encoding_guesses) < ENCODING_GUESS_THRESHOLD: + return None + + guess_1, guess_2 = self._encoding_guesses[-2:] + if guess_1 == guess_2: + self._encoding = guess_1 + return guess_1 + + @encoding.setter + def encoding(self, value) -> None: + self._encoding = value + class PrettyStream(EncodedStream): """In addition to :class:`EncodedStream` behaviour, this stream applies @@ -178,7 +206,7 @@ class PrettyStream(EncodedStream): if not isinstance(chunk, str): # Text when a converter has been used, # otherwise it will always be bytes. - chunk = smart_decode(chunk, self.encoding) + chunk = self.decode_chunk(chunk) chunk = self.formatting.format_body(content=chunk, mime=self.mime) return smart_encode(chunk, self.output_encoding) diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py index ade44929..126b1327 100644 --- a/tests/fixtures/__init__.py +++ b/tests/fixtures/__init__.py @@ -32,6 +32,8 @@ JSON_FILE_PATH_ARG = patharg(JSON_FILE_PATH) # line would be escaped). FILE_CONTENT = FILE_PATH.read_text(encoding=UTF8).strip() +ASCII_FILE_CONTENT = "random text" * 10 + JSON_FILE_CONTENT = JSON_FILE_PATH.read_text(encoding=UTF8) BIN_FILE_CONTENT = BIN_FILE_PATH.read_bytes() diff --git a/tests/test_stream.py b/tests/test_stream.py index 55e000a6..fb47378b 100644 --- a/tests/test_stream.py +++ b/tests/test_stream.py @@ -11,7 +11,12 @@ from httpie.plugins import ConverterPlugin from httpie.plugins.registry import plugin_manager from .utils import StdinBytesIO, http, MockEnvironment, DUMMY_URL -from .fixtures import BIN_FILE_CONTENT, BIN_FILE_PATH +from .fixtures import ( + ASCII_FILE_CONTENT, + BIN_FILE_CONTENT, + BIN_FILE_PATH, + FILE_CONTENT as UNICODE_FILE_CONTENT +) PRETTY_OPTIONS = list(PRETTY_MAP.keys()) @@ -133,3 +138,9 @@ def test_auto_streaming(http_server, extras, expected): for call_arg in env.stdout.write.call_args_list if b'test' in call_arg[0][0] ]) == expected + + +def test_streaming_encoding_detection(http_server): + r = http('--stream', http_server + '/stream/encoding/random') + assert ASCII_FILE_CONTENT in r + assert UNICODE_FILE_CONTENT in r diff --git a/tests/utils/http_server.py b/tests/utils/http_server.py index f09e06c2..fc8f2b07 100644 --- a/tests/utils/http_server.py +++ b/tests/utils/http_server.py @@ -52,6 +52,27 @@ def chunked_drip(handler): handler.wfile.write('0\r\n\r\n'.encode('utf-8')) +@TestHandler.handler('GET', '/stream/encoding/random') +def random_encoding(handler): + from tests.fixtures import ASCII_FILE_CONTENT, FILE_CONTENT as UNICODE_FILE_CONTENT + + handler.send_response(200) + handler.send_header('Transfer-Encoding', 'chunked') + handler.end_headers() + + for body in [ + ASCII_FILE_CONTENT, + ASCII_FILE_CONTENT, + UNICODE_FILE_CONTENT, + UNICODE_FILE_CONTENT, + UNICODE_FILE_CONTENT, + ]: + body += "\n" + handler.wfile.write(f'{len(body.encode()):X}\r\n{body}\r\n'.encode()) + + handler.wfile.write('0\r\n\r\n'.encode('utf-8')) + + @pytest.fixture(scope="function") def http_server(): """A custom HTTP server implementation for our tests, that is