forked from extern/httpie-cli
Optimize encoding detection (#1243)
* Optimize encoding detection * Use a threshold based system
This commit is contained in:
parent
5a83a9ebc4
commit
e09401b81a
@ -178,9 +178,8 @@ for pretty in ['all', 'none']:
|
||||
f'`http --pretty={pretty} pie.dev/stream/1000`',
|
||||
[
|
||||
'--print=HBhb',
|
||||
'--stream',
|
||||
f'--pretty={pretty}',
|
||||
'httpbin.org/stream/100'
|
||||
'httpbin.org/stream/1000'
|
||||
]
|
||||
)
|
||||
DownloadRunner('download', '`http --download :/big_file.txt` (3GB)', '3G')
|
||||
|
@ -1,4 +1,4 @@
|
||||
from typing import Union
|
||||
from typing import Union, Tuple
|
||||
|
||||
from charset_normalizer import from_bytes
|
||||
from charset_normalizer.constant import TOO_SMALL_SEQUENCE
|
||||
@ -29,7 +29,7 @@ def detect_encoding(content: ContentBytes) -> str:
|
||||
return encoding
|
||||
|
||||
|
||||
def smart_decode(content: ContentBytes, encoding: str) -> str:
|
||||
def smart_decode(content: ContentBytes, encoding: str) -> Tuple[str, str]:
|
||||
"""Decode `content` using the given `encoding`.
|
||||
If no `encoding` is provided, the best effort is to guess it from `content`.
|
||||
|
||||
@ -38,7 +38,7 @@ def smart_decode(content: ContentBytes, encoding: str) -> str:
|
||||
"""
|
||||
if not encoding:
|
||||
encoding = detect_encoding(content)
|
||||
return content.decode(encoding, 'replace')
|
||||
return content.decode(encoding, 'replace'), encoding
|
||||
|
||||
|
||||
def smart_encode(content: str, encoding: str) -> bytes:
|
||||
|
@ -1,6 +1,6 @@
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from itertools import chain
|
||||
from typing import Callable, Iterable, Union
|
||||
from typing import Callable, Iterable, Optional, Union
|
||||
|
||||
from .processing import Conversion, Formatting
|
||||
from ..context import Environment
|
||||
@ -89,6 +89,9 @@ class RawStream(BaseStream):
|
||||
return self.msg.iter_body(self.chunk_size)
|
||||
|
||||
|
||||
ENCODING_GUESS_THRESHOLD = 3
|
||||
|
||||
|
||||
class EncodedStream(BaseStream):
|
||||
"""Encoded HTTP message stream.
|
||||
|
||||
@ -111,7 +114,8 @@ class EncodedStream(BaseStream):
|
||||
self.mime = mime_overwrite
|
||||
else:
|
||||
self.mime, _ = parse_content_type_header(self.msg.content_type)
|
||||
self.encoding = encoding_overwrite or self.msg.encoding
|
||||
self._encoding = encoding_overwrite or self.msg.encoding
|
||||
self._encoding_guesses = []
|
||||
if env.stdout_isatty:
|
||||
# Use the encoding supported by the terminal.
|
||||
output_encoding = env.stdout_encoding
|
||||
@ -125,9 +129,33 @@ class EncodedStream(BaseStream):
|
||||
for line, lf in self.msg.iter_lines(self.CHUNK_SIZE):
|
||||
if b'\0' in line:
|
||||
raise BinarySuppressedError()
|
||||
line = smart_decode(line, self.encoding)
|
||||
line = self.decode_chunk(line)
|
||||
yield smart_encode(line, self.output_encoding) + lf
|
||||
|
||||
def decode_chunk(self, raw_chunk: str) -> str:
|
||||
chunk, guessed_encoding = smart_decode(raw_chunk, self.encoding)
|
||||
self._encoding_guesses.append(guessed_encoding)
|
||||
return chunk
|
||||
|
||||
@property
|
||||
def encoding(self) -> Optional[str]:
|
||||
if self._encoding:
|
||||
return self._encoding
|
||||
|
||||
# If we find a reliable (used consecutively) encoding, than
|
||||
# use it for the next iterations.
|
||||
if len(self._encoding_guesses) < ENCODING_GUESS_THRESHOLD:
|
||||
return None
|
||||
|
||||
guess_1, guess_2 = self._encoding_guesses[-2:]
|
||||
if guess_1 == guess_2:
|
||||
self._encoding = guess_1
|
||||
return guess_1
|
||||
|
||||
@encoding.setter
|
||||
def encoding(self, value) -> None:
|
||||
self._encoding = value
|
||||
|
||||
|
||||
class PrettyStream(EncodedStream):
|
||||
"""In addition to :class:`EncodedStream` behaviour, this stream applies
|
||||
@ -178,7 +206,7 @@ class PrettyStream(EncodedStream):
|
||||
if not isinstance(chunk, str):
|
||||
# Text when a converter has been used,
|
||||
# otherwise it will always be bytes.
|
||||
chunk = smart_decode(chunk, self.encoding)
|
||||
chunk = self.decode_chunk(chunk)
|
||||
chunk = self.formatting.format_body(content=chunk, mime=self.mime)
|
||||
return smart_encode(chunk, self.output_encoding)
|
||||
|
||||
|
2
tests/fixtures/__init__.py
vendored
2
tests/fixtures/__init__.py
vendored
@ -32,6 +32,8 @@ JSON_FILE_PATH_ARG = patharg(JSON_FILE_PATH)
|
||||
# line would be escaped).
|
||||
FILE_CONTENT = FILE_PATH.read_text(encoding=UTF8).strip()
|
||||
|
||||
ASCII_FILE_CONTENT = "random text" * 10
|
||||
|
||||
|
||||
JSON_FILE_CONTENT = JSON_FILE_PATH.read_text(encoding=UTF8)
|
||||
BIN_FILE_CONTENT = BIN_FILE_PATH.read_bytes()
|
||||
|
@ -11,7 +11,12 @@ from httpie.plugins import ConverterPlugin
|
||||
from httpie.plugins.registry import plugin_manager
|
||||
|
||||
from .utils import StdinBytesIO, http, MockEnvironment, DUMMY_URL
|
||||
from .fixtures import BIN_FILE_CONTENT, BIN_FILE_PATH
|
||||
from .fixtures import (
|
||||
ASCII_FILE_CONTENT,
|
||||
BIN_FILE_CONTENT,
|
||||
BIN_FILE_PATH,
|
||||
FILE_CONTENT as UNICODE_FILE_CONTENT
|
||||
)
|
||||
|
||||
PRETTY_OPTIONS = list(PRETTY_MAP.keys())
|
||||
|
||||
@ -133,3 +138,9 @@ def test_auto_streaming(http_server, extras, expected):
|
||||
for call_arg in env.stdout.write.call_args_list
|
||||
if b'test' in call_arg[0][0]
|
||||
]) == expected
|
||||
|
||||
|
||||
def test_streaming_encoding_detection(http_server):
|
||||
r = http('--stream', http_server + '/stream/encoding/random')
|
||||
assert ASCII_FILE_CONTENT in r
|
||||
assert UNICODE_FILE_CONTENT in r
|
||||
|
@ -52,6 +52,27 @@ def chunked_drip(handler):
|
||||
handler.wfile.write('0\r\n\r\n'.encode('utf-8'))
|
||||
|
||||
|
||||
@TestHandler.handler('GET', '/stream/encoding/random')
|
||||
def random_encoding(handler):
|
||||
from tests.fixtures import ASCII_FILE_CONTENT, FILE_CONTENT as UNICODE_FILE_CONTENT
|
||||
|
||||
handler.send_response(200)
|
||||
handler.send_header('Transfer-Encoding', 'chunked')
|
||||
handler.end_headers()
|
||||
|
||||
for body in [
|
||||
ASCII_FILE_CONTENT,
|
||||
ASCII_FILE_CONTENT,
|
||||
UNICODE_FILE_CONTENT,
|
||||
UNICODE_FILE_CONTENT,
|
||||
UNICODE_FILE_CONTENT,
|
||||
]:
|
||||
body += "\n"
|
||||
handler.wfile.write(f'{len(body.encode()):X}\r\n{body}\r\n'.encode())
|
||||
|
||||
handler.wfile.write('0\r\n\r\n'.encode('utf-8'))
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def http_server():
|
||||
"""A custom HTTP server implementation for our tests, that is
|
||||
|
Loading…
Reference in New Issue
Block a user