diff --git a/CHANGELOG.md b/CHANGELOG.md index ca49cefb..df49ee28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ This project adheres to [Semantic Versioning](https://semver.org/). - Added support for formatting & coloring of JSON bodies preceded by non-JSON data (e.g., an XXSI prefix). ([#1130](https://github.com/httpie/httpie/issues/1130)) - Added `--format-options=response.as:CONTENT_TYPE` to allow overriding the response `Content-Type`. ([#1134](https://github.com/httpie/httpie/issues/1134)) - Added `--response-as` shortcut for setting the response `Content-Type`-related `--format-options`. ([#1134](https://github.com/httpie/httpie/issues/1134)) +- Improved handling of prettified responses without correct `Content-Type` encoding. ([#1110](https://github.com/httpie/httpie/issues/1110)) - Installed plugins are now listed in `--debug` output. ([#1165](https://github.com/httpie/httpie/issues/1165)) - Fixed duplicate keys preservation of JSON data. ([#1163](https://github.com/httpie/httpie/issues/1163)) diff --git a/docs/README.md b/docs/README.md index d4cfd533..ed315370 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1249,6 +1249,18 @@ For example, the following request will force the response to be treated as XML: $ http --response-as=application/xml pie.dev/get ``` +And the following requests will force the response to use the [big5](https://docs.python.org/3/library/codecs.html#standard-encodings) encoding: + +```bash +$ http --response-as='charset=big5' pie.dev/get +``` + +```bash +$ http --response-as='text/plain; charset=big5' pie.dev/get +``` + +Given the encoding is not sent by the server, HTTPie will auto-detect it. + ### Binary data Binary data is suppressed for terminal output, which makes it safe to perform requests to URLs that send back binary data. diff --git a/httpie/cli/definition.py b/httpie/cli/definition.py index 4fb9c184..77be93a9 100644 --- a/httpie/cli/definition.py +++ b/httpie/cli/definition.py @@ -316,6 +316,8 @@ output_processing.add_argument( Override the response Content-Type for formatting purposes, e.g.: --response-as=application/xml + --response-as=charset=utf-8 + --response-as='application/xml; charset=utf-8' It is a shortcut for: diff --git a/httpie/codec.py b/httpie/codec.py new file mode 100644 index 00000000..61057166 --- /dev/null +++ b/httpie/codec.py @@ -0,0 +1,37 @@ +from typing import Union + +from charset_normalizer import from_bytes + +from .constants import UTF8 + +Bytes = Union[bytearray, bytes] + + +def detect_encoding(content: Bytes) -> str: + """Detect the `content` encoding. + Fallback to UTF-8 when no suitable encoding found. + + """ + match = from_bytes(bytes(content)).best() + return match.encoding if match else UTF8 + + +def decode(content: Bytes, encoding: str) -> str: + """Decode `content` using the given `encoding`. + If no `encoding` is provided, the best effort is to guess it from `content`. + + Unicode errors are replaced. + + """ + if not encoding: + encoding = detect_encoding(content) + return content.decode(encoding, 'replace') + + +def encode(content: str, encoding: str) -> bytes: + """Encode `content` using the given `encoding`. + + Unicode errors are replaced. + + """ + return content.encode(encoding, 'replace') diff --git a/httpie/models.py b/httpie/models.py index f4ddb03b..21034a04 100644 --- a/httpie/models.py +++ b/httpie/models.py @@ -30,11 +30,6 @@ class HTTPMessage(metaclass=ABCMeta): def encoding(self) -> Optional[str]: """Return a `str` with the message's encoding, if known.""" - @property - def body(self) -> bytes: - """Return a `bytes` with the message's body.""" - raise NotImplementedError() - @property def content_type(self) -> str: """Return the message content type.""" @@ -86,12 +81,6 @@ class HTTPResponse(HTTPMessage): def encoding(self): return self._orig.encoding or UTF8 - @property - def body(self): - # Only now the response body is fetched. - # Shouldn't be touched unless the body is actually needed. - return self._orig.content - class HTTPRequest(HTTPMessage): """A :class:`requests.models.Request` wrapper.""" diff --git a/httpie/output/formatters/xml.py b/httpie/output/formatters/xml.py index e5ce5c23..2909f7c0 100644 --- a/httpie/output/formatters/xml.py +++ b/httpie/output/formatters/xml.py @@ -25,7 +25,7 @@ def pretty_xml(document: 'Document', } if standalone is not None and sys.version_info >= (3, 9): kwargs['standalone'] = standalone - body = document.toprettyxml(**kwargs).decode() + body = document.toprettyxml(**kwargs).decode(kwargs['encoding']) # Remove blank lines automatically added by `toprettyxml()`. return '\n'.join(line for line in body.splitlines() if line.strip()) diff --git a/httpie/output/streams.py b/httpie/output/streams.py index 9bb646bf..1c6afaa8 100644 --- a/httpie/output/streams.py +++ b/httpie/output/streams.py @@ -1,7 +1,8 @@ from abc import ABCMeta, abstractmethod from itertools import chain -from typing import Callable, Iterable, Union +from typing import Any, Callable, Dict, Iterable, Tuple, Union +from .. import codec from ..cli.constants import EMPTY_FORMAT_OPTION from ..context import Environment from ..constants import UTF8 @@ -114,8 +115,8 @@ class EncodedStream(BaseStream): for line, lf in self.msg.iter_lines(self.CHUNK_SIZE): if b'\0' in line: raise BinarySuppressedError() - yield line.decode(self.msg.encoding) \ - .encode(self.output_encoding, 'replace') + lf + line = codec.decode(line, self.msg.encoding) + yield codec.encode(line, self.output_encoding) + lf class PrettyStream(EncodedStream): @@ -137,15 +138,23 @@ class PrettyStream(EncodedStream): super().__init__(**kwargs) self.formatting = formatting self.conversion = conversion - self.mime = self.get_mime() + self.mime, mime_options = self._get_mime_and_options() + self.encoding = mime_options.get('charset') or '' - def get_mime(self) -> str: - mime = parse_header_content_type(self.msg.content_type)[0] - if isinstance(self.msg, HTTPResponse): - forced_content_type = self.formatting.options['response']['as'] - if forced_content_type != EMPTY_FORMAT_OPTION: - mime = parse_header_content_type(forced_content_type)[0] or mime - return mime + def _get_mime_and_options(self) -> Tuple[str, Dict[str, Any]]: + # Defaults from the `Content-Type` header. + mime, options = parse_header_content_type(self.msg.content_type) + + if not isinstance(self.msg, HTTPResponse): + return mime, options + + # Override from the `--response-as` option. + forced_content_type = self.formatting.options['response']['as'] + if forced_content_type == EMPTY_FORMAT_OPTION: + return mime, options + + forced_mime, forced_options = parse_header_content_type(forced_content_type) + return (forced_mime or mime, forced_options or options) def get_headers(self) -> bytes: return self.formatting.format_headers( @@ -176,9 +185,9 @@ class PrettyStream(EncodedStream): if not isinstance(chunk, str): # Text when a converter has been used, # otherwise it will always be bytes. - chunk = chunk.decode(self.msg.encoding, 'replace') + chunk = codec.decode(chunk, self.encoding) chunk = self.formatting.format_body(content=chunk, mime=self.mime) - return chunk.encode(self.output_encoding, 'replace') + return codec.encode(chunk, self.output_encoding) class BufferedPrettyStream(PrettyStream): diff --git a/setup.py b/setup.py index 22c14212..ef2d5e86 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ dev_require = [ 'wheel', ] install_requires = [ + 'charset_normalizer>=2.0.0', 'defusedxml>=0.6.0', 'requests[socks]>=2.22.0', 'Pygments>=2.5.2', diff --git a/tests/test_errors.py b/tests/test_errors.py index c33b8f80..abbf7235 100644 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -39,3 +39,10 @@ def test_max_headers_limit(httpbin_both): def test_max_headers_no_limit(httpbin_both): assert HTTP_OK in http('--max-headers=0', httpbin_both + '/get') + + +def test_charset_argument_unknown_encoding(httpbin_both): + with raises(LookupError) as e: + http('--response-as', 'charset=foobar', + 'GET', httpbin_both + '/get') + assert 'unknown encoding: foobar' in str(e.value) diff --git a/tests/test_unicode.py b/tests/test_unicode.py index d1ea8172..2a12180e 100644 --- a/tests/test_unicode.py +++ b/tests/test_unicode.py @@ -2,9 +2,17 @@ Various unicode handling related tests. """ -from .utils import http, HTTP_OK +import pytest +import responses + +from httpie.cli.constants import PRETTY_MAP +from httpie.constants import UTF8 + +from .utils import http, HTTP_OK, URL_EXAMPLE from .fixtures import UNICODE +ENCODINGS = [UTF8, 'windows-1250'] + def test_unicode_headers(httpbin): # httpbin doesn't interpret UFT-8 headers @@ -109,3 +117,95 @@ def test_unicode_digest_auth(httpbin): http('--auth-type=digest', '--auth', f'test:{UNICODE}', f'{httpbin.url}/digest-auth/auth/test/{UNICODE}') + + +@pytest.mark.parametrize('encoding', ENCODINGS) +@responses.activate +def test_GET_encoding_detection_from_content_type_header(encoding): + responses.add(responses.GET, + URL_EXAMPLE, + body='\nFinanciën'.encode(encoding), + content_type=f'text/xml; charset={encoding.upper()}') + r = http('GET', URL_EXAMPLE) + assert 'Financiën' in r + + +@pytest.mark.parametrize('encoding', ENCODINGS) +@responses.activate +def test_GET_encoding_detection_from_content(encoding): + body = f'\nFinanciën' + responses.add(responses.GET, + URL_EXAMPLE, + body=body.encode(encoding), + content_type='text/xml') + r = http('GET', URL_EXAMPLE) + assert 'Financiën' in r + + +@responses.activate +def test_GET_encoding_provided_by_format_options(): + responses.add(responses.GET, + URL_EXAMPLE, + body='▒▒▒'.encode('johab'), + content_type='text/plain') + r = http('--format-options', 'response.as:text/plain; charset=johab', + 'GET', URL_EXAMPLE) + assert '▒▒▒' in r + + +@responses.activate +def test_GET_encoding_provided_by_shortcut_option(): + responses.add(responses.GET, + URL_EXAMPLE, + body='▒▒▒'.encode('johab'), + content_type='text/plain') + r = http('--response-as', 'text/plain; charset=johab', + 'GET', URL_EXAMPLE) + assert '▒▒▒' in r + + +@pytest.mark.parametrize('encoding', ENCODINGS) +@responses.activate +def test_GET_encoding_provided_by_empty_shortcut_option_should_use_content_detection(encoding): + body = f'\nFinanciën' + responses.add(responses.GET, + URL_EXAMPLE, + body=body.encode(encoding), + content_type='text/xml') + r = http('--response-as', '', 'GET', URL_EXAMPLE) + assert 'Financiën' in r + + +@pytest.mark.parametrize('encoding', ENCODINGS) +@responses.activate +def test_POST_encoding_detection_from_content_type_header(encoding): + responses.add(responses.POST, + URL_EXAMPLE, + body='Všichni lidé jsou si rovni.'.encode(encoding), + content_type=f'text/plain; charset={encoding.upper()}') + r = http('--form', 'POST', URL_EXAMPLE) + assert 'Všichni lidé jsou si rovni.' in r + + +@pytest.mark.parametrize('encoding', ENCODINGS) +@responses.activate +def test_POST_encoding_detection_from_content(encoding): + responses.add(responses.POST, + URL_EXAMPLE, + body='Všichni lidé jsou si rovni.'.encode(encoding), + content_type='text/plain') + r = http('--form', 'POST', URL_EXAMPLE) + assert 'Všichni lidé jsou si rovni.' in r + + +@pytest.mark.parametrize('encoding', ENCODINGS) +@pytest.mark.parametrize('pretty', PRETTY_MAP.keys()) +@responses.activate +def test_stream_encoding_detection_from_content_type_header(encoding, pretty): + responses.add(responses.GET, + URL_EXAMPLE, + body='\nFinanciën'.encode(encoding), + stream=True, + content_type=f'text/xml; charset={encoding.upper()}') + r = http('--pretty=' + pretty, '--stream', 'GET', URL_EXAMPLE) + assert 'Financiën' in r