2013-02-26 15:12:33 +01:00
|
|
|
"""
|
|
|
|
Download mode implementation.
|
|
|
|
|
|
|
|
"""
|
2019-08-30 11:32:14 +02:00
|
|
|
import mimetypes
|
2013-02-26 15:12:33 +01:00
|
|
|
import os
|
2013-04-11 07:29:10 +02:00
|
|
|
import re
|
2013-09-24 19:50:37 +02:00
|
|
|
from mailbox import Message
|
2022-04-14 16:43:10 +02:00
|
|
|
from time import monotonic
|
2019-08-30 11:32:14 +02:00
|
|
|
from typing import IO, Optional, Tuple
|
2019-08-29 08:53:56 +02:00
|
|
|
from urllib.parse import urlsplit
|
2013-02-26 15:12:33 +01:00
|
|
|
|
2019-08-30 11:32:14 +02:00
|
|
|
import requests
|
|
|
|
|
2021-12-23 21:13:25 +01:00
|
|
|
from .models import HTTPResponse, OutputOptions
|
2021-05-05 14:13:39 +02:00
|
|
|
from .output.streams import RawStream
|
2022-04-14 16:43:10 +02:00
|
|
|
from .context import Environment
|
2013-02-26 15:12:33 +01:00
|
|
|
|
2013-04-11 09:00:41 +02:00
|
|
|
|
2013-04-11 07:29:10 +02:00
|
|
|
PARTIAL_CONTENT = 206
|
|
|
|
|
2013-04-11 23:51:21 +02:00
|
|
|
|
2013-04-11 08:24:59 +02:00
|
|
|
class ContentRangeError(ValueError):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2019-08-30 11:32:14 +02:00
|
|
|
def parse_content_range(content_range: str, resumed_from: int) -> int:
|
2013-04-11 08:24:59 +02:00
|
|
|
"""
|
|
|
|
Parse and validate Content-Range header.
|
|
|
|
|
2019-08-30 10:07:01 +02:00
|
|
|
<https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html>
|
2013-04-11 08:24:59 +02:00
|
|
|
|
|
|
|
:param content_range: the value of a Content-Range response header
|
|
|
|
eg. "bytes 21010-47021/47022"
|
|
|
|
:param resumed_from: first byte pos. from the Range request header
|
|
|
|
:return: total size of the response body when fully downloaded.
|
|
|
|
|
|
|
|
"""
|
2013-04-15 05:56:47 +02:00
|
|
|
if content_range is None:
|
|
|
|
raise ContentRangeError('Missing Content-Range')
|
|
|
|
|
2013-04-11 08:24:59 +02:00
|
|
|
pattern = (
|
2018-07-12 21:16:16 +02:00
|
|
|
r'^bytes (?P<first_byte_pos>\d+)-(?P<last_byte_pos>\d+)'
|
|
|
|
r'/(\*|(?P<instance_length>\d+))$'
|
2013-04-11 08:24:59 +02:00
|
|
|
)
|
|
|
|
match = re.match(pattern, content_range)
|
|
|
|
|
|
|
|
if not match:
|
|
|
|
raise ContentRangeError(
|
2021-05-25 20:49:07 +02:00
|
|
|
f'Invalid Content-Range format {content_range!r}')
|
2013-04-11 08:24:59 +02:00
|
|
|
|
|
|
|
content_range_dict = match.groupdict()
|
|
|
|
first_byte_pos = int(content_range_dict['first_byte_pos'])
|
|
|
|
last_byte_pos = int(content_range_dict['last_byte_pos'])
|
|
|
|
instance_length = (
|
|
|
|
int(content_range_dict['instance_length'])
|
|
|
|
if content_range_dict['instance_length']
|
|
|
|
else None
|
|
|
|
)
|
|
|
|
|
|
|
|
# "A byte-content-range-spec with a byte-range-resp-spec whose
|
|
|
|
# last- byte-pos value is less than its first-byte-pos value,
|
|
|
|
# or whose instance-length value is less than or equal to its
|
|
|
|
# last-byte-pos value, is invalid. The recipient of an invalid
|
|
|
|
# byte-content-range- spec MUST ignore it and any content
|
|
|
|
# transferred along with it."
|
2021-02-14 13:30:58 +01:00
|
|
|
if (first_byte_pos > last_byte_pos
|
2019-08-30 11:32:14 +02:00
|
|
|
or (instance_length is not None
|
|
|
|
and instance_length <= last_byte_pos)):
|
2013-04-11 08:24:59 +02:00
|
|
|
raise ContentRangeError(
|
2021-05-25 20:49:07 +02:00
|
|
|
f'Invalid Content-Range returned: {content_range!r}')
|
2013-04-11 08:24:59 +02:00
|
|
|
|
2018-07-12 21:16:16 +02:00
|
|
|
if (first_byte_pos != resumed_from
|
2019-08-30 11:32:14 +02:00
|
|
|
or (instance_length is not None
|
|
|
|
and last_byte_pos + 1 != instance_length)):
|
2013-04-11 08:24:59 +02:00
|
|
|
# Not what we asked for.
|
|
|
|
raise ContentRangeError(
|
2021-05-25 20:49:07 +02:00
|
|
|
f'Unexpected Content-Range returned ({content_range!r})'
|
|
|
|
f' for the requested Range ("bytes={resumed_from}-")'
|
2013-04-11 08:49:01 +02:00
|
|
|
)
|
2013-04-11 08:24:59 +02:00
|
|
|
|
|
|
|
return last_byte_pos + 1
|
|
|
|
|
|
|
|
|
2019-08-30 11:32:14 +02:00
|
|
|
def filename_from_content_disposition(
|
|
|
|
content_disposition: str
|
|
|
|
) -> Optional[str]:
|
2013-04-12 15:19:49 +02:00
|
|
|
"""
|
|
|
|
Extract and validate filename from a Content-Disposition header.
|
|
|
|
|
|
|
|
:param content_disposition: Content-Disposition value
|
|
|
|
:return: the filename if present and valid, otherwise `None`
|
|
|
|
|
|
|
|
"""
|
2017-03-10 11:27:38 +01:00
|
|
|
# attachment; filename=jakubroztocil-httpie-0.4.1-20-g40bd8f6.tar.gz
|
2013-09-24 19:50:37 +02:00
|
|
|
|
2021-05-25 20:49:07 +02:00
|
|
|
msg = Message(f'Content-Disposition: {content_disposition}')
|
2013-09-24 19:50:37 +02:00
|
|
|
filename = msg.get_filename()
|
|
|
|
if filename:
|
|
|
|
# Basic sanitation.
|
|
|
|
filename = os.path.basename(filename).lstrip('.').strip()
|
|
|
|
if filename:
|
|
|
|
return filename
|
2013-04-12 15:19:49 +02:00
|
|
|
|
|
|
|
|
2019-09-03 17:14:39 +02:00
|
|
|
def filename_from_url(url: str, content_type: Optional[str]) -> str:
|
2013-04-12 15:19:49 +02:00
|
|
|
fn = urlsplit(url).path.rstrip('/')
|
|
|
|
fn = os.path.basename(fn) if fn else 'index'
|
|
|
|
if '.' not in fn and content_type:
|
|
|
|
content_type = content_type.split(';')[0]
|
|
|
|
if content_type == 'text/plain':
|
|
|
|
# mimetypes returns '.ksh'
|
|
|
|
ext = '.txt'
|
|
|
|
else:
|
|
|
|
ext = mimetypes.guess_extension(content_type)
|
|
|
|
|
2021-05-27 13:05:41 +02:00
|
|
|
if ext == '.htm':
|
2013-04-12 19:01:24 +02:00
|
|
|
ext = '.html'
|
|
|
|
|
2013-04-12 15:19:49 +02:00
|
|
|
if ext:
|
|
|
|
fn += ext
|
|
|
|
|
|
|
|
return fn
|
|
|
|
|
|
|
|
|
2019-08-30 11:32:14 +02:00
|
|
|
def trim_filename(filename: str, max_len: int) -> str:
|
2016-03-17 08:58:01 +01:00
|
|
|
if len(filename) > max_len:
|
|
|
|
trim_by = len(filename) - max_len
|
|
|
|
name, ext = os.path.splitext(filename)
|
|
|
|
if trim_by >= len(name):
|
|
|
|
filename = filename[:-trim_by]
|
|
|
|
else:
|
|
|
|
filename = name[:-trim_by] + ext
|
|
|
|
return filename
|
|
|
|
|
|
|
|
|
2019-08-30 11:32:14 +02:00
|
|
|
def get_filename_max_length(directory: str) -> int:
|
2016-03-17 09:14:14 +01:00
|
|
|
max_len = 255
|
2021-07-26 23:56:38 +02:00
|
|
|
if hasattr(os, 'pathconf') and 'PC_NAME_MAX' in os.pathconf_names:
|
|
|
|
max_len = os.pathconf(directory, 'PC_NAME_MAX')
|
2016-03-17 08:58:01 +01:00
|
|
|
return max_len
|
|
|
|
|
|
|
|
|
2019-08-30 11:32:14 +02:00
|
|
|
def trim_filename_if_needed(filename: str, directory='.', extra=0) -> str:
|
2016-03-17 08:58:01 +01:00
|
|
|
max_len = get_filename_max_length(directory) - extra
|
|
|
|
if len(filename) > max_len:
|
|
|
|
filename = trim_filename(filename, max_len)
|
|
|
|
return filename
|
|
|
|
|
|
|
|
|
2019-08-30 11:32:14 +02:00
|
|
|
def get_unique_filename(filename: str, exists=os.path.exists) -> str:
|
2013-04-12 15:19:49 +02:00
|
|
|
attempt = 0
|
|
|
|
while True:
|
2021-05-25 20:49:07 +02:00
|
|
|
suffix = f'-{attempt}' if attempt > 0 else ''
|
2016-03-17 08:58:01 +01:00
|
|
|
try_filename = trim_filename_if_needed(filename, extra=len(suffix))
|
|
|
|
try_filename += suffix
|
|
|
|
if not exists(try_filename):
|
|
|
|
return try_filename
|
2013-04-12 15:19:49 +02:00
|
|
|
attempt += 1
|
|
|
|
|
|
|
|
|
2019-08-30 11:32:14 +02:00
|
|
|
class Downloader:
|
2013-02-26 15:12:33 +01:00
|
|
|
|
2019-08-30 11:32:14 +02:00
|
|
|
def __init__(
|
|
|
|
self,
|
2022-04-14 16:43:10 +02:00
|
|
|
env: Environment,
|
2019-08-30 11:32:14 +02:00
|
|
|
output_file: IO = None,
|
2022-04-14 16:43:10 +02:00
|
|
|
resume: bool = False
|
2019-08-30 11:32:14 +02:00
|
|
|
):
|
2013-02-26 15:12:33 +01:00
|
|
|
"""
|
|
|
|
:param resume: Should the download resume if partial download
|
|
|
|
already exists.
|
|
|
|
|
|
|
|
:param output_file: The file to store response body in. If not
|
|
|
|
provided, it will be guessed from the response.
|
|
|
|
|
|
|
|
:param progress_file: Where to report download progress.
|
|
|
|
|
|
|
|
"""
|
2019-08-30 11:32:14 +02:00
|
|
|
self.finished = False
|
2022-04-14 16:43:10 +02:00
|
|
|
self.status = DownloadStatus(env=env)
|
2013-03-24 15:23:18 +01:00
|
|
|
self._output_file = output_file
|
|
|
|
self._resume = resume
|
|
|
|
self._resumed_from = 0
|
2013-04-11 23:51:21 +02:00
|
|
|
|
2019-08-30 11:32:14 +02:00
|
|
|
def pre_request(self, request_headers: dict):
|
2013-03-24 15:23:18 +01:00
|
|
|
"""Called just before the HTTP request is sent.
|
|
|
|
|
2013-04-11 07:29:10 +02:00
|
|
|
Might alter `request_headers`.
|
2013-03-24 15:23:18 +01:00
|
|
|
|
|
|
|
"""
|
2016-03-07 04:46:59 +01:00
|
|
|
# Ask the server not to encode the content so that we can resume, etc.
|
|
|
|
request_headers['Accept-Encoding'] = 'identity'
|
2013-03-24 15:23:18 +01:00
|
|
|
if self._resume:
|
2013-04-12 16:56:05 +02:00
|
|
|
bytes_have = os.path.getsize(self._output_file.name)
|
|
|
|
if bytes_have:
|
2013-02-26 15:12:33 +01:00
|
|
|
# Set ``Range`` header to resume the download
|
2013-04-11 21:23:15 +02:00
|
|
|
# TODO: Use "If-Range: mtime" to make sure it's fresh?
|
2021-05-25 20:49:07 +02:00
|
|
|
request_headers['Range'] = f'bytes={bytes_have}-'
|
2013-04-11 21:23:15 +02:00
|
|
|
self._resumed_from = bytes_have
|
2013-02-26 15:12:33 +01:00
|
|
|
|
2019-09-03 17:14:39 +02:00
|
|
|
def start(
|
|
|
|
self,
|
|
|
|
initial_url: str,
|
|
|
|
final_response: requests.Response
|
|
|
|
) -> Tuple[RawStream, IO]:
|
2013-02-26 15:12:33 +01:00
|
|
|
"""
|
|
|
|
Initiate and return a stream for `response` body with progress
|
|
|
|
callback attached. Can be called only once.
|
|
|
|
|
2019-09-03 17:14:39 +02:00
|
|
|
:param initial_url: The original requested URL
|
2019-06-24 12:19:29 +02:00
|
|
|
:param final_response: Initiated response object with headers already fetched
|
2013-02-26 15:12:33 +01:00
|
|
|
|
|
|
|
:return: RawStream, output_file
|
|
|
|
|
|
|
|
"""
|
2013-07-07 17:00:03 +02:00
|
|
|
assert not self.status.time_started
|
2013-02-26 15:12:33 +01:00
|
|
|
|
2016-03-07 04:46:59 +01:00
|
|
|
# FIXME: some servers still might sent Content-Encoding: gzip
|
2020-12-23 22:07:27 +01:00
|
|
|
# <https://github.com/httpie/httpie/issues/423>
|
2013-04-11 21:23:15 +02:00
|
|
|
try:
|
2019-06-24 12:19:29 +02:00
|
|
|
total_size = int(final_response.headers['Content-Length'])
|
2013-04-13 19:50:32 +02:00
|
|
|
except (KeyError, ValueError, TypeError):
|
2013-04-11 21:23:15 +02:00
|
|
|
total_size = None
|
2013-03-24 15:23:18 +01:00
|
|
|
|
2019-06-24 12:19:29 +02:00
|
|
|
if not self._output_file:
|
|
|
|
self._output_file = self._get_output_file_from_response(
|
2019-09-03 17:14:39 +02:00
|
|
|
initial_url=initial_url,
|
|
|
|
final_response=final_response,
|
|
|
|
)
|
2019-06-24 12:19:29 +02:00
|
|
|
else:
|
2019-06-24 12:29:42 +02:00
|
|
|
# `--output, -o` provided
|
2019-06-24 12:19:29 +02:00
|
|
|
if self._resume and final_response.status_code == PARTIAL_CONTENT:
|
2013-04-15 05:56:47 +02:00
|
|
|
total_size = parse_content_range(
|
2019-06-24 12:19:29 +02:00
|
|
|
final_response.headers.get('Content-Range'),
|
2013-04-15 05:56:47 +02:00
|
|
|
self._resumed_from
|
|
|
|
)
|
2013-04-11 08:24:59 +02:00
|
|
|
|
2013-04-11 07:29:10 +02:00
|
|
|
else:
|
|
|
|
self._resumed_from = 0
|
2013-04-12 16:04:14 +02:00
|
|
|
try:
|
|
|
|
self._output_file.seek(0)
|
|
|
|
self._output_file.truncate()
|
2021-05-31 10:10:41 +02:00
|
|
|
except OSError:
|
2013-04-12 16:04:14 +02:00
|
|
|
pass # stdout
|
2013-02-26 15:12:33 +01:00
|
|
|
|
2021-12-23 21:13:25 +01:00
|
|
|
output_options = OutputOptions.from_message(final_response, headers=False, body=True)
|
2013-02-26 15:12:33 +01:00
|
|
|
stream = RawStream(
|
2019-06-24 12:19:29 +02:00
|
|
|
msg=HTTPResponse(final_response),
|
2021-12-23 21:13:25 +01:00
|
|
|
output_options=output_options,
|
2013-08-18 00:59:10 +02:00
|
|
|
on_body_chunk_downloaded=self.chunk_downloaded,
|
2013-02-26 15:12:33 +01:00
|
|
|
)
|
|
|
|
|
2022-04-14 16:43:10 +02:00
|
|
|
self.status.started(
|
|
|
|
output_file=self._output_file,
|
|
|
|
resumed_from=self._resumed_from,
|
|
|
|
total_size=total_size
|
2013-04-13 02:49:27 +02:00
|
|
|
)
|
2013-02-26 15:12:33 +01:00
|
|
|
|
2013-03-24 15:23:18 +01:00
|
|
|
return stream, self._output_file
|
2013-02-26 15:12:33 +01:00
|
|
|
|
2013-03-24 15:23:18 +01:00
|
|
|
def finish(self):
|
2013-04-16 09:55:45 +02:00
|
|
|
assert not self.finished
|
|
|
|
self.finished = True
|
2013-07-07 17:00:03 +02:00
|
|
|
self.status.finished()
|
2013-04-16 09:55:45 +02:00
|
|
|
|
|
|
|
def failed(self):
|
2022-04-14 16:43:10 +02:00
|
|
|
self.status.terminate()
|
2013-03-07 17:32:48 +01:00
|
|
|
|
2013-03-24 15:23:18 +01:00
|
|
|
@property
|
2019-08-30 11:32:14 +02:00
|
|
|
def interrupted(self) -> bool:
|
2013-03-24 15:23:18 +01:00
|
|
|
return (
|
2018-07-12 21:16:16 +02:00
|
|
|
self.finished
|
|
|
|
and self.status.total_size
|
|
|
|
and self.status.total_size != self.status.downloaded
|
2013-03-07 17:32:48 +01:00
|
|
|
)
|
|
|
|
|
2019-08-30 11:32:14 +02:00
|
|
|
def chunk_downloaded(self, chunk: bytes):
|
2013-03-07 17:32:48 +01:00
|
|
|
"""
|
|
|
|
A download progress callback.
|
|
|
|
|
|
|
|
:param chunk: A chunk of response body data that has just
|
|
|
|
been downloaded and written to the output.
|
|
|
|
|
|
|
|
"""
|
2013-07-07 17:00:03 +02:00
|
|
|
self.status.chunk_downloaded(len(chunk))
|
2013-02-26 15:12:33 +01:00
|
|
|
|
2019-06-24 12:19:29 +02:00
|
|
|
@staticmethod
|
2019-08-30 11:32:14 +02:00
|
|
|
def _get_output_file_from_response(
|
2019-09-03 17:14:39 +02:00
|
|
|
initial_url: str,
|
|
|
|
final_response: requests.Response,
|
2019-08-30 11:32:14 +02:00
|
|
|
) -> IO:
|
2019-06-24 12:19:29 +02:00
|
|
|
# Output file not specified. Pick a name that doesn't exist yet.
|
|
|
|
filename = None
|
|
|
|
if 'Content-Disposition' in final_response.headers:
|
|
|
|
filename = filename_from_content_disposition(
|
|
|
|
final_response.headers['Content-Disposition'])
|
|
|
|
if not filename:
|
|
|
|
filename = filename_from_url(
|
2019-09-03 17:14:39 +02:00
|
|
|
url=initial_url,
|
2019-06-24 12:19:29 +02:00
|
|
|
content_type=final_response.headers.get('Content-Type'),
|
|
|
|
)
|
|
|
|
unique_filename = get_unique_filename(filename)
|
2021-12-17 09:00:03 +01:00
|
|
|
return open(unique_filename, buffering=0, mode='a+b')
|
2019-06-24 12:19:29 +02:00
|
|
|
|
2013-03-24 15:23:18 +01:00
|
|
|
|
2019-08-30 11:32:14 +02:00
|
|
|
class DownloadStatus:
|
2019-12-04 13:32:08 +01:00
|
|
|
"""Holds details about the download status."""
|
2013-03-24 15:23:18 +01:00
|
|
|
|
2022-04-14 16:43:10 +02:00
|
|
|
def __init__(self, env):
|
|
|
|
self.env = env
|
2013-03-24 15:23:18 +01:00
|
|
|
self.downloaded = 0
|
2013-04-11 07:29:10 +02:00
|
|
|
self.total_size = None
|
2013-04-11 23:51:21 +02:00
|
|
|
self.resumed_from = 0
|
|
|
|
self.time_started = None
|
|
|
|
self.time_finished = None
|
2013-03-24 15:23:18 +01:00
|
|
|
|
2022-04-14 16:43:10 +02:00
|
|
|
def started(self, output_file, resumed_from=0, total_size=None):
|
2013-04-11 23:51:21 +02:00
|
|
|
assert self.time_started is None
|
2016-03-07 04:46:59 +01:00
|
|
|
self.total_size = total_size
|
2013-04-11 23:51:21 +02:00
|
|
|
self.downloaded = self.resumed_from = resumed_from
|
2021-07-29 16:05:56 +02:00
|
|
|
self.time_started = monotonic()
|
2022-04-14 16:43:10 +02:00
|
|
|
self.start_display(output_file=output_file)
|
|
|
|
|
|
|
|
def start_display(self, output_file):
|
|
|
|
from httpie.output.ui.rich_progress import (
|
|
|
|
DummyDisplay,
|
|
|
|
StatusDisplay,
|
|
|
|
ProgressDisplay
|
|
|
|
)
|
|
|
|
|
|
|
|
message = f'Downloading to {output_file.name}'
|
|
|
|
if self.env.show_displays:
|
|
|
|
if self.total_size is None:
|
|
|
|
# Rich does not support progress bars without a total
|
|
|
|
# size given. Instead we use status objects.
|
|
|
|
self.display = StatusDisplay(self.env)
|
|
|
|
else:
|
|
|
|
self.display = ProgressDisplay(self.env)
|
|
|
|
else:
|
|
|
|
self.display = DummyDisplay(self.env)
|
|
|
|
|
|
|
|
self.display.start(
|
|
|
|
total=self.total_size,
|
|
|
|
at=self.downloaded,
|
|
|
|
description=message
|
|
|
|
)
|
2013-03-24 15:23:18 +01:00
|
|
|
|
|
|
|
def chunk_downloaded(self, size):
|
2013-04-11 23:51:21 +02:00
|
|
|
assert self.time_finished is None
|
2013-03-24 15:23:18 +01:00
|
|
|
self.downloaded += size
|
2022-04-14 16:43:10 +02:00
|
|
|
self.display.update(size)
|
2013-03-24 15:23:18 +01:00
|
|
|
|
2013-04-11 23:51:21 +02:00
|
|
|
@property
|
|
|
|
def has_finished(self):
|
|
|
|
return self.time_finished is not None
|
|
|
|
|
2022-04-14 16:43:10 +02:00
|
|
|
@property
|
|
|
|
def time_spent(self):
|
|
|
|
if (
|
|
|
|
self.time_started is not None
|
|
|
|
and self.time_finished is not None
|
|
|
|
):
|
|
|
|
return self.time_finished - self.time_started
|
|
|
|
else:
|
|
|
|
return None
|
|
|
|
|
2013-04-11 23:51:21 +02:00
|
|
|
def finished(self):
|
|
|
|
assert self.time_started is not None
|
|
|
|
assert self.time_finished is None
|
2021-07-29 16:05:56 +02:00
|
|
|
self.time_finished = monotonic()
|
2022-04-14 16:43:10 +02:00
|
|
|
if hasattr(self, 'display'):
|
|
|
|
self.display.stop(self.time_spent)
|
2013-03-24 15:23:18 +01:00
|
|
|
|
2022-04-14 16:43:10 +02:00
|
|
|
def terminate(self):
|
|
|
|
if hasattr(self, 'display'):
|
|
|
|
self.display.stop(self.time_spent)
|