httpie-cli/httpie/downloads.py

456 lines
14 KiB
Python
Raw Normal View History

"""
Download mode implementation.
"""
import mimetypes
import os
import re
import sys
2013-04-11 23:51:21 +02:00
import threading
from mailbox import Message
from time import sleep, monotonic
from typing import IO, Optional, Tuple
2019-08-29 08:53:56 +02:00
from urllib.parse import urlsplit
import requests
from .models import HTTPResponse
from .output.streams import RawStream
from .utils import humanize_bytes
2013-04-11 09:00:41 +02:00
PARTIAL_CONTENT = 206
2013-04-11 23:51:21 +02:00
CLEAR_LINE = '\r\033[K'
2013-04-13 02:49:27 +02:00
PROGRESS = (
'{percentage: 6.2f} %'
2013-04-15 05:56:47 +02:00
' {downloaded: >10}'
2013-04-13 02:49:27 +02:00
' {speed: >10}/s'
' {eta: >8} ETA'
)
PROGRESS_NO_CONTENT_LENGTH = '{downloaded: >10} {speed: >10}/s'
SUMMARY = 'Done. {downloaded} in {time:0.5f}s ({speed}/s)\n'
2013-04-11 23:51:21 +02:00
SPINNER = '|/-\\'
2013-04-11 08:24:59 +02:00
class ContentRangeError(ValueError):
pass
def parse_content_range(content_range: str, resumed_from: int) -> int:
2013-04-11 08:24:59 +02:00
"""
Parse and validate Content-Range header.
2019-08-30 10:07:01 +02:00
<https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html>
2013-04-11 08:24:59 +02:00
:param content_range: the value of a Content-Range response header
eg. "bytes 21010-47021/47022"
:param resumed_from: first byte pos. from the Range request header
:return: total size of the response body when fully downloaded.
"""
2013-04-15 05:56:47 +02:00
if content_range is None:
raise ContentRangeError('Missing Content-Range')
2013-04-11 08:24:59 +02:00
pattern = (
r'^bytes (?P<first_byte_pos>\d+)-(?P<last_byte_pos>\d+)'
r'/(\*|(?P<instance_length>\d+))$'
2013-04-11 08:24:59 +02:00
)
match = re.match(pattern, content_range)
if not match:
raise ContentRangeError(
f'Invalid Content-Range format {content_range!r}')
2013-04-11 08:24:59 +02:00
content_range_dict = match.groupdict()
first_byte_pos = int(content_range_dict['first_byte_pos'])
last_byte_pos = int(content_range_dict['last_byte_pos'])
instance_length = (
int(content_range_dict['instance_length'])
if content_range_dict['instance_length']
else None
)
# "A byte-content-range-spec with a byte-range-resp-spec whose
# last- byte-pos value is less than its first-byte-pos value,
# or whose instance-length value is less than or equal to its
# last-byte-pos value, is invalid. The recipient of an invalid
# byte-content-range- spec MUST ignore it and any content
# transferred along with it."
if (first_byte_pos > last_byte_pos
or (instance_length is not None
and instance_length <= last_byte_pos)):
2013-04-11 08:24:59 +02:00
raise ContentRangeError(
f'Invalid Content-Range returned: {content_range!r}')
2013-04-11 08:24:59 +02:00
if (first_byte_pos != resumed_from
or (instance_length is not None
and last_byte_pos + 1 != instance_length)):
2013-04-11 08:24:59 +02:00
# Not what we asked for.
raise ContentRangeError(
f'Unexpected Content-Range returned ({content_range!r})'
f' for the requested Range ("bytes={resumed_from}-")'
)
2013-04-11 08:24:59 +02:00
return last_byte_pos + 1
def filename_from_content_disposition(
content_disposition: str
) -> Optional[str]:
"""
Extract and validate filename from a Content-Disposition header.
:param content_disposition: Content-Disposition value
:return: the filename if present and valid, otherwise `None`
"""
2017-03-10 11:27:38 +01:00
# attachment; filename=jakubroztocil-httpie-0.4.1-20-g40bd8f6.tar.gz
msg = Message(f'Content-Disposition: {content_disposition}')
filename = msg.get_filename()
if filename:
# Basic sanitation.
filename = os.path.basename(filename).lstrip('.').strip()
if filename:
return filename
def filename_from_url(url: str, content_type: Optional[str]) -> str:
fn = urlsplit(url).path.rstrip('/')
fn = os.path.basename(fn) if fn else 'index'
if '.' not in fn and content_type:
content_type = content_type.split(';')[0]
if content_type == 'text/plain':
# mimetypes returns '.ksh'
ext = '.txt'
else:
ext = mimetypes.guess_extension(content_type)
2021-05-27 13:05:41 +02:00
if ext == '.htm':
ext = '.html'
if ext:
fn += ext
return fn
def trim_filename(filename: str, max_len: int) -> str:
if len(filename) > max_len:
trim_by = len(filename) - max_len
name, ext = os.path.splitext(filename)
if trim_by >= len(name):
filename = filename[:-trim_by]
else:
filename = name[:-trim_by] + ext
return filename
def get_filename_max_length(directory: str) -> int:
2016-03-17 09:14:14 +01:00
max_len = 255
if hasattr(os, 'pathconf') and 'PC_NAME_MAX' in os.pathconf_names:
max_len = os.pathconf(directory, 'PC_NAME_MAX')
return max_len
def trim_filename_if_needed(filename: str, directory='.', extra=0) -> str:
max_len = get_filename_max_length(directory) - extra
if len(filename) > max_len:
filename = trim_filename(filename, max_len)
return filename
def get_unique_filename(filename: str, exists=os.path.exists) -> str:
attempt = 0
while True:
suffix = f'-{attempt}' if attempt > 0 else ''
try_filename = trim_filename_if_needed(filename, extra=len(suffix))
try_filename += suffix
if not exists(try_filename):
return try_filename
attempt += 1
class Downloader:
def __init__(
self,
output_file: IO = None,
resume: bool = False,
progress_file: IO = sys.stderr
):
"""
:param resume: Should the download resume if partial download
already exists.
:param output_file: The file to store response body in. If not
provided, it will be guessed from the response.
:param progress_file: Where to report download progress.
"""
self.finished = False
self.status = DownloadStatus()
2013-03-24 15:23:18 +01:00
self._output_file = output_file
self._resume = resume
self._resumed_from = 0
self._progress_reporter = ProgressReporterThread(
status=self.status,
2013-04-11 23:51:21 +02:00
output=progress_file
)
def pre_request(self, request_headers: dict):
2013-03-24 15:23:18 +01:00
"""Called just before the HTTP request is sent.
Might alter `request_headers`.
2013-03-24 15:23:18 +01:00
"""
# Ask the server not to encode the content so that we can resume, etc.
request_headers['Accept-Encoding'] = 'identity'
2013-03-24 15:23:18 +01:00
if self._resume:
bytes_have = os.path.getsize(self._output_file.name)
if bytes_have:
# Set ``Range`` header to resume the download
2013-04-11 21:23:15 +02:00
# TODO: Use "If-Range: mtime" to make sure it's fresh?
request_headers['Range'] = f'bytes={bytes_have}-'
2013-04-11 21:23:15 +02:00
self._resumed_from = bytes_have
def start(
self,
initial_url: str,
final_response: requests.Response
) -> Tuple[RawStream, IO]:
"""
Initiate and return a stream for `response` body with progress
callback attached. Can be called only once.
:param initial_url: The original requested URL
:param final_response: Initiated response object with headers already fetched
:return: RawStream, output_file
"""
assert not self.status.time_started
# FIXME: some servers still might sent Content-Encoding: gzip
2020-12-23 22:07:27 +01:00
# <https://github.com/httpie/httpie/issues/423>
2013-04-11 21:23:15 +02:00
try:
total_size = int(final_response.headers['Content-Length'])
except (KeyError, ValueError, TypeError):
2013-04-11 21:23:15 +02:00
total_size = None
2013-03-24 15:23:18 +01:00
if not self._output_file:
self._output_file = self._get_output_file_from_response(
initial_url=initial_url,
final_response=final_response,
)
else:
2019-06-24 12:29:42 +02:00
# `--output, -o` provided
if self._resume and final_response.status_code == PARTIAL_CONTENT:
2013-04-15 05:56:47 +02:00
total_size = parse_content_range(
final_response.headers.get('Content-Range'),
2013-04-15 05:56:47 +02:00
self._resumed_from
)
2013-04-11 08:24:59 +02:00
else:
self._resumed_from = 0
try:
self._output_file.seek(0)
self._output_file.truncate()
except OSError:
pass # stdout
self.status.started(
2013-03-24 15:23:18 +01:00
resumed_from=self._resumed_from,
total_size=total_size
2013-03-24 15:23:18 +01:00
)
stream = RawStream(
msg=HTTPResponse(final_response),
with_headers=False,
with_body=True,
2013-08-18 00:59:10 +02:00
on_body_chunk_downloaded=self.chunk_downloaded,
)
2013-04-11 23:51:21 +02:00
self._progress_reporter.output.write(
f'Downloading {humanize_bytes(total_size) + " " if total_size is not None else ""}'
f'to "{self._output_file.name}"\n'
2013-04-13 02:49:27 +02:00
)
self._progress_reporter.start()
2013-03-24 15:23:18 +01:00
return stream, self._output_file
2013-03-24 15:23:18 +01:00
def finish(self):
assert not self.finished
self.finished = True
self.status.finished()
def failed(self):
self._progress_reporter.stop()
2013-03-24 15:23:18 +01:00
@property
def interrupted(self) -> bool:
2013-03-24 15:23:18 +01:00
return (
self.finished
and self.status.total_size
and self.status.total_size != self.status.downloaded
)
def chunk_downloaded(self, chunk: bytes):
"""
A download progress callback.
:param chunk: A chunk of response body data that has just
been downloaded and written to the output.
"""
self.status.chunk_downloaded(len(chunk))
@staticmethod
def _get_output_file_from_response(
initial_url: str,
final_response: requests.Response,
) -> IO:
# Output file not specified. Pick a name that doesn't exist yet.
filename = None
if 'Content-Disposition' in final_response.headers:
filename = filename_from_content_disposition(
final_response.headers['Content-Disposition'])
if not filename:
filename = filename_from_url(
url=initial_url,
content_type=final_response.headers.get('Content-Type'),
)
unique_filename = get_unique_filename(filename)
return open(unique_filename, buffering=0, mode='a+b')
2013-03-24 15:23:18 +01:00
class DownloadStatus:
"""Holds details about the download status."""
2013-03-24 15:23:18 +01:00
2013-04-11 23:51:21 +02:00
def __init__(self):
2013-03-24 15:23:18 +01:00
self.downloaded = 0
self.total_size = None
2013-04-11 23:51:21 +02:00
self.resumed_from = 0
self.time_started = None
self.time_finished = None
2013-03-24 15:23:18 +01:00
def started(self, resumed_from=0, total_size=None):
2013-04-11 23:51:21 +02:00
assert self.time_started is None
self.total_size = total_size
2013-04-11 23:51:21 +02:00
self.downloaded = self.resumed_from = resumed_from
self.time_started = monotonic()
2013-03-24 15:23:18 +01:00
def chunk_downloaded(self, size):
2013-04-11 23:51:21 +02:00
assert self.time_finished is None
2013-03-24 15:23:18 +01:00
self.downloaded += size
2013-04-11 23:51:21 +02:00
@property
def has_finished(self):
return self.time_finished is not None
def finished(self):
assert self.time_started is not None
assert self.time_finished is None
self.time_finished = monotonic()
2013-03-24 15:23:18 +01:00
class ProgressReporterThread(threading.Thread):
"""
Reports download progress based on its status.
2013-03-24 15:23:18 +01:00
Uses threading to periodically update the status (speed, ETA, etc.).
"""
2013-04-11 23:51:21 +02:00
def __init__(
self,
status: DownloadStatus,
output: IO,
tick=.1,
update_interval=1
):
super().__init__()
self.status = status
2013-04-11 23:51:21 +02:00
self.output = output
self._tick = tick
self._update_interval = update_interval
2013-04-15 05:56:47 +02:00
self._spinner_pos = 0
self._status_line = ''
2013-04-15 05:56:47 +02:00
self._prev_bytes = 0
self._prev_time = monotonic()
self._should_stop = threading.Event()
def stop(self):
"""Stop reporting on next tick."""
self._should_stop.set()
def run(self):
while not self._should_stop.is_set():
if self.status.has_finished:
self.sum_up()
break
2013-04-11 23:51:21 +02:00
self.report_speed()
sleep(self._tick)
2013-04-11 23:51:21 +02:00
def report_speed(self):
now = monotonic()
if now - self._prev_time >= self._update_interval:
downloaded = self.status.downloaded
speed = ((downloaded - self._prev_bytes)
/ (now - self._prev_time))
if not self.status.total_size:
2013-04-13 20:34:31 +02:00
self._status_line = PROGRESS_NO_CONTENT_LENGTH.format(
downloaded=humanize_bytes(downloaded),
speed=humanize_bytes(speed),
)
2013-04-13 02:49:27 +02:00
else:
percentage = (downloaded / self.status.total_size * 100
if self.status.total_size
else 0)
2013-04-13 20:34:31 +02:00
if not speed:
eta = '-:--:--'
else:
s = int((self.status.total_size - downloaded) / speed)
h, s = divmod(s, 60 * 60)
m, s = divmod(s, 60)
eta = f'{h}:{m:0>2}:{s:0>2}'
2013-04-13 20:34:31 +02:00
self._status_line = PROGRESS.format(
percentage=percentage,
downloaded=humanize_bytes(downloaded),
speed=humanize_bytes(speed),
eta=eta,
)
2013-03-24 15:23:18 +01:00
2013-04-11 23:51:21 +02:00
self._prev_time = now
self._prev_bytes = downloaded
self.output.write(
f'{CLEAR_LINE} {SPINNER[self._spinner_pos]} {self._status_line}'
)
2013-03-24 15:23:18 +01:00
self.output.flush()
self._spinner_pos = (self._spinner_pos + 1) % len(SPINNER)
2013-04-11 23:51:21 +02:00
def sum_up(self):
2016-07-04 20:30:55 +02:00
actually_downloaded = (
self.status.downloaded - self.status.resumed_from)
time_taken = self.status.time_finished - self.status.time_started
speed = actually_downloaded / time_taken if time_taken else actually_downloaded
2013-04-11 23:51:21 +02:00
self.output.write(CLEAR_LINE)
2013-04-11 23:51:21 +02:00
self.output.write(SUMMARY.format(
downloaded=humanize_bytes(actually_downloaded),
total=(self.status.total_size
and humanize_bytes(self.status.total_size)),
speed=humanize_bytes(speed),
2013-03-24 15:23:18 +01:00
time=time_taken,
))
2013-04-11 23:51:21 +02:00
self.output.flush()