Improved async/threading attachment support (#1086)

2025-06-21 02:07:43 +02:00 · 2024-03-29 14:42:28 -04:00 · 2024-03-29 14:42:28 -04:00 · 81804704da
commit 81804704da
parent 195d0efe3c
3 changed files with 210 additions and 122 deletions
--- a/apprise/attachment/AttachBase.py
+++ b/apprise/attachment/AttachBase.py
@ -253,7 +253,7 @@ class AttachBase(URLBase):
        return self.detected_mimetype \
            if self.detected_mimetype else self.unknown_mimetype
-    def exists(self):
+    def exists(self, retrieve_if_missing=True):
        """
        Simply returns true if the object has downloaded and stored the
        attachment AND the attachment has not expired.
@ -282,7 +282,7 @@ class AttachBase(URLBase):
                # The file is not present
                pass
-        return self.download()
+        return False if not retrieve_if_missing else self.download()
    def invalidate(self):
        """
--- a/apprise/attachment/AttachHTTP.py
+++ b/apprise/attachment/AttachHTTP.py
@ -29,6 +29,7 @@
 import re
 import os
 import requests
 import threading
 from tempfile import NamedTemporaryFile
 from .AttachBase import AttachBase
 from ..common import ContentLocation
@ -56,6 +57,9 @@ class AttachHTTP(AttachBase):
    # Web based requests are remote/external to our current location
    location = ContentLocation.HOSTED
    # thread safe loading
    _lock = threading.Lock()
    def __init__(self, headers=None, **kwargs):
        """
        Initialize HTTP Object
@ -96,9 +100,6 @@ class AttachHTTP(AttachBase):
            # our content is inaccessible
            return False
        # Ensure any existing content set has been invalidated
        self.invalidate()
        # prepare header
        headers = {
            'User-Agent': self.app_id,
@ -117,134 +118,154 @@ class AttachHTTP(AttachBase):
        url += self.fullpath
        self.logger.debug('HTTP POST URL: %s (cert_verify=%r)' % (
            url, self.verify_certificate,
        ))
        # Where our request object will temporarily live.
        r = None
        # Always call throttle before any remote server i/o is made
        self.throttle()
-        try:
+        with self._lock:
-            # Make our request
+            if self.exists(retrieve_if_missing=False):
-            with requests.get(
+                # Due to locking; it's possible a concurrent thread already
-                    url,
+                # handled the retrieval in which case we can safely move on
-                    headers=headers,
+                self.logger.trace(
-                    auth=auth,
+                    'HTTP Attachment %s already retrieved',
-                    params=self.qsd,
+                    self._temp_file.name)
-                    verify=self.verify_certificate,
+                return True
                    timeout=self.request_timeout,
                    stream=True) as r:
-                # Handle Errors
+            # Ensure any existing content set has been invalidated
                r.raise_for_status()
                # Get our file-size (if known)
                try:
                    file_size = int(r.headers.get('Content-Length', '0'))
                except (TypeError, ValueError):
                    # Handle edge case where Content-Length is a bad value
                    file_size = 0
                # Perform a little Q/A on file limitations and restrictions
                if self.max_file_size > 0 and file_size > self.max_file_size:
                    # The content retrieved is to large
                    self.logger.error(
                        'HTTP response exceeds allowable maximum file length '
                        '({}KB): {}'.format(
                            int(self.max_file_size / 1024),
                            self.url(privacy=True)))
                    # Return False (signifying a failure)
                    return False
                # Detect config format based on mime if the format isn't
                # already enforced
                self.detected_mimetype = r.headers.get('Content-Type')
                d = r.headers.get('Content-Disposition', '')
                result = re.search(
                    "filename=['\"]?(?P<name>[^'\"]+)['\"]?", d, re.I)
                if result:
                    self.detected_name = result.group('name').strip()
                # Create a temporary file to work with
                self._temp_file = NamedTemporaryFile()
                # Get our chunk size
                chunk_size = self.chunk_size
                # Track all bytes written to disk
                bytes_written = 0
                # If we get here, we can now safely write our content to disk
                for chunk in r.iter_content(chunk_size=chunk_size):
                    # filter out keep-alive chunks
                    if chunk:
                        self._temp_file.write(chunk)
                        bytes_written = self._temp_file.tell()
                        # Prevent a case where Content-Length isn't provided
                        # we don't want to fetch beyond our limits
                        if self.max_file_size > 0:
                            if bytes_written > self.max_file_size:
                                # The content retrieved is to large
                                self.logger.error(
                                    'HTTP response exceeds allowable maximum '
                                    'file length ({}KB): {}'.format(
                                        int(self.max_file_size / 1024),
                                        self.url(privacy=True)))
                                # Invalidate any variables previously set
                                self.invalidate()
                                # Return False (signifying a failure)
                                return False
                            elif bytes_written + chunk_size \
                                    > self.max_file_size:
                                # Adjust out next read to accomodate up to our
                                # limit +1. This will prevent us from readig
                                # to much into our memory buffer
                                self.max_file_size - bytes_written + 1
                # Ensure our content is flushed to disk for post-processing
                self._temp_file.flush()
            # Set our minimum requirements for a successful download() call
            self.download_path = self._temp_file.name
            if not self.detected_name:
                self.detected_name = os.path.basename(self.fullpath)
        except requests.RequestException as e:
            self.logger.error(
                'A Connection error occurred retrieving HTTP '
                'configuration from %s.' % self.host)
            self.logger.debug('Socket Exception: %s' % str(e))
            # Invalidate any variables previously set
            self.invalidate()
-            # Return False (signifying a failure)
+            self.logger.debug(
-            return False
+                'HTTP Attachment Fetch URL: %s (cert_verify=%r)' % (
                    url, self.verify_certificate))
-        except (IOError, OSError):
+            try:
-            # IOError is present for backwards compatibility with Python
+                # Make our request
-            # versions older then 3.3.  >= 3.3 throw OSError now.
+                with requests.get(
                        url,
                        headers=headers,
                        auth=auth,
                        params=self.qsd,
                        verify=self.verify_certificate,
                        timeout=self.request_timeout,
                        stream=True) as r:
-            # Could not open and/or write the temporary file
+                    # Handle Errors
-            self.logger.error(
+                    r.raise_for_status()
                'Could not write attachment to disk: {}'.format(
                    self.url(privacy=True)))
-            # Invalidate any variables previously set
+                    # Get our file-size (if known)
-            self.invalidate()
+                    try:
                        file_size = int(r.headers.get('Content-Length', '0'))
                    except (TypeError, ValueError):
                        # Handle edge case where Content-Length is a bad value
                        file_size = 0
-            # Return False (signifying a failure)
+                    # Perform a little Q/A on file limitations and restrictions
-            return False
+                    if self.max_file_size > 0 and \
                            file_size > self.max_file_size:
                        # The content retrieved is to large
                        self.logger.error(
                            'HTTP response exceeds allowable maximum file '
                            'length ({}KB): {}'.format(
                                int(self.max_file_size / 1024),
                                self.url(privacy=True)))
                        # Return False (signifying a failure)
                        return False
                    # Detect config format based on mime if the format isn't
                    # already enforced
                    self.detected_mimetype = r.headers.get('Content-Type')
                    d = r.headers.get('Content-Disposition', '')
                    result = re.search(
                        "filename=['\"]?(?P<name>[^'\"]+)['\"]?", d, re.I)
                    if result:
                        self.detected_name = result.group('name').strip()
                    # Create a temporary file to work with; delete must be set
                    # to False or it isn't compatible with Microsoft Windows
                    # instances. In lieu of this, __del__ will clean up the
                    # file for us.
                    self._temp_file = NamedTemporaryFile(delete=False)
                    # Get our chunk size
                    chunk_size = self.chunk_size
                    # Track all bytes written to disk
                    bytes_written = 0
                    # If we get here, we can now safely write our content to
                    # disk
                    for chunk in r.iter_content(chunk_size=chunk_size):
                        # filter out keep-alive chunks
                        if chunk:
                            self._temp_file.write(chunk)
                            bytes_written = self._temp_file.tell()
                            # Prevent a case where Content-Length isn't
                            # provided. In this case we don't want to fetch
                            # beyond our limits
                            if self.max_file_size > 0:
                                if bytes_written > self.max_file_size:
                                    # The content retrieved is to large
                                    self.logger.error(
                                        'HTTP response exceeds allowable '
                                        'maximum file length '
                                        '({}KB): {}'.format(
                                            int(self.max_file_size / 1024),
                                            self.url(privacy=True)))
                                    # Invalidate any variables previously set
                                    self.invalidate()
                                    # Return False (signifying a failure)
                                    return False
                                elif bytes_written + chunk_size \
                                        > self.max_file_size:
                                    # Adjust out next read to accomodate up to
                                    # our limit +1. This will prevent us from
                                    # reading to much into our memory buffer
                                    self.max_file_size - bytes_written + 1
                    # Ensure our content is flushed to disk for post-processing
                    self._temp_file.flush()
                    # Set our minimum requirements for a successful download()
                    # call
                    self.download_path = self._temp_file.name
                    if not self.detected_name:
                        self.detected_name = os.path.basename(self.fullpath)
            except requests.RequestException as e:
                self.logger.error(
                    'A Connection error occurred retrieving HTTP '
                    'configuration from %s.' % self.host)
                self.logger.debug('Socket Exception: %s' % str(e))
                # Invalidate any variables previously set
                self.invalidate()
                # Return False (signifying a failure)
                return False
            except (IOError, OSError):
                # IOError is present for backwards compatibility with Python
                # versions older then 3.3.  >= 3.3 throw OSError now.
                # Could not open and/or write the temporary file
                self.logger.error(
                    'Could not write attachment to disk: {}'.format(
                        self.url(privacy=True)))
                # Invalidate any variables previously set
                self.invalidate()
                # Return False (signifying a failure)
                return False
        # Return our success
        return True
@ -254,11 +275,30 @@ class AttachHTTP(AttachBase):
        Close our temporary file
        """
        if self._temp_file:
            self.logger.trace(
                'Attachment cleanup of %s', self._temp_file.name)
            self._temp_file.close()
            try:
                # Ensure our file is removed (if it exists)
                os.unlink(self._temp_file.name)
            except OSError:
                pass
            # Reset our temporary file to prevent from entering
            # this block again
            self._temp_file = None
        super().invalidate()
    def __del__(self):
        """
        Tidy memory if open
        """
        with self._lock:
            self.invalidate()
    def url(self, privacy=False, *args, **kwargs):
        """
        Returns the URL built dynamically based on specified arguments.
--- a/test/test_attach_http.py
+++ b/test/test_attach_http.py
@ -35,7 +35,7 @@ from os.path import join
 from os.path import dirname
 from os.path import getsize
 from apprise.attachment.AttachHTTP import AttachHTTP
-from apprise import AppriseAttachment
+from apprise import Apprise, AppriseAttachment
 from apprise.NotificationManager import NotificationManager
 from apprise.plugins.NotifyBase import NotifyBase
 from apprise.common import ContentLocation
@ -113,8 +113,9 @@ def test_attach_http_query_string_dictionary():
    assert re.search(r'[?&]_var=test', obj.url())
@mock.patch('requests.post')
@mock.patch('requests.get')
-def test_attach_http(mock_get):
+def test_attach_http(mock_get, mock_post):
    """
    API: AttachHTTP() object
@ -422,3 +423,50 @@ def test_attach_http(mock_get):
    # Restore value
    AttachHTTP.max_file_size = max_file_size
    # Multi Message Testing
    mock_get.side_effect = None
    mock_get.return_value = DummyResponse()
    # Prepare our POST response (from notify call)
    response = requests.Request()
    response.status_code = requests.codes.ok
    response.content = ""
    mock_post.return_value = response
    mock_get.reset_mock()
    mock_post.reset_mock()
    assert mock_get.call_count == 0
    apobj = Apprise()
    assert apobj.add('form://localhost')
    assert apobj.add('json://localhost')
    assert apobj.add('xml://localhost')
    assert len(apobj) == 3
    assert apobj.notify(
        body='one attachment split 3 times',
        attach="http://localhost/test.gif",
    ) is True
    # We posted 3 times
    assert mock_post.call_count == 3
    # We only fetched once and re-used the same fetch for all posts
    assert mock_get.call_count == 1
    mock_get.reset_mock()
    mock_post.reset_mock()
    apobj = Apprise()
    for n in range(10):
        assert apobj.add(f'json://localhost?:entry={n}&method=post')
        assert apobj.add(f'form://localhost?:entry={n}&method=post')
        assert apobj.add(f'xml://localhost?:entry={n}&method=post')
    assert apobj.notify(
        body='one attachment split 30 times',
        attach="http://localhost/test.gif",
    ) is True
    # We posted 30 times
    assert mock_post.call_count == 30
    # We only fetched once and re-used the same fetch for all posts
    assert mock_get.call_count == 1