Telegram escaping completely refactored (#386)

2025-06-22 18:51:42 +02:00 · 2021-05-15 16:08:53 -04:00 · 2021-05-15 16:08:53 -04:00 · 59aa5f5d10
commit 59aa5f5d10
parent 7f7ee043d9
2 changed files with 75 additions and 25 deletions
--- a/apprise/plugins/NotifyTelegram.py
+++ b/apprise/plugins/NotifyTelegram.py
@ -524,39 +524,73 @@ class NotifyTelegram(NotifyBase):
                body,
            )

-        elif self.notify_format == NotifyFormat.HTML:
+        else:  # HTML or TEXT
+
+            # Use Telegram's HTML mode
            payload['parse_mode'] = 'HTML'

+            # Telegram's HTML support doesn't like having HTML escaped
+            # characters passed into it.  to handle this situation, we need to
+            # search the body for these sequences and convert them to the
+            # output the user expected
+            telegram_escape_html_dict = {
                # HTML Spaces (&nbsp;) and tabs (&emsp;) aren't supported
                # See https://core.telegram.org/bots/api#html-style
-            body = re.sub('&nbsp;?', ' ', body, re.I)
+                r'nbsp': ' ',

                # Tabs become 3 spaces
-            body = re.sub('&emsp;?', '   ', body, re.I)
+                r'emsp': '   ',
+
+                # Some characters get re-escaped by the Telegram upstream
+                # service so we need to convert these back,
+                r'apos': '\'',
+                r'quot': '"',
+            }
+
+            # Create a regular expression from the dictionary keys
+            html_regex = re.compile("&(%s);?" % "|".join(
+                map(re.escape, telegram_escape_html_dict.keys())).lower(),
+                re.I)
+
+            # For each match, look-up corresponding value in dictionary
+            # we look +1 to ignore the & that does not appear in the index
+            # we only look at the first 4 characters because we don't want to
+            # fail on &apos; as it's accepted (along with &apos - no
+            # semi-colon)
+            body = html_regex.sub(  # pragma: no branch
+                lambda mo: telegram_escape_html_dict[
+                    mo.string[mo.start():mo.end()][1:5]], body)

            if title:
-                # HTML Spaces (&nbsp;) and tabs (&emsp;) aren't supported
-                # See https://core.telegram.org/bots/api#html-style
-                title = re.sub('&nbsp;?', ' ', title, re.I)
+                # For each match, look-up corresponding value in dictionary
+                # Indexing is explained above (for how the body is parsed)
+                title = html_regex.sub(  # pragma: no branch
+                    lambda mo: telegram_escape_html_dict[
+                        mo.string[mo.start():mo.end()][1:5]], title)

-                # Tabs become 3 spaces
-                title = re.sub('&emsp;?', '   ', title, re.I)
+            if self.notify_format == NotifyFormat.TEXT:
+                telegram_escape_text_dict = {
+                    # We need to escape characters that conflict with html
+                    # entity blocks (< and >) when displaying text
+                    r'>': '&gt;',
+                    r'<': '&lt;',
+                }

-            payload['text'] = '{}{}'.format(
-                '<b>{}</b>\r\n'.format(title) if title else '',
-                body,
-            )
+                # Create a regular expression from the dictionary keys
+                text_regex = re.compile("(%s)" % "|".join(
+                    map(re.escape, telegram_escape_text_dict.keys())).lower(),
+                    re.I)

-        else:  # pass directly as is...
-            payload['parse_mode'] = 'HTML'
+                # For each match, look-up corresponding value in dictionary
+                body = text_regex.sub(  # pragma: no branch
+                    lambda mo: telegram_escape_text_dict[
+                        mo.string[mo.start():mo.end()]], body)

-            # Telegram strangely escapes all HTML characters for us already
-            # but to avoid causing issues with HTML, we escape the < and >
-            # characters
-            title = re.sub('>', '&gt;', title, re.I)
-            title = re.sub('<', '&lt;', title, re.I)
-            body = re.sub('>', '&gt;', body, re.I)
-            body = re.sub('<', '&lt;', body, re.I)
+                if title:
+                    # For each match, look-up corresponding value in dictionary
+                    title = text_regex.sub(  # pragma: no branch
+                        lambda mo: telegram_escape_text_dict[
+                            mo.string[mo.start():mo.end()]], title)

            payload['text'] = '{}{}'.format(
                '<b>{}</b>\r\n'.format(title) if title else '',
--- a/test/test_telegram.py
+++ b/test/test_telegram.py
@ -29,6 +29,7 @@ import pytest
 import mock
 import requests
 from json import dumps
+from json import loads
 from apprise import Apprise
 from apprise import AppriseAttachment
 from apprise import AppriseAsset
@ -202,11 +203,26 @@ def test_notify_telegram_plugin(mock_post, mock_get):
    })
    mock_post.return_value.status_code = requests.codes.ok

-    # Test sending attachments
    obj = plugins.NotifyTelegram(bot_token=bot_token, targets='12345')
    assert len(obj.targets) == 1
    assert obj.targets[0] == '12345'

+    # Test the escaping of characters since Telegram escapes stuff for us to
+    # which we need to consider
+    mock_post.reset_mock()
+    body = "<p>\'\"This can't\t\r\nfail&nbsp;us\"\'</p>"
+    assert obj.notify(
+        body=body, title='special characters',
+        notify_type=NotifyType.INFO) is True
+    assert mock_post.call_count == 1
+    payload = loads(mock_post.call_args_list[0][1]['data'])
+
+    # Our special characters are escaped properly
+    assert payload['text'] == \
+        '<b>special characters</b>\r\n&lt;p&gt;'\
+        '\'"This can\'t\t\r\nfail us"\'&lt;/p&gt;'
+
+    # Test sending attachments
    attach = AppriseAttachment(os.path.join(TEST_VAR_DIR, 'apprise-test.gif'))
    assert obj.notify(
        body='body', title='title', notify_type=NotifyType.INFO,