Re-worked Telegram HTML/Markdown -> HTML Conversion (#579)

2025-02-18 19:21:06 +01:00 · 2022-05-01 14:43:55 -04:00 · 2022-05-01 14:43:55 -04:00 · fd0cb3ffcc
commit fd0cb3ffcc
parent 3c07d80975
5 changed files with 186 additions and 76 deletions
--- a/apprise/conversion.py
+++ b/apprise/conversion.py
@ -96,7 +96,9 @@ class HTMLConverter(HTMLParser, object):
                  'div', 'td', 'th', 'code', 'pre', 'label', 'li',)

    # the folowing tags ignore any internal text
-    IGNORE_TAGS = ('style', 'link', 'meta', 'title', 'html', 'head', 'script')
+    IGNORE_TAGS = (
+        'form', 'input', 'textarea', 'select', 'ul', 'ol', 'style', 'link',
+        'meta', 'title', 'html', 'head', 'script')

    # Condense Whitespace
    WS_TRIM = re.compile(r'[\s]+', re.DOTALL | re.MULTILINE)
--- a/apprise/plugins/NotifyTelegram.py
+++ b/apprise/plugins/NotifyTelegram.py
@ -177,44 +177,85 @@ class NotifyTelegram(NotifyBase):
    # characters passed into it.  to handle this situation, we need to
    # search the body for these sequences and convert them to the
    # output the user expected
-    __telegram_escape_html_dict = {
-        # New Lines
-        re.compile(r'<\s*/?br\s*/?>\r*\n?', re.I): '\r\n',
-        re.compile(r'<\s*/(br|p|div|li)[^>]*>\r*\n?', re.I): '\r\n',
-
-        # The following characters can be altered to become supported
-        re.compile(r'<\s*pre[^>]*>', re.I): '<code>',
-        re.compile(r'<\s*/pre[^>]*>', re.I): '</code>',
+    __telegram_escape_html_entries = (
+        # Comments
+        (re.compile(
+            r'\s*<!.+?-->\s*',
+            (re.I | re.M | re.S)), '', {}),

        # the following tags are not supported
-        re.compile(
-            r'<\s*(br|p|div|span|body|script|meta|html|font'
-            r'|label|iframe|li|ol|ul|source|script)[^>]*>', re.I): '',
+        (re.compile(
+            r'\s*<\s*(!?DOCTYPE|p|div|span|body|script|link|'
+            r'meta|html|font|head|label|form|input|textarea|select|iframe|'
+            r'source|script)([^a-z0-9>][^>]*)?>\s*',
+            (re.I | re.M | re.S)), '', {}),

-        re.compile(
-            r'<\s*/(span|body|script|meta|html|font'
-            r'|label|iframe|ol|ul|source|script)[^>]*>', re.I): '',
-
-        # Italic
-        re.compile(r'<\s*(caption|em)[^>]*>', re.I): '<i>',
-        re.compile(r'<\s*/(caption|em)[^>]*>', re.I): '</i>',
+        # All closing tags to be removed are put here
+        (re.compile(
+            r'\s*<\s*/(span|body|script|meta|html|font|head|'
+            r'label|form|input|textarea|select|ol|ul|link|'
+            r'iframe|source|script)([^a-z0-9>][^>]*)?>\s*',
+            (re.I | re.M | re.S)), '', {}),

        # Bold
-        re.compile(r'<\s*(h[1-6]|title|strong)[^>]*>', re.I): '<b>',
-        re.compile(r'<\s*/(h[1-6]|title|strong)[^>]*>', re.I): '</b>',
+        (re.compile(
+            r'<\s*(strong)([^a-z0-9>][^>]*)?>',
+            (re.I | re.M | re.S)), '<b>', {}),
+        (re.compile(
+            r'<\s*/\s*(strong)([^a-z0-9>][^>]*)?>',
+            (re.I | re.M | re.S)), '</b>', {}),
+        (re.compile(
+            r'\s*<\s*(h[1-6]|title)([^a-z0-9>][^>]*)?>\s*',
+            (re.I | re.M | re.S)), '{}<b>', {'html': '\r\n'}),
+        (re.compile(
+            r'\s*<\s*/\s*(h[1-6]|title)([^a-z0-9>][^>]*)?>\s*',
+            (re.I | re.M | re.S)),
+            '</b>{}', {'html': '<br/>'}),
+
+        # Italic
+        (re.compile(
+            r'<\s*(caption|em)([^a-z0-9>][^>]*)?>',
+            (re.I | re.M | re.S)), '<i>', {}),
+        (re.compile(
+            r'<\s*/\s*(caption|em)([^a-z0-9>][^>]*)?>',
+            (re.I | re.M | re.S)), '</i>', {}),
+
+        # Bullet Lists
+        (re.compile(
+            r'<\s*li([^a-z0-9>][^>]*)?>\s*',
+            (re.I | re.M | re.S)), ' -', {}),
+
+        # convert pre tags to code (supported by Telegram)
+        (re.compile(
+            r'<\s*pre([^a-z0-9>][^>]*)?>',
+            (re.I | re.M | re.S)), '{}<code>', {'html': '\r\n'}),
+        (re.compile(
+            r'<\s*/\s*pre([^a-z0-9>][^>]*)?>',
+            (re.I | re.M | re.S)), '</code>{}', {'html': '\r\n'}),
+
+        # New Lines
+        (re.compile(
+            r'\s*<\s*/?\s*(ol|ul|br|hr)\s*/?>\s*',
+            (re.I | re.M | re.S)), '\r\n', {}),
+        (re.compile(
+            r'\s*<\s*/\s*(br|p|hr|li|div)([^a-z0-9>][^>]*)?>\s*',
+            (re.I | re.M | re.S)), '\r\n', {}),

        # HTML Spaces (&nbsp;) and tabs (&emsp;) aren't supported
        # See https://core.telegram.org/bots/api#html-style
-        re.compile(r'\&nbsp;?', re.I): ' ',
+        (re.compile(r'\&nbsp;?', re.I), ' ', {}),

        # Tabs become 3 spaces
-        re.compile(r'\&emsp;?', re.I): '   ',
+        (re.compile(r'\&emsp;?', re.I), '   ', {}),

        # Some characters get re-escaped by the Telegram upstream
        # service so we need to convert these back,
-        re.compile(r'\&apos;?', re.I): '\'',
-        re.compile(r'\&quot;?', re.I): '"',
-    }
+        (re.compile(r'\&apos;?', re.I), '\'', {}),
+        (re.compile(r'\&quot;?', re.I), '"', {}),
+
+        # New line cleanup
+        (re.compile(r'\r*\n[\r\n]+', re.I), '\r\n', {}),
+    )

    # Define our template tokens
    template_tokens = dict(NotifyBase.template_tokens, **{
@ -597,38 +638,19 @@ class NotifyTelegram(NotifyBase):

            # Use Telegram's HTML mode
            payload['parse_mode'] = 'HTML'
-            for r, v in self.__telegram_escape_html_dict.items():
-                body = r.sub(v, body, re.I)
+            for r, v, m in self.__telegram_escape_html_entries:
+
+                if 'html' in m:
+                    # Handle special cases where we need to alter new lines
+                    # for presentation purposes
+                    v = v.format(m['html'] if body_format in (
+                        NotifyFormat.HTML, NotifyFormat.MARKDOWN) else '')
+
+                body = r.sub(v, body)

            # Prepare our payload based on HTML or TEXT
            payload['text'] = body

-        # else:  # self.notify_format == NotifyFormat.TEXT:
-        #     # Use Telegram's HTML mode
-        #     payload['parse_mode'] = 'HTML'
-
-        #     # Further html escaping required...
-        #     telegram_escape_text_dict = {
-        #         # We need to escape characters that conflict with html
-        #         # entity blocks (< and >) when displaying text
-        #         r'>': '&gt;',
-        #         r'<': '&lt;',
-        #         r'\&': '&amp;',
-        #     }
-
-        #     # Create a regular expression from the dictionary keys
-        #     text_regex = re.compile("(%s)" % "|".join(
-        #         map(re.escape, telegram_escape_text_dict.keys())).lower(),
-        #         re.I)
-
-        #     # For each match, look-up corresponding value in dictionary
-        #     body = text_regex.sub(  # pragma: no branch
-        #         lambda mo: telegram_escape_text_dict[
-        #             mo.string[mo.start():mo.end()]], body)
-
-        #     # prepare our payload based on HTML or TEXT
-        #     payload['text'] = body
-
        # Create a copy of the chat_ids list
        targets = list(self.targets)
        while len(targets):
--- a/test/test_conversion.py
+++ b/test/test_conversion.py
@ -22,7 +22,6 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
-
 from apprise import NotifyFormat
 from apprise.conversion import convert_between
 import pytest
--- a/test/test_plugin_telegram.py
+++ b/test/test_plugin_telegram.py
@ -625,11 +625,10 @@ def test_plugin_telegram_formating_py3(mock_post):

    # Test that everything is escaped properly in a TEXT mode
    assert payload['text'] == \
-        '<b>🚨 Change detected&nbsp;for&nbsp;&lt;i&gt;Apprise&nbsp;' \
-        'Test&nbsp;Title&lt;/i&gt;</b>\r\n&lt;a&nbsp;href=' \
-        '"http://localhost"&gt;&lt;i&gt;Apprise&nbsp;Body&nbsp;Title&lt;' \
-        '/i&gt;&lt;/a&gt;&nbsp;had&nbsp;&lt;a&nbsp;href=&quot;http://' \
-        '127.0.0.1&quot;&gt;a&nbsp;change&lt;/a&gt;'
+        '<b>🚨 Change detected for &lt;i&gt;Apprise Test Title&lt;/i&gt;' \
+        '</b>\r\n&lt;a href="http://localhost"&gt;&lt;i&gt;' \
+        'Apprise Body Title&lt;/i&gt;&lt;/a&gt; had &lt;' \
+        'a href="http://127.0.0.1"&gt;a change&lt;/a&gt;'

    # Reset our values
    mock_post.reset_mock()
@ -718,8 +717,9 @@ def test_plugin_telegram_formating_py3(mock_post):

    # Test that everything is escaped properly in a HTML mode
    assert payload['text'] == \
-        '<b><b>🚨 Another Change detected for <i>Apprise Test Title</i>' \
-        '</b></b>\r\n<i><a href="http://localhost">Apprise Body Title</a>' \
+        '<b>\r\n<b>🚨 Another Change detected for ' \
+        '<i>Apprise Test Title</i></b>\r\n</b>\r\n<i>' \
+        '<a href="http://localhost">Apprise Body Title</a>' \
        '</i> had <a href="http://127.0.0.2">a change</a>\r\n'

    # Now we'll test an edge case where a title was defined, but after
@ -881,11 +881,11 @@ def test_plugin_telegram_formating_py2(mock_post):

    # Test that everything is escaped properly in a TEXT mode
    assert payload['text'].encode('utf-8') == \
-        '<b>\xf0\x9f\x9a\xa8 Change detected&nbsp;for&nbsp;&lt;i&gt;' \
-        'Apprise&nbsp;Test&nbsp;Title&lt;/i&gt;</b>\r\n&lt;a&nbsp;' \
-        'href="http://localhost"&gt;&lt;i&gt;Apprise&nbsp;Body&nbsp;' \
-        'Title&lt;/i&gt;&lt;/a&gt;&nbsp;had&nbsp;&lt;a&nbsp;href=&quot;' \
-        'http://127.0.0.1&quot;&gt;a&nbsp;change&lt;/a&gt;'
+        '<b>\xf0\x9f\x9a\xa8 Change detected for &lt;i&gt;' \
+        'Apprise Test Title&lt;/i&gt;</b>\r\n&lt;' \
+        'a href="http://localhost"&gt;&lt;i&gt;Apprise Body Title' \
+        '&lt;/i&gt;&lt;/a&gt; had &lt;a href="http://127.0.0.1"' \
+        '&gt;a change&lt;/a&gt;'

    # Reset our values
    mock_post.reset_mock()
@ -969,9 +969,9 @@ def test_plugin_telegram_formating_py2(mock_post):

    # Test that everything is escaped properly in a HTML mode
    assert payload['text'].encode('utf-8') == \
-        '<b><b>\xf0\x9f\x9a\xa8 Change detected for ' \
-        '<i>Apprise Test Title</i></b></b>\r\n<i>' \
-        '<a href="http://localhost">Apprise Body Title</a>'\
+        '<b>\r\n<b>\xf0\x9f\x9a\xa8 Change detected for ' \
+        '<i>Apprise Test Title</i></b>\r\n</b>\r\n<i>' \
+        '<a href="http://localhost">Apprise Body Title</a>' \
        '</i> had <a href="http://127.0.0.1">a change</a>\r\n'

    # Reset our values
@ -1163,8 +1163,8 @@ def test_plugin_telegram_html_formatting(mock_post):

    # Test that everything is escaped properly in a HTML mode
    assert payload['text'] == \
-        '<b><b>\'information\'</b></b>\r\n<i>"This is in Italic"' \
-        '</i>\r\n<b>      Headings are dropped and converted to bold</b>'
+        '<b>\r\n<b>\'information\'</b>\r\n</b>\r\n<i>"This is in Italic"' \
+        '</i>\r\n<b>      Headings are dropped and converted to bold</b>\r\n'

    mock_post.reset_mock()

@ -1177,7 +1177,28 @@ def test_plugin_telegram_html_formatting(mock_post):

    assert payload['text'] == \
        '<b>&lt;title&gt;&amp;apos;information&amp;apos&lt;/title&gt;</b>' \
-        '\r\n&lt;em&gt;&amp;quot;This is in&nbsp;Italic&amp;quot&lt;/em' \
-        '&gt;&lt;br/&gt;&lt;h5&gt;&amp;emsp;&amp;emspHeadings&amp;nbsp;' \
-        'are&nbsp;dropped&nbsp;and&amp;nbspconverted&nbsp;to&nbsp;bold&lt;' \
-        '/h5&gt;'
+        '\r\n&lt;em&gt;&amp;quot;This is in Italic&amp;quot&lt;/em&gt;&lt;' \
+        'br/&gt;&lt;h5&gt;&amp;emsp;&amp;emspHeadings&amp;nbsp;are ' \
+        'dropped and&amp;nbspconverted to bold&lt;/h5&gt;'
+
+    # Lest test more complex HTML examples now
+    mock_post.reset_mock()
+
+    test_file_01 = os.path.join(
+        TEST_VAR_DIR, '01_test_example.html')
+    with open(test_file_01) as html_file:
+        assert aobj.notify(
+            body=html_file.read(), body_format=NotifyFormat.HTML)
+
+    # owner has already been looked up, so only one call is made
+    assert mock_post.call_count == 1
+
+    payload = loads(mock_post.call_args_list[0][1]['data'])
+    assert payload['text'] == \
+        '\r\n<b>Bootstrap 101 Template</b>\r\n<b>My Title</b>\r\n' \
+        '<b>Heading 1</b>\r\n-Bullet 1\r\n-Bullet 2\r\n-Bullet 3\r\n' \
+        '-Bullet 1\r\n-Bullet 2\r\n-Bullet 3\r\n<b>Heading 2</b>\r\n' \
+        'A div entry\r\nA div entry\r\n<code>A pre entry</code>\r\n' \
+        '<b>Heading 3</b>\r\n<b>Heading 4</b>\r\n<b>Heading 5</b>\r\n' \
+        '<b>Heading 6</b>\r\nA set of text\r\n' \
+        'Another line after the set of text\r\nMore text\r\nlabel'
--- a/test/var/01_test_example.html
+++ b/test/var/01_test_example.html
@ -0,0 +1,66 @@
+<!DOCTYPE html>
+<html lang="en">
+ <head>
+    <meta charset="utf-8">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
+    <title>Bootstrap 101 Template</title>
+
+    <!-- Bootstrap -->
+    <link href="css/bootstrap.min.css" rel="stylesheet">
+
+      <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
+      <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
+      <!--[if lt IE 9]>
+        <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+        <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+      <![endif]-->
+   </head>
+   <body>
+      <h1>My Title</h1>
+
+      <!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
+      <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
+      <!-- Include all compiled plugins (below), or include individual files as needed -->
+      <script src="js/bootstrap.min.js"></script>
+
+      <h1>Heading 1</h1>
+      <p>
+        <ul>
+          <li>Bullet 1</li>
+          <li>Bullet 2</li>
+          <li>Bullet 3</li>
+        </ul>
+
+        <ol>
+          <li>Bullet 1</li>
+          <li>Bullet 2</li>
+          <li>Bullet 3</li>
+        </ol>
+      </p>
+
+      <h2>Heading 2</h2>
+      <div>A div entry</div>
+      <p>
+        <span>A div entry</span>
+        <pre>A pre entry</pre>
+      </p>
+
+      <h3>Heading 3</h3>
+      <h4>Heading 4</h4>
+      <h5>Heading 5</h5>
+      <h6>Heading 6</h6>
+
+      <p>
+      A set of text <br/>Another line after the set of text
+      <hr/>
+      More text
+      </p>
+      <form>
+        <label>label</label>
+        <input/>
+        <select/>
+      </form>
+   </body>
+</html>