Merge 4c1b5cabbe into 3cc98662f3

2025-03-13 14:28:23 +01:00 · 2025-03-11 23:54:22 +01:00 · 2025-03-11 23:54:22 +01:00 · 8723442129
commit 8723442129
parent 3cc98662f3 4c1b5cabbe
2 changed files with 267 additions and 3 deletions
--- a/apprise/conversion.py
+++ b/apprise/conversion.py
@ -46,8 +46,7 @@ def convert_between(from_format, to_format, content):
        (NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown_to_html,
        (NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html,
        (NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text,
-        # For now; use same converter for Markdown support
+        (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_markdown,
        (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text,
    }
    convert = converters.get((from_format, to_format))
@ -82,12 +81,23 @@ def html_to_text(content):
    return parser.converted
 def html_to_markdown(content):
    """
    Converts a content from HTML to markdown.
    """
    parser = HTMLMarkDownConverter()
    parser.feed(content)
    parser.close()
    return parser.converted
 class HTMLConverter(HTMLParser, object):
    """An HTML to plain text converter tuned for email messages."""
    # The following tags must start on a new line
    BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
-                  'div', 'td', 'th', 'code', 'pre', 'label', 'li',)
+                  'div', 'td', 'th', 'pre', 'samp', 'label', 'li',)
    # the folowing tags ignore any internal text
    IGNORE_TAGS = (
@ -198,3 +208,132 @@ class HTMLConverter(HTMLParser, object):
        if tag in self.BLOCK_TAGS:
            self._result.append(self.BLOCK_END)
 class HTMLMarkDownConverter(HTMLConverter):
    """An HTML to markdown converter tuned for email messages."""
    # Escape markdown characters
    MARKDOWN_ESCAPE = re.compile(r'([`*#])', re.DOTALL | re.MULTILINE)
    # Detect Carriage Return
    HAS_CR = re.compile(r'[\r*\n]+', re.DOTALL | re.MULTILINE)
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        # Store href value
        self._link = ""
        self._preserver_cr = False
    def handle_data(self, data, *args, **kwargs):
        """
        Store our data if it is not on the ignore list
        """
        # initialize our previous flag
        if self._do_store:
            # Tidy our whitespace
            content = self.WS_TRIM.sub(' ', data) \
                if not self._preserver_cr else data
            content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content)
            # Add hyperlink
            if self._link == "":
                self._result.append(content)
            else:
                self._result.append("[" + content + "]" + self._link)
    def handle_starttag(self, tag, attrs):
        """
        Process our starting HTML Tag
        """
        # Toggle initial states
        self._do_store = tag not in self.IGNORE_TAGS
        self._link = ""
        if tag in self.BLOCK_TAGS:
            self._result.append(self.BLOCK_END)
        if tag == 'li':
            self._result.append('- ')
        elif tag == 'br':
            self._result.append('\n')
        elif tag == 'hr':
            if self._result:
                self._result[-1] = self._result[-1].rstrip(' ')
            self._result.append('\n---\n')
        elif tag == 'blockquote':
            self._result.append('> ')
        elif tag == 'h1':
            self._result.append('# ')
        elif tag == 'h2':
            self._result.append('## ')
        elif tag == 'h3':
            self._result.append('### ')
        elif tag == 'h4':
            self._result.append('#### ')
        elif tag == 'h5':
            self._result.append('##### ')
        elif tag == 'h6':
            self._result.append('###### ')
        elif tag in ('strong', 'b'):
            self._result.append('**')
        elif tag in ('em', 'i'):
            self._result.append('*')
        elif tag == 'code':
            self._result.append('`')
            self._preserver_cr = True
        elif tag in ('pre', 'samp'):
            self._result.append('```')
            self._result.append(self.BLOCK_END)
            self._preserver_cr = True
        elif tag == 'a':
            for name, link in attrs:  # pragma: no branch
                if name == 'href':
                    self._link = '(' + link + ')'
                    # Take an early exit for speed (in case there are more
                    # parameters - no need to waste time looking at them)
                    break
    def handle_endtag(self, tag):
        """
        Edge case handling of open/close tags
        """
        self._do_store = True
        self._link = ""
        if tag in self.BLOCK_TAGS:
            self._result.append(self.BLOCK_END)
        if tag in ('strong', 'b'):
            self._result.append('**')
        elif tag in ('em', 'i'):
            self._result.append('*')
        elif tag == 'code':
            self._result.append('`')
            self._preserver_cr = False
        elif tag in ('pre', 'samp'):
            self._result.append('```')
            self._result.append(self.BLOCK_END)
            self._preserver_cr = False
--- a/test/test_conversion.py
+++ b/test/test_conversion.py
@ -104,6 +104,11 @@ def test_conversion_html_to_text():
                   "<a href='#'>my link</a>") == \
        "test my link"
    # a with missing href entry
    assert to_html("<span></span<<span>test</span> "
                   "<a>my link</a>") == \
        "test my link"
    # </p> missing
    assert to_html("<body><div>line 1 <b>bold</b></div>  "
                   " <a href='#'>my link</a>"
@ -204,6 +209,126 @@ wanted."
        assert to_html(object)
 def test_conversion_html_to_markdown():
    """conversion: Test HTML to plain text
    """
    def to_markdown(body):
        """
        A function to simply html conversion tests
        """
        return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body)
    assert to_markdown("No HTML code here.") == "No HTML code here."
    clist = to_markdown("<ul><li>Lots and lots</li><li>of lists.</li></ul>")
    assert "- Lots and lots" in clist
    assert "- of lists." in clist
    assert "> To be or not to be." == to_markdown(
        "<blockquote>To be or not to be.</blockquote>")
    cspace = to_markdown(
        "<h2>Fancy heading</h2>"
        "<p>And a paragraph too.<br>Plus line break.</p>")
    assert "# Fancy heading" in cspace
    assert "And a paragraph too.\nPlus line break." in cspace
    assert to_markdown(
        "<style>body { font: 200%; }</style>"
        "<p>Some obnoxious text here.</p>") == "Some obnoxious text here."
    assert to_markdown(
        "<p>line 1</p>"
        "<p>line 2</p>"
        "<p>line 3</p>") == "line 1\nline 2\nline 3"
    # Case sensitivity
    assert to_markdown(
        "<p>line 1</P>"
        "<P>line 2</P>"
        "<P>line 3</P>") == "line 1\nline 2\nline 3"
    # double new lines (testing <br> and </br>)
    assert to_markdown(
        "some information<br/><br>and more information") == \
        "some information\n\nand more information"
    #
    # Test bad tags
    #
    # first 2 entries are okay, but last will do as best as it can
    assert to_markdown(
        "<h1>Heading 1</h1>"
        "<h2>Heading 2</h2>"
        "<h3>Heading 3</h3>"
        "<h4>Heading 4</h4>"
        "<h5>Heading 5</h5>"
        "<h6>Heading 6</h6>"
        "<p>line 1</>"
        "<p><em>line 2</em></gar>"
        "<p>line 3>") == \
        "# Heading 1\n## Heading 2\n### Heading 3\n" \
        "#### Heading 4\n##### Heading 5\n###### Heading 6\n" \
        "line 1\n*line 2*\nline 3>"
    # Make sure we ignore fields that aren't important to us
    assert to_markdown(
        "<script>ignore this</script>"
        "<p>line 1</p>"
        "Another line without being enclosed") == \
        "line 1\nAnother line without being enclosed"
    # Test <code> and <pre>
    assert to_markdown(
        "<code>multi-line 1\nmulti-line 2</code>more content"
        "<pre>multi-line 1\nmulti-line 2</pre>more content") == \
        '`multi-line 1\nmulti-line 2`more content' \
        '\n```\nmulti-line 1\nmulti-line 2\n```\nmore content'
    # Test cases when there are no new lines (we're dealing with just inline
    # entries); an empty entry as well
    assert to_markdown("<span></span<<span>test</span> "
                       "<a href='#'>my link</a>") == \
           "test [my link](#)"
    # </p> missing
    assert to_markdown("<body><div>line 1 <b>bold</b></div>  "
                       " <a href='/link'>my link</a>"
                       "<p>3rd line</body>") == \
           "line 1 **bold**\n[my link](/link)\n3rd line"
    # <hr/> on it's own
    assert to_markdown("<hr/>") == "---"
    assert to_markdown("<hr>") == "---"
    # We need to handle HTML Encodings
    assert to_markdown("""
        <html>
            <title>ignore this entry</title>
        <body>
          Let&apos;s handle&nbsp;special html encoding
          <hr/>
        </body>
        """) == "Let's handle special html encoding\n---"
    # If you give nothing, you get nothing in return
    assert to_markdown("") == ""
    with pytest.raises(TypeError):
        # Invalid input
        assert to_markdown(None)
    with pytest.raises(TypeError):
        # Invalid input
        assert to_markdown(42)
    with pytest.raises(TypeError):
        # Invalid input
        assert to_markdown(object)
 def test_conversion_text_to():
    """conversion: Test Text to all types
    """