From 8d543a5eb32f6f65c6d814f1f00ecfd006376c6b Mon Sep 17 00:00:00 2001
From: Hector Rodriguez Medina <rotec52@gmail.com>
Date: Sun, 20 Aug 2023 18:16:47 -0300
Subject: [PATCH 1/3] Add conversion for HTML to markdown

---
 apprise/conversion.py   | 123 +++++++++++++++++++++++++++++++++++++++-
 test/test_conversion.py | 104 +++++++++++++++++++++++++++++++++
 2 files changed, 225 insertions(+), 2 deletions(-)

diff --git a/apprise/conversion.py b/apprise/conversion.py
index 77c9aa5e..689697f0 100644
--- a/apprise/conversion.py
+++ b/apprise/conversion.py
@@ -50,8 +50,7 @@ def convert_between(from_format, to_format, content):
         (NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown_to_html,
         (NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html,
         (NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text,
-        # For now; use same converter for Markdown support
-        (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text,
+        (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_markdown,
     }
 
     convert = converters.get((from_format, to_format))
@@ -86,6 +85,17 @@ def html_to_text(content):
     return parser.converted
 
 
+def html_to_markdown(content):
+    """
+    Converts a content from HTML to markdown.
+    """
+
+    parser = HTMLMarkDownConverter()
+    parser.feed(content)
+    parser.close()
+    return parser.converted
+
+
 class HTMLConverter(HTMLParser, object):
     """An HTML to plain text converter tuned for email messages."""
 
@@ -200,3 +210,112 @@ class HTMLConverter(HTMLParser, object):
 
         if tag in self.BLOCK_TAGS:
             self._result.append(self.BLOCK_END)
+
+
+class HTMLMarkDownConverter(HTMLConverter):
+    """An HTML to markdown converter tuned for email messages."""
+
+    # Escape markdown characters
+    MARKDOWN_ESCAPE = re.compile(r'([\\`*_{}[\]<>()#+\-.!|])',
+                                 re.DOTALL | re.MULTILINE)
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # Store href value
+        self._link = ""
+
+    def handle_data(self, data, *args, **kwargs):
+        """
+        Store our data if it is not on the ignore list
+        """
+
+        # initialize our previous flag
+        if self._do_store:
+
+            # Tidy our whitespace
+            content = self.WS_TRIM.sub(' ', data)
+            content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content)
+
+            # Add hyperlink
+            if self._link == "":
+                self._result.append(content)
+            else:
+                self._result.append("[" + content + "]" + self._link)
+
+    def handle_starttag(self, tag, attrs):
+        """
+        Process our starting HTML Tag
+        """
+        # Toggle initial states
+        self._do_store = tag not in self.IGNORE_TAGS
+        self._link = ""
+
+        if tag in self.BLOCK_TAGS:
+            self._result.append(self.BLOCK_END)
+
+        if tag == 'li':
+            self._result.append('- ')
+
+        elif tag == 'br':
+            self._result.append('\n')
+
+        elif tag == 'hr':
+            if self._result:
+                self._result[-1] = self._result[-1].rstrip(' ')
+
+            self._result.append('\n---\n')
+
+        elif tag == 'blockquote':
+            self._result.append('> ')
+
+        elif tag == 'h1':
+            self._result.append('# ')
+
+        elif tag == 'h2':
+            self._result.append('## ')
+
+        elif tag == 'h3':
+            self._result.append('### ')
+
+        elif tag == 'h4':
+            self._result.append('#### ')
+
+        elif tag == 'h5':
+            self._result.append('##### ')
+
+        elif tag == 'h6':
+            self._result.append('###### ')
+
+        elif tag in ['strong', 'b']:
+            self._result.append('**')
+
+        elif tag in ['em', 'i']:
+            self._result.append('*')
+
+        elif tag == 'code':
+            self._result.append('`')
+
+        elif tag == 'a':
+            for name, link in attrs:
+                if name == 'href':
+                    self._link = '(' + link + ')'
+
+    def handle_endtag(self, tag):
+        """
+        Edge case handling of open/close tags
+        """
+        self._do_store = True
+        self._link = ""
+
+        if tag in self.BLOCK_TAGS:
+            self._result.append(self.BLOCK_END)
+
+        if tag in ['strong', 'b']:
+            self._result.append('**')
+
+        elif tag in ['em', 'i']:
+            self._result.append('*')
+
+        elif tag == 'code':
+            self._result.append('`')
diff --git a/test/test_conversion.py b/test/test_conversion.py
index 103ebea6..0a8230d5 100644
--- a/test/test_conversion.py
+++ b/test/test_conversion.py
@@ -143,6 +143,110 @@ def test_conversion_html_to_text():
         assert to_html(object)
 
 
+def test_conversion_html_to_markdown():
+    """conversion: Test HTML to plain text
+    """
+
+    def to_markdown(body):
+        """
+        A function to simply html conversion tests
+        """
+        return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body)
+
+    assert to_markdown("No HTML code here.") == "No HTML code here\."
+
+    clist = to_markdown("<ul><li>Lots and lots</li><li>of lists.</li></ul>")
+    assert "- Lots and lots" in clist
+    assert "- of lists\." in clist
+
+    assert "> To be or not to be\." == to_markdown(
+        "<blockquote>To be or not to be.</blockquote>")
+
+    cspace = to_markdown(
+        "<h2>Fancy heading</h2>"
+        "<p>And a paragraph too.<br>Plus line break.</p>")
+    assert "# Fancy heading" in cspace
+    assert "And a paragraph too\.\nPlus line break\." in cspace
+
+    assert to_markdown(
+        "<style>body { font: 200%; }</style>"
+        "<p>Some obnoxious text here.</p>") == "Some obnoxious text here\."
+
+    assert to_markdown(
+        "<p>line 1</p>"
+        "<p>line 2</p>"
+        "<p>line 3</p>") == "line 1\nline 2\nline 3"
+
+    # Case sensitivity
+    assert to_markdown(
+        "<p>line 1</P>"
+        "<P>line 2</P>"
+        "<P>line 3</P>") == "line 1\nline 2\nline 3"
+
+    # double new lines (testing <br> and </br>)
+    assert to_markdown(
+        "some information<br/><br>and more information") == \
+        "some information\n\nand more information"
+
+    #
+    # Test bad tags
+    #
+
+    # first 2 entries are okay, but last will do as best as it can
+    assert to_markdown(
+        "<p>line 1</>"
+        "<p>line 2</gar>"
+        "<p>line 3>") == "line 1\nline 2\nline 3\>"
+
+    # Make sure we ignore fields that aren't important to us
+    assert to_markdown(
+        "<script>ignore this</script>"
+        "<p>line 1</p>"
+        "Another line without being enclosed") == \
+        "line 1\nAnother line without being enclosed"
+
+    # Test cases when there are no new lines (we're dealing with just inline
+    # entries); an empty entry as well
+    assert to_markdown("<span></span<<span>test</span> "
+                       "<a href='#'>my link</a>") == \
+           "test [my link](#)"
+
+    # </p> missing
+    assert to_markdown("<body><div>line 1 <b>bold</b></div>  "
+                       " <a href='/link'>my link</a>"
+                       "<p>3rd line</body>") == \
+           "line 1 **bold**\n[my link](/link)\n3rd line"
+
+    # <hr/> on it's own
+    assert to_markdown("<hr/>") == "---"
+    assert to_markdown("<hr>") == "---"
+
+    # We need to handle HTML Encodings
+    assert to_markdown("""
+        <html>
+            <title>ignore this entry</title>
+        <body>
+          Let&apos;s handle&nbsp;special html encoding
+          <hr/>
+        </body>
+        """) == "Let's handle special html encoding\n---"
+
+    # If you give nothing, you get nothing in return
+    assert to_markdown("") == ""
+
+    with pytest.raises(TypeError):
+        # Invalid input
+        assert to_markdown(None)
+
+    with pytest.raises(TypeError):
+        # Invalid input
+        assert to_markdown(42)
+
+    with pytest.raises(TypeError):
+        # Invalid input
+        assert to_markdown(object)
+
+
 def test_conversion_text_to():
     """conversion: Test Text to all types
     """

From f725b3ac75860761dfb6fee41164f63f12c47955 Mon Sep 17 00:00:00 2001
From: Chris Caron <lead2gold@gmail.com>
Date: Fri, 6 Oct 2023 18:08:10 -0400
Subject: [PATCH 2/3] code & test improvements, added more coverage

---
 apprise/conversion.py   | 36 ++++++++++++++++++++++++++++--------
 test/test_conversion.py | 30 +++++++++++++++++++++++-------
 2 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/apprise/conversion.py b/apprise/conversion.py
index 689697f0..89171433 100644
--- a/apprise/conversion.py
+++ b/apprise/conversion.py
@@ -101,7 +101,7 @@ class HTMLConverter(HTMLParser, object):
 
     # The following tags must start on a new line
     BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
-                  'div', 'td', 'th', 'code', 'pre', 'label', 'li',)
+                  'div', 'td', 'th', 'pre', 'samp', 'label', 'li',)
 
     # the folowing tags ignore any internal text
     IGNORE_TAGS = (
@@ -216,8 +216,10 @@ class HTMLMarkDownConverter(HTMLConverter):
     """An HTML to markdown converter tuned for email messages."""
 
     # Escape markdown characters
-    MARKDOWN_ESCAPE = re.compile(r'([\\`*_{}[\]<>()#+\-.!|])',
-                                 re.DOTALL | re.MULTILINE)
+    MARKDOWN_ESCAPE = re.compile(r'([`*#])', re.DOTALL | re.MULTILINE)
+
+    # Detect Carriage Return
+    HAS_CR = re.compile(r'[\r*\n]+', re.DOTALL | re.MULTILINE)
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -225,6 +227,8 @@ class HTMLMarkDownConverter(HTMLConverter):
         # Store href value
         self._link = ""
 
+        self._preserver_cr = False
+
     def handle_data(self, data, *args, **kwargs):
         """
         Store our data if it is not on the ignore list
@@ -234,7 +238,8 @@ class HTMLMarkDownConverter(HTMLConverter):
         if self._do_store:
 
             # Tidy our whitespace
-            content = self.WS_TRIM.sub(' ', data)
+            content = self.WS_TRIM.sub(' ', data) \
+                if not self._preserver_cr else data
             content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content)
 
             # Add hyperlink
@@ -287,19 +292,28 @@ class HTMLMarkDownConverter(HTMLConverter):
         elif tag == 'h6':
             self._result.append('###### ')
 
-        elif tag in ['strong', 'b']:
+        elif tag in ('strong', 'b'):
             self._result.append('**')
 
-        elif tag in ['em', 'i']:
+        elif tag in ('em', 'i'):
             self._result.append('*')
 
         elif tag == 'code':
             self._result.append('`')
+            self._preserver_cr = True
+
+        elif tag in ('pre', 'samp'):
+            self._result.append('```')
+            self._result.append(self.BLOCK_END)
+            self._preserver_cr = True
 
         elif tag == 'a':
             for name, link in attrs:
                 if name == 'href':
                     self._link = '(' + link + ')'
+                    # Take an early exit for speed (in case there are more
+                    # parameters - no need to waste time looking at them)
+                    break
 
     def handle_endtag(self, tag):
         """
@@ -311,11 +325,17 @@ class HTMLMarkDownConverter(HTMLConverter):
         if tag in self.BLOCK_TAGS:
             self._result.append(self.BLOCK_END)
 
-        if tag in ['strong', 'b']:
+        if tag in ('strong', 'b'):
             self._result.append('**')
 
-        elif tag in ['em', 'i']:
+        elif tag in ('em', 'i'):
             self._result.append('*')
 
         elif tag == 'code':
             self._result.append('`')
+            self._preserver_cr = False
+
+        elif tag in ('pre', 'samp'):
+            self._result.append('```')
+            self._result.append(self.BLOCK_END)
+            self._preserver_cr = False
diff --git a/test/test_conversion.py b/test/test_conversion.py
index 0a8230d5..c09eb86e 100644
--- a/test/test_conversion.py
+++ b/test/test_conversion.py
@@ -153,24 +153,24 @@ def test_conversion_html_to_markdown():
         """
         return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body)
 
-    assert to_markdown("No HTML code here.") == "No HTML code here\."
+    assert to_markdown("No HTML code here.") == "No HTML code here."
 
     clist = to_markdown("<ul><li>Lots and lots</li><li>of lists.</li></ul>")
     assert "- Lots and lots" in clist
-    assert "- of lists\." in clist
+    assert "- of lists." in clist
 
-    assert "> To be or not to be\." == to_markdown(
+    assert "> To be or not to be." == to_markdown(
         "<blockquote>To be or not to be.</blockquote>")
 
     cspace = to_markdown(
         "<h2>Fancy heading</h2>"
         "<p>And a paragraph too.<br>Plus line break.</p>")
     assert "# Fancy heading" in cspace
-    assert "And a paragraph too\.\nPlus line break\." in cspace
+    assert "And a paragraph too.\nPlus line break." in cspace
 
     assert to_markdown(
         "<style>body { font: 200%; }</style>"
-        "<p>Some obnoxious text here.</p>") == "Some obnoxious text here\."
+        "<p>Some obnoxious text here.</p>") == "Some obnoxious text here."
 
     assert to_markdown(
         "<p>line 1</p>"
@@ -194,9 +194,18 @@ def test_conversion_html_to_markdown():
 
     # first 2 entries are okay, but last will do as best as it can
     assert to_markdown(
+        "<h1>Heading 1</h1>"
+        "<h2>Heading 2</h2>"
+        "<h3>Heading 3</h3>"
+        "<h4>Heading 4</h4>"
+        "<h5>Heading 5</h5>"
+        "<h6>Heading 6</h6>"
         "<p>line 1</>"
-        "<p>line 2</gar>"
-        "<p>line 3>") == "line 1\nline 2\nline 3\>"
+        "<p><em>line 2</em></gar>"
+        "<p>line 3>") == \
+        "# Heading 1\n## Heading 2\n### Heading 3\n" \
+        "#### Heading 4\n##### Heading 5\n###### Heading 6\n" \
+        "line 1\n*line 2*\nline 3>"
 
     # Make sure we ignore fields that aren't important to us
     assert to_markdown(
@@ -205,6 +214,13 @@ def test_conversion_html_to_markdown():
         "Another line without being enclosed") == \
         "line 1\nAnother line without being enclosed"
 
+    # Test <code> and <pre>
+    assert to_markdown(
+        "<code>multi-line 1\nmulti-line 2</code>more content"
+        "<pre>multi-line 1\nmulti-line 2</pre>more content") == \
+        '`multi-line 1\nmulti-line 2`more content' \
+        '\n```\nmulti-line 1\nmulti-line 2\n```\nmore content'
+
     # Test cases when there are no new lines (we're dealing with just inline
     # entries); an empty entry as well
     assert to_markdown("<span></span<<span>test</span> "

From 4c1b5cabbe7bd4adea5f7a6d573eb996865075fe Mon Sep 17 00:00:00 2001
From: Chris Caron <lead2gold@gmail.com>
Date: Fri, 6 Oct 2023 18:11:44 -0400
Subject: [PATCH 3/3] complete coverage of what is there

---
 apprise/conversion.py   | 2 +-
 test/test_conversion.py | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/apprise/conversion.py b/apprise/conversion.py
index 89171433..898f8007 100644
--- a/apprise/conversion.py
+++ b/apprise/conversion.py
@@ -308,7 +308,7 @@ class HTMLMarkDownConverter(HTMLConverter):
             self._preserver_cr = True
 
         elif tag == 'a':
-            for name, link in attrs:
+            for name, link in attrs:  # pragma: no branch
                 if name == 'href':
                     self._link = '(' + link + ')'
                     # Take an early exit for speed (in case there are more
diff --git a/test/test_conversion.py b/test/test_conversion.py
index c09eb86e..406db664 100644
--- a/test/test_conversion.py
+++ b/test/test_conversion.py
@@ -107,6 +107,11 @@ def test_conversion_html_to_text():
                    "<a href='#'>my link</a>") == \
         "test my link"
 
+    # a with missing href entry
+    assert to_html("<span></span<<span>test</span> "
+                   "<a>my link</a>") == \
+        "test my link"
+
     # </p> missing
     assert to_html("<body><div>line 1 <b>bold</b></div>  "
                    " <a href='#'>my link</a>"