diff --git a/apprise/conversion.py b/apprise/conversion.py index 7f691eae..5bdf5bda 100644 --- a/apprise/conversion.py +++ b/apprise/conversion.py @@ -46,8 +46,7 @@ def convert_between(from_format, to_format, content): (NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown_to_html, (NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html, (NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text, - # For now; use same converter for Markdown support - (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text, + (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_markdown, } convert = converters.get((from_format, to_format)) @@ -82,12 +81,23 @@ def html_to_text(content): return parser.converted +def html_to_markdown(content): + """ + Converts a content from HTML to markdown. + """ + + parser = HTMLMarkDownConverter() + parser.feed(content) + parser.close() + return parser.converted + + class HTMLConverter(HTMLParser, object): """An HTML to plain text converter tuned for email messages.""" # The following tags must start on a new line BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', - 'div', 'td', 'th', 'code', 'pre', 'label', 'li',) + 'div', 'td', 'th', 'pre', 'samp', 'label', 'li',) # the folowing tags ignore any internal text IGNORE_TAGS = ( @@ -198,3 +208,132 @@ class HTMLConverter(HTMLParser, object): if tag in self.BLOCK_TAGS: self._result.append(self.BLOCK_END) + + +class HTMLMarkDownConverter(HTMLConverter): + """An HTML to markdown converter tuned for email messages.""" + + # Escape markdown characters + MARKDOWN_ESCAPE = re.compile(r'([`*#])', re.DOTALL | re.MULTILINE) + + # Detect Carriage Return + HAS_CR = re.compile(r'[\r*\n]+', re.DOTALL | re.MULTILINE) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + # Store href value + self._link = "" + + self._preserver_cr = False + + def handle_data(self, data, *args, **kwargs): + """ + Store our data if it is not on the ignore list + """ + + # initialize our previous flag + if self._do_store: + + # Tidy our whitespace + content = self.WS_TRIM.sub(' ', data) \ + if not self._preserver_cr else data + content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content) + + # Add hyperlink + if self._link == "": + self._result.append(content) + else: + self._result.append("[" + content + "]" + self._link) + + def handle_starttag(self, tag, attrs): + """ + Process our starting HTML Tag + """ + # Toggle initial states + self._do_store = tag not in self.IGNORE_TAGS + self._link = "" + + if tag in self.BLOCK_TAGS: + self._result.append(self.BLOCK_END) + + if tag == 'li': + self._result.append('- ') + + elif tag == 'br': + self._result.append('\n') + + elif tag == 'hr': + if self._result: + self._result[-1] = self._result[-1].rstrip(' ') + + self._result.append('\n---\n') + + elif tag == 'blockquote': + self._result.append('> ') + + elif tag == 'h1': + self._result.append('# ') + + elif tag == 'h2': + self._result.append('## ') + + elif tag == 'h3': + self._result.append('### ') + + elif tag == 'h4': + self._result.append('#### ') + + elif tag == 'h5': + self._result.append('##### ') + + elif tag == 'h6': + self._result.append('###### ') + + elif tag in ('strong', 'b'): + self._result.append('**') + + elif tag in ('em', 'i'): + self._result.append('*') + + elif tag == 'code': + self._result.append('`') + self._preserver_cr = True + + elif tag in ('pre', 'samp'): + self._result.append('```') + self._result.append(self.BLOCK_END) + self._preserver_cr = True + + elif tag == 'a': + for name, link in attrs: # pragma: no branch + if name == 'href': + self._link = '(' + link + ')' + # Take an early exit for speed (in case there are more + # parameters - no need to waste time looking at them) + break + + def handle_endtag(self, tag): + """ + Edge case handling of open/close tags + """ + self._do_store = True + self._link = "" + + if tag in self.BLOCK_TAGS: + self._result.append(self.BLOCK_END) + + if tag in ('strong', 'b'): + self._result.append('**') + + elif tag in ('em', 'i'): + self._result.append('*') + + elif tag == 'code': + self._result.append('`') + self._preserver_cr = False + + elif tag in ('pre', 'samp'): + self._result.append('```') + self._result.append(self.BLOCK_END) + self._preserver_cr = False diff --git a/test/test_conversion.py b/test/test_conversion.py index 9872d334..08b6889d 100644 --- a/test/test_conversion.py +++ b/test/test_conversion.py @@ -104,6 +104,11 @@ def test_conversion_html_to_text(): "my link") == \ "test my link" + # a with missing href entry + assert to_html("test " + "my link") == \ + "test my link" + #

missing assert to_html("
line 1 bold
" " my link" @@ -204,6 +209,126 @@ wanted." assert to_html(object) +def test_conversion_html_to_markdown(): + """conversion: Test HTML to plain text + """ + + def to_markdown(body): + """ + A function to simply html conversion tests + """ + return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body) + + assert to_markdown("No HTML code here.") == "No HTML code here." + + clist = to_markdown("") + assert "- Lots and lots" in clist + assert "- of lists." in clist + + assert "> To be or not to be." == to_markdown( + "
To be or not to be.
") + + cspace = to_markdown( + "

Fancy heading

" + "

And a paragraph too.
Plus line break.

") + assert "# Fancy heading" in cspace + assert "And a paragraph too.\nPlus line break." in cspace + + assert to_markdown( + "" + "

Some obnoxious text here.

") == "Some obnoxious text here." + + assert to_markdown( + "

line 1

" + "

line 2

" + "

line 3

") == "line 1\nline 2\nline 3" + + # Case sensitivity + assert to_markdown( + "

line 1

" + "

line 2

" + "

line 3

") == "line 1\nline 2\nline 3" + + # double new lines (testing
and
) + assert to_markdown( + "some information

and more information") == \ + "some information\n\nand more information" + + # + # Test bad tags + # + + # first 2 entries are okay, but last will do as best as it can + assert to_markdown( + "

Heading 1

" + "

Heading 2

" + "

Heading 3

" + "

Heading 4

" + "
Heading 5
" + "
Heading 6
" + "

line 1" + "

line 2" + "

line 3>") == \ + "# Heading 1\n## Heading 2\n### Heading 3\n" \ + "#### Heading 4\n##### Heading 5\n###### Heading 6\n" \ + "line 1\n*line 2*\nline 3>" + + # Make sure we ignore fields that aren't important to us + assert to_markdown( + "" + "

line 1

" + "Another line without being enclosed") == \ + "line 1\nAnother line without being enclosed" + + # Test and
+    assert to_markdown(
+        "multi-line 1\nmulti-line 2more content"
+        "
multi-line 1\nmulti-line 2
more content") == \ + '`multi-line 1\nmulti-line 2`more content' \ + '\n```\nmulti-line 1\nmulti-line 2\n```\nmore content' + + # Test cases when there are no new lines (we're dealing with just inline + # entries); an empty entry as well + assert to_markdown("test " + "my link") == \ + "test [my link](#)" + + #

missing + assert to_markdown("
line 1 bold
" + " my link" + "

3rd line") == \ + "line 1 **bold**\n[my link](/link)\n3rd line" + + #


on it's own + assert to_markdown("
") == "---" + assert to_markdown("
") == "---" + + # We need to handle HTML Encodings + assert to_markdown(""" + + ignore this entry + + Let's handle special html encoding +
+ + """) == "Let's handle special html encoding\n---" + + # If you give nothing, you get nothing in return + assert to_markdown("") == "" + + with pytest.raises(TypeError): + # Invalid input + assert to_markdown(None) + + with pytest.raises(TypeError): + # Invalid input + assert to_markdown(42) + + with pytest.raises(TypeError): + # Invalid input + assert to_markdown(object) + + def test_conversion_text_to(): """conversion: Test Text to all types """