diff --git a/apprise/conversion.py b/apprise/conversion.py index 7f691eae..5bdf5bda 100644 --- a/apprise/conversion.py +++ b/apprise/conversion.py @@ -46,8 +46,7 @@ def convert_between(from_format, to_format, content): (NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown_to_html, (NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html, (NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text, - # For now; use same converter for Markdown support - (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text, + (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_markdown, } convert = converters.get((from_format, to_format)) @@ -82,12 +81,23 @@ def html_to_text(content): return parser.converted +def html_to_markdown(content): + """ + Converts a content from HTML to markdown. + """ + + parser = HTMLMarkDownConverter() + parser.feed(content) + parser.close() + return parser.converted + + class HTMLConverter(HTMLParser, object): """An HTML to plain text converter tuned for email messages.""" # The following tags must start on a new line BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', - 'div', 'td', 'th', 'code', 'pre', 'label', 'li',) + 'div', 'td', 'th', 'pre', 'samp', 'label', 'li',) # the folowing tags ignore any internal text IGNORE_TAGS = ( @@ -198,3 +208,132 @@ class HTMLConverter(HTMLParser, object): if tag in self.BLOCK_TAGS: self._result.append(self.BLOCK_END) + + +class HTMLMarkDownConverter(HTMLConverter): + """An HTML to markdown converter tuned for email messages.""" + + # Escape markdown characters + MARKDOWN_ESCAPE = re.compile(r'([`*#])', re.DOTALL | re.MULTILINE) + + # Detect Carriage Return + HAS_CR = re.compile(r'[\r*\n]+', re.DOTALL | re.MULTILINE) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + # Store href value + self._link = "" + + self._preserver_cr = False + + def handle_data(self, data, *args, **kwargs): + """ + Store our data if it is not on the ignore list + """ + + # initialize our previous flag + if self._do_store: + + # Tidy our whitespace + content = self.WS_TRIM.sub(' ', data) \ + if not self._preserver_cr else data + content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content) + + # Add hyperlink + if self._link == "": + self._result.append(content) + else: + self._result.append("[" + content + "]" + self._link) + + def handle_starttag(self, tag, attrs): + """ + Process our starting HTML Tag + """ + # Toggle initial states + self._do_store = tag not in self.IGNORE_TAGS + self._link = "" + + if tag in self.BLOCK_TAGS: + self._result.append(self.BLOCK_END) + + if tag == 'li': + self._result.append('- ') + + elif tag == 'br': + self._result.append('\n') + + elif tag == 'hr': + if self._result: + self._result[-1] = self._result[-1].rstrip(' ') + + self._result.append('\n---\n') + + elif tag == 'blockquote': + self._result.append('> ') + + elif tag == 'h1': + self._result.append('# ') + + elif tag == 'h2': + self._result.append('## ') + + elif tag == 'h3': + self._result.append('### ') + + elif tag == 'h4': + self._result.append('#### ') + + elif tag == 'h5': + self._result.append('##### ') + + elif tag == 'h6': + self._result.append('###### ') + + elif tag in ('strong', 'b'): + self._result.append('**') + + elif tag in ('em', 'i'): + self._result.append('*') + + elif tag == 'code': + self._result.append('`') + self._preserver_cr = True + + elif tag in ('pre', 'samp'): + self._result.append('```') + self._result.append(self.BLOCK_END) + self._preserver_cr = True + + elif tag == 'a': + for name, link in attrs: # pragma: no branch + if name == 'href': + self._link = '(' + link + ')' + # Take an early exit for speed (in case there are more + # parameters - no need to waste time looking at them) + break + + def handle_endtag(self, tag): + """ + Edge case handling of open/close tags + """ + self._do_store = True + self._link = "" + + if tag in self.BLOCK_TAGS: + self._result.append(self.BLOCK_END) + + if tag in ('strong', 'b'): + self._result.append('**') + + elif tag in ('em', 'i'): + self._result.append('*') + + elif tag == 'code': + self._result.append('`') + self._preserver_cr = False + + elif tag in ('pre', 'samp'): + self._result.append('```') + self._result.append(self.BLOCK_END) + self._preserver_cr = False diff --git a/test/test_conversion.py b/test/test_conversion.py index 9872d334..08b6889d 100644 --- a/test/test_conversion.py +++ b/test/test_conversion.py @@ -104,6 +104,11 @@ def test_conversion_html_to_text(): "my link") == \ "test my link" + # a with missing href entry + assert to_html("test " + "my link") == \ + "test my link" + #
missing assert to_html("To be or not to be.") + + cspace = to_markdown( + "
And a paragraph too.
Plus line break.
Some obnoxious text here.
") == "Some obnoxious text here." + + assert to_markdown( + "line 1
" + "line 2
" + "line 3
") == "line 1\nline 2\nline 3" + + # Case sensitivity + assert to_markdown( + "line 1
" + "line 2
" + "line 3
") == "line 1\nline 2\nline 3" + + # double new lines (testingline 1>" + "
line 2" + "
line 3>") == \ + "# Heading 1\n## Heading 2\n### Heading 3\n" \ + "#### Heading 4\n##### Heading 5\n###### Heading 6\n" \ + "line 1\n*line 2*\nline 3>" + + # Make sure we ignore fields that aren't important to us + assert to_markdown( + "" + "
line 1
" + "Another line without being enclosed") == \ + "line 1\nAnother line without being enclosed" + + # Test and
+ assert to_markdown(
+ "multi-line 1\nmulti-line 2
more content"
+ "multi-line 1\nmulti-line 2
more content") == \
+ '`multi-line 1\nmulti-line 2`more content' \
+ '\n```\nmulti-line 1\nmulti-line 2\n```\nmore content'
+
+ # Test cases when there are no new lines (we're dealing with just inline
+ # entries); an empty entry as well
+ assert to_markdown("test "
+ "my link") == \
+ "test [my link](#)"
+
+ # missing
+ assert to_markdown("line 1 bold "
+ " my link"
+ "3rd line") == \
+ "line 1 **bold**\n[my link](/link)\n3rd line"
+
+ #
on it's own
+ assert to_markdown("
") == "---"
+ assert to_markdown("
") == "---"
+
+ # We need to handle HTML Encodings
+ assert to_markdown("""
+
+ ignore this entry
+
+ Let's handle special html encoding
+
+
+ """) == "Let's handle special html encoding\n---"
+
+ # If you give nothing, you get nothing in return
+ assert to_markdown("") == ""
+
+ with pytest.raises(TypeError):
+ # Invalid input
+ assert to_markdown(None)
+
+ with pytest.raises(TypeError):
+ # Invalid input
+ assert to_markdown(42)
+
+ with pytest.raises(TypeError):
+ # Invalid input
+ assert to_markdown(object)
+
+
def test_conversion_text_to():
"""conversion: Test Text to all types
"""