From 8d543a5eb32f6f65c6d814f1f00ecfd006376c6b Mon Sep 17 00:00:00 2001 From: Hector Rodriguez Medina Date: Sun, 20 Aug 2023 18:16:47 -0300 Subject: [PATCH 1/3] Add conversion for HTML to markdown --- apprise/conversion.py | 123 +++++++++++++++++++++++++++++++++++++++- test/test_conversion.py | 104 +++++++++++++++++++++++++++++++++ 2 files changed, 225 insertions(+), 2 deletions(-) diff --git a/apprise/conversion.py b/apprise/conversion.py index 77c9aa5e..689697f0 100644 --- a/apprise/conversion.py +++ b/apprise/conversion.py @@ -50,8 +50,7 @@ def convert_between(from_format, to_format, content): (NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown_to_html, (NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html, (NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text, - # For now; use same converter for Markdown support - (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text, + (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_markdown, } convert = converters.get((from_format, to_format)) @@ -86,6 +85,17 @@ def html_to_text(content): return parser.converted +def html_to_markdown(content): + """ + Converts a content from HTML to markdown. + """ + + parser = HTMLMarkDownConverter() + parser.feed(content) + parser.close() + return parser.converted + + class HTMLConverter(HTMLParser, object): """An HTML to plain text converter tuned for email messages.""" @@ -200,3 +210,112 @@ class HTMLConverter(HTMLParser, object): if tag in self.BLOCK_TAGS: self._result.append(self.BLOCK_END) + + +class HTMLMarkDownConverter(HTMLConverter): + """An HTML to markdown converter tuned for email messages.""" + + # Escape markdown characters + MARKDOWN_ESCAPE = re.compile(r'([\\`*_{}[\]<>()#+\-.!|])', + re.DOTALL | re.MULTILINE) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + # Store href value + self._link = "" + + def handle_data(self, data, *args, **kwargs): + """ + Store our data if it is not on the ignore list + """ + + # initialize our previous flag + if self._do_store: + + # Tidy our whitespace + content = self.WS_TRIM.sub(' ', data) + content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content) + + # Add hyperlink + if self._link == "": + self._result.append(content) + else: + self._result.append("[" + content + "]" + self._link) + + def handle_starttag(self, tag, attrs): + """ + Process our starting HTML Tag + """ + # Toggle initial states + self._do_store = tag not in self.IGNORE_TAGS + self._link = "" + + if tag in self.BLOCK_TAGS: + self._result.append(self.BLOCK_END) + + if tag == 'li': + self._result.append('- ') + + elif tag == 'br': + self._result.append('\n') + + elif tag == 'hr': + if self._result: + self._result[-1] = self._result[-1].rstrip(' ') + + self._result.append('\n---\n') + + elif tag == 'blockquote': + self._result.append('> ') + + elif tag == 'h1': + self._result.append('# ') + + elif tag == 'h2': + self._result.append('## ') + + elif tag == 'h3': + self._result.append('### ') + + elif tag == 'h4': + self._result.append('#### ') + + elif tag == 'h5': + self._result.append('##### ') + + elif tag == 'h6': + self._result.append('###### ') + + elif tag in ['strong', 'b']: + self._result.append('**') + + elif tag in ['em', 'i']: + self._result.append('*') + + elif tag == 'code': + self._result.append('`') + + elif tag == 'a': + for name, link in attrs: + if name == 'href': + self._link = '(' + link + ')' + + def handle_endtag(self, tag): + """ + Edge case handling of open/close tags + """ + self._do_store = True + self._link = "" + + if tag in self.BLOCK_TAGS: + self._result.append(self.BLOCK_END) + + if tag in ['strong', 'b']: + self._result.append('**') + + elif tag in ['em', 'i']: + self._result.append('*') + + elif tag == 'code': + self._result.append('`') diff --git a/test/test_conversion.py b/test/test_conversion.py index 103ebea6..0a8230d5 100644 --- a/test/test_conversion.py +++ b/test/test_conversion.py @@ -143,6 +143,110 @@ def test_conversion_html_to_text(): assert to_html(object) +def test_conversion_html_to_markdown(): + """conversion: Test HTML to plain text + """ + + def to_markdown(body): + """ + A function to simply html conversion tests + """ + return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body) + + assert to_markdown("No HTML code here.") == "No HTML code here\." + + clist = to_markdown("") + assert "- Lots and lots" in clist + assert "- of lists\." in clist + + assert "> To be or not to be\." == to_markdown( + "
To be or not to be.
") + + cspace = to_markdown( + "

Fancy heading

" + "

And a paragraph too.
Plus line break.

") + assert "# Fancy heading" in cspace + assert "And a paragraph too\.\nPlus line break\." in cspace + + assert to_markdown( + "" + "

Some obnoxious text here.

") == "Some obnoxious text here\." + + assert to_markdown( + "

line 1

" + "

line 2

" + "

line 3

") == "line 1\nline 2\nline 3" + + # Case sensitivity + assert to_markdown( + "

line 1

" + "

line 2

" + "

line 3

") == "line 1\nline 2\nline 3" + + # double new lines (testing
and
) + assert to_markdown( + "some information

and more information") == \ + "some information\n\nand more information" + + # + # Test bad tags + # + + # first 2 entries are okay, but last will do as best as it can + assert to_markdown( + "

line 1" + "

line 2" + "

line 3>") == "line 1\nline 2\nline 3\>" + + # Make sure we ignore fields that aren't important to us + assert to_markdown( + "" + "

line 1

" + "Another line without being enclosed") == \ + "line 1\nAnother line without being enclosed" + + # Test cases when there are no new lines (we're dealing with just inline + # entries); an empty entry as well + assert to_markdown("test " + "my link") == \ + "test [my link](#)" + + #

missing + assert to_markdown("
line 1 bold
" + " my link" + "

3rd line") == \ + "line 1 **bold**\n[my link](/link)\n3rd line" + + #


on it's own + assert to_markdown("
") == "---" + assert to_markdown("
") == "---" + + # We need to handle HTML Encodings + assert to_markdown(""" + + ignore this entry + + Let's handle special html encoding +
+ + """) == "Let's handle special html encoding\n---" + + # If you give nothing, you get nothing in return + assert to_markdown("") == "" + + with pytest.raises(TypeError): + # Invalid input + assert to_markdown(None) + + with pytest.raises(TypeError): + # Invalid input + assert to_markdown(42) + + with pytest.raises(TypeError): + # Invalid input + assert to_markdown(object) + + def test_conversion_text_to(): """conversion: Test Text to all types """ From f725b3ac75860761dfb6fee41164f63f12c47955 Mon Sep 17 00:00:00 2001 From: Chris Caron Date: Fri, 6 Oct 2023 18:08:10 -0400 Subject: [PATCH 2/3] code & test improvements, added more coverage --- apprise/conversion.py | 36 ++++++++++++++++++++++++++++-------- test/test_conversion.py | 30 +++++++++++++++++++++++------- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/apprise/conversion.py b/apprise/conversion.py index 689697f0..89171433 100644 --- a/apprise/conversion.py +++ b/apprise/conversion.py @@ -101,7 +101,7 @@ class HTMLConverter(HTMLParser, object): # The following tags must start on a new line BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', - 'div', 'td', 'th', 'code', 'pre', 'label', 'li',) + 'div', 'td', 'th', 'pre', 'samp', 'label', 'li',) # the folowing tags ignore any internal text IGNORE_TAGS = ( @@ -216,8 +216,10 @@ class HTMLMarkDownConverter(HTMLConverter): """An HTML to markdown converter tuned for email messages.""" # Escape markdown characters - MARKDOWN_ESCAPE = re.compile(r'([\\`*_{}[\]<>()#+\-.!|])', - re.DOTALL | re.MULTILINE) + MARKDOWN_ESCAPE = re.compile(r'([`*#])', re.DOTALL | re.MULTILINE) + + # Detect Carriage Return + HAS_CR = re.compile(r'[\r*\n]+', re.DOTALL | re.MULTILINE) def __init__(self, **kwargs): super().__init__(**kwargs) @@ -225,6 +227,8 @@ class HTMLMarkDownConverter(HTMLConverter): # Store href value self._link = "" + self._preserver_cr = False + def handle_data(self, data, *args, **kwargs): """ Store our data if it is not on the ignore list @@ -234,7 +238,8 @@ class HTMLMarkDownConverter(HTMLConverter): if self._do_store: # Tidy our whitespace - content = self.WS_TRIM.sub(' ', data) + content = self.WS_TRIM.sub(' ', data) \ + if not self._preserver_cr else data content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content) # Add hyperlink @@ -287,19 +292,28 @@ class HTMLMarkDownConverter(HTMLConverter): elif tag == 'h6': self._result.append('###### ') - elif tag in ['strong', 'b']: + elif tag in ('strong', 'b'): self._result.append('**') - elif tag in ['em', 'i']: + elif tag in ('em', 'i'): self._result.append('*') elif tag == 'code': self._result.append('`') + self._preserver_cr = True + + elif tag in ('pre', 'samp'): + self._result.append('```') + self._result.append(self.BLOCK_END) + self._preserver_cr = True elif tag == 'a': for name, link in attrs: if name == 'href': self._link = '(' + link + ')' + # Take an early exit for speed (in case there are more + # parameters - no need to waste time looking at them) + break def handle_endtag(self, tag): """ @@ -311,11 +325,17 @@ class HTMLMarkDownConverter(HTMLConverter): if tag in self.BLOCK_TAGS: self._result.append(self.BLOCK_END) - if tag in ['strong', 'b']: + if tag in ('strong', 'b'): self._result.append('**') - elif tag in ['em', 'i']: + elif tag in ('em', 'i'): self._result.append('*') elif tag == 'code': self._result.append('`') + self._preserver_cr = False + + elif tag in ('pre', 'samp'): + self._result.append('```') + self._result.append(self.BLOCK_END) + self._preserver_cr = False diff --git a/test/test_conversion.py b/test/test_conversion.py index 0a8230d5..c09eb86e 100644 --- a/test/test_conversion.py +++ b/test/test_conversion.py @@ -153,24 +153,24 @@ def test_conversion_html_to_markdown(): """ return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body) - assert to_markdown("No HTML code here.") == "No HTML code here\." + assert to_markdown("No HTML code here.") == "No HTML code here." clist = to_markdown("") assert "- Lots and lots" in clist - assert "- of lists\." in clist + assert "- of lists." in clist - assert "> To be or not to be\." == to_markdown( + assert "> To be or not to be." == to_markdown( "
To be or not to be.
") cspace = to_markdown( "

Fancy heading

" "

And a paragraph too.
Plus line break.

") assert "# Fancy heading" in cspace - assert "And a paragraph too\.\nPlus line break\." in cspace + assert "And a paragraph too.\nPlus line break." in cspace assert to_markdown( "" - "

Some obnoxious text here.

") == "Some obnoxious text here\." + "

Some obnoxious text here.

") == "Some obnoxious text here." assert to_markdown( "

line 1

" @@ -194,9 +194,18 @@ def test_conversion_html_to_markdown(): # first 2 entries are okay, but last will do as best as it can assert to_markdown( + "

Heading 1

" + "

Heading 2

" + "

Heading 3

" + "

Heading 4

" + "
Heading 5
" + "
Heading 6
" "

line 1" - "

line 2" - "

line 3>") == "line 1\nline 2\nline 3\>" + "

line 2" + "

line 3>") == \ + "# Heading 1\n## Heading 2\n### Heading 3\n" \ + "#### Heading 4\n##### Heading 5\n###### Heading 6\n" \ + "line 1\n*line 2*\nline 3>" # Make sure we ignore fields that aren't important to us assert to_markdown( @@ -205,6 +214,13 @@ def test_conversion_html_to_markdown(): "Another line without being enclosed") == \ "line 1\nAnother line without being enclosed" + # Test and

+    assert to_markdown(
+        "multi-line 1\nmulti-line 2more content"
+        "
multi-line 1\nmulti-line 2
more content") == \ + '`multi-line 1\nmulti-line 2`more content' \ + '\n```\nmulti-line 1\nmulti-line 2\n```\nmore content' + # Test cases when there are no new lines (we're dealing with just inline # entries); an empty entry as well assert to_markdown("test " From 4c1b5cabbe7bd4adea5f7a6d573eb996865075fe Mon Sep 17 00:00:00 2001 From: Chris Caron Date: Fri, 6 Oct 2023 18:11:44 -0400 Subject: [PATCH 3/3] complete coverage of what is there --- apprise/conversion.py | 2 +- test/test_conversion.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/apprise/conversion.py b/apprise/conversion.py index 89171433..898f8007 100644 --- a/apprise/conversion.py +++ b/apprise/conversion.py @@ -308,7 +308,7 @@ class HTMLMarkDownConverter(HTMLConverter): self._preserver_cr = True elif tag == 'a': - for name, link in attrs: + for name, link in attrs: # pragma: no branch if name == 'href': self._link = '(' + link + ')' # Take an early exit for speed (in case there are more diff --git a/test/test_conversion.py b/test/test_conversion.py index c09eb86e..406db664 100644 --- a/test/test_conversion.py +++ b/test/test_conversion.py @@ -107,6 +107,11 @@ def test_conversion_html_to_text(): "my link") == \ "test my link" + # a with missing href entry + assert to_html("test " + "my link") == \ + "test my link" + #

missing assert to_html("
line 1 bold
" " my link"