From 8d543a5eb32f6f65c6d814f1f00ecfd006376c6b Mon Sep 17 00:00:00 2001
From: Hector Rodriguez Medina
Date: Sun, 20 Aug 2023 18:16:47 -0300
Subject: [PATCH 1/3] Add conversion for HTML to markdown
---
apprise/conversion.py | 123 +++++++++++++++++++++++++++++++++++++++-
test/test_conversion.py | 104 +++++++++++++++++++++++++++++++++
2 files changed, 225 insertions(+), 2 deletions(-)
diff --git a/apprise/conversion.py b/apprise/conversion.py
index 77c9aa5e..689697f0 100644
--- a/apprise/conversion.py
+++ b/apprise/conversion.py
@@ -50,8 +50,7 @@ def convert_between(from_format, to_format, content):
(NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown_to_html,
(NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html,
(NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text,
- # For now; use same converter for Markdown support
- (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text,
+ (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_markdown,
}
convert = converters.get((from_format, to_format))
@@ -86,6 +85,17 @@ def html_to_text(content):
return parser.converted
+def html_to_markdown(content):
+ """
+ Converts a content from HTML to markdown.
+ """
+
+ parser = HTMLMarkDownConverter()
+ parser.feed(content)
+ parser.close()
+ return parser.converted
+
+
class HTMLConverter(HTMLParser, object):
"""An HTML to plain text converter tuned for email messages."""
@@ -200,3 +210,112 @@ class HTMLConverter(HTMLParser, object):
if tag in self.BLOCK_TAGS:
self._result.append(self.BLOCK_END)
+
+
+class HTMLMarkDownConverter(HTMLConverter):
+ """An HTML to markdown converter tuned for email messages."""
+
+ # Escape markdown characters
+ MARKDOWN_ESCAPE = re.compile(r'([\\`*_{}[\]<>()#+\-.!|])',
+ re.DOTALL | re.MULTILINE)
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+ # Store href value
+ self._link = ""
+
+ def handle_data(self, data, *args, **kwargs):
+ """
+ Store our data if it is not on the ignore list
+ """
+
+ # initialize our previous flag
+ if self._do_store:
+
+ # Tidy our whitespace
+ content = self.WS_TRIM.sub(' ', data)
+ content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content)
+
+ # Add hyperlink
+ if self._link == "":
+ self._result.append(content)
+ else:
+ self._result.append("[" + content + "]" + self._link)
+
+ def handle_starttag(self, tag, attrs):
+ """
+ Process our starting HTML Tag
+ """
+ # Toggle initial states
+ self._do_store = tag not in self.IGNORE_TAGS
+ self._link = ""
+
+ if tag in self.BLOCK_TAGS:
+ self._result.append(self.BLOCK_END)
+
+ if tag == 'li':
+ self._result.append('- ')
+
+ elif tag == 'br':
+ self._result.append('\n')
+
+ elif tag == 'hr':
+ if self._result:
+ self._result[-1] = self._result[-1].rstrip(' ')
+
+ self._result.append('\n---\n')
+
+ elif tag == 'blockquote':
+ self._result.append('> ')
+
+ elif tag == 'h1':
+ self._result.append('# ')
+
+ elif tag == 'h2':
+ self._result.append('## ')
+
+ elif tag == 'h3':
+ self._result.append('### ')
+
+ elif tag == 'h4':
+ self._result.append('#### ')
+
+ elif tag == 'h5':
+ self._result.append('##### ')
+
+ elif tag == 'h6':
+ self._result.append('###### ')
+
+ elif tag in ['strong', 'b']:
+ self._result.append('**')
+
+ elif tag in ['em', 'i']:
+ self._result.append('*')
+
+ elif tag == 'code':
+ self._result.append('`')
+
+ elif tag == 'a':
+ for name, link in attrs:
+ if name == 'href':
+ self._link = '(' + link + ')'
+
+ def handle_endtag(self, tag):
+ """
+ Edge case handling of open/close tags
+ """
+ self._do_store = True
+ self._link = ""
+
+ if tag in self.BLOCK_TAGS:
+ self._result.append(self.BLOCK_END)
+
+ if tag in ['strong', 'b']:
+ self._result.append('**')
+
+ elif tag in ['em', 'i']:
+ self._result.append('*')
+
+ elif tag == 'code':
+ self._result.append('`')
diff --git a/test/test_conversion.py b/test/test_conversion.py
index 103ebea6..0a8230d5 100644
--- a/test/test_conversion.py
+++ b/test/test_conversion.py
@@ -143,6 +143,110 @@ def test_conversion_html_to_text():
assert to_html(object)
+def test_conversion_html_to_markdown():
+ """conversion: Test HTML to plain text
+ """
+
+ def to_markdown(body):
+ """
+ A function to simply html conversion tests
+ """
+ return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body)
+
+ assert to_markdown("No HTML code here.") == "No HTML code here\."
+
+ clist = to_markdown("")
+ assert "- Lots and lots" in clist
+ assert "- of lists\." in clist
+
+ assert "> To be or not to be\." == to_markdown(
+ "To be or not to be.
")
+
+ cspace = to_markdown(
+ "Fancy heading
"
+ "And a paragraph too.
Plus line break.
")
+ assert "# Fancy heading" in cspace
+ assert "And a paragraph too\.\nPlus line break\." in cspace
+
+ assert to_markdown(
+ ""
+ "Some obnoxious text here.
") == "Some obnoxious text here\."
+
+ assert to_markdown(
+ "line 1
"
+ "line 2
"
+ "line 3
") == "line 1\nline 2\nline 3"
+
+ # Case sensitivity
+ assert to_markdown(
+ "line 1
"
+ "line 2
"
+ "line 3
") == "line 1\nline 2\nline 3"
+
+ # double new lines (testing
and )
+ assert to_markdown(
+ "some information
and more information") == \
+ "some information\n\nand more information"
+
+ #
+ # Test bad tags
+ #
+
+ # first 2 entries are okay, but last will do as best as it can
+ assert to_markdown(
+ "line 1>"
+ "
line 2"
+ "
line 3>") == "line 1\nline 2\nline 3\>"
+
+ # Make sure we ignore fields that aren't important to us
+ assert to_markdown(
+ ""
+ "
line 1
"
+ "Another line without being enclosed") == \
+ "line 1\nAnother line without being enclosed"
+
+ # Test cases when there are no new lines (we're dealing with just inline
+ # entries); an empty entry as well
+ assert to_markdown("test "
+ "my link") == \
+ "test [my link](#)"
+
+ #
missing
+ assert to_markdown("line 1 bold
"
+ " my link"
+ "3rd line") == \
+ "line 1 **bold**\n[my link](/link)\n3rd line"
+
+ #
on it's own
+ assert to_markdown("
") == "---"
+ assert to_markdown("
") == "---"
+
+ # We need to handle HTML Encodings
+ assert to_markdown("""
+
+ ignore this entry
+
+ Let's handle special html encoding
+
+
+ """) == "Let's handle special html encoding\n---"
+
+ # If you give nothing, you get nothing in return
+ assert to_markdown("") == ""
+
+ with pytest.raises(TypeError):
+ # Invalid input
+ assert to_markdown(None)
+
+ with pytest.raises(TypeError):
+ # Invalid input
+ assert to_markdown(42)
+
+ with pytest.raises(TypeError):
+ # Invalid input
+ assert to_markdown(object)
+
+
def test_conversion_text_to():
"""conversion: Test Text to all types
"""
From f725b3ac75860761dfb6fee41164f63f12c47955 Mon Sep 17 00:00:00 2001
From: Chris Caron
Date: Fri, 6 Oct 2023 18:08:10 -0400
Subject: [PATCH 2/3] code & test improvements, added more coverage
---
apprise/conversion.py | 36 ++++++++++++++++++++++++++++--------
test/test_conversion.py | 30 +++++++++++++++++++++++-------
2 files changed, 51 insertions(+), 15 deletions(-)
diff --git a/apprise/conversion.py b/apprise/conversion.py
index 689697f0..89171433 100644
--- a/apprise/conversion.py
+++ b/apprise/conversion.py
@@ -101,7 +101,7 @@ class HTMLConverter(HTMLParser, object):
# The following tags must start on a new line
BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
- 'div', 'td', 'th', 'code', 'pre', 'label', 'li',)
+ 'div', 'td', 'th', 'pre', 'samp', 'label', 'li',)
# the folowing tags ignore any internal text
IGNORE_TAGS = (
@@ -216,8 +216,10 @@ class HTMLMarkDownConverter(HTMLConverter):
"""An HTML to markdown converter tuned for email messages."""
# Escape markdown characters
- MARKDOWN_ESCAPE = re.compile(r'([\\`*_{}[\]<>()#+\-.!|])',
- re.DOTALL | re.MULTILINE)
+ MARKDOWN_ESCAPE = re.compile(r'([`*#])', re.DOTALL | re.MULTILINE)
+
+ # Detect Carriage Return
+ HAS_CR = re.compile(r'[\r*\n]+', re.DOTALL | re.MULTILINE)
def __init__(self, **kwargs):
super().__init__(**kwargs)
@@ -225,6 +227,8 @@ class HTMLMarkDownConverter(HTMLConverter):
# Store href value
self._link = ""
+ self._preserver_cr = False
+
def handle_data(self, data, *args, **kwargs):
"""
Store our data if it is not on the ignore list
@@ -234,7 +238,8 @@ class HTMLMarkDownConverter(HTMLConverter):
if self._do_store:
# Tidy our whitespace
- content = self.WS_TRIM.sub(' ', data)
+ content = self.WS_TRIM.sub(' ', data) \
+ if not self._preserver_cr else data
content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content)
# Add hyperlink
@@ -287,19 +292,28 @@ class HTMLMarkDownConverter(HTMLConverter):
elif tag == 'h6':
self._result.append('###### ')
- elif tag in ['strong', 'b']:
+ elif tag in ('strong', 'b'):
self._result.append('**')
- elif tag in ['em', 'i']:
+ elif tag in ('em', 'i'):
self._result.append('*')
elif tag == 'code':
self._result.append('`')
+ self._preserver_cr = True
+
+ elif tag in ('pre', 'samp'):
+ self._result.append('```')
+ self._result.append(self.BLOCK_END)
+ self._preserver_cr = True
elif tag == 'a':
for name, link in attrs:
if name == 'href':
self._link = '(' + link + ')'
+ # Take an early exit for speed (in case there are more
+ # parameters - no need to waste time looking at them)
+ break
def handle_endtag(self, tag):
"""
@@ -311,11 +325,17 @@ class HTMLMarkDownConverter(HTMLConverter):
if tag in self.BLOCK_TAGS:
self._result.append(self.BLOCK_END)
- if tag in ['strong', 'b']:
+ if tag in ('strong', 'b'):
self._result.append('**')
- elif tag in ['em', 'i']:
+ elif tag in ('em', 'i'):
self._result.append('*')
elif tag == 'code':
self._result.append('`')
+ self._preserver_cr = False
+
+ elif tag in ('pre', 'samp'):
+ self._result.append('```')
+ self._result.append(self.BLOCK_END)
+ self._preserver_cr = False
diff --git a/test/test_conversion.py b/test/test_conversion.py
index 0a8230d5..c09eb86e 100644
--- a/test/test_conversion.py
+++ b/test/test_conversion.py
@@ -153,24 +153,24 @@ def test_conversion_html_to_markdown():
"""
return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body)
- assert to_markdown("No HTML code here.") == "No HTML code here\."
+ assert to_markdown("No HTML code here.") == "No HTML code here."
clist = to_markdown("")
assert "- Lots and lots" in clist
- assert "- of lists\." in clist
+ assert "- of lists." in clist
- assert "> To be or not to be\." == to_markdown(
+ assert "> To be or not to be." == to_markdown(
"To be or not to be.
")
cspace = to_markdown(
"Fancy heading
"
"And a paragraph too.
Plus line break.
")
assert "# Fancy heading" in cspace
- assert "And a paragraph too\.\nPlus line break\." in cspace
+ assert "And a paragraph too.\nPlus line break." in cspace
assert to_markdown(
""
- "Some obnoxious text here.
") == "Some obnoxious text here\."
+ "Some obnoxious text here.
") == "Some obnoxious text here."
assert to_markdown(
"line 1
"
@@ -194,9 +194,18 @@ def test_conversion_html_to_markdown():
# first 2 entries are okay, but last will do as best as it can
assert to_markdown(
+ "Heading 1
"
+ "Heading 2
"
+ "Heading 3
"
+ "Heading 4
"
+ "Heading 5
"
+ "Heading 6
"
"line 1>"
- "
line 2"
- "
line 3>") == "line 1\nline 2\nline 3\>"
+ "
line 2"
+ "
line 3>") == \
+ "# Heading 1\n## Heading 2\n### Heading 3\n" \
+ "#### Heading 4\n##### Heading 5\n###### Heading 6\n" \
+ "line 1\n*line 2*\nline 3>"
# Make sure we ignore fields that aren't important to us
assert to_markdown(
@@ -205,6 +214,13 @@ def test_conversion_html_to_markdown():
"Another line without being enclosed") == \
"line 1\nAnother line without being enclosed"
+ # Test and
+ assert to_markdown(
+ "multi-line 1\nmulti-line 2
more content"
+ "multi-line 1\nmulti-line 2
more content") == \
+ '`multi-line 1\nmulti-line 2`more content' \
+ '\n```\nmulti-line 1\nmulti-line 2\n```\nmore content'
+
# Test cases when there are no new lines (we're dealing with just inline
# entries); an empty entry as well
assert to_markdown("test "
From 4c1b5cabbe7bd4adea5f7a6d573eb996865075fe Mon Sep 17 00:00:00 2001
From: Chris Caron
Date: Fri, 6 Oct 2023 18:11:44 -0400
Subject: [PATCH 3/3] complete coverage of what is there
---
apprise/conversion.py | 2 +-
test/test_conversion.py | 5 +++++
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/apprise/conversion.py b/apprise/conversion.py
index 89171433..898f8007 100644
--- a/apprise/conversion.py
+++ b/apprise/conversion.py
@@ -308,7 +308,7 @@ class HTMLMarkDownConverter(HTMLConverter):
self._preserver_cr = True
elif tag == 'a':
- for name, link in attrs:
+ for name, link in attrs: # pragma: no branch
if name == 'href':
self._link = '(' + link + ')'
# Take an early exit for speed (in case there are more
diff --git a/test/test_conversion.py b/test/test_conversion.py
index c09eb86e..406db664 100644
--- a/test/test_conversion.py
+++ b/test/test_conversion.py
@@ -107,6 +107,11 @@ def test_conversion_html_to_text():
"my link") == \
"test my link"
+ # a with missing href entry
+ assert to_html("test "
+ "my link") == \
+ "test my link"
+
#
missing
assert to_html("line 1 bold
"
" my link"