mirror of
https://github.com/caronc/apprise.git
synced 2025-02-06 05:19:15 +01:00
HTML to TEXT/MARKDOWN cleanup and refactoring (#530)
This commit is contained in:
parent
8fa146685f
commit
405e26e22d
@ -514,39 +514,19 @@ class Apprise(object):
|
||||
# was set to None), or we did define a tag and the logic above
|
||||
# determined we need to notify the service it's associated with
|
||||
if server.notify_format not in conversion_map:
|
||||
conversion_map[server.notify_format] = \
|
||||
convert_between(body_format, server.notify_format, body)
|
||||
conversion_map[server.notify_format] = convert_between(
|
||||
body_format, server.notify_format, body)
|
||||
|
||||
if interpret_escapes:
|
||||
#
|
||||
# Escape our content
|
||||
#
|
||||
if interpret_escapes:
|
||||
#
|
||||
# Escape our content
|
||||
#
|
||||
|
||||
try:
|
||||
# Added overhead required due to Python 3 Encoding Bug
|
||||
# identified here: https://bugs.python.org/issue21331
|
||||
conversion_map[server.notify_format] = \
|
||||
conversion_map[server.notify_format]\
|
||||
.encode('ascii', 'backslashreplace')\
|
||||
.decode('unicode-escape')
|
||||
|
||||
except UnicodeDecodeError: # pragma: no cover
|
||||
# This occurs using a very old verion of Python 2.7 such
|
||||
# as the one that ships with CentOS/RedHat 7.x (v2.7.5).
|
||||
conversion_map[server.notify_format] = \
|
||||
conversion_map[server.notify_format] \
|
||||
.decode('string_escape')
|
||||
|
||||
except AttributeError:
|
||||
# Must be of string type
|
||||
logger.error('Failed to escape message body')
|
||||
raise TypeError
|
||||
|
||||
if title:
|
||||
try:
|
||||
# Added overhead required due to Python 3 Encoding Bug
|
||||
# identified here: https://bugs.python.org/issue21331
|
||||
title = title\
|
||||
conversion_map[server.notify_format] = \
|
||||
conversion_map[server.notify_format]\
|
||||
.encode('ascii', 'backslashreplace')\
|
||||
.decode('unicode-escape')
|
||||
|
||||
@ -554,13 +534,35 @@ class Apprise(object):
|
||||
# This occurs using a very old verion of Python 2.7
|
||||
# such as the one that ships with CentOS/RedHat 7.x
|
||||
# (v2.7.5).
|
||||
title = title.decode('string_escape')
|
||||
conversion_map[server.notify_format] = \
|
||||
conversion_map[server.notify_format] \
|
||||
.decode('string_escape')
|
||||
|
||||
except AttributeError:
|
||||
# Must be of string type
|
||||
logger.error('Failed to escape message title')
|
||||
logger.error('Failed to escape message body')
|
||||
raise TypeError
|
||||
|
||||
if title:
|
||||
try:
|
||||
# Added overhead required due to Python 3 Encoding
|
||||
# Bug identified here:
|
||||
# https://bugs.python.org/issue21331
|
||||
title = title\
|
||||
.encode('ascii', 'backslashreplace')\
|
||||
.decode('unicode-escape')
|
||||
|
||||
except UnicodeDecodeError: # pragma: no cover
|
||||
# This occurs using a very old verion of Python 2.7
|
||||
# such as the one that ships with CentOS/RedHat 7.x
|
||||
# (v2.7.5).
|
||||
title = title.decode('string_escape')
|
||||
|
||||
except AttributeError:
|
||||
# Must be of string type
|
||||
logger.error('Failed to escape message title')
|
||||
raise TypeError
|
||||
|
||||
yield handler(
|
||||
server,
|
||||
body=conversion_map[server.notify_format],
|
||||
|
@ -27,11 +27,11 @@
|
||||
import re
|
||||
import six
|
||||
from markdown import markdown
|
||||
from os import linesep
|
||||
from .common import NotifyFormat
|
||||
|
||||
if six.PY2:
|
||||
from HTMLParser import HTMLParser
|
||||
|
||||
else:
|
||||
from html.parser import HTMLParser
|
||||
|
||||
@ -46,6 +46,8 @@ def convert_between(from_format, to_format, body):
|
||||
(NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown,
|
||||
(NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html,
|
||||
(NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text,
|
||||
# For now; use same converter for Markdown support
|
||||
(NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text,
|
||||
}
|
||||
|
||||
convert = converters.get((from_format, to_format))
|
||||
@ -85,7 +87,7 @@ def text_to_html(body):
|
||||
# Execute our map against our body in addition to
|
||||
# swapping out new lines and replacing them with <br/>
|
||||
return re.sub(
|
||||
r'\r*\n', '<br/>\r\n', re_table.sub(lambda x: re_map[x.group()], body))
|
||||
r'\r*\n', '<br/>\n', re_table.sub(lambda x: re_map[x.group()], body))
|
||||
|
||||
|
||||
def html_to_text(body):
|
||||
@ -94,37 +96,134 @@ def html_to_text(body):
|
||||
"""
|
||||
|
||||
parser = HTMLConverter()
|
||||
if six.PY2:
|
||||
# Python 2.7 requires an additional parsing to un-escape characters
|
||||
body = parser.unescape(body)
|
||||
|
||||
parser.feed(body)
|
||||
parser.close()
|
||||
return parser.converted
|
||||
result = parser.converted
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class HTMLConverter(HTMLParser, object):
|
||||
"""An HTML to plain text converter tuned for email messages."""
|
||||
|
||||
# The following tags must start on a new line
|
||||
BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'div', 'td', 'th', 'code', 'pre', 'label', 'li',)
|
||||
|
||||
# the folowing tags ignore any internal text
|
||||
IGNORE_TAGS = ('style', 'link', 'meta', 'title', 'html', 'head', 'script')
|
||||
|
||||
# Condense Whitespace
|
||||
WS_TRIM = re.compile(r'[\s]+', re.DOTALL | re.MULTILINE)
|
||||
|
||||
# Sentinel value for block tag boundaries, which may be consolidated into a
|
||||
# single line break.
|
||||
BLOCK_END = {}
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super(HTMLConverter, self).__init__(**kwargs)
|
||||
|
||||
# Shoudl we store the text content or not?
|
||||
self._do_store = True
|
||||
|
||||
# Initialize internal result list
|
||||
self._result = []
|
||||
|
||||
# Initialize public result field (not populated until close() is
|
||||
# called)
|
||||
self.converted = ""
|
||||
|
||||
def close(self):
|
||||
# Removes all html before the last "}". Some HTML can return additional
|
||||
# style information with text output.
|
||||
self.converted = str(self.converted).split('}')[-1].strip()
|
||||
string = ''.join(self._finalize(self._result))
|
||||
self.converted = string.strip()
|
||||
|
||||
def handle_data(self, data):
|
||||
self.converted += data.strip()
|
||||
if six.PY2:
|
||||
# See https://stackoverflow.com/questions/10993612/\
|
||||
# how-to-remove-xa0-from-string-in-python
|
||||
#
|
||||
# This is required since the unescape() nbsp; with \xa0 when
|
||||
# using Python 2.7
|
||||
self.converted = self.converted.replace(u'\xa0', u' ')
|
||||
|
||||
def _finalize(self, result):
|
||||
"""
|
||||
Combines and strips consecutive strings, then converts consecutive
|
||||
block ends into singleton newlines.
|
||||
|
||||
[ {be} " Hello " {be} {be} " World!" ] -> "\nHello\nWorld!"
|
||||
"""
|
||||
|
||||
# None means the last visited item was a block end.
|
||||
accum = None
|
||||
|
||||
for item in result:
|
||||
if item == self.BLOCK_END:
|
||||
# Multiple consecutive block ends; do nothing.
|
||||
if accum is None:
|
||||
continue
|
||||
|
||||
# First block end; yield the current string, plus a newline.
|
||||
yield accum.strip() + '\n'
|
||||
accum = None
|
||||
|
||||
# Multiple consecutive strings; combine them.
|
||||
elif accum is not None:
|
||||
accum += item
|
||||
|
||||
# First consecutive string; store it.
|
||||
else:
|
||||
accum = item
|
||||
|
||||
# Yield the last string if we have not already done so.
|
||||
if accum is not None:
|
||||
yield accum.strip()
|
||||
|
||||
def handle_data(self, data, *args, **kwargs):
|
||||
"""
|
||||
Store our data if it is not on the ignore list
|
||||
"""
|
||||
|
||||
# initialize our previous flag
|
||||
if self._do_store:
|
||||
|
||||
# Tidy our whitespace
|
||||
content = self.WS_TRIM.sub(' ', data)
|
||||
self._result.append(content)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
"""
|
||||
Process our starting HTML Tag
|
||||
"""
|
||||
# Toggle initial states
|
||||
self._do_store = tag not in self.IGNORE_TAGS
|
||||
|
||||
if tag in self.BLOCK_TAGS:
|
||||
self._result.append(self.BLOCK_END)
|
||||
|
||||
if tag == 'li':
|
||||
self.converted += linesep + '- '
|
||||
elif tag == 'blockquote':
|
||||
self.converted += linesep + linesep + '\t'
|
||||
elif tag in ('p', 'h1', 'h2', 'h3', 'h4', 'tr', 'th'):
|
||||
self.converted += linesep + '\n'
|
||||
self._result.append('- ')
|
||||
|
||||
elif tag == 'br':
|
||||
self.converted += linesep
|
||||
self._result.append('\n')
|
||||
|
||||
elif tag == 'hr':
|
||||
if self._result:
|
||||
self._result[-1] = self._result[-1].rstrip(' ')
|
||||
|
||||
self._result.append('\n---\n')
|
||||
|
||||
elif tag == 'blockquote':
|
||||
self._result.append(' >')
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == 'blockquote':
|
||||
self.converted += linesep + linesep
|
||||
"""
|
||||
Edge case handling of open/close tags
|
||||
"""
|
||||
self._do_store = True
|
||||
|
||||
if tag in self.BLOCK_TAGS:
|
||||
self._result.append(self.BLOCK_END)
|
||||
|
@ -25,6 +25,7 @@
|
||||
|
||||
from apprise import NotifyFormat
|
||||
from apprise.conversion import convert_between
|
||||
import pytest
|
||||
|
||||
# Disable logging for a cleaner testing output
|
||||
import logging
|
||||
@ -35,24 +36,101 @@ def test_html_to_text():
|
||||
"""conversion: Test HTML to plain text
|
||||
"""
|
||||
|
||||
def convert(body):
|
||||
def to_html(body):
|
||||
"""
|
||||
A function to simply html conversion tests
|
||||
"""
|
||||
return convert_between(NotifyFormat.HTML, NotifyFormat.TEXT, body)
|
||||
|
||||
assert convert("No HTML code here.") == "No HTML code here."
|
||||
assert to_html("No HTML code here.") == "No HTML code here."
|
||||
|
||||
clist = convert("<ul><li>Lots and lots</li><li>of lists.</li></ul>")
|
||||
clist = to_html("<ul><li>Lots and lots</li><li>of lists.</li></ul>")
|
||||
assert "Lots and lots" in clist
|
||||
assert "of lists." in clist
|
||||
|
||||
assert "To be or not to be." in convert(
|
||||
assert "To be or not to be." in to_html(
|
||||
"<blockquote>To be or not to be.</blockquote>")
|
||||
|
||||
cspace = convert(
|
||||
cspace = to_html(
|
||||
"<h2>Fancy heading</h2>"
|
||||
"<p>And a paragraph too.<br>Plus line break.</p>")
|
||||
assert "Fancy heading" in cspace
|
||||
assert "And a paragraph too.\nPlus line break." in cspace
|
||||
|
||||
assert convert(
|
||||
assert to_html(
|
||||
"<style>body { font: 200%; }</style>"
|
||||
"<p>Some obnoxious text here.</p>") == "Some obnoxious text here."
|
||||
|
||||
assert to_html(
|
||||
"<p>line 1</p>"
|
||||
"<p>line 2</p>"
|
||||
"<p>line 3</p>") == "line 1\nline 2\nline 3"
|
||||
|
||||
# Case sensitivity
|
||||
assert to_html(
|
||||
"<p>line 1</P>"
|
||||
"<P>line 2</P>"
|
||||
"<P>line 3</P>") == "line 1\nline 2\nline 3"
|
||||
|
||||
# double new lines (testing <br> and </br>)
|
||||
assert to_html(
|
||||
"some information<br/><br>and more information") == \
|
||||
"some information\n\nand more information"
|
||||
|
||||
#
|
||||
# Test bad tags
|
||||
#
|
||||
|
||||
# first 2 entries are okay, but last will do as best as it can
|
||||
assert to_html(
|
||||
"<p>line 1</>"
|
||||
"<p>line 2</gar>"
|
||||
"<p>line 3>") == "line 1\nline 2\nline 3>"
|
||||
|
||||
# Make sure we ignore fields that aren't important to us
|
||||
assert to_html(
|
||||
"<script>ignore this</script>"
|
||||
"<p>line 1</p>"
|
||||
"Another line without being enclosed") == \
|
||||
"line 1\nAnother line without being enclosed"
|
||||
|
||||
# Test cases when there are no new lines (we're dealing with just inline
|
||||
# entries); an empty entry as well
|
||||
assert to_html("<span></span<<span>test</span> "
|
||||
"<a href='#'>my link</a>") == \
|
||||
"test my link"
|
||||
|
||||
# </p> missing
|
||||
assert to_html("<body><div>line 1 <b>bold</b></div> "
|
||||
" <a href='#'>my link</a>"
|
||||
"<p>3rd line</body>") == \
|
||||
"line 1 bold\nmy link\n3rd line"
|
||||
|
||||
# <hr/> on it's own
|
||||
assert to_html("<hr/>") == "---"
|
||||
assert to_html("<hr>") == "---"
|
||||
|
||||
# We need to handle HTML Encodings
|
||||
assert to_html("""
|
||||
<html>
|
||||
<title>ignore this entry</title>
|
||||
<body>
|
||||
Let's handle special html encoding
|
||||
<hr/>
|
||||
</body>
|
||||
""") == "Let's handle special html encoding\n---"
|
||||
|
||||
# If you give nothing, you get nothing in return
|
||||
assert to_html("") == ""
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
# Invalid input
|
||||
assert to_html(None)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
# Invalid input
|
||||
assert to_html(42)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
# Invalid input
|
||||
assert to_html(object)
|
||||
|
Loading…
Reference in New Issue
Block a user