Re-worked Telegram HTML/Markdown -> HTML Conversion (#579)

This commit is contained in:
Chris Caron 2022-05-01 14:43:55 -04:00 committed by GitHub
parent 3c07d80975
commit fd0cb3ffcc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 186 additions and 76 deletions

View File

@ -96,7 +96,9 @@ class HTMLConverter(HTMLParser, object):
'div', 'td', 'th', 'code', 'pre', 'label', 'li',)
# the folowing tags ignore any internal text
IGNORE_TAGS = ('style', 'link', 'meta', 'title', 'html', 'head', 'script')
IGNORE_TAGS = (
'form', 'input', 'textarea', 'select', 'ul', 'ol', 'style', 'link',
'meta', 'title', 'html', 'head', 'script')
# Condense Whitespace
WS_TRIM = re.compile(r'[\s]+', re.DOTALL | re.MULTILINE)

View File

@ -177,44 +177,85 @@ class NotifyTelegram(NotifyBase):
# characters passed into it. to handle this situation, we need to
# search the body for these sequences and convert them to the
# output the user expected
__telegram_escape_html_dict = {
# New Lines
re.compile(r'<\s*/?br\s*/?>\r*\n?', re.I): '\r\n',
re.compile(r'<\s*/(br|p|div|li)[^>]*>\r*\n?', re.I): '\r\n',
# The following characters can be altered to become supported
re.compile(r'<\s*pre[^>]*>', re.I): '<code>',
re.compile(r'<\s*/pre[^>]*>', re.I): '</code>',
__telegram_escape_html_entries = (
# Comments
(re.compile(
r'\s*<!.+?-->\s*',
(re.I | re.M | re.S)), '', {}),
# the following tags are not supported
re.compile(
r'<\s*(br|p|div|span|body|script|meta|html|font'
r'|label|iframe|li|ol|ul|source|script)[^>]*>', re.I): '',
(re.compile(
r'\s*<\s*(!?DOCTYPE|p|div|span|body|script|link|'
r'meta|html|font|head|label|form|input|textarea|select|iframe|'
r'source|script)([^a-z0-9>][^>]*)?>\s*',
(re.I | re.M | re.S)), '', {}),
re.compile(
r'<\s*/(span|body|script|meta|html|font'
r'|label|iframe|ol|ul|source|script)[^>]*>', re.I): '',
# Italic
re.compile(r'<\s*(caption|em)[^>]*>', re.I): '<i>',
re.compile(r'<\s*/(caption|em)[^>]*>', re.I): '</i>',
# All closing tags to be removed are put here
(re.compile(
r'\s*<\s*/(span|body|script|meta|html|font|head|'
r'label|form|input|textarea|select|ol|ul|link|'
r'iframe|source|script)([^a-z0-9>][^>]*)?>\s*',
(re.I | re.M | re.S)), '', {}),
# Bold
re.compile(r'<\s*(h[1-6]|title|strong)[^>]*>', re.I): '<b>',
re.compile(r'<\s*/(h[1-6]|title|strong)[^>]*>', re.I): '</b>',
(re.compile(
r'<\s*(strong)([^a-z0-9>][^>]*)?>',
(re.I | re.M | re.S)), '<b>', {}),
(re.compile(
r'<\s*/\s*(strong)([^a-z0-9>][^>]*)?>',
(re.I | re.M | re.S)), '</b>', {}),
(re.compile(
r'\s*<\s*(h[1-6]|title)([^a-z0-9>][^>]*)?>\s*',
(re.I | re.M | re.S)), '{}<b>', {'html': '\r\n'}),
(re.compile(
r'\s*<\s*/\s*(h[1-6]|title)([^a-z0-9>][^>]*)?>\s*',
(re.I | re.M | re.S)),
'</b>{}', {'html': '<br/>'}),
# Italic
(re.compile(
r'<\s*(caption|em)([^a-z0-9>][^>]*)?>',
(re.I | re.M | re.S)), '<i>', {}),
(re.compile(
r'<\s*/\s*(caption|em)([^a-z0-9>][^>]*)?>',
(re.I | re.M | re.S)), '</i>', {}),
# Bullet Lists
(re.compile(
r'<\s*li([^a-z0-9>][^>]*)?>\s*',
(re.I | re.M | re.S)), ' -', {}),
# convert pre tags to code (supported by Telegram)
(re.compile(
r'<\s*pre([^a-z0-9>][^>]*)?>',
(re.I | re.M | re.S)), '{}<code>', {'html': '\r\n'}),
(re.compile(
r'<\s*/\s*pre([^a-z0-9>][^>]*)?>',
(re.I | re.M | re.S)), '</code>{}', {'html': '\r\n'}),
# New Lines
(re.compile(
r'\s*<\s*/?\s*(ol|ul|br|hr)\s*/?>\s*',
(re.I | re.M | re.S)), '\r\n', {}),
(re.compile(
r'\s*<\s*/\s*(br|p|hr|li|div)([^a-z0-9>][^>]*)?>\s*',
(re.I | re.M | re.S)), '\r\n', {}),
# HTML Spaces (&nbsp;) and tabs (&emsp;) aren't supported
# See https://core.telegram.org/bots/api#html-style
re.compile(r'\&nbsp;?', re.I): ' ',
(re.compile(r'\&nbsp;?', re.I), ' ', {}),
# Tabs become 3 spaces
re.compile(r'\&emsp;?', re.I): ' ',
(re.compile(r'\&emsp;?', re.I), ' ', {}),
# Some characters get re-escaped by the Telegram upstream
# service so we need to convert these back,
re.compile(r'\&apos;?', re.I): '\'',
re.compile(r'\&quot;?', re.I): '"',
}
(re.compile(r'\&apos;?', re.I), '\'', {}),
(re.compile(r'\&quot;?', re.I), '"', {}),
# New line cleanup
(re.compile(r'\r*\n[\r\n]+', re.I), '\r\n', {}),
)
# Define our template tokens
template_tokens = dict(NotifyBase.template_tokens, **{
@ -597,38 +638,19 @@ class NotifyTelegram(NotifyBase):
# Use Telegram's HTML mode
payload['parse_mode'] = 'HTML'
for r, v in self.__telegram_escape_html_dict.items():
body = r.sub(v, body, re.I)
for r, v, m in self.__telegram_escape_html_entries:
if 'html' in m:
# Handle special cases where we need to alter new lines
# for presentation purposes
v = v.format(m['html'] if body_format in (
NotifyFormat.HTML, NotifyFormat.MARKDOWN) else '')
body = r.sub(v, body)
# Prepare our payload based on HTML or TEXT
payload['text'] = body
# else: # self.notify_format == NotifyFormat.TEXT:
# # Use Telegram's HTML mode
# payload['parse_mode'] = 'HTML'
# # Further html escaping required...
# telegram_escape_text_dict = {
# # We need to escape characters that conflict with html
# # entity blocks (< and >) when displaying text
# r'>': '&gt;',
# r'<': '&lt;',
# r'\&': '&amp;',
# }
# # Create a regular expression from the dictionary keys
# text_regex = re.compile("(%s)" % "|".join(
# map(re.escape, telegram_escape_text_dict.keys())).lower(),
# re.I)
# # For each match, look-up corresponding value in dictionary
# body = text_regex.sub( # pragma: no branch
# lambda mo: telegram_escape_text_dict[
# mo.string[mo.start():mo.end()]], body)
# # prepare our payload based on HTML or TEXT
# payload['text'] = body
# Create a copy of the chat_ids list
targets = list(self.targets)
while len(targets):

View File

@ -22,7 +22,6 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
from apprise import NotifyFormat
from apprise.conversion import convert_between
import pytest

View File

@ -625,11 +625,10 @@ def test_plugin_telegram_formating_py3(mock_post):
# Test that everything is escaped properly in a TEXT mode
assert payload['text'] == \
'<b>🚨 Change detected&nbsp;for&nbsp;&lt;i&gt;Apprise&nbsp;' \
'Test&nbsp;Title&lt;/i&gt;</b>\r\n&lt;a&nbsp;href=' \
'"http://localhost"&gt;&lt;i&gt;Apprise&nbsp;Body&nbsp;Title&lt;' \
'/i&gt;&lt;/a&gt;&nbsp;had&nbsp;&lt;a&nbsp;href=&quot;http://' \
'127.0.0.1&quot;&gt;a&nbsp;change&lt;/a&gt;'
'<b>🚨 Change detected for &lt;i&gt;Apprise Test Title&lt;/i&gt;' \
'</b>\r\n&lt;a href="http://localhost"&gt;&lt;i&gt;' \
'Apprise Body Title&lt;/i&gt;&lt;/a&gt; had &lt;' \
'a href="http://127.0.0.1"&gt;a change&lt;/a&gt;'
# Reset our values
mock_post.reset_mock()
@ -718,8 +717,9 @@ def test_plugin_telegram_formating_py3(mock_post):
# Test that everything is escaped properly in a HTML mode
assert payload['text'] == \
'<b><b>🚨 Another Change detected for <i>Apprise Test Title</i>' \
'</b></b>\r\n<i><a href="http://localhost">Apprise Body Title</a>' \
'<b>\r\n<b>🚨 Another Change detected for ' \
'<i>Apprise Test Title</i></b>\r\n</b>\r\n<i>' \
'<a href="http://localhost">Apprise Body Title</a>' \
'</i> had <a href="http://127.0.0.2">a change</a>\r\n'
# Now we'll test an edge case where a title was defined, but after
@ -881,11 +881,11 @@ def test_plugin_telegram_formating_py2(mock_post):
# Test that everything is escaped properly in a TEXT mode
assert payload['text'].encode('utf-8') == \
'<b>\xf0\x9f\x9a\xa8 Change detected&nbsp;for&nbsp;&lt;i&gt;' \
'Apprise&nbsp;Test&nbsp;Title&lt;/i&gt;</b>\r\n&lt;a&nbsp;' \
'href="http://localhost"&gt;&lt;i&gt;Apprise&nbsp;Body&nbsp;' \
'Title&lt;/i&gt;&lt;/a&gt;&nbsp;had&nbsp;&lt;a&nbsp;href=&quot;' \
'http://127.0.0.1&quot;&gt;a&nbsp;change&lt;/a&gt;'
'<b>\xf0\x9f\x9a\xa8 Change detected for &lt;i&gt;' \
'Apprise Test Title&lt;/i&gt;</b>\r\n&lt;' \
'a href="http://localhost"&gt;&lt;i&gt;Apprise Body Title' \
'&lt;/i&gt;&lt;/a&gt; had &lt;a href="http://127.0.0.1"' \
'&gt;a change&lt;/a&gt;'
# Reset our values
mock_post.reset_mock()
@ -969,9 +969,9 @@ def test_plugin_telegram_formating_py2(mock_post):
# Test that everything is escaped properly in a HTML mode
assert payload['text'].encode('utf-8') == \
'<b><b>\xf0\x9f\x9a\xa8 Change detected for ' \
'<i>Apprise Test Title</i></b></b>\r\n<i>' \
'<a href="http://localhost">Apprise Body Title</a>'\
'<b>\r\n<b>\xf0\x9f\x9a\xa8 Change detected for ' \
'<i>Apprise Test Title</i></b>\r\n</b>\r\n<i>' \
'<a href="http://localhost">Apprise Body Title</a>' \
'</i> had <a href="http://127.0.0.1">a change</a>\r\n'
# Reset our values
@ -1163,8 +1163,8 @@ def test_plugin_telegram_html_formatting(mock_post):
# Test that everything is escaped properly in a HTML mode
assert payload['text'] == \
'<b><b>\'information\'</b></b>\r\n<i>"This is in Italic"' \
'</i>\r\n<b> Headings are dropped and converted to bold</b>'
'<b>\r\n<b>\'information\'</b>\r\n</b>\r\n<i>"This is in Italic"' \
'</i>\r\n<b> Headings are dropped and converted to bold</b>\r\n'
mock_post.reset_mock()
@ -1177,7 +1177,28 @@ def test_plugin_telegram_html_formatting(mock_post):
assert payload['text'] == \
'<b>&lt;title&gt;&amp;apos;information&amp;apos&lt;/title&gt;</b>' \
'\r\n&lt;em&gt;&amp;quot;This is in&nbsp;Italic&amp;quot&lt;/em' \
'&gt;&lt;br/&gt;&lt;h5&gt;&amp;emsp;&amp;emspHeadings&amp;nbsp;' \
'are&nbsp;dropped&nbsp;and&amp;nbspconverted&nbsp;to&nbsp;bold&lt;' \
'/h5&gt;'
'\r\n&lt;em&gt;&amp;quot;This is in Italic&amp;quot&lt;/em&gt;&lt;' \
'br/&gt;&lt;h5&gt;&amp;emsp;&amp;emspHeadings&amp;nbsp;are ' \
'dropped and&amp;nbspconverted to bold&lt;/h5&gt;'
# Lest test more complex HTML examples now
mock_post.reset_mock()
test_file_01 = os.path.join(
TEST_VAR_DIR, '01_test_example.html')
with open(test_file_01) as html_file:
assert aobj.notify(
body=html_file.read(), body_format=NotifyFormat.HTML)
# owner has already been looked up, so only one call is made
assert mock_post.call_count == 1
payload = loads(mock_post.call_args_list[0][1]['data'])
assert payload['text'] == \
'\r\n<b>Bootstrap 101 Template</b>\r\n<b>My Title</b>\r\n' \
'<b>Heading 1</b>\r\n-Bullet 1\r\n-Bullet 2\r\n-Bullet 3\r\n' \
'-Bullet 1\r\n-Bullet 2\r\n-Bullet 3\r\n<b>Heading 2</b>\r\n' \
'A div entry\r\nA div entry\r\n<code>A pre entry</code>\r\n' \
'<b>Heading 3</b>\r\n<b>Heading 4</b>\r\n<b>Heading 5</b>\r\n' \
'<b>Heading 6</b>\r\nA set of text\r\n' \
'Another line after the set of text\r\nMore text\r\nlabel'

View File

@ -0,0 +1,66 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<title>Bootstrap 101 Template</title>
<!-- Bootstrap -->
<link href="css/bootstrap.min.css" rel="stylesheet">
<!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
<!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
</head>
<body>
<h1>My Title</h1>
<!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
<!-- Include all compiled plugins (below), or include individual files as needed -->
<script src="js/bootstrap.min.js"></script>
<h1>Heading 1</h1>
<p>
<ul>
<li>Bullet 1</li>
<li>Bullet 2</li>
<li>Bullet 3</li>
</ul>
<ol>
<li>Bullet 1</li>
<li>Bullet 2</li>
<li>Bullet 3</li>
</ol>
</p>
<h2>Heading 2</h2>
<div>A div entry</div>
<p>
<span>A div entry</span>
<pre>A pre entry</pre>
</p>
<h3>Heading 3</h3>
<h4>Heading 4</h4>
<h5>Heading 5</h5>
<h6>Heading 6</h6>
<p>
A set of text <br/>Another line after the set of text
<hr/>
More text
</p>
<form>
<label>label</label>
<input/>
<select/>
</form>
</body>
</html>