Fix utf decoding bug in email parsing code

For some reason mozilla thunderbird sometimes marks email parts as 8bit
even though they are utf-8. I guess the best way to work around this is
to add a try-catch block because this really cannot be predicted.
This commit is contained in:
Timothy Hobbs 2020-01-23 15:30:08 +01:00
parent af2d0d59b7
commit 9963a3fe5d
3 changed files with 93 additions and 1 deletions

View File

@ -487,13 +487,18 @@ def object_from_message(message, queue, logger):
body.encode('utf-8') body.encode('utf-8')
logger.debug("Discovered plain text MIME part") logger.debug("Discovered plain text MIME part")
else: else:
try:
email_body = encoding.smart_text(part.get_payload(decode=True))
except UnicodeDecodeError:
email_body = encoding.smart_text(part.get_payload(decode=False))
payload = """ payload = """
<html> <html>
<head> <head>
<meta charset="utf-8"/> <meta charset="utf-8"/>
</head> </head>
%s %s
</html>""" % encoding.smart_text(part.get_payload(decode=True)) </html>""" % email_body
files.append( files.append(
SimpleUploadedFile(_("email_html_body.html"), payload.encode("utf-8"), 'text/html') SimpleUploadedFile(_("email_html_body.html"), payload.encode("utf-8"), 'text/html')
) )

View File

@ -0,0 +1,72 @@
Delivered-To: helpdesk@example.cz
Received: by 2002:a17:90a:f983:0:0:0:0 with SMTP id cq3csp4021504pjb;
Tue, 21 Jan 2020 04:28:48 -0800 (PST)
X-Received: by 2002:a05:6000:50:: with SMTP id k16mr4730387wrx.145.1579609728626;
Tue, 21 Jan 2020 04:28:48 -0800 (PST)
X-Received: by 2002:a5d:50d2:: with SMTP id f18mr4914314wrt.366.1579609727642;
Tue, 21 Jan 2020 04:28:47 -0800 (PST)
Return-Path: <john.smith@example.cz>
Received: from [10.0.0.179] (ip-89-176-203-67.net.upcbroadband.cz. [89.176.203.67])
by smtp.gmail.com with ESMTPSA id w83sm3724796wmb.42.2020.01.21.04.28.46
for <helpdesk@example.cz>
(version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128);
Tue, 21 Jan 2020 04:28:47 -0800 (PST)
Subject: =?UTF-8?Q?Fwd=3a_Cyklozam=c4=9bstnavatel_-_zm=c4=9bna_vyhodnocen?=
=?UTF-8?B?w60=?=
References: <CAK0Q_=uQY=3V5iUSgEN16GLtYoJ-6oQu-vYjsgQ=jv6DwOkuLQ@mail.gmail.com>
To: helpdesk@example.cz
From: John Smith <john.smith@example.cz>
Openpgp: preference=signencrypt
X-Forwarded-Message-Id: <CAK0Q_=uQY=3V5iUSgEN16GLtYoJ-6oQu-vYjsgQ=jv6DwOkuLQ@mail.gmail.com>
Message-ID: <00d73ce5-774a-5ea1-6742-af73ef58c01c@example.cz>
Date: Tue, 21 Jan 2020 13:28:46 +0100
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101
Thunderbird/60.9.0
MIME-Version: 1.0
In-Reply-To: <CAK0Q_=uQY=3V5iUSgEN16GLtYoJ-6oQu-vYjsgQ=jv6DwOkuLQ@mail.gmail.com>
Content-Type: multipart/alternative;
boundary="------------1E8B96489BB357387CBD04A6"
Content-Language: en-US
This is a multi-part message in MIME format.
--------------1E8B96489BB357387CBD04A6
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 8bit
-------- Forwarded Message --------
Subject: Cyklozaměstnavatel - změna vyhodnocení
Date: Thu, 9 Jan 2020 16:24:28 +0100
From: Nikola <nikola@example.cz>
To: John Smith <john.smith@example.cz>, Jiří Houdek
<jiri.houdek@example.cz>
Ahoj Johne,
podle domluvy bych Tě poprosila o změnu vyhodnocení soutěže
Cyklozaměstnavatel. 
Poprosím, aby se ve výsledné tabulce pro každé město zobrazoval
jednotlivý zaměstnavatel *jen jednou s průměrným výsledkem, *který vyjde
po zprůměrování hodnocení všech zaměstnanců tohoto zaměstnavatele. 
Díky moc!
n. 
--------------1E8B96489BB357387CBD04A6
Content-Type: text/html; charset=utf-8
Content-Transfer-Encoding: 8bit
<body>
<p>
prosazuje lepší
prostředí pro
kvalitní život
ve městě.
</p>
</body>
--------------1E8B96489BB357387CBD04A6--

View File

@ -84,6 +84,21 @@ class GetEmailCommonTests(TestCase):
self.assertEqual(ticket.title, "Testovácí email") self.assertEqual(ticket.title, "Testovácí email")
self.assertEqual(ticket.description, "íářčšáíéřášč") self.assertEqual(ticket.description, "íářčšáíéřášč")
def test_email_with_utf_8_non_decodable_sequences(self):
"""
Tests that emails with utf-8 non-decodable sequences are parsed correctly
"""
with open(os.path.join(THIS_DIR, "test_files/utf-nondecodable.eml")) as fd:
test_email = fd.read()
ticket = helpdesk.email.object_from_message(test_email, self.queue_public, self.logger)
self.assertEqual(ticket.title, "Fwd: Cyklozaměstnavatel - změna vyhodnocení")
self.assertIn("prosazuje lepší", ticket.description)
followups = FollowUp.objects.filter(ticket=ticket)
followup = followups[0]
attachments = FollowUpAttachment.objects.filter(followup=followup)
attachment = attachments[0]
self.assertIn('prosazuje lepší', attachment.file.read().decode("utf-8"))
class GetEmailParametricTemplate(object): class GetEmailParametricTemplate(object):
"""TestCase that checks basic email functionality across methods and socks configs.""" """TestCase that checks basic email functionality across methods and socks configs."""