From 9963a3fe5d51c20b8e8e4540bcd060f7fb1af95d Mon Sep 17 00:00:00 2001 From: Timothy Hobbs Date: Thu, 23 Jan 2020 15:30:08 +0100 Subject: [PATCH] Fix utf decoding bug in email parsing code For some reason mozilla thunderbird sometimes marks email parts as 8bit even though they are utf-8. I guess the best way to work around this is to add a try-catch block because this really cannot be predicted. --- helpdesk/email.py | 7 +- .../tests/test_files/utf-nondecodable.eml | 72 +++++++++++++++++++ helpdesk/tests/test_get_email.py | 15 ++++ 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 helpdesk/tests/test_files/utf-nondecodable.eml diff --git a/helpdesk/email.py b/helpdesk/email.py index 5458e022..a2088be2 100644 --- a/helpdesk/email.py +++ b/helpdesk/email.py @@ -487,13 +487,18 @@ def object_from_message(message, queue, logger): body.encode('utf-8') logger.debug("Discovered plain text MIME part") else: + try: + email_body = encoding.smart_text(part.get_payload(decode=True)) + except UnicodeDecodeError: + email_body = encoding.smart_text(part.get_payload(decode=False)) + payload = """ %s -""" % encoding.smart_text(part.get_payload(decode=True)) +""" % email_body files.append( SimpleUploadedFile(_("email_html_body.html"), payload.encode("utf-8"), 'text/html') ) diff --git a/helpdesk/tests/test_files/utf-nondecodable.eml b/helpdesk/tests/test_files/utf-nondecodable.eml new file mode 100644 index 00000000..6d5a57d5 --- /dev/null +++ b/helpdesk/tests/test_files/utf-nondecodable.eml @@ -0,0 +1,72 @@ +Delivered-To: helpdesk@example.cz +Received: by 2002:a17:90a:f983:0:0:0:0 with SMTP id cq3csp4021504pjb; + Tue, 21 Jan 2020 04:28:48 -0800 (PST) +X-Received: by 2002:a05:6000:50:: with SMTP id k16mr4730387wrx.145.1579609728626; + Tue, 21 Jan 2020 04:28:48 -0800 (PST) +X-Received: by 2002:a5d:50d2:: with SMTP id f18mr4914314wrt.366.1579609727642; + Tue, 21 Jan 2020 04:28:47 -0800 (PST) +Return-Path: +Received: from [10.0.0.179] (ip-89-176-203-67.net.upcbroadband.cz. [89.176.203.67]) + by smtp.gmail.com with ESMTPSA id w83sm3724796wmb.42.2020.01.21.04.28.46 + for + (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); + Tue, 21 Jan 2020 04:28:47 -0800 (PST) +Subject: =?UTF-8?Q?Fwd=3a_Cyklozam=c4=9bstnavatel_-_zm=c4=9bna_vyhodnocen?= + =?UTF-8?B?w60=?= +References: +To: helpdesk@example.cz +From: John Smith +Openpgp: preference=signencrypt +X-Forwarded-Message-Id: +Message-ID: <00d73ce5-774a-5ea1-6742-af73ef58c01c@example.cz> +Date: Tue, 21 Jan 2020 13:28:46 +0100 +User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 + Thunderbird/60.9.0 +MIME-Version: 1.0 +In-Reply-To: +Content-Type: multipart/alternative; + boundary="------------1E8B96489BB357387CBD04A6" +Content-Language: en-US + +This is a multi-part message in MIME format. +--------------1E8B96489BB357387CBD04A6 +Content-Type: text/plain; charset=utf-8 +Content-Transfer-Encoding: 8bit + + + + +-------- Forwarded Message -------- +Subject: Cyklozaměstnavatel - změna vyhodnocení +Date: Thu, 9 Jan 2020 16:24:28 +0100 +From: Nikola +To: John Smith , Jiří Houdek + + + + +Ahoj Johne, +podle domluvy bych Tě poprosila o změnu vyhodnocení soutěže +Cyklozaměstnavatel.  +Poprosím, aby se ve výsledné tabulce pro každé město zobrazoval +jednotlivý zaměstnavatel *jen jednou s průměrným výsledkem, *který vyjde +po zprůměrování hodnocení všech zaměstnanců tohoto zaměstnavatele.  +Díky moc! +n.  + + +--------------1E8B96489BB357387CBD04A6 +Content-Type: text/html; charset=utf-8 +Content-Transfer-Encoding: 8bit + + +

+ prosazuje lepší + prostředí pro + kvalitní život + ve městě. +

+ + + +--------------1E8B96489BB357387CBD04A6-- diff --git a/helpdesk/tests/test_get_email.py b/helpdesk/tests/test_get_email.py index 50a5f1e5..9c7b4c11 100644 --- a/helpdesk/tests/test_get_email.py +++ b/helpdesk/tests/test_get_email.py @@ -84,6 +84,21 @@ class GetEmailCommonTests(TestCase): self.assertEqual(ticket.title, "Testovácí email") self.assertEqual(ticket.description, "íářčšáíéřášč") + def test_email_with_utf_8_non_decodable_sequences(self): + """ + Tests that emails with utf-8 non-decodable sequences are parsed correctly + """ + with open(os.path.join(THIS_DIR, "test_files/utf-nondecodable.eml")) as fd: + test_email = fd.read() + ticket = helpdesk.email.object_from_message(test_email, self.queue_public, self.logger) + self.assertEqual(ticket.title, "Fwd: Cyklozaměstnavatel - změna vyhodnocení") + self.assertIn("prosazuje lepší", ticket.description) + followups = FollowUp.objects.filter(ticket=ticket) + followup = followups[0] + attachments = FollowUpAttachment.objects.filter(followup=followup) + attachment = attachments[0] + self.assertIn('prosazuje lepší', attachment.file.read().decode("utf-8")) + class GetEmailParametricTemplate(object): """TestCase that checks basic email functionality across methods and socks configs."""