From 23c3b72a432a24f8f374881ffa80ffab57d5120f Mon Sep 17 00:00:00 2001 From: chrisbroderick Date: Sun, 4 Sep 2022 22:01:32 +0100 Subject: [PATCH] Refactor object_from_message to make it more testable and fix some bugs. Extract the from email using the email library instead of pre-decoding which exposes the comma separator causing the email address to be extracted incorrectly if the real name contains a comma. Raise an exception when a message to be ignored is detected to process ignored messages explicitly. --- helpdesk/email.py | 253 ++++++++++++++++++++++++---------------------- 1 file changed, 132 insertions(+), 121 deletions(-) diff --git a/helpdesk/email.py b/helpdesk/email.py index 1a0b7d50..7a4962e2 100644 --- a/helpdesk/email.py +++ b/helpdesk/email.py @@ -20,6 +20,7 @@ import email from email.utils import getaddresses from email_reply_parser import EmailReplyParser from helpdesk import settings +from helpdesk.exceptions import IgnoreTicketException from helpdesk.lib import process_attachments, safe_template_context from helpdesk.models import FollowUp, IgnoreEmail, Queue, Ticket import imaplib @@ -34,6 +35,8 @@ import ssl import sys from time import ctime import typing +from email.message import Message +from typing import Tuple, List # import User model, which may be a custom model @@ -135,8 +138,11 @@ def pop3_sync(q, logger, server): else: full_message = encoding.force_str( "\n".join(raw_content), errors='replace') - ticket = object_from_message( - message=full_message, queue=q, logger=logger) + try: + ticket = object_from_message(message=full_message, queue=q, logger=logger) + except IgnoreTicketException: + logger.warn( + "Message %s was ignored and will be left on POP3 server" % msgNum) if ticket: server.dele(msgNum) @@ -186,9 +192,12 @@ def imap_sync(q, logger, server): data = server.fetch(num, '(RFC822)')[1] full_message = encoding.force_str(data[0][1], errors='replace') try: - ticket = object_from_message( - message=full_message, queue=q, logger=logger) - except TypeError: + ticket = object_from_message(message=full_message, queue=q, logger=logger) + except IgnoreTicketException: + logger.warn("Message %s was ignored and will be left on IMAP server" % num) + return + except TypeError as te: + logger.warn(f"Unexpected error processing message: {te}") ticket = None # hotfix. Need to work out WHY. if ticket: server.store(num, '+FLAGS', '\\Deleted') @@ -282,8 +291,11 @@ def process_queue(q, logger): logger.info("Processing message %d" % i) with open(m, 'r') as f: full_message = encoding.force_str(f.read(), errors='replace') - ticket = object_from_message( - message=full_message, queue=q, logger=logger) + try: + ticket = object_from_message(message=full_message, queue=q, logger=logger) + except IgnoreTicketException: + logger.warn("Message %d was ignored and will be left in local directory", i) + return if ticket: logger.info( "Successfully processed message %d, ticket/comment created.", i) @@ -573,38 +585,122 @@ def get_email_body_from_part_payload(part) -> str: part.get_payload(decode=False) ) +def attempt_body_extract_from_html(message: str) -> str: + mail = BeautifulSoup(str(message), "html.parser") + beautiful_body = mail.find('body') + body = None + full_body = None + if beautiful_body: + try: + body = beautiful_body.text + full_body = body + except AttributeError: + pass + if not body: + body = "" + return body, full_body + +def extract_part_data( + part: Message, + counter: int, + ticket_id: int, + files: List, + logger: logging.Logger + ) -> Tuple[str, str]: + name = part.get_filename() + if name: + name = email.utils.collapse_rfc2231_value(name) + part_body = None + part_full_body=None + if part.get_content_maintype() == 'text' and name is None: + if part.get_content_subtype() == 'plain': + part_body = part.get_payload(decode=True) + # https://github.com/django-helpdesk/django-helpdesk/issues/732 + if part['Content-Transfer-Encoding'] == '8bit' and part.get_content_charset() == 'utf-8': + part_body = part_body.decode('unicode_escape') + part_body = decodeUnknown(part.get_content_charset(), part_body) + # have to use django_settings here so overwritting it works in tests + # the default value is False anyway + if ticket_id is None and getattr(django_settings, 'HELPDESK_FULL_FIRST_MESSAGE_FROM_EMAIL', False): + # first message in thread, we save full body to avoid + # losing forwards and things like that + part_full_body = get_body_from_fragments(part_body) + part_body = EmailReplyParser.parse_reply(part_body) + else: + # second and other reply, save only first part of the + # message + part_body = EmailReplyParser.parse_reply(part_body) + part_full_body = part_body + # workaround to get unicode text out rather than escaped text + part_body = get_encoded_body(part_body) + logger.debug("Discovered plain text MIME part") + else: + email_body = get_email_body_from_part_payload(part) + + if not part_body and not part_full_body: + # no text has been parsed so far - try such deep parsing + # for some messages + altered_body = email_body.replace( + "

", "

\n").replace("{email_body}" + + payload = ( + '' + '' + '' + '' + '%s' + '' + ) % email_body + files.append( + SimpleUploadedFile( + _("email_html_body.html"), payload.encode("utf-8"), 'text/html') + ) + logger.debug("Discovered HTML MIME part") + else: + if not name: + ext = mimetypes.guess_extension(part.get_content_type()) + name = "part-%i%s" % (counter, ext) + else: + name = ("part-%i_" % counter) + name + + files.append(SimpleUploadedFile(name, part.get_payload(decode=True), mimetypes.guess_type(name)[0])) + logger.debug("Found MIME attachment %s" % name) + return part_body, part_full_body def object_from_message(message: str, queue: Queue, logger: logging.Logger ) -> Ticket: - # 'message' must be an RFC822 formatted message. - message = email.message_from_string(message) + # 'message' must be an RFC822 formatted message to correctly parse. + message_obj = email.message_from_string(message) - subject = message.get('subject', _('Comment from e-mail')) + subject = message_obj.get('subject', _('Comment from e-mail')) subject = decode_mail_headers( - decodeUnknown(message.get_charset(), subject)) + decodeUnknown(message_obj.get_charset(), subject)) for affix in STRIPPED_SUBJECT_STRINGS: subject = subject.replace(affix, "") subject = subject.strip() - sender = message.get('from', _('Unknown Sender')) - sender = decode_mail_headers(decodeUnknown(message.get_charset(), sender)) - - # to address bug #832, we wrap all the text in front of the email address in - # double quotes by using replace() on the email string. Then, - # take first item of list, second item of tuple is the actual email address. - # Note that the replace won't work on just an email with no real name, - # but the getaddresses() function seems to be able to handle just unclosed quotes - # correctly. Not ideal, but this seems to work for now. - sender_email = email.utils.getaddresses( - ['\"' + sender.replace('<', '\" <')])[0][1] + # TODO: Should really be assigning a properly formatted fake email. + # Check if anything relies on this being a "real name" formatted string if no sender is found on message_obj. + # Also not sure it should be accepting emails from unknown senders + sender_email = _('Unknown Sender') + sender_hdr = message_obj.get('from') + if sender_hdr: + # Parse the header which extracts the first email address in the list if more than one + # The parseaddr method returns a tuple in the form + # Only need the actual email address from the tuple not the "real name" + # Since the spec requires that all email addresses are ASCII, they will not be encoded + sender_email = email.utils.parseaddr(sender_hdr)[1] for ignore in IgnoreEmail.objects.filter(Q(queues=queue) | Q(queues__isnull=True)): if ignore.test(sender_email): - # By returning 'False' the message will be kept in the mailbox, - # and the 'True' will cause the message to be deleted. - return not ignore.keep_in_mailbox + raise IgnoreTicketException() ticket_id: typing.Optional[int] = get_ticket_id_from_subject_slug( queue.slug, @@ -617,108 +713,23 @@ def object_from_message(message: str, counter = 0 files = [] - for part in message.walk(): + for part in message_obj.walk(): if part.get_content_maintype() == 'multipart': continue - - name = part.get_param("name") - if name: - name = email.utils.collapse_rfc2231_value(name) - - if part.get_content_maintype() == 'text' and name is None: - if part.get_content_subtype() == 'plain': - body = part.get_payload(decode=True) - # https://github.com/django-helpdesk/django-helpdesk/issues/732 - if part['Content-Transfer-Encoding'] == '8bit' and part.get_content_charset() == 'utf-8': - body = body.decode('unicode_escape') - body = decodeUnknown(part.get_content_charset(), body) - # have to use django_settings here so overwritting it works in tests - # the default value is False anyway - if ticket_id is None and getattr(django_settings, 'HELPDESK_FULL_FIRST_MESSAGE_FROM_EMAIL', False): - # first message in thread, we save full body to avoid - # losing forwards and things like that - full_body = get_body_from_fragments(body) - body = EmailReplyParser.parse_reply(body) - else: - # second and other reply, save only first part of the - # message - body = EmailReplyParser.parse_reply(body) - full_body = body - # workaround to get unicode text out rather than escaped text - body = get_encoded_body(body) - logger.debug("Discovered plain text MIME part") - else: - email_body = get_email_body_from_part_payload(part) - - if not body and not full_body: - # no text has been parsed so far - try such deep parsing - # for some messages - altered_body = email_body.replace( - "

", "

\n").replace("{email_body}" - - payload = ( - '' - '' - '' - '' - '%s' - '' - ) % email_body - files.append( - SimpleUploadedFile( - _("email_html_body.html"), payload.encode("utf-8"), 'text/html') - ) - logger.debug("Discovered HTML MIME part") - else: - if not name: - ext = mimetypes.guess_extension(part.get_content_type()) - name = "part-%i%s" % (counter, ext) - else: - name = ("part-%i_" % counter) + name - - # # FIXME: this code gets the paylods, then does something with it and then completely ignores it - # # writing the part.get_payload(decode=True) instead; and then the payload variable is - # # replaced by some dict later. - # # the `payloadToWrite` has been also ignored so was commented - # payload = part.get_payload() - # if isinstance(payload, list): - # payload = payload.pop().as_string() - # # payloadToWrite = payload - # # check version of python to ensure use of only the correct error type - # non_b64_err = TypeError - # try: - # logger.debug("Try to base64 decode the attachment payload") - # # payloadToWrite = base64.decodebytes(payload) - # except non_b64_err: - # logger.debug("Payload was not base64 encoded, using raw bytes") - # # payloadToWrite = payload - files.append(SimpleUploadedFile(name, part.get_payload( - decode=True), mimetypes.guess_type(name)[0])) - logger.debug("Found MIME attachment %s" % name) - + # See email.message_obj.Message.get_filename() + part_body, part_full_body = extract_part_data(part, counter, ticket_id, files, logger) + if part_body: + body = part_body + full_body = part_full_body counter += 1 if not body: - mail = BeautifulSoup(str(message), "html.parser") - beautiful_body = mail.find('body') - if beautiful_body: - try: - body = beautiful_body.text - full_body = body - except AttributeError: - pass - if not body: - body = "" + body, full_body = attempt_body_extract_from_html(message_obj) - add_file_if_always_save_incoming_email_message(files, message) + add_file_if_always_save_incoming_email_message(files, message_obj) - smtp_priority = message.get('priority', '') - smtp_importance = message.get('importance', '') + smtp_priority = message_obj.get('priority', '') + smtp_importance = message_obj.get('importance', '') high_priority_types = {'high', 'important', '1', 'urgent'} priority = 2 if high_priority_types & { smtp_priority, smtp_importance} else 3 @@ -733,4 +744,4 @@ def object_from_message(message: str, 'files': files, } - return create_object_from_email_message(message, ticket_id, payload, files, logger=logger) + return create_object_from_email_message(message_obj, ticket_id, payload, files, logger=logger)