Completely rework the email parsing.

Fixes a number of hacks that have accumulated and makes it is more easily understood and easier to enhance in the future.
2025-06-25 12:01:27 +02:00 · 2023-07-23 06:12:32 +01:00 · 2023-07-23 06:12:32 +01:00 · 2e5697c11a
commit 2e5697c11a
parent fad11d53bb
1 changed files with 152 additions and 131 deletions
--- a/helpdesk/email.py
+++ b/helpdesk/email.py
@ -39,7 +39,8 @@ import ssl
 import sys
 from time import ctime
 import typing
-from typing import List, Tuple
+from typing import List
 from email.mime.text import MIMEText
 # import User model, which may be a custom model
@ -53,6 +54,8 @@ STRIPPED_SUBJECT_STRINGS = [
    "Automatic reply: ",
 ]
 HTML_EMAIL_ATTACHMENT_FILENAME = _("email_html_body.html")
 def process_email(quiet=False):
    for q in Queue.objects.filter(
@ -141,7 +144,7 @@ def pop3_sync(q, logger, server):
            full_message = encoding.force_str(
                "\n".join(raw_content), errors='replace')
        try:
-            ticket = object_from_message(message=full_message, queue=q, logger=logger)
+            ticket = extract_email_metadata(message=full_message, queue=q, logger=logger)
        except IgnoreTicketException:
            logger.warn(
                "Message %s was ignored and will be left on POP3 server" % msgNum)
@ -198,7 +201,7 @@ def imap_sync(q, logger, server):
                data = server.fetch(num, '(RFC822)')[1]
                full_message = encoding.force_str(data[0][1], errors='replace')
                try:
-                    ticket = object_from_message(message=full_message, queue=q, logger=logger)
+                    ticket = extract_email_metadata(message=full_message, queue=q, logger=logger)
                except IgnoreTicketException:
                    logger.warn("Message %s was ignored and will be left on IMAP server" % num)
                except DeleteIgnoredTicketException:
@ -285,7 +288,7 @@ def imap_oauth_sync(q, logger, server):
                full_message = encoding.force_str(data[0][1], errors='replace')
                try:
-                    ticket = object_from_message(message=full_message, queue=q, logger=logger)
+                    ticket = extract_email_metadata(message=full_message, queue=q, logger=logger)
                except IgnoreTicketException as itex:
                    logger.warn(f"Message {num} was ignored. {itex}")
@ -405,7 +408,7 @@ def process_queue(q, logger):
            with open(m, 'r') as f:
                full_message = encoding.force_str(f.read(), errors='replace')
                try:
-                    ticket = object_from_message(message=full_message, queue=q, logger=logger)
+                    ticket = extract_email_metadata(message=full_message, queue=q, logger=logger)
                except IgnoreTicketException:
                    logger.warn("Message %d was ignored and will be left in local directory", i)
                except DeleteIgnoredTicketException:
@ -433,7 +436,7 @@ def decodeUnknown(charset, string):
        if not charset:
            try:
                return str(string, encoding='utf-8', errors='replace')
-            except UnicodeError:
+            except UnicodeError as e:
                return str(string, encoding='iso8859-1', errors='replace')
        return str(string, encoding=charset, errors='replace')
    return string
@ -723,133 +726,92 @@ def attempt_body_extract_from_html(message: str) -> str:
    return body, full_body
-def extract_part_data(
+def  extract_mime_content(part: Message,) -> str:
    '''
    Extract the content from the MIME body part
    :param part: the MIME part to extract the content from
    '''
    content_bytes = part.get_payload(decode=True)
    charset = part.get_content_charset()
    # The default for MIME email is 7bit which requires special decoding to utf-8 so make sure we handle the decoding correctly
    if part['Content-Transfer-Encoding'] in [None, '8bit', '7bit'] and (charset == 'utf-8' or charset is None):
        charset = "unicode_escape"
    content = decodeUnknown(charset, content_bytes)
    return content
 def extract_email_message(mime_content: str, is_plain_content_type: bool, is_extract_full_email_msg: bool) -> str:
    email_content = None
    if is_extract_full_email_msg:
        # Take the full content including encapsulated "forwarded" and "reply" sections
        email_content = get_body_from_fragments(mime_content) if is_plain_content_type else mime_content
    else:
        # Just get the primary part of the email and drop off any text below the actually response text
        email_content = EmailReplyParser.parse_reply(mime_content) if is_plain_content_type else mime_content
    return email_content
 def process_as_attachment(
        part: Message,
        counter: int,
        ticket_id: int,
        files: List,
        logger: logging.Logger
-) -> Tuple[str, str]:
+):
    name = part.get_filename()
    if name:
-        name = email.utils.collapse_rfc2231_value(name)
+        name = f"part-{counter}_{email.utils.collapse_rfc2231_value(name)}"
    part_body = None
    formatted_body = None
    if part.get_content_maintype() == 'text' and name is None:
        if part.get_content_subtype() == 'plain':
            part_body = part.get_payload(decode=True)
            # https://github.com/django-helpdesk/django-helpdesk/issues/732
            if part['Content-Transfer-Encoding'] == '8bit' and part.get_content_charset() == 'utf-8':
                part_body = part_body.decode('unicode_escape')
            part_body = decodeUnknown(part.get_content_charset(), part_body)
            # have to use django_settings here so overwriting it works in tests
            # the default value is False anyway
            if ticket_id is None and getattr(django_settings, 'HELPDESK_FULL_FIRST_MESSAGE_FROM_EMAIL', False):
                # first message in thread, we save full body to avoid
                # losing forwards and things like that
                formatted_body = get_body_from_fragments(part_body)
                part_body = EmailReplyParser.parse_reply(part_body)
            else:
                # second and other reply, save only first part of the
                # message
                part_body = EmailReplyParser.parse_reply(part_body)
                formatted_body = part_body
            # workaround to get unicode text out rather than escaped text
            part_body = get_encoded_body(part_body)
            logger.debug("Discovered plain text MIME part")
        else:
            email_body = get_email_body_from_part_payload(part)
            if not part_body and not formatted_body:
                # no text has been parsed so far - try such deep parsing
                # for some messages
                altered_body = email_body.replace(
                    "</p>", "</p>\n").replace("<br", "\n<br")
                mail = BeautifulSoup(str(altered_body), "html.parser")
                formatted_body = mail.get_text()
            if "<body" not in email_body:
                email_body = f"<body>{email_body}</body>"
            payload = (
                '<html>'
                '<head>'
                '<meta charset="utf-8" />'
                '</head>'
                '%s'
                '</html>'
            ) % email_body
            files.append(
                SimpleUploadedFile(
                    _("email_html_body.html"), payload.encode("utf-8"), 'text/html')
            )
            logger.debug("Discovered HTML MIME part")
    else:
-        if not name:
+        ext = mimetypes.guess_extension(part.get_content_type())
-            ext = mimetypes.guess_extension(part.get_content_type())
+        name = f"part-{counter}{ext}"
-            name = f"part-{counter}{ext}"
+    # Extract payload accounting for attached multiparts
-        else:
+    payload = part.as_string() if part.is_multipart() else part.get_payload(decode=True)
-            name = f"part-{counter}_{name}"
+    files.append(SimpleUploadedFile(name, payload, mimetypes.guess_type(name)[0]))
-        payload = part.as_string() if part.is_multipart() else part.get_payload(decode=True)
+    if logger.isEnabledFor(logging.DEBUG):
-        files.append(SimpleUploadedFile(name, payload, mimetypes.guess_type(name)[0]))
+        logger.debug("Processed MIME as attachment: %s", name)
-        logger.debug("Found MIME attachment %s", name)
+    return
    return part_body, formatted_body
-def recurse_multipart(
+def extract_email_subject(email_msg: Message,) -> str:
-        multipart: Message,
+    subject = email_msg.get('subject', _('Comment from e-mail'))
-        counter: int,
+    subject = decode_mail_headers(
-        ticket_id: int,
+        decodeUnknown(email_msg.get_charset(), subject))
-        files: List,
+    for affix in STRIPPED_SUBJECT_STRINGS:
-        logger: logging.Logger
+        subject = subject.replace(affix, "")
-) -> Tuple[str, str]:
+    return subject.strip()
    '''
    The received MIME part could be a multipart with embedded multiparts and therefore requires recursion.
    Recurse through the multipart structures trying to find the 1st body part that
    provides the message body. It will try to find an HTML formatted part (contentType=text/html)
    and a TEXT formatted part (contentType=text/plain) and return both
    :param multipart:
    :param counter:
    :param ticket_id:
    :param files:
    :param logger:
    '''
    plain_msg = None
    formatted_msg = None
    for part in multipart.walk():
        if part.get_content_maintype() == 'multipart':
            continue
        # See email.message_obj.Message.get_filename()
        plain_body, formatted_body = recurse_multipart(
            part, counter, ticket_id, files, logger) if part.get_content_maintype(
        ) == 'multipart' else extract_part_data(part, counter, ticket_id, files, logger)
        # Only update the message variables if they are still empty to handle attached messages overriding the core message
        if plain_msg is None and plain_body:
            plain_msg = plain_body
        if formatted_msg is None and formatted_body:
            formatted_msg = formatted_body
        counter += 1
    return plain_msg, formatted_msg
-def object_from_message(message: str,
+def extract_email_metadata(message: str,
                        queue: Queue,
                        logger: logging.Logger
                        ) -> Ticket:
    '''
    Extracts the text/plain  mime part if there is one as the ticket description and
    stores the text/html part as an attachment if it is present.
    If no text/plain  part is present then it will try to use the text/html part if
    it is present as the ticket description by removing the HTML formatting.
    If neither a text/plain or text/html is present then it will use the first text/*
    MIME part that it finds as the ticket description.
    By default it will always take only the actual message and drop any chained messages
    from replies.
    The HELPDESK_FULL_FIRST_MESSAGE_FROM_EMAIL settings can force the entire message to be
    stored in the ticket if it is a new ticket by setting it to True.
    In this scenario, if it is a reply that is a forwarded message with no actual message,
    then the description will be sourced from the text/html part and the forwarded message
    will be in the FollowUp record aassociated with the ticket.
    It will iterate over every MIME part and store all MIME parts as attachments apart
    from the text/plain part.
    There may be a case for trying to exclude repeated signature images by checking if an
    attachment of the same name already exists as an attachement on the ticket but that is
    not implemented.
    :param message: the raw email message received
    :param queue: the queue that hte 
    :param logger: the logger to be used
    '''
    # 'message' must be an RFC822 formatted message to correctly parse.
-    message_obj = email.message_from_string(message)
+    message_obj: Message = email.message_from_string(message)
-    subject = message_obj.get('subject', _('Comment from e-mail'))
+    subject = extract_email_subject(message_obj)
    subject = decode_mail_headers(
        decodeUnknown(message_obj.get_charset(), subject))
    for affix in STRIPPED_SUBJECT_STRINGS:
        subject = subject.replace(affix, "")
    subject = subject.strip()
    # TODO: Should really be assigning a properly formatted fake email.
    #       Check if anything relies on this being a "real name" formatted string if no sender is found on message_obj.
    #       Also not sure it should be accepting emails from unknown senders
    sender_email = _('Unknown Sender')
    sender_hdr = message_obj.get('from')
    if sender_hdr:
@ -868,27 +830,86 @@ def object_from_message(message: str,
        subject,
        logger
    )
-
+    plain_body: str = None
-    body = None
+    formatted_body: str = None
    full_body = None
    counter = 0
    files = []
    first_mime_non_multipart_content: MIMEText = None
    # Cycle through all MIME parts in the email extracting the plain and formatted messages
    # Algorithm uses the first text parts found as the actual email content and subsequent text parts
    # are made into attachments so they do not get lost
    for part in message_obj.walk():
-        if part.get_content_maintype() == 'multipart':
+        part_main_type = part.get_content_maintype()
        if part_main_type == 'multipart':
            continue
-        # See email.message_obj.Message.get_filename()
+        if part.get_content_disposition() in ['inline', 'attachment']:
-        plain_body, formatted_body = extract_part_data(part, counter, ticket_id, files, logger)
+            process_as_attachment(part, counter, files, logger)
-        if plain_body:
+        else:
-            body = plain_body
+            # Get the content then assign to plain for formatted email message otherwise store the content as an attachment
-        if formatted_body:
+            mime_content = extract_mime_content(part)
-            full_body = formatted_body
+            if first_mime_non_multipart_content is None:
                first_mime_non_multipart_content = mime_content
            if part_main_type == 'text':
                # Could be the body of the email
                part_sub_type = part.get_content_subtype()
                if plain_body is None and part_sub_type == "plain":
                    plain_body = mime_content
                elif formatted_body is None and part_sub_type == "html":
                    formatted_body = mime_content
                    if "<body" not in formatted_body:
                        email_body = f"<body>{formatted_body}</body>"
                    else:
                        email_body = formatted_body
                    payload = (
                        '<html>'
                        '<head>'
                        '<meta charset="utf-8" />'
                        '</head>'
                        '%s'
                        '</html>'
                    ) % email_body
                    files.append(
                        SimpleUploadedFile(
                            HTML_EMAIL_ATTACHMENT_FILENAME, payload.encode("utf-8"), 'text/html')
                    )
                else:
                    # Theoretically should not happen to properly structured emails but process anything else as an attachment
                    process_as_attachment(part, counter, files, logger)
                    logger.debug(f"Text MIME part added as attachment: {part.get_content_type()}")
            else:
                # process anything else as an attachment
                process_as_attachment(part, counter, files, logger)
        counter += 1
-    if not body:
+    # Check if we have at least the plain body
-        body, full_body = attempt_body_extract_from_html(message_obj)
+    if not plain_body:
        if formatted_body:
            # We have a formatted body but no plain text body
            plain_body, _x = attempt_body_extract_from_html(formatted_body)
        else:
            # Something wrong with email or a processing issue so try first part or save full email message
            if first_mime_non_multipart_content:
                plain_body = extract_email_message(first_mime_non_multipart_content, True, True)
            else:
                plain_body = message
    # first message in thread, we save full body to avoid losing forwards and things like that
    include_chained_msgs = True if ticket_id is None and getattr(django_settings, 'HELPDESK_FULL_FIRST_MESSAGE_FROM_EMAIL', False) else False
    message_body = extract_email_message(plain_body, True, include_chained_msgs)
-    add_file_if_always_save_incoming_email_message(files, message_obj)
+    # Only need the full message if the message_body excludes the chained messages
    chained_email_message = None if include_chained_msgs else plain_body
    # Not sure this is valid but a unit test uses a DIFFERENT plain text to html text body
    # where plain text has blank message with forwarded message so.... hack away to support it
    if message_body is not None and len(message_body) == 0 and formatted_body and len(formatted_body) > 0:
        message_body, _x = attempt_body_extract_from_html(formatted_body)
        # Set the chained message to the orignal plain text full message so it is stored in a FollowUp comments field
        if len(plain_body) > 0:
            chained_email_message = plain_body
    add_file_if_always_save_incoming_email_message(files, message)
    smtp_priority = message_obj.get('priority', '')
    smtp_importance = message_obj.get('importance', '')
@ -897,8 +918,8 @@ def object_from_message(message: str,
        smtp_priority, smtp_importance} else 3
    payload = {
-        'body': body,
+        'body': message_body,
-        'full_body': full_body or body,
+        'full_body': chained_email_message,
        'subject': subject,
        'queue': queue,
        'sender_email': sender_email,