From 8006826ddf86acfce608064d5977b5d09efd03d6 Mon Sep 17 00:00:00 2001
From: Christopher Broderick <uhurusurfa@gmail.com>
Date: Tue, 10 Oct 2023 13:41:28 +0100
Subject: [PATCH] Process email content first then focus on attachments. Use
 the EmailMessage class for enhanced processing.

---
 helpdesk/email.py | 245 ++++++++++++++++++++++++++++------------------
 1 file changed, 152 insertions(+), 93 deletions(-)

diff --git a/helpdesk/email.py b/helpdesk/email.py
index 9cdcbdc8..bccb2992 100644
--- a/helpdesk/email.py
+++ b/helpdesk/email.py
@@ -16,7 +16,8 @@ from django.db.models import Q
 from django.utils import encoding, timezone
 from django.utils.translation import gettext as _
 import email
-from email.message import Message
+from email import policy
+from email.message import EmailMessage, MIMEPart
 from email.mime.text import MIMEText
 from email.utils import getaddresses
 from email_reply_parser import EmailReplyParser
@@ -52,6 +53,7 @@ STRIPPED_SUBJECT_STRINGS = [
     "Automatic reply: ",
 ]
 
+# Allow a custom default attached email name for the HTML formatted email if one is found
 HTML_EMAIL_ATTACHMENT_FILENAME = _("email_html_body.html")
 
 
@@ -426,7 +428,7 @@ def process_queue(q, logger):
 
 
 def decodeUnknown(charset, string):
-    if type(string) is not str:
+    if string and not isinstance(string, str):
         if not charset:
             try:
                 return str(string, encoding='utf-8', errors='replace')
@@ -717,7 +719,7 @@ def attempt_body_extract_from_html(message: str) -> str:
     return body, full_body
 
 
-def extract_mime_content(part: Message,) -> str:
+def mime_content_to_string(part: EmailMessage,) -> str:
     '''
     Extract the content from the MIME body part
     :param part: the MIME part to extract the content from
@@ -732,19 +734,82 @@ def extract_mime_content(part: Message,) -> str:
     return content
 
 
-def extract_email_message(mime_content: str, is_plain_content_type: bool, is_extract_full_email_msg: bool) -> str:
-    email_content = None
+def parse_email_content(mime_content: str, is_extract_full_email_msg: bool) -> str:
     if is_extract_full_email_msg:
         # Take the full content including encapsulated "forwarded" and "reply" sections
-        email_content = get_body_from_fragments(mime_content) if is_plain_content_type else mime_content
+        return mime_content
     else:
-        # Just get the primary part of the email and drop off any text below the actually response text
-        email_content = EmailReplyParser.parse_reply(mime_content) if is_plain_content_type else mime_content
-    return email_content
+        # Just get the primary part of the email and drop off any text below the actual response text
+        return EmailReplyParser.parse_reply(mime_content)
+
+
+def extract_email_message_content(
+        part: MIMEPart,
+        files: List,
+        include_chained_msgs: bool,
+) -> (str, str):
+    '''
+    Uses the get_body() method of the email package to extract the email message content.
+    If there is an HTML version of the email message content then it is stored as an attachment.
+    If there is a plain text part then that is used for storing the email content aginst the ticket.
+    Otherwise if there is just an HTML part then the HTML is parsed to extract a simple text message.
+    There is special handling for the case when a multipart/related part holds the message content when
+    there are multiple attachments to the email.
+    :param part: the base email MIME part to be searched
+    :param files: any MIME parts to be attached are added to this list
+    :param include_chained_msgs: flag to indicate if the entire email message content including past replies must be extracted
+    '''
+    message_part:MIMEPart = part.get_body()
+    parent_part:MIMEPart = part
+    content_type = message_part.get_content_type()
+    # Handle the possibility of a related part formatted email
+    if "multipart/related" == content_type:
+        # We want the actual message text so try again on the related MIME part
+        parent_part = message_part
+        message_part = message_part.get_body(preferencelist=["html", "plain",])
+        content_type = message_part.get_content_type()
+    mime_content = None
+    formatted_body = None # Retain the original content by using a secondary variable if the HTML needs wrapping
+    if "text/html" == content_type:
+        # add the HTML message as an attachment wrapping if necessary
+        mime_content = mime_content_to_string(message_part)
+        if "<body" not in mime_content:
+            formatted_body = f"<body>{mime_content}</body>"
+        if "<html" not in mime_content:
+            formatted_body = f"<html><head><meta charset=\"utf-8\" /></head>{mime_content if formatted_body is None else formatted_body}</html>"
+        files.append(
+            SimpleUploadedFile(
+                 HTML_EMAIL_ATTACHMENT_FILENAME, (mime_content if formatted_body is None else formatted_body).encode("utf-8"), 'text/html')
+        )
+        # Try to get a plain part message
+        plain_message_part = parent_part.get_body(preferencelist=["plain",])
+        if plain_message_part:
+            # Replace mime_content with the plain text part content
+            mime_content = mime_content_to_string(plain_message_part)
+            message_part = plain_message_part
+            content_type = "text/plain"
+        else:
+            # Try to constitute the HTML response as plain text
+            mime_content, _x = attempt_body_extract_from_html(mime_content if formatted_body is None else formatted_body)
+    else:
+        # Is either text/plain or some random content-type so just decode the part content and store as is
+        mime_content = mime_content_to_string(message_part)
+    # We should now have the mime content
+    filtered_body = parse_email_content(mime_content, include_chained_msgs)
+    if not filtered_body or "" == filtered_body.strip():
+        # A unit test that has a different HTML content to plain text which seems an invalid case as email
+        # tools should retain the HTML to be consistent with the plain text but manage this as a special case
+        # Try to constitute the HTML response as plain text
+        if formatted_body:
+            filtered_body, _x = attempt_body_extract_from_html(formatted_body)
+        else:
+            filtered_body = mime_content
+    # Only need the full message if the message_body excludes the chained messages
+    return filtered_body, mime_content
 
 
 def process_as_attachment(
-        part: Message,
+        part: MIMEPart,
         counter: int,
         files: List,
         logger: logging.Logger
@@ -756,14 +821,14 @@ def process_as_attachment(
         ext = mimetypes.guess_extension(part.get_content_type())
         name = f"part-{counter}{ext}"
     # Extract payload accounting for attached multiparts
-    payload = part.as_string() if part.is_multipart() else part.get_payload(decode=True)
-    files.append(SimpleUploadedFile(name, payload, mimetypes.guess_type(name)[0]))
+    payload_bytes = part.as_bytes() if part.is_multipart() else part.get_payload(decode=True)
+    files.append(SimpleUploadedFile(name, payload_bytes, mimetypes.guess_type(name)[0]))
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug("Processed MIME as attachment: %s", name)
     return
 
 
-def extract_email_subject(email_msg: Message,) -> str:
+def extract_email_subject(email_msg: EmailMessage,) -> str:
     subject = email_msg.get('subject', _('Comment from e-mail'))
     subject = decode_mail_headers(
         decodeUnknown(email_msg.get_charset(), subject))
@@ -772,6 +837,62 @@ def extract_email_subject(email_msg: Message,) -> str:
     return subject.strip()
 
 
+def extract_attachments(
+        target_part: MIMEPart,
+        files: List,
+        logger: logging.Logger,
+        counter: int = 1,
+        content_parts_excluded: bool = False,
+) -> (int, bool):
+    '''
+    if the MIME part is a multipart and not identified as inline or attachment then
+    iterate over the sub parts recursively.
+    Otherwise extract \MIME part content and add as an attachment.
+    It will recursively descend as appropriate ensuring that all parts not part of the message content
+    are added to the list of files to be attached. To cater for the possibility of text/plain and text/html parts
+    that are further down in the multipart heirarchy than the ones that ones meant to provide that content,
+    iterators are selectively used.
+    :param part: the email MIME part to be processed
+    :param files: any MIME part content or MIME parts to be attached are added to this list
+    :param logger: the logger to use for this MIME part processing
+    :param counter: the count of MIME parts added as attachment
+    :param content_parts_excluded: the MIME part(s) that provided the message content have been excluded
+    :returns the count of mime parts added as attachments and a boolean if the content parts have been excluded
+    '''
+    content_type = target_part.get_content_type()
+    content_maintype = target_part.get_content_maintype()
+    if "multipart" == content_maintype and target_part.get_content_disposition() not in ['inline', 'attachment']:
+        # Cycle through all MIME parts in the email extracting the attachments that were not part of the message body
+        # If this is a "related" multipart then we can use the message part excluder iterator directly
+        if "multipart/related" == content_type:
+            if content_parts_excluded:
+                # This should really never happen in a properly constructed email message but...
+                logger.warn("WARNING! Content type MIME parts have been excluded but a multipart/related has been encountered. there may be missing information in attachments.")
+            else:
+                content_parts_excluded = True
+            # Use the iterator that automatically excludes message content parts
+            for part in target_part.iter_attachments():
+                counter, content_parts_excluded = extract_attachments(part, files, logger, counter, content_parts_excluded)
+        # The iterator must be different depending on whether we have already excluded message content parts
+        else:
+            # Content part might be 1 or 2 parts but will be at same level so track finding at least 1
+            content_part_detected = False
+            for part in target_part.iter_parts():
+                if not content_parts_excluded and part.get_content_type() in ["text/plain", "text/html"]:
+                    content_part_detected = True
+                    continue
+                # Recurse into the part to process embedded parts
+                counter, content_parts_excluded = extract_attachments(part, files, logger, counter, content_parts_excluded)
+            # If we have found 1 or more content parts then flag that the content parts have been ommitted
+            # to ensure that other text/* parts are attached
+            if content_part_detected:
+                content_parts_excluded = True
+    else:
+        process_as_attachment(target_part, counter, files, logger)
+        counter = counter + 1
+    return (counter, content_parts_excluded)
+    
+
 def extract_email_metadata(message: str,
                            queue: Queue,
                            logger: logging.Logger
@@ -789,18 +910,19 @@ def extract_email_metadata(message: str,
     stored in the ticket if it is a new ticket by setting it to True.
     In this scenario, if it is a reply that is a forwarded message with no actual message,
     then the description will be sourced from the text/html part and the forwarded message
-    will be in the FollowUp record aassociated with the ticket.
+    will be in the FollowUp record associated with the ticket.
     It will iterate over every MIME part and store all MIME parts as attachments apart
     from the text/plain part.
     There may be a case for trying to exclude repeated signature images by checking if an
-    attachment of the same name already exists as an attachement on the ticket but that is
+    attachment of the same name already exists as an attachment on the ticket but that is
     not implemented.
     :param message: the raw email message received
-    :param queue: the queue that hte
+    :param queue: the queue that the message is assigned to
     :param logger: the logger to be used
     '''
     # 'message' must be an RFC822 formatted message to correctly parse.
-    message_obj: Message = email.message_from_string(message)
+    # NBot sure why but policy explicitly set to default is required for any messages with attachments in them
+    message_obj: EmailMessage = email.message_from_string(message, EmailMessage, policy=policy.default)
 
     subject = extract_email_subject(message_obj)
 
@@ -822,84 +944,21 @@ def extract_email_metadata(message: str,
         subject,
         logger
     )
-    plain_body: str = None
-    formatted_body: str = None
-    counter = 0
     files = []
-    first_mime_non_multipart_content: MIMEText = None
-    # Cycle through all MIME parts in the email extracting the plain and formatted messages
-    # Algorithm uses the first text parts found as the actual email content and subsequent text parts
-    # are made into attachments so they do not get lost
-    for part in message_obj.walk():
-        part_main_type = part.get_content_maintype()
-        if part_main_type == 'multipart':
-            continue
-        if part.get_content_disposition() in ['inline', 'attachment']:
-            process_as_attachment(part, counter, files, logger)
-        else:
-            # Get the content then assign to plain for formatted email message otherwise store the
-            # content as an attachment
-            mime_content = extract_mime_content(part)
-            if first_mime_non_multipart_content is None:
-                first_mime_non_multipart_content = mime_content
-            if part_main_type == 'text':
-                # Could be the body of the email
-                part_sub_type = part.get_content_subtype()
-                if plain_body is None and part_sub_type == "plain":
-                    plain_body = mime_content
-                elif formatted_body is None and part_sub_type == "html":
-                    formatted_body = mime_content
-                    if "<body" not in formatted_body:
-                        email_body = f"<body>{formatted_body}</body>"
-                    else:
-                        email_body = formatted_body
-
-                    payload = (
-                        '<html>'
-                        '<head>'
-                        '<meta charset="utf-8" />'
-                        '</head>'
-                        '%s'
-                        '</html>'
-                    ) % email_body
-                    files.append(
-                        SimpleUploadedFile(
-                            HTML_EMAIL_ATTACHMENT_FILENAME, payload.encode("utf-8"), 'text/html')
-                    )
-                else:
-                    # Theoretically should not happen to properly structured emails but process anything
-                    # else as an attachment
-                    process_as_attachment(part, counter, files, logger)
-                    logger.debug(f"Text MIME part added as attachment: {part.get_content_type()}")
-            else:
-                # process anything else as an attachment
-                process_as_attachment(part, counter, files, logger)
-        counter += 1
-    # Check if we have at least the plain body
-    if not plain_body:
-        if formatted_body:
-            # We have a formatted body but no plain text body
-            plain_body, _x = attempt_body_extract_from_html(formatted_body)
-        else:
-            # Something wrong with email or a processing issue so try first part or save full email message
-            if first_mime_non_multipart_content:
-                plain_body = extract_email_message(first_mime_non_multipart_content, True, True)
-            else:
-                plain_body = message
     # first message in thread, we save full body to avoid losing forwards and things like that
     include_chained_msgs = True if ticket_id is None and getattr(
         django_settings, 'HELPDESK_FULL_FIRST_MESSAGE_FROM_EMAIL', False) else False
-    message_body = extract_email_message(plain_body, True, include_chained_msgs)
-    # Only need the full message if the message_body excludes the chained messages
-    chained_email_message = None if include_chained_msgs else plain_body
-    # Not sure this is valid but a unit test uses a DIFFERENT plain text to html text body
-    # where plain text has blank message with forwarded message so.... hack away to support it
-    if message_body is not None and len(message_body) == 0 and formatted_body and len(formatted_body) > 0:
-        message_body, _x = attempt_body_extract_from_html(formatted_body)
-        # Set the chained message to the orignal plain text full message so it is stored in a FollowUp comments field
-        if len(plain_body) > 0:
-            chained_email_message = plain_body
-
+    filtered_body, full_body = extract_email_message_content(message_obj, files, include_chained_msgs)
+    # If the base part is not a multipart then it will have already been processed as the vbody content so
+    # no need to process attachments
+    if "multipart" == message_obj.get_content_maintype():
+        # Find and attach all other parts or part contents as attachments
+        counter, content_parts_excluded = extract_attachments(message_obj, files, logger)
+        if not content_parts_excluded:
+            # Unexpected situation and may mean there is a hole in the email processing logic
+            logger.warning("Failed to exclude email content when parsing all MIME parts in the multipart. Verify that there were no text/* parts containing message content.")
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug("Email parsed and %s attachments were found and attached.", counter)
     add_file_if_always_save_incoming_email_message(files, message)
 
     smtp_priority = message_obj.get('priority', '')
@@ -909,8 +968,8 @@ def extract_email_metadata(message: str,
         smtp_priority, smtp_importance} else 3
 
     payload = {
-        'body': message_body,
-        'full_body': chained_email_message,
+        'body': filtered_body,
+        'full_body': full_body,
         'subject': subject,
         'queue': queue,
         'sender_email': sender_email,