Completely rework the email parsing.

Fixes a number of hacks that have accumulated and makes it is more
easily understood and easier to enhance in the future.
This commit is contained in:
Christopher Broderick 2023-07-23 06:12:32 +01:00
parent fad11d53bb
commit 2e5697c11a

View File

@ -39,7 +39,8 @@ import ssl
import sys import sys
from time import ctime from time import ctime
import typing import typing
from typing import List, Tuple from typing import List
from email.mime.text import MIMEText
# import User model, which may be a custom model # import User model, which may be a custom model
@ -53,6 +54,8 @@ STRIPPED_SUBJECT_STRINGS = [
"Automatic reply: ", "Automatic reply: ",
] ]
HTML_EMAIL_ATTACHMENT_FILENAME = _("email_html_body.html")
def process_email(quiet=False): def process_email(quiet=False):
for q in Queue.objects.filter( for q in Queue.objects.filter(
@ -141,7 +144,7 @@ def pop3_sync(q, logger, server):
full_message = encoding.force_str( full_message = encoding.force_str(
"\n".join(raw_content), errors='replace') "\n".join(raw_content), errors='replace')
try: try:
ticket = object_from_message(message=full_message, queue=q, logger=logger) ticket = extract_email_metadata(message=full_message, queue=q, logger=logger)
except IgnoreTicketException: except IgnoreTicketException:
logger.warn( logger.warn(
"Message %s was ignored and will be left on POP3 server" % msgNum) "Message %s was ignored and will be left on POP3 server" % msgNum)
@ -198,7 +201,7 @@ def imap_sync(q, logger, server):
data = server.fetch(num, '(RFC822)')[1] data = server.fetch(num, '(RFC822)')[1]
full_message = encoding.force_str(data[0][1], errors='replace') full_message = encoding.force_str(data[0][1], errors='replace')
try: try:
ticket = object_from_message(message=full_message, queue=q, logger=logger) ticket = extract_email_metadata(message=full_message, queue=q, logger=logger)
except IgnoreTicketException: except IgnoreTicketException:
logger.warn("Message %s was ignored and will be left on IMAP server" % num) logger.warn("Message %s was ignored and will be left on IMAP server" % num)
except DeleteIgnoredTicketException: except DeleteIgnoredTicketException:
@ -285,7 +288,7 @@ def imap_oauth_sync(q, logger, server):
full_message = encoding.force_str(data[0][1], errors='replace') full_message = encoding.force_str(data[0][1], errors='replace')
try: try:
ticket = object_from_message(message=full_message, queue=q, logger=logger) ticket = extract_email_metadata(message=full_message, queue=q, logger=logger)
except IgnoreTicketException as itex: except IgnoreTicketException as itex:
logger.warn(f"Message {num} was ignored. {itex}") logger.warn(f"Message {num} was ignored. {itex}")
@ -405,7 +408,7 @@ def process_queue(q, logger):
with open(m, 'r') as f: with open(m, 'r') as f:
full_message = encoding.force_str(f.read(), errors='replace') full_message = encoding.force_str(f.read(), errors='replace')
try: try:
ticket = object_from_message(message=full_message, queue=q, logger=logger) ticket = extract_email_metadata(message=full_message, queue=q, logger=logger)
except IgnoreTicketException: except IgnoreTicketException:
logger.warn("Message %d was ignored and will be left in local directory", i) logger.warn("Message %d was ignored and will be left in local directory", i)
except DeleteIgnoredTicketException: except DeleteIgnoredTicketException:
@ -433,7 +436,7 @@ def decodeUnknown(charset, string):
if not charset: if not charset:
try: try:
return str(string, encoding='utf-8', errors='replace') return str(string, encoding='utf-8', errors='replace')
except UnicodeError: except UnicodeError as e:
return str(string, encoding='iso8859-1', errors='replace') return str(string, encoding='iso8859-1', errors='replace')
return str(string, encoding=charset, errors='replace') return str(string, encoding=charset, errors='replace')
return string return string
@ -723,133 +726,92 @@ def attempt_body_extract_from_html(message: str) -> str:
return body, full_body return body, full_body
def extract_part_data( def extract_mime_content(part: Message,) -> str:
'''
Extract the content from the MIME body part
:param part: the MIME part to extract the content from
'''
content_bytes = part.get_payload(decode=True)
charset = part.get_content_charset()
# The default for MIME email is 7bit which requires special decoding to utf-8 so make sure we handle the decoding correctly
if part['Content-Transfer-Encoding'] in [None, '8bit', '7bit'] and (charset == 'utf-8' or charset is None):
charset = "unicode_escape"
content = decodeUnknown(charset, content_bytes)
return content
def extract_email_message(mime_content: str, is_plain_content_type: bool, is_extract_full_email_msg: bool) -> str:
email_content = None
if is_extract_full_email_msg:
# Take the full content including encapsulated "forwarded" and "reply" sections
email_content = get_body_from_fragments(mime_content) if is_plain_content_type else mime_content
else:
# Just get the primary part of the email and drop off any text below the actually response text
email_content = EmailReplyParser.parse_reply(mime_content) if is_plain_content_type else mime_content
return email_content
def process_as_attachment(
part: Message, part: Message,
counter: int, counter: int,
ticket_id: int,
files: List, files: List,
logger: logging.Logger logger: logging.Logger
) -> Tuple[str, str]: ):
name = part.get_filename() name = part.get_filename()
if name: if name:
name = email.utils.collapse_rfc2231_value(name) name = f"part-{counter}_{email.utils.collapse_rfc2231_value(name)}"
part_body = None
formatted_body = None
if part.get_content_maintype() == 'text' and name is None:
if part.get_content_subtype() == 'plain':
part_body = part.get_payload(decode=True)
# https://github.com/django-helpdesk/django-helpdesk/issues/732
if part['Content-Transfer-Encoding'] == '8bit' and part.get_content_charset() == 'utf-8':
part_body = part_body.decode('unicode_escape')
part_body = decodeUnknown(part.get_content_charset(), part_body)
# have to use django_settings here so overwriting it works in tests
# the default value is False anyway
if ticket_id is None and getattr(django_settings, 'HELPDESK_FULL_FIRST_MESSAGE_FROM_EMAIL', False):
# first message in thread, we save full body to avoid
# losing forwards and things like that
formatted_body = get_body_from_fragments(part_body)
part_body = EmailReplyParser.parse_reply(part_body)
else:
# second and other reply, save only first part of the
# message
part_body = EmailReplyParser.parse_reply(part_body)
formatted_body = part_body
# workaround to get unicode text out rather than escaped text
part_body = get_encoded_body(part_body)
logger.debug("Discovered plain text MIME part")
else:
email_body = get_email_body_from_part_payload(part)
if not part_body and not formatted_body:
# no text has been parsed so far - try such deep parsing
# for some messages
altered_body = email_body.replace(
"</p>", "</p>\n").replace("<br", "\n<br")
mail = BeautifulSoup(str(altered_body), "html.parser")
formatted_body = mail.get_text()
if "<body" not in email_body:
email_body = f"<body>{email_body}</body>"
payload = (
'<html>'
'<head>'
'<meta charset="utf-8" />'
'</head>'
'%s'
'</html>'
) % email_body
files.append(
SimpleUploadedFile(
_("email_html_body.html"), payload.encode("utf-8"), 'text/html')
)
logger.debug("Discovered HTML MIME part")
else: else:
if not name: ext = mimetypes.guess_extension(part.get_content_type())
ext = mimetypes.guess_extension(part.get_content_type()) name = f"part-{counter}{ext}"
name = f"part-{counter}{ext}" # Extract payload accounting for attached multiparts
else: payload = part.as_string() if part.is_multipart() else part.get_payload(decode=True)
name = f"part-{counter}_{name}" files.append(SimpleUploadedFile(name, payload, mimetypes.guess_type(name)[0]))
payload = part.as_string() if part.is_multipart() else part.get_payload(decode=True) if logger.isEnabledFor(logging.DEBUG):
files.append(SimpleUploadedFile(name, payload, mimetypes.guess_type(name)[0])) logger.debug("Processed MIME as attachment: %s", name)
logger.debug("Found MIME attachment %s", name) return
return part_body, formatted_body
def recurse_multipart( def extract_email_subject(email_msg: Message,) -> str:
multipart: Message, subject = email_msg.get('subject', _('Comment from e-mail'))
counter: int, subject = decode_mail_headers(
ticket_id: int, decodeUnknown(email_msg.get_charset(), subject))
files: List, for affix in STRIPPED_SUBJECT_STRINGS:
logger: logging.Logger subject = subject.replace(affix, "")
) -> Tuple[str, str]: return subject.strip()
'''
The received MIME part could be a multipart with embedded multiparts and therefore requires recursion.
Recurse through the multipart structures trying to find the 1st body part that
provides the message body. It will try to find an HTML formatted part (contentType=text/html)
and a TEXT formatted part (contentType=text/plain) and return both
:param multipart:
:param counter:
:param ticket_id:
:param files:
:param logger:
'''
plain_msg = None
formatted_msg = None
for part in multipart.walk():
if part.get_content_maintype() == 'multipart':
continue
# See email.message_obj.Message.get_filename()
plain_body, formatted_body = recurse_multipart(
part, counter, ticket_id, files, logger) if part.get_content_maintype(
) == 'multipart' else extract_part_data(part, counter, ticket_id, files, logger)
# Only update the message variables if they are still empty to handle attached messages overriding the core message
if plain_msg is None and plain_body:
plain_msg = plain_body
if formatted_msg is None and formatted_body:
formatted_msg = formatted_body
counter += 1
return plain_msg, formatted_msg
def object_from_message(message: str, def extract_email_metadata(message: str,
queue: Queue, queue: Queue,
logger: logging.Logger logger: logging.Logger
) -> Ticket: ) -> Ticket:
'''
Extracts the text/plain mime part if there is one as the ticket description and
stores the text/html part as an attachment if it is present.
If no text/plain part is present then it will try to use the text/html part if
it is present as the ticket description by removing the HTML formatting.
If neither a text/plain or text/html is present then it will use the first text/*
MIME part that it finds as the ticket description.
By default it will always take only the actual message and drop any chained messages
from replies.
The HELPDESK_FULL_FIRST_MESSAGE_FROM_EMAIL settings can force the entire message to be
stored in the ticket if it is a new ticket by setting it to True.
In this scenario, if it is a reply that is a forwarded message with no actual message,
then the description will be sourced from the text/html part and the forwarded message
will be in the FollowUp record aassociated with the ticket.
It will iterate over every MIME part and store all MIME parts as attachments apart
from the text/plain part.
There may be a case for trying to exclude repeated signature images by checking if an
attachment of the same name already exists as an attachement on the ticket but that is
not implemented.
:param message: the raw email message received
:param queue: the queue that hte
:param logger: the logger to be used
'''
# 'message' must be an RFC822 formatted message to correctly parse. # 'message' must be an RFC822 formatted message to correctly parse.
message_obj = email.message_from_string(message) message_obj: Message = email.message_from_string(message)
subject = message_obj.get('subject', _('Comment from e-mail')) subject = extract_email_subject(message_obj)
subject = decode_mail_headers(
decodeUnknown(message_obj.get_charset(), subject))
for affix in STRIPPED_SUBJECT_STRINGS:
subject = subject.replace(affix, "")
subject = subject.strip()
# TODO: Should really be assigning a properly formatted fake email.
# Check if anything relies on this being a "real name" formatted string if no sender is found on message_obj.
# Also not sure it should be accepting emails from unknown senders
sender_email = _('Unknown Sender') sender_email = _('Unknown Sender')
sender_hdr = message_obj.get('from') sender_hdr = message_obj.get('from')
if sender_hdr: if sender_hdr:
@ -868,27 +830,86 @@ def object_from_message(message: str,
subject, subject,
logger logger
) )
plain_body: str = None
body = None formatted_body: str = None
full_body = None
counter = 0 counter = 0
files = [] files = []
first_mime_non_multipart_content: MIMEText = None
# Cycle through all MIME parts in the email extracting the plain and formatted messages
# Algorithm uses the first text parts found as the actual email content and subsequent text parts
# are made into attachments so they do not get lost
for part in message_obj.walk(): for part in message_obj.walk():
if part.get_content_maintype() == 'multipart': part_main_type = part.get_content_maintype()
if part_main_type == 'multipart':
continue continue
# See email.message_obj.Message.get_filename() if part.get_content_disposition() in ['inline', 'attachment']:
plain_body, formatted_body = extract_part_data(part, counter, ticket_id, files, logger) process_as_attachment(part, counter, files, logger)
if plain_body: else:
body = plain_body # Get the content then assign to plain for formatted email message otherwise store the content as an attachment
if formatted_body: mime_content = extract_mime_content(part)
full_body = formatted_body if first_mime_non_multipart_content is None:
first_mime_non_multipart_content = mime_content
if part_main_type == 'text':
# Could be the body of the email
part_sub_type = part.get_content_subtype()
if plain_body is None and part_sub_type == "plain":
plain_body = mime_content
elif formatted_body is None and part_sub_type == "html":
formatted_body = mime_content
if "<body" not in formatted_body:
email_body = f"<body>{formatted_body}</body>"
else:
email_body = formatted_body
payload = (
'<html>'
'<head>'
'<meta charset="utf-8" />'
'</head>'
'%s'
'</html>'
) % email_body
files.append(
SimpleUploadedFile(
HTML_EMAIL_ATTACHMENT_FILENAME, payload.encode("utf-8"), 'text/html')
)
else:
# Theoretically should not happen to properly structured emails but process anything else as an attachment
process_as_attachment(part, counter, files, logger)
logger.debug(f"Text MIME part added as attachment: {part.get_content_type()}")
else:
# process anything else as an attachment
process_as_attachment(part, counter, files, logger)
counter += 1 counter += 1
if not body: # Check if we have at least the plain body
body, full_body = attempt_body_extract_from_html(message_obj) if not plain_body:
if formatted_body:
# We have a formatted body but no plain text body
plain_body, _x = attempt_body_extract_from_html(formatted_body)
else:
# Something wrong with email or a processing issue so try first part or save full email message
if first_mime_non_multipart_content:
plain_body = extract_email_message(first_mime_non_multipart_content, True, True)
else:
plain_body = message
# first message in thread, we save full body to avoid losing forwards and things like that
include_chained_msgs = True if ticket_id is None and getattr(django_settings, 'HELPDESK_FULL_FIRST_MESSAGE_FROM_EMAIL', False) else False
message_body = extract_email_message(plain_body, True, include_chained_msgs)
add_file_if_always_save_incoming_email_message(files, message_obj) # Only need the full message if the message_body excludes the chained messages
chained_email_message = None if include_chained_msgs else plain_body
# Not sure this is valid but a unit test uses a DIFFERENT plain text to html text body
# where plain text has blank message with forwarded message so.... hack away to support it
if message_body is not None and len(message_body) == 0 and formatted_body and len(formatted_body) > 0:
message_body, _x = attempt_body_extract_from_html(formatted_body)
# Set the chained message to the orignal plain text full message so it is stored in a FollowUp comments field
if len(plain_body) > 0:
chained_email_message = plain_body
add_file_if_always_save_incoming_email_message(files, message)
smtp_priority = message_obj.get('priority', '') smtp_priority = message_obj.get('priority', '')
smtp_importance = message_obj.get('importance', '') smtp_importance = message_obj.get('importance', '')
@ -897,8 +918,8 @@ def object_from_message(message: str,
smtp_priority, smtp_importance} else 3 smtp_priority, smtp_importance} else 3
payload = { payload = {
'body': body, 'body': message_body,
'full_body': full_body or body, 'full_body': chained_email_message,
'subject': subject, 'subject': subject,
'queue': queue, 'queue': queue,
'sender_email': sender_email, 'sender_email': sender_email,