mirror of
https://github.com/django-helpdesk/django-helpdesk.git
synced 2025-01-23 14:28:55 +01:00
Completely rework the email parsing.
Fixes a number of hacks that have accumulated and makes it is more easily understood and easier to enhance in the future.
This commit is contained in:
parent
fad11d53bb
commit
2e5697c11a
@ -39,7 +39,8 @@ import ssl
|
|||||||
import sys
|
import sys
|
||||||
from time import ctime
|
from time import ctime
|
||||||
import typing
|
import typing
|
||||||
from typing import List, Tuple
|
from typing import List
|
||||||
|
from email.mime.text import MIMEText
|
||||||
|
|
||||||
|
|
||||||
# import User model, which may be a custom model
|
# import User model, which may be a custom model
|
||||||
@ -53,6 +54,8 @@ STRIPPED_SUBJECT_STRINGS = [
|
|||||||
"Automatic reply: ",
|
"Automatic reply: ",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
HTML_EMAIL_ATTACHMENT_FILENAME = _("email_html_body.html")
|
||||||
|
|
||||||
|
|
||||||
def process_email(quiet=False):
|
def process_email(quiet=False):
|
||||||
for q in Queue.objects.filter(
|
for q in Queue.objects.filter(
|
||||||
@ -141,7 +144,7 @@ def pop3_sync(q, logger, server):
|
|||||||
full_message = encoding.force_str(
|
full_message = encoding.force_str(
|
||||||
"\n".join(raw_content), errors='replace')
|
"\n".join(raw_content), errors='replace')
|
||||||
try:
|
try:
|
||||||
ticket = object_from_message(message=full_message, queue=q, logger=logger)
|
ticket = extract_email_metadata(message=full_message, queue=q, logger=logger)
|
||||||
except IgnoreTicketException:
|
except IgnoreTicketException:
|
||||||
logger.warn(
|
logger.warn(
|
||||||
"Message %s was ignored and will be left on POP3 server" % msgNum)
|
"Message %s was ignored and will be left on POP3 server" % msgNum)
|
||||||
@ -198,7 +201,7 @@ def imap_sync(q, logger, server):
|
|||||||
data = server.fetch(num, '(RFC822)')[1]
|
data = server.fetch(num, '(RFC822)')[1]
|
||||||
full_message = encoding.force_str(data[0][1], errors='replace')
|
full_message = encoding.force_str(data[0][1], errors='replace')
|
||||||
try:
|
try:
|
||||||
ticket = object_from_message(message=full_message, queue=q, logger=logger)
|
ticket = extract_email_metadata(message=full_message, queue=q, logger=logger)
|
||||||
except IgnoreTicketException:
|
except IgnoreTicketException:
|
||||||
logger.warn("Message %s was ignored and will be left on IMAP server" % num)
|
logger.warn("Message %s was ignored and will be left on IMAP server" % num)
|
||||||
except DeleteIgnoredTicketException:
|
except DeleteIgnoredTicketException:
|
||||||
@ -285,7 +288,7 @@ def imap_oauth_sync(q, logger, server):
|
|||||||
full_message = encoding.force_str(data[0][1], errors='replace')
|
full_message = encoding.force_str(data[0][1], errors='replace')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
ticket = object_from_message(message=full_message, queue=q, logger=logger)
|
ticket = extract_email_metadata(message=full_message, queue=q, logger=logger)
|
||||||
|
|
||||||
except IgnoreTicketException as itex:
|
except IgnoreTicketException as itex:
|
||||||
logger.warn(f"Message {num} was ignored. {itex}")
|
logger.warn(f"Message {num} was ignored. {itex}")
|
||||||
@ -405,7 +408,7 @@ def process_queue(q, logger):
|
|||||||
with open(m, 'r') as f:
|
with open(m, 'r') as f:
|
||||||
full_message = encoding.force_str(f.read(), errors='replace')
|
full_message = encoding.force_str(f.read(), errors='replace')
|
||||||
try:
|
try:
|
||||||
ticket = object_from_message(message=full_message, queue=q, logger=logger)
|
ticket = extract_email_metadata(message=full_message, queue=q, logger=logger)
|
||||||
except IgnoreTicketException:
|
except IgnoreTicketException:
|
||||||
logger.warn("Message %d was ignored and will be left in local directory", i)
|
logger.warn("Message %d was ignored and will be left in local directory", i)
|
||||||
except DeleteIgnoredTicketException:
|
except DeleteIgnoredTicketException:
|
||||||
@ -433,7 +436,7 @@ def decodeUnknown(charset, string):
|
|||||||
if not charset:
|
if not charset:
|
||||||
try:
|
try:
|
||||||
return str(string, encoding='utf-8', errors='replace')
|
return str(string, encoding='utf-8', errors='replace')
|
||||||
except UnicodeError:
|
except UnicodeError as e:
|
||||||
return str(string, encoding='iso8859-1', errors='replace')
|
return str(string, encoding='iso8859-1', errors='replace')
|
||||||
return str(string, encoding=charset, errors='replace')
|
return str(string, encoding=charset, errors='replace')
|
||||||
return string
|
return string
|
||||||
@ -723,133 +726,92 @@ def attempt_body_extract_from_html(message: str) -> str:
|
|||||||
return body, full_body
|
return body, full_body
|
||||||
|
|
||||||
|
|
||||||
def extract_part_data(
|
def extract_mime_content(part: Message,) -> str:
|
||||||
|
'''
|
||||||
|
Extract the content from the MIME body part
|
||||||
|
:param part: the MIME part to extract the content from
|
||||||
|
'''
|
||||||
|
content_bytes = part.get_payload(decode=True)
|
||||||
|
charset = part.get_content_charset()
|
||||||
|
# The default for MIME email is 7bit which requires special decoding to utf-8 so make sure we handle the decoding correctly
|
||||||
|
if part['Content-Transfer-Encoding'] in [None, '8bit', '7bit'] and (charset == 'utf-8' or charset is None):
|
||||||
|
charset = "unicode_escape"
|
||||||
|
content = decodeUnknown(charset, content_bytes)
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def extract_email_message(mime_content: str, is_plain_content_type: bool, is_extract_full_email_msg: bool) -> str:
|
||||||
|
email_content = None
|
||||||
|
if is_extract_full_email_msg:
|
||||||
|
# Take the full content including encapsulated "forwarded" and "reply" sections
|
||||||
|
email_content = get_body_from_fragments(mime_content) if is_plain_content_type else mime_content
|
||||||
|
else:
|
||||||
|
# Just get the primary part of the email and drop off any text below the actually response text
|
||||||
|
email_content = EmailReplyParser.parse_reply(mime_content) if is_plain_content_type else mime_content
|
||||||
|
return email_content
|
||||||
|
|
||||||
|
|
||||||
|
def process_as_attachment(
|
||||||
part: Message,
|
part: Message,
|
||||||
counter: int,
|
counter: int,
|
||||||
ticket_id: int,
|
|
||||||
files: List,
|
files: List,
|
||||||
logger: logging.Logger
|
logger: logging.Logger
|
||||||
) -> Tuple[str, str]:
|
):
|
||||||
name = part.get_filename()
|
name = part.get_filename()
|
||||||
if name:
|
if name:
|
||||||
name = email.utils.collapse_rfc2231_value(name)
|
name = f"part-{counter}_{email.utils.collapse_rfc2231_value(name)}"
|
||||||
part_body = None
|
|
||||||
formatted_body = None
|
|
||||||
if part.get_content_maintype() == 'text' and name is None:
|
|
||||||
if part.get_content_subtype() == 'plain':
|
|
||||||
part_body = part.get_payload(decode=True)
|
|
||||||
# https://github.com/django-helpdesk/django-helpdesk/issues/732
|
|
||||||
if part['Content-Transfer-Encoding'] == '8bit' and part.get_content_charset() == 'utf-8':
|
|
||||||
part_body = part_body.decode('unicode_escape')
|
|
||||||
part_body = decodeUnknown(part.get_content_charset(), part_body)
|
|
||||||
# have to use django_settings here so overwriting it works in tests
|
|
||||||
# the default value is False anyway
|
|
||||||
if ticket_id is None and getattr(django_settings, 'HELPDESK_FULL_FIRST_MESSAGE_FROM_EMAIL', False):
|
|
||||||
# first message in thread, we save full body to avoid
|
|
||||||
# losing forwards and things like that
|
|
||||||
formatted_body = get_body_from_fragments(part_body)
|
|
||||||
part_body = EmailReplyParser.parse_reply(part_body)
|
|
||||||
else:
|
|
||||||
# second and other reply, save only first part of the
|
|
||||||
# message
|
|
||||||
part_body = EmailReplyParser.parse_reply(part_body)
|
|
||||||
formatted_body = part_body
|
|
||||||
# workaround to get unicode text out rather than escaped text
|
|
||||||
part_body = get_encoded_body(part_body)
|
|
||||||
logger.debug("Discovered plain text MIME part")
|
|
||||||
else:
|
|
||||||
email_body = get_email_body_from_part_payload(part)
|
|
||||||
|
|
||||||
if not part_body and not formatted_body:
|
|
||||||
# no text has been parsed so far - try such deep parsing
|
|
||||||
# for some messages
|
|
||||||
altered_body = email_body.replace(
|
|
||||||
"</p>", "</p>\n").replace("<br", "\n<br")
|
|
||||||
mail = BeautifulSoup(str(altered_body), "html.parser")
|
|
||||||
formatted_body = mail.get_text()
|
|
||||||
|
|
||||||
if "<body" not in email_body:
|
|
||||||
email_body = f"<body>{email_body}</body>"
|
|
||||||
|
|
||||||
payload = (
|
|
||||||
'<html>'
|
|
||||||
'<head>'
|
|
||||||
'<meta charset="utf-8" />'
|
|
||||||
'</head>'
|
|
||||||
'%s'
|
|
||||||
'</html>'
|
|
||||||
) % email_body
|
|
||||||
files.append(
|
|
||||||
SimpleUploadedFile(
|
|
||||||
_("email_html_body.html"), payload.encode("utf-8"), 'text/html')
|
|
||||||
)
|
|
||||||
logger.debug("Discovered HTML MIME part")
|
|
||||||
else:
|
else:
|
||||||
if not name:
|
ext = mimetypes.guess_extension(part.get_content_type())
|
||||||
ext = mimetypes.guess_extension(part.get_content_type())
|
name = f"part-{counter}{ext}"
|
||||||
name = f"part-{counter}{ext}"
|
# Extract payload accounting for attached multiparts
|
||||||
else:
|
payload = part.as_string() if part.is_multipart() else part.get_payload(decode=True)
|
||||||
name = f"part-{counter}_{name}"
|
files.append(SimpleUploadedFile(name, payload, mimetypes.guess_type(name)[0]))
|
||||||
payload = part.as_string() if part.is_multipart() else part.get_payload(decode=True)
|
if logger.isEnabledFor(logging.DEBUG):
|
||||||
files.append(SimpleUploadedFile(name, payload, mimetypes.guess_type(name)[0]))
|
logger.debug("Processed MIME as attachment: %s", name)
|
||||||
logger.debug("Found MIME attachment %s", name)
|
return
|
||||||
return part_body, formatted_body
|
|
||||||
|
|
||||||
|
|
||||||
def recurse_multipart(
|
def extract_email_subject(email_msg: Message,) -> str:
|
||||||
multipart: Message,
|
subject = email_msg.get('subject', _('Comment from e-mail'))
|
||||||
counter: int,
|
subject = decode_mail_headers(
|
||||||
ticket_id: int,
|
decodeUnknown(email_msg.get_charset(), subject))
|
||||||
files: List,
|
for affix in STRIPPED_SUBJECT_STRINGS:
|
||||||
logger: logging.Logger
|
subject = subject.replace(affix, "")
|
||||||
) -> Tuple[str, str]:
|
return subject.strip()
|
||||||
'''
|
|
||||||
The received MIME part could be a multipart with embedded multiparts and therefore requires recursion.
|
|
||||||
Recurse through the multipart structures trying to find the 1st body part that
|
|
||||||
provides the message body. It will try to find an HTML formatted part (contentType=text/html)
|
|
||||||
and a TEXT formatted part (contentType=text/plain) and return both
|
|
||||||
:param multipart:
|
|
||||||
:param counter:
|
|
||||||
:param ticket_id:
|
|
||||||
:param files:
|
|
||||||
:param logger:
|
|
||||||
'''
|
|
||||||
plain_msg = None
|
|
||||||
formatted_msg = None
|
|
||||||
|
|
||||||
for part in multipart.walk():
|
|
||||||
if part.get_content_maintype() == 'multipart':
|
|
||||||
continue
|
|
||||||
# See email.message_obj.Message.get_filename()
|
|
||||||
plain_body, formatted_body = recurse_multipart(
|
|
||||||
part, counter, ticket_id, files, logger) if part.get_content_maintype(
|
|
||||||
) == 'multipart' else extract_part_data(part, counter, ticket_id, files, logger)
|
|
||||||
# Only update the message variables if they are still empty to handle attached messages overriding the core message
|
|
||||||
if plain_msg is None and plain_body:
|
|
||||||
plain_msg = plain_body
|
|
||||||
if formatted_msg is None and formatted_body:
|
|
||||||
formatted_msg = formatted_body
|
|
||||||
counter += 1
|
|
||||||
return plain_msg, formatted_msg
|
|
||||||
|
|
||||||
|
|
||||||
def object_from_message(message: str,
|
def extract_email_metadata(message: str,
|
||||||
queue: Queue,
|
queue: Queue,
|
||||||
logger: logging.Logger
|
logger: logging.Logger
|
||||||
) -> Ticket:
|
) -> Ticket:
|
||||||
|
'''
|
||||||
|
Extracts the text/plain mime part if there is one as the ticket description and
|
||||||
|
stores the text/html part as an attachment if it is present.
|
||||||
|
If no text/plain part is present then it will try to use the text/html part if
|
||||||
|
it is present as the ticket description by removing the HTML formatting.
|
||||||
|
If neither a text/plain or text/html is present then it will use the first text/*
|
||||||
|
MIME part that it finds as the ticket description.
|
||||||
|
By default it will always take only the actual message and drop any chained messages
|
||||||
|
from replies.
|
||||||
|
The HELPDESK_FULL_FIRST_MESSAGE_FROM_EMAIL settings can force the entire message to be
|
||||||
|
stored in the ticket if it is a new ticket by setting it to True.
|
||||||
|
In this scenario, if it is a reply that is a forwarded message with no actual message,
|
||||||
|
then the description will be sourced from the text/html part and the forwarded message
|
||||||
|
will be in the FollowUp record aassociated with the ticket.
|
||||||
|
It will iterate over every MIME part and store all MIME parts as attachments apart
|
||||||
|
from the text/plain part.
|
||||||
|
There may be a case for trying to exclude repeated signature images by checking if an
|
||||||
|
attachment of the same name already exists as an attachement on the ticket but that is
|
||||||
|
not implemented.
|
||||||
|
:param message: the raw email message received
|
||||||
|
:param queue: the queue that hte
|
||||||
|
:param logger: the logger to be used
|
||||||
|
'''
|
||||||
# 'message' must be an RFC822 formatted message to correctly parse.
|
# 'message' must be an RFC822 formatted message to correctly parse.
|
||||||
message_obj = email.message_from_string(message)
|
message_obj: Message = email.message_from_string(message)
|
||||||
|
|
||||||
subject = message_obj.get('subject', _('Comment from e-mail'))
|
subject = extract_email_subject(message_obj)
|
||||||
subject = decode_mail_headers(
|
|
||||||
decodeUnknown(message_obj.get_charset(), subject))
|
|
||||||
for affix in STRIPPED_SUBJECT_STRINGS:
|
|
||||||
subject = subject.replace(affix, "")
|
|
||||||
subject = subject.strip()
|
|
||||||
|
|
||||||
# TODO: Should really be assigning a properly formatted fake email.
|
|
||||||
# Check if anything relies on this being a "real name" formatted string if no sender is found on message_obj.
|
|
||||||
# Also not sure it should be accepting emails from unknown senders
|
|
||||||
sender_email = _('Unknown Sender')
|
sender_email = _('Unknown Sender')
|
||||||
sender_hdr = message_obj.get('from')
|
sender_hdr = message_obj.get('from')
|
||||||
if sender_hdr:
|
if sender_hdr:
|
||||||
@ -868,27 +830,86 @@ def object_from_message(message: str,
|
|||||||
subject,
|
subject,
|
||||||
logger
|
logger
|
||||||
)
|
)
|
||||||
|
plain_body: str = None
|
||||||
body = None
|
formatted_body: str = None
|
||||||
full_body = None
|
|
||||||
counter = 0
|
counter = 0
|
||||||
files = []
|
files = []
|
||||||
|
first_mime_non_multipart_content: MIMEText = None
|
||||||
|
|
||||||
|
# Cycle through all MIME parts in the email extracting the plain and formatted messages
|
||||||
|
# Algorithm uses the first text parts found as the actual email content and subsequent text parts
|
||||||
|
# are made into attachments so they do not get lost
|
||||||
for part in message_obj.walk():
|
for part in message_obj.walk():
|
||||||
if part.get_content_maintype() == 'multipart':
|
part_main_type = part.get_content_maintype()
|
||||||
|
if part_main_type == 'multipart':
|
||||||
continue
|
continue
|
||||||
# See email.message_obj.Message.get_filename()
|
if part.get_content_disposition() in ['inline', 'attachment']:
|
||||||
plain_body, formatted_body = extract_part_data(part, counter, ticket_id, files, logger)
|
process_as_attachment(part, counter, files, logger)
|
||||||
if plain_body:
|
else:
|
||||||
body = plain_body
|
# Get the content then assign to plain for formatted email message otherwise store the content as an attachment
|
||||||
if formatted_body:
|
mime_content = extract_mime_content(part)
|
||||||
full_body = formatted_body
|
if first_mime_non_multipart_content is None:
|
||||||
|
first_mime_non_multipart_content = mime_content
|
||||||
|
if part_main_type == 'text':
|
||||||
|
# Could be the body of the email
|
||||||
|
part_sub_type = part.get_content_subtype()
|
||||||
|
if plain_body is None and part_sub_type == "plain":
|
||||||
|
plain_body = mime_content
|
||||||
|
elif formatted_body is None and part_sub_type == "html":
|
||||||
|
formatted_body = mime_content
|
||||||
|
if "<body" not in formatted_body:
|
||||||
|
email_body = f"<body>{formatted_body}</body>"
|
||||||
|
else:
|
||||||
|
email_body = formatted_body
|
||||||
|
|
||||||
|
payload = (
|
||||||
|
'<html>'
|
||||||
|
'<head>'
|
||||||
|
'<meta charset="utf-8" />'
|
||||||
|
'</head>'
|
||||||
|
'%s'
|
||||||
|
'</html>'
|
||||||
|
) % email_body
|
||||||
|
files.append(
|
||||||
|
SimpleUploadedFile(
|
||||||
|
HTML_EMAIL_ATTACHMENT_FILENAME, payload.encode("utf-8"), 'text/html')
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Theoretically should not happen to properly structured emails but process anything else as an attachment
|
||||||
|
process_as_attachment(part, counter, files, logger)
|
||||||
|
logger.debug(f"Text MIME part added as attachment: {part.get_content_type()}")
|
||||||
|
else:
|
||||||
|
# process anything else as an attachment
|
||||||
|
process_as_attachment(part, counter, files, logger)
|
||||||
counter += 1
|
counter += 1
|
||||||
|
|
||||||
if not body:
|
# Check if we have at least the plain body
|
||||||
body, full_body = attempt_body_extract_from_html(message_obj)
|
if not plain_body:
|
||||||
|
if formatted_body:
|
||||||
|
# We have a formatted body but no plain text body
|
||||||
|
plain_body, _x = attempt_body_extract_from_html(formatted_body)
|
||||||
|
else:
|
||||||
|
# Something wrong with email or a processing issue so try first part or save full email message
|
||||||
|
if first_mime_non_multipart_content:
|
||||||
|
plain_body = extract_email_message(first_mime_non_multipart_content, True, True)
|
||||||
|
else:
|
||||||
|
plain_body = message
|
||||||
|
# first message in thread, we save full body to avoid losing forwards and things like that
|
||||||
|
include_chained_msgs = True if ticket_id is None and getattr(django_settings, 'HELPDESK_FULL_FIRST_MESSAGE_FROM_EMAIL', False) else False
|
||||||
|
message_body = extract_email_message(plain_body, True, include_chained_msgs)
|
||||||
|
|
||||||
add_file_if_always_save_incoming_email_message(files, message_obj)
|
# Only need the full message if the message_body excludes the chained messages
|
||||||
|
chained_email_message = None if include_chained_msgs else plain_body
|
||||||
|
|
||||||
|
# Not sure this is valid but a unit test uses a DIFFERENT plain text to html text body
|
||||||
|
# where plain text has blank message with forwarded message so.... hack away to support it
|
||||||
|
if message_body is not None and len(message_body) == 0 and formatted_body and len(formatted_body) > 0:
|
||||||
|
message_body, _x = attempt_body_extract_from_html(formatted_body)
|
||||||
|
# Set the chained message to the orignal plain text full message so it is stored in a FollowUp comments field
|
||||||
|
if len(plain_body) > 0:
|
||||||
|
chained_email_message = plain_body
|
||||||
|
|
||||||
|
add_file_if_always_save_incoming_email_message(files, message)
|
||||||
|
|
||||||
smtp_priority = message_obj.get('priority', '')
|
smtp_priority = message_obj.get('priority', '')
|
||||||
smtp_importance = message_obj.get('importance', '')
|
smtp_importance = message_obj.get('importance', '')
|
||||||
@ -897,8 +918,8 @@ def object_from_message(message: str,
|
|||||||
smtp_priority, smtp_importance} else 3
|
smtp_priority, smtp_importance} else 3
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
'body': body,
|
'body': message_body,
|
||||||
'full_body': full_body or body,
|
'full_body': chained_email_message,
|
||||||
'subject': subject,
|
'subject': subject,
|
||||||
'queue': queue,
|
'queue': queue,
|
||||||
'sender_email': sender_email,
|
'sender_email': sender_email,
|
||||||
|
Loading…
Reference in New Issue
Block a user