From 92a2fe14f33bcb89b07505c5a367272cc67f2fb4 Mon Sep 17 00:00:00 2001 From: gwiedeman Date: Wed, 13 Apr 2022 14:53:37 -0400 Subject: [PATCH] make parsing EML and MBOX parts into a helper, also made error handling into a helper. Minor attachement fixes as well --- mailbag/controller.py | 4 +- mailbag/formats/eml.py | 80 +++++++++-------------------- mailbag/formats/mbox.py | 82 +++++++++--------------------- mailbag/formats/msg.py | 45 ++++++++-------- mailbag/formats/pst.py | 51 ++++++++----------- mailbag/helper.py | 110 ++++++++++++++++++++++++++++++++++++++-- 6 files changed, 196 insertions(+), 176 deletions(-) diff --git a/mailbag/controller.py b/mailbag/controller.py index 3c6cc1d..da7e8ce 100644 --- a/mailbag/controller.py +++ b/mailbag/controller.py @@ -93,9 +93,7 @@ def generate_mailbag(self): mailbag_message_id += 1 message.Mailbag_Message_ID = mailbag_message_id - # if message.Attachments is None: - # print("None") - if len(message.Attachments)>0: + if len(message.Attachments) > 0: helper.saveAttachmentOnDisk(self.args.dry_run,attachments_dir,message) # Setting up CSV data diff --git a/mailbag/formats/eml.py b/mailbag/formats/eml.py index 8ab8175..7ebecd8 100644 --- a/mailbag/formats/eml.py +++ b/mailbag/formats/eml.py @@ -1,6 +1,5 @@ import datetime import json -import traceback from os.path import join import mailbag.helper as helper import mailbox @@ -41,60 +40,29 @@ def messages(self): originalFile = helper.relativePath(self.file, filePath) attachments = [] - error = [] - stack_trace=[] + errors = {} + errors["msg"] = [] + errors["stack_trace"] = [] try: with open(filePath, 'rb') as f: msg = email.message_from_binary_file(f, policy=policy.default) try: # Parse message bodies - html_body = None - text_body = None - html_encoding = None - text_encoding = None + bodies = {} + bodies["html_body"] = None + bodies["text_body"] = None + bodies["html_encoding"] = None + bodies["text_encoding"] = None if msg.is_multipart(): for part in msg.walk(): - content_type = part.get_content_type() - content_disposition = part.get_content_disposition() - if content_type == "text/html" and content_disposition != "attachment": - html_encoding = part.get_charsets()[0] - html_body = part.get_payload(decode=True).decode(html_encoding) - if content_type == "text/plain" and content_disposition != "attachment": - text_encoding = part.get_charsets()[0] - text_body = part.get_payload(decode=True).decode(text_encoding) - - # Extract Attachment using walk - if part.get_content_maintype() == 'multipart': continue - if content_disposition is None: continue - try: - attachmentName = part.get_filename() - attachmentFile = part.get_payload(decode=True) - attachment = Attachment( - Name=attachmentName if attachmentName else str(len(attachments)), - File=attachmentFile, - MimeType=helper.mimeType(attachmentName) - ) - attachments.append(attachment) - except Exception as e: - log.error(e) - error.append("Error parsing attachments.") + bodies, attachments, errors = helper.parse_part(part, bodies, attachments, errors) else: - content_type = msg.get_content_type() - content_disposition = msg.get_content_disposition() - if content_type == "text/html" and content_disposition != "attachment": - html_encoding = part.get_charsets()[0] - html_body = part.get_payload(decode=True).decode(html_encoding) - if content_type == "text/plain" and content_disposition != "attachment": - text_encoding = part.get_charsets()[0] - text_body = part.get_payload(decode=True).decode(text_encoding) + bodies, attachments, errors = helper.parse_part(part, bodies, attachments, errors) + except Exception as e: desc = "Error parsing message parts" - error_msg = desc + ": " + repr(e) - error.append(error_msg) - stack_trace.append(traceback.format_exc()) - log.error(error_msg) - + errors = helper.handle_error(errors, e, desc) # Look for message arrangement try: @@ -102,12 +70,11 @@ def messages(self): unsafePath = os.path.join(os.path.dirname(originalFile), messagePath) derivativesPath = helper.normalizePath(unsafePath) except Exception as e: - log.error(e) - error.append("Error reading message path from headers.") - + desc = "Error reading message path from headers" + errors = helper.handle_error(errors, e, desc) message = Email( - Error=error, + Error=errors["msg"], Message_ID=msg["message-id"].strip(), Original_File=originalFile, Message_Path=messagePath, @@ -118,23 +85,22 @@ def messages(self): Subject=msg["subject"], Content_Type=msg.get_content_type(), Headers=msg, - HTML_Body=html_body, - HTML_Encoding=html_encoding, - Text_Body=text_body, - Text_Encoding=text_encoding, + HTML_Body=bodies["html_body"], + HTML_Encoding=bodies["html_encoding"], + Text_Body=bodies["text_body"], + Text_Encoding=bodies["text_encoding"], Message=msg, Attachments=attachments, - StackTrace=stack_trace + StackTrace=errors["stack_trace"] ) except (email.errors.MessageParseError, Exception) as e: desc = 'Error parsing message' - error_msg = desc + ": " + repr(e) + errors = helper.handle_error(errors, e, desc) message = Email( - Error=error.append(error_msg), - StackTrace=stack_trace.append(traceback.format_exc()) + Error=errors["msg"], + StackTrace=errors["stack_trace"] ) - log.error(error_msg) # Move EML to new mailbag directory structure diff --git a/mailbag/formats/mbox.py b/mailbag/formats/mbox.py index 6655b22..cae5ac2 100644 --- a/mailbag/formats/mbox.py +++ b/mailbag/formats/mbox.py @@ -1,6 +1,5 @@ import email import mailbox -import traceback from structlog import get_logger from pathlib import Path @@ -8,7 +7,7 @@ import email.errors from mailbag.email_account import EmailAccount -from mailbag.models import Email,Attachment +from mailbag.models import Email, Attachment import mailbag.helper as helper log = get_logger() @@ -47,58 +46,27 @@ def messages(self): for mail in data.itervalues(): attachments = [] - error = [] - stack_trace=[] + errors = {} + errors["msg"] = [] + errors["stack_trace"] = [] try: mailObject = email.message_from_bytes(mail.as_bytes(),policy=email.policy.default) # Try to parse content try: - html_body = None - text_body = None - html_encoding = None - text_encoding = None + bodies = {} + bodies["html_body"] = None + bodies["text_body"] = None + bodies["html_encoding"] = None + bodies["text_encoding"] = None if mailObject.is_multipart(): for part in mailObject.walk(): - content_type = part.get_content_type() - content_disposition = part.get_content_disposition() - if content_type == "text/html" and content_disposition != "attachment": - html_encoding = part.get_charsets()[0] - html_body = part.get_payload(decode=True).decode(html_encoding) - if content_type == "text/plain" and content_disposition != "attachment": - text_encoding = part.get_charsets()[0] - text_body = part.get_payload(decode=True).decode(text_encoding) - - # Extract Attachment using walk - if part.get_content_maintype() == 'multipart': continue - if content_disposition is None: continue - try: - attachmentName = part.get_filename() - attachmentFile = part.get_payload(decode=True) - attachment = Attachment( - Name=attachmentName if attachmentName else str(len(attachments)), - File=attachmentFile, - MimeType=helper.mimeType(attachmentName) - ) - attachments.append(attachment) - except Exception as e: - log.error(e) - error.append("Error parsing attachments.") + bodies, attachments, errors = helper.parse_part(part, bodies, attachments, errors) else: - content_type = mailObject.get_content_type() - content_disposition = mailObject.get_content_disposition() - if content_type == "text/html" and content_disposition != "attachment": - html_encoding = part.get_charsets()[0] - html_body = part.get_payload(decode=True).decode(html_encoding) - if content_type == "text/plain" and content_disposition != "attachment": - text_encoding = part.get_charsets()[0] - text_body = part.get_payload(decode=True).decode(text_encoding) + bodies, attachments, errors = helper.parse_part(part, bodies, attachments, errors) except Exception as e: - desc = "Error parsing message body" - error_msg = desc + ": " + repr(e) - error.append(error_msg) - stack_trace.append(traceback.format_exc()) - log.error(error_msg) + desc = "Error parsing message parts" + errors = helper.handle_error(errors, e, desc) # Look for message arrangement try: @@ -107,14 +75,10 @@ def messages(self): derivativesPath = helper.normalizePath(unsafePath) except Exception as e: desc = "Error reading message path from headers" - error_msg = desc + ": " + repr(e) - error.append(error_msg) - stack_trace.append(traceback.format_exc()) - log.error(error_msg) + errors = helper.handle_error(errors, e, desc) - message = Email( - Error=error, + Error=errors["msg"], Message_ID=mail['Message-ID'].strip(), Original_File=originalFile, Message_Path=messagePath, @@ -127,20 +91,20 @@ def messages(self): Subject=mail['Subject'], Content_Type=mailObject.get_content_type(), Headers=mail, - HTML_Body=html_body, - HTML_Encoding=html_encoding, - Text_Body=text_body, - Text_Encoding=text_encoding, + HTML_Body=bodies["html_body"], + HTML_Encoding=bodies["html_encoding"], + Text_Body=bodies["text_body"], + Text_Encoding=bodies["text_encoding"], Message=mailObject, Attachments=attachments, - StackTrace = stack_trace + StackTrace = errors["stack_trace"] ) except (email.errors.MessageParseError, Exception) as e: desc = 'Error parsing message' - error_msg = desc + ": " + repr(e) + errors = helper.handle_error(errors, e, desc) message = Email( - Error=error.append(error_msg), - StackTrace=stack_trace.append(traceback.format_exc()) + Error=errors["msg"], + StackTrace=errors["stack_trace"] ) log.error(error_msg) diff --git a/mailbag/formats/msg.py b/mailbag/formats/msg.py index 3f2fd95..012edc7 100644 --- a/mailbag/formats/msg.py +++ b/mailbag/formats/msg.py @@ -4,7 +4,6 @@ from structlog import get_logger from RTFDE.deencapsulate import DeEncapsulator import email.errors -import traceback from mailbag.email_account import EmailAccount from mailbag.models import Email, Attachment import mailbag.helper as helper @@ -37,8 +36,9 @@ def messages(self): originalFile = helper.relativePath(self.file, filePath) attachments = [] - error = [] - stack_trace = [] + errors = {} + errors["msg"] = [] + errors["stack_trace"] = [] try: mail = extract_msg.openMsg(filePath) @@ -56,10 +56,7 @@ def messages(self): text_encoding = mail.stringEncoding except Exception as e: desc = "Error parsing message body" - error_msg = desc + ": " + repr(e) - error.append(error_msg) - stack_trace.append(traceback.format_exc()) - log.error(error_msg) + errors = helper.handle_error(errors, e, desc) # Look for message arrangement try: @@ -67,35 +64,38 @@ def messages(self): unsafePath = os.path.join(os.path.dirname(originalFile), messagePath) derivativesPath = helper.normalizePath(unsafePath) except Exception as e: - log.error(e) - error.append("Error reading message path from headers.") + desc = "Error reading message path from headers" + errors = helper.handle_error(errors, e, desc) try: for mailAttachment in mail.attachments: - if mailAttachment.longFilename: + if mailAttachment.getFilename(): + attachmentName = mailAttachment.getFilename() + elif mailAttachment.longFilename: attachmentName = mailAttachment.longFilename elif mailAttachment.shortFilename: - attachmentNames = mailAttachment.shortFilename + attachmentName = mailAttachment.shortFilename else: - attachmentNames = str(len(attachments)) + attachmentName = str(len(attachments)) + nameMsg = "No filename found for attachment " + attachmentName + \ + " for message " + str(message.Mailbag_Message_ID) + log.error(nameMsg) + error.append(nameMsg) attachment = Attachment( Name=attachmentName, File=mailAttachment.data, - MimeType=helper.mimeType(attachmentName) + MimeType=helper.guessMimeType(attachmentName) ) attachments.append(attachment) except Exception as e: desc = "Error parsing attachments" - error_msg = desc + ": " + repr(e) - error.append(error_msg) - stack_trace.append(traceback.format_exc()) - log.error(error_msg) + errors = helper.handle_error(errors, e, desc) message = Email( - Error = error, + Error = errors["msg"], Message_ID = mail.messageId.strip(), Original_File=originalFile, Message_Path=messagePath, @@ -116,19 +116,18 @@ def messages(self): # Doesn't look like we can feasibly get a full email.message.Message object for .msg Message=None, Attachments=attachments, - StackTrace=stack_trace + StackTrace=errors["stack_trace"] ) # Make sure the MSG file is closed mail.close() except (email.errors.MessageParseError, Exception) as e: desc = 'Error parsing message' - error_msg = desc + ": " + repr(e) + errors = helper.handle_error(errors, e, desc) message = Email( - Error=error.append(error_msg), - StackTrace=stack_trace.append(traceback.format_exc()) + Error=errors["msg"], + StackTrace=errors["stack_trace"] ) - log.error(error_msg) # Make sure the MSG file is closed mail.close() diff --git a/mailbag/formats/pst.py b/mailbag/formats/pst.py index fdf4375..e7a5950 100644 --- a/mailbag/formats/pst.py +++ b/mailbag/formats/pst.py @@ -1,11 +1,9 @@ import os, glob import mailbox -import traceback import chardet from structlog import get_logger from email import parser -import mimetypes from mailbag.email_account import EmailAccount from mailbag.models import Email, Attachment import mailbag.helper as helper @@ -47,10 +45,10 @@ def folders(self, folder, path, originalFile): path.append(folder.name) for index in range(folder.number_of_sub_messages): - stack_trace=[] - error = [] attachments = [] - + errors = {} + errors["msg"] = [] + errors["stack_trace"] = [] try: messageObj = folder.get_sub_message(index) @@ -59,10 +57,7 @@ def folders(self, folder, path, originalFile): headers = headerParser.parsestr(messageObj.transport_headers) except Exception as e: desc = "Error parsing message body" - error_msg = desc + ": " + repr(e) - error.append(error_msg) - stack_trace.append(traceback.format_exc()) - log.error(error_msg) + errors = helper.handle_error(errors, e, desc) try: @@ -79,10 +74,7 @@ def folders(self, folder, path, originalFile): text_body = messageObj.plain_text_body.decode(text_encoding) except Exception as e: desc = "Error parsing message body" - error_msg = desc + ": " + repr(e) - error.append(error_msg) - stack_trace.append(traceback.format_exc()) - log.error(error_msg) + errors = helper.handle_error(errors, e, desc) # Build message and derivatives paths try: @@ -90,34 +82,36 @@ def folders(self, folder, path, originalFile): derivativesPath = helper.normalizePath(messagePath) except Exception as e: desc = "Error reading message path" - error_msg = desc + ": " + repr(e) - error.append(error_msg) - stack_trace.append(traceback.format_exc()) - log.error(error_msg) + errors = helper.handle_error(errors, e, desc) try: total_attachment_size_bytes = 0 for attachmentObj in messageObj.attachments: total_attachment_size_bytes = total_attachment_size_bytes + attachmentObj.get_size() attachment_content = attachmentObj.read_buffer(attachmentObj.get_size()) + + try: + attachmentName = attachmentObj.get_name() + except: + attachmentName = str(len(attachments)) + desc = "No filename found for attachment " + attachmentName + \ + " for message " + str(message.Mailbag_Message_ID) + errors = helper.handle_error(errors, e, desc) attachment = Attachment( - Name=attachmentObj.get_name(), + Name=attachmentName, File=attachment_content, - MimeType=helper.mimeType(attachmentObj.get_name()) + MimeType=helper.guessMimeType(attachmentName) ) attachments.append(attachment) except Exception as e: desc = "Error parsing attachments" - error_msg = desc + ": " + repr(e) - error.append(error_msg) - stack_trace.append(traceback.format_exc()) - log.error(error_msg) + errors = helper.handle_error(errors, e, desc) message = Email( - Error=error, + Error=errors["msg"], Message_ID=headers['Message-ID'].strip(), Original_File=originalFile, Message_Path=messagePath, @@ -136,17 +130,16 @@ def folders(self, folder, path, originalFile): Text_Encoding=text_encoding, Message=None, Attachments=attachments, - StackTrace=stack_trace + StackTrace=errors["stack_trace"] ) except (Exception) as e: desc = 'Error parsing message' - error_msg = desc + ": " + repr(e) + errors = helper.handle_error(errors, e, desc) message = Email( - Error=error.append(error_msg), - StackTrace=stack_trace.append(traceback.format_exc()) + Error=errors["msg"], + StackTrace=errors["stack_trace"] ) - log.error(error_msg) yield message diff --git a/mailbag/helper.py b/mailbag/helper.py index 2a371ec..53c134b 100644 --- a/mailbag/helper.py +++ b/mailbag/helper.py @@ -2,7 +2,9 @@ from pathlib import Path import os, shutil, glob from structlog import get_logger +from mailbag.models import Attachment import mimetypes +import traceback log = get_logger() @@ -96,12 +98,101 @@ def moveWithDirectoryStructure(dry_run, mainPath, mailbag_name, input, file): return file_new_path -def saveAttachments(part): - return (part.get_filename(),part.get_payload(decode=True)) +def handle_error(errors, exception, desc): + """ + Is called when an exception is raised in the parsers. + returns a dict of readable and full trace errors that can be appended to. + + Parameters: + errors (dict): + "msg" contains a list of human readable error messages + "stack_trace" contains a list of full stack traces + exception (Exception): The exception raised + desc (String): A full email message object desribed in models.py + Returns: + errors (dict): + "msg" contains a list of human readable error messages + "stack_trace" contains a list of full stack traces + """ + error_msg = desc + ": " + repr(exception) + errors["msg"].append(error_msg) + errors["stack_trace"].append(traceback.format_exc()) + log.error(error_msg) + + return errors + +def parse_part(part, bodies, attachments, errors): + """ + Used for EML and MBOX parsers + Parses a part of an email message for multipart messages or a full message with a single part + + Parameters: + part (email.Message.message part): + "msg" contains a list of human readable error messages + "stack_trace" contains a list of full stack traces + bodies (dict): + "msg" contains a list of human readable error messages + "stack_trace" contains a list of full stack traces + attachments (list): a list of attachment object as defined in models.py + errors (dict): + "msg" contains a list of human readable error messages + "stack_trace" contains a list of full stack traces + Returns: + bodies (dict): + "msg" contains a list of human readable error messages + "stack_trace" contains a list of full stack traces + attachments (list): a list of attachment object as defined in models.py + errors (dict): + "msg" contains a list of human readable error messages + "stack_trace" contains a list of full stack traces + """ + content_type = part.get_content_type() + content_disposition = part.get_content_disposition() + + # Extract body + try: + if content_type == "text/html" and content_disposition != "attachment": + bodies["html_encoding"] = part.get_charsets()[0] + bodies["html_body"] = part.get_payload(decode=True).decode(bodies["html_encoding"]) + if content_type == "text/plain" and content_disposition != "attachment": + bodies["text_encoding"] = part.get_charsets()[0] + bodies["text_body"] = part.get_payload(decode=True).decode(bodies["text_encoding"]) + except Exception as e: + desc = "Error parsing message body" + errors = handle_error(errors, e, desc) + # Extract attachments + if part.get_content_maintype() == 'multipart': + pass + elif content_disposition is None: + pass + else: + try: + attachmentName = part.get_filename() + attachmentFile = part.get_payload(decode=True) + attachment = Attachment( + Name=attachmentName if attachmentName else str(len(attachments)), + File=attachmentFile, + MimeType=content_type + ) + attachments.append(attachment) + except Exception as e: + desc = "Error parsing attachments" + errors = handle_error(errors, e, desc) + + return bodies, attachments, errors def saveAttachmentOnDisk(dry_run,attachments_dir,message): - + """ + Takes an email message object and writes any attachments in the model + to the attachments subdirectory according to the mailbag spec + + Parameters: + dry_run (Boolean): Option to do a test run without writing changes + attachments_dir (Path): Path to the attachments subdirectory + message (Email): A full email message object desribed in models.py + """ + if not dry_run: message_attachments_dir = os.path.join(attachments_dir,str(message.Mailbag_Message_ID)) os.mkdir(message_attachments_dir) @@ -115,8 +206,17 @@ def saveAttachmentOnDisk(dry_run,attachments_dir,message): f.write(attachment.File) f.close() -def mimeType(file): - return mimetypes.guess_type(file)[0] +def guessMimeType(filename): + """ + Takes an file name and uses mimetypes to guess the mime type + + Parameters: + filename (String): Attachment filename + + Returns: + Mimetype (String) + """ + return mimetypes.guess_type(filename)[0] def normalizePath(path): # this is not sufficent yet