Skip to content

Commit

Permalink
make parsing EML and MBOX parts into a helper, also made error handli…
Browse files Browse the repository at this point in the history
…ng into a helper. Minor attachement fixes as well
  • Loading branch information
gwiedeman committed Apr 13, 2022
1 parent 4ca0c7d commit 92a2fe1
Show file tree
Hide file tree
Showing 6 changed files with 196 additions and 176 deletions.
4 changes: 1 addition & 3 deletions mailbag/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,7 @@ def generate_mailbag(self):
mailbag_message_id += 1
message.Mailbag_Message_ID = mailbag_message_id

# if message.Attachments is None:
# print("None")
if len(message.Attachments)>0:
if len(message.Attachments) > 0:
helper.saveAttachmentOnDisk(self.args.dry_run,attachments_dir,message)

# Setting up CSV data
Expand Down
80 changes: 23 additions & 57 deletions mailbag/formats/eml.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import datetime
import json
import traceback
from os.path import join
import mailbag.helper as helper
import mailbox
Expand Down Expand Up @@ -41,73 +40,41 @@ def messages(self):
originalFile = helper.relativePath(self.file, filePath)

attachments = []
error = []
stack_trace=[]
errors = {}
errors["msg"] = []
errors["stack_trace"] = []
try:
with open(filePath, 'rb') as f:
msg = email.message_from_binary_file(f, policy=policy.default)

try:
# Parse message bodies
html_body = None
text_body = None
html_encoding = None
text_encoding = None
bodies = {}
bodies["html_body"] = None
bodies["text_body"] = None
bodies["html_encoding"] = None
bodies["text_encoding"] = None
if msg.is_multipart():
for part in msg.walk():
content_type = part.get_content_type()
content_disposition = part.get_content_disposition()
if content_type == "text/html" and content_disposition != "attachment":
html_encoding = part.get_charsets()[0]
html_body = part.get_payload(decode=True).decode(html_encoding)
if content_type == "text/plain" and content_disposition != "attachment":
text_encoding = part.get_charsets()[0]
text_body = part.get_payload(decode=True).decode(text_encoding)

# Extract Attachment using walk
if part.get_content_maintype() == 'multipart': continue
if content_disposition is None: continue
try:
attachmentName = part.get_filename()
attachmentFile = part.get_payload(decode=True)
attachment = Attachment(
Name=attachmentName if attachmentName else str(len(attachments)),
File=attachmentFile,
MimeType=helper.mimeType(attachmentName)
)
attachments.append(attachment)
except Exception as e:
log.error(e)
error.append("Error parsing attachments.")
bodies, attachments, errors = helper.parse_part(part, bodies, attachments, errors)
else:
content_type = msg.get_content_type()
content_disposition = msg.get_content_disposition()
if content_type == "text/html" and content_disposition != "attachment":
html_encoding = part.get_charsets()[0]
html_body = part.get_payload(decode=True).decode(html_encoding)
if content_type == "text/plain" and content_disposition != "attachment":
text_encoding = part.get_charsets()[0]
text_body = part.get_payload(decode=True).decode(text_encoding)
bodies, attachments, errors = helper.parse_part(part, bodies, attachments, errors)

except Exception as e:
desc = "Error parsing message parts"
error_msg = desc + ": " + repr(e)
error.append(error_msg)
stack_trace.append(traceback.format_exc())
log.error(error_msg)

errors = helper.handle_error(errors, e, desc)

# Look for message arrangement
try:
messagePath = helper.messagePath(msg)
unsafePath = os.path.join(os.path.dirname(originalFile), messagePath)
derivativesPath = helper.normalizePath(unsafePath)
except Exception as e:
log.error(e)
error.append("Error reading message path from headers.")

desc = "Error reading message path from headers"
errors = helper.handle_error(errors, e, desc)

message = Email(
Error=error,
Error=errors["msg"],
Message_ID=msg["message-id"].strip(),
Original_File=originalFile,
Message_Path=messagePath,
Expand All @@ -118,23 +85,22 @@ def messages(self):
Subject=msg["subject"],
Content_Type=msg.get_content_type(),
Headers=msg,
HTML_Body=html_body,
HTML_Encoding=html_encoding,
Text_Body=text_body,
Text_Encoding=text_encoding,
HTML_Body=bodies["html_body"],
HTML_Encoding=bodies["html_encoding"],
Text_Body=bodies["text_body"],
Text_Encoding=bodies["text_encoding"],
Message=msg,
Attachments=attachments,
StackTrace=stack_trace
StackTrace=errors["stack_trace"]
)

except (email.errors.MessageParseError, Exception) as e:
desc = 'Error parsing message'
error_msg = desc + ": " + repr(e)
errors = helper.handle_error(errors, e, desc)
message = Email(
Error=error.append(error_msg),
StackTrace=stack_trace.append(traceback.format_exc())
Error=errors["msg"],
StackTrace=errors["stack_trace"]
)
log.error(error_msg)


# Move EML to new mailbag directory structure
Expand Down
82 changes: 23 additions & 59 deletions mailbag/formats/mbox.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import email
import mailbox
import traceback

from structlog import get_logger
from pathlib import Path
import os, shutil, glob
import email.errors

from mailbag.email_account import EmailAccount
from mailbag.models import Email,Attachment
from mailbag.models import Email, Attachment
import mailbag.helper as helper

log = get_logger()
Expand Down Expand Up @@ -47,58 +46,27 @@ def messages(self):
for mail in data.itervalues():

attachments = []
error = []
stack_trace=[]
errors = {}
errors["msg"] = []
errors["stack_trace"] = []
try:
mailObject = email.message_from_bytes(mail.as_bytes(),policy=email.policy.default)

# Try to parse content
try:
html_body = None
text_body = None
html_encoding = None
text_encoding = None
bodies = {}
bodies["html_body"] = None
bodies["text_body"] = None
bodies["html_encoding"] = None
bodies["text_encoding"] = None
if mailObject.is_multipart():
for part in mailObject.walk():
content_type = part.get_content_type()
content_disposition = part.get_content_disposition()
if content_type == "text/html" and content_disposition != "attachment":
html_encoding = part.get_charsets()[0]
html_body = part.get_payload(decode=True).decode(html_encoding)
if content_type == "text/plain" and content_disposition != "attachment":
text_encoding = part.get_charsets()[0]
text_body = part.get_payload(decode=True).decode(text_encoding)

# Extract Attachment using walk
if part.get_content_maintype() == 'multipart': continue
if content_disposition is None: continue
try:
attachmentName = part.get_filename()
attachmentFile = part.get_payload(decode=True)
attachment = Attachment(
Name=attachmentName if attachmentName else str(len(attachments)),
File=attachmentFile,
MimeType=helper.mimeType(attachmentName)
)
attachments.append(attachment)
except Exception as e:
log.error(e)
error.append("Error parsing attachments.")
bodies, attachments, errors = helper.parse_part(part, bodies, attachments, errors)
else:
content_type = mailObject.get_content_type()
content_disposition = mailObject.get_content_disposition()
if content_type == "text/html" and content_disposition != "attachment":
html_encoding = part.get_charsets()[0]
html_body = part.get_payload(decode=True).decode(html_encoding)
if content_type == "text/plain" and content_disposition != "attachment":
text_encoding = part.get_charsets()[0]
text_body = part.get_payload(decode=True).decode(text_encoding)
bodies, attachments, errors = helper.parse_part(part, bodies, attachments, errors)
except Exception as e:
desc = "Error parsing message body"
error_msg = desc + ": " + repr(e)
error.append(error_msg)
stack_trace.append(traceback.format_exc())
log.error(error_msg)
desc = "Error parsing message parts"
errors = helper.handle_error(errors, e, desc)

# Look for message arrangement
try:
Expand All @@ -107,14 +75,10 @@ def messages(self):
derivativesPath = helper.normalizePath(unsafePath)
except Exception as e:
desc = "Error reading message path from headers"
error_msg = desc + ": " + repr(e)
error.append(error_msg)
stack_trace.append(traceback.format_exc())
log.error(error_msg)
errors = helper.handle_error(errors, e, desc)


message = Email(
Error=error,
Error=errors["msg"],
Message_ID=mail['Message-ID'].strip(),
Original_File=originalFile,
Message_Path=messagePath,
Expand All @@ -127,20 +91,20 @@ def messages(self):
Subject=mail['Subject'],
Content_Type=mailObject.get_content_type(),
Headers=mail,
HTML_Body=html_body,
HTML_Encoding=html_encoding,
Text_Body=text_body,
Text_Encoding=text_encoding,
HTML_Body=bodies["html_body"],
HTML_Encoding=bodies["html_encoding"],
Text_Body=bodies["text_body"],
Text_Encoding=bodies["text_encoding"],
Message=mailObject,
Attachments=attachments,
StackTrace = stack_trace
StackTrace = errors["stack_trace"]
)
except (email.errors.MessageParseError, Exception) as e:
desc = 'Error parsing message'
error_msg = desc + ": " + repr(e)
errors = helper.handle_error(errors, e, desc)
message = Email(
Error=error.append(error_msg),
StackTrace=stack_trace.append(traceback.format_exc())
Error=errors["msg"],
StackTrace=errors["stack_trace"]
)
log.error(error_msg)

Expand Down
45 changes: 22 additions & 23 deletions mailbag/formats/msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from structlog import get_logger
from RTFDE.deencapsulate import DeEncapsulator
import email.errors
import traceback
from mailbag.email_account import EmailAccount
from mailbag.models import Email, Attachment
import mailbag.helper as helper
Expand Down Expand Up @@ -37,8 +36,9 @@ def messages(self):
originalFile = helper.relativePath(self.file, filePath)

attachments = []
error = []
stack_trace = []
errors = {}
errors["msg"] = []
errors["stack_trace"] = []
try:
mail = extract_msg.openMsg(filePath)

Expand All @@ -56,46 +56,46 @@ def messages(self):
text_encoding = mail.stringEncoding
except Exception as e:
desc = "Error parsing message body"
error_msg = desc + ": " + repr(e)
error.append(error_msg)
stack_trace.append(traceback.format_exc())
log.error(error_msg)
errors = helper.handle_error(errors, e, desc)

# Look for message arrangement
try:
messagePath = helper.messagePath(mail.header)
unsafePath = os.path.join(os.path.dirname(originalFile), messagePath)
derivativesPath = helper.normalizePath(unsafePath)
except Exception as e:
log.error(e)
error.append("Error reading message path from headers.")
desc = "Error reading message path from headers"
errors = helper.handle_error(errors, e, desc)

try:
for mailAttachment in mail.attachments:
if mailAttachment.longFilename:
if mailAttachment.getFilename():

This comment has been minimized.

Copy link
@baibhavr

baibhavr Apr 14, 2022

Collaborator

I'm seeing error because of getFilename().
[error ] Error parsing attachments: AttributeError("'Attachment' object has no attribute 'getFilename'")

attachmentName = mailAttachment.getFilename()
elif mailAttachment.longFilename:
attachmentName = mailAttachment.longFilename
elif mailAttachment.shortFilename:
attachmentNames = mailAttachment.shortFilename
attachmentName = mailAttachment.shortFilename
else:
attachmentNames = str(len(attachments))
attachmentName = str(len(attachments))
nameMsg = "No filename found for attachment " + attachmentName + \
" for message " + str(message.Mailbag_Message_ID)
log.error(nameMsg)
error.append(nameMsg)

attachment = Attachment(
Name=attachmentName,
File=mailAttachment.data,
MimeType=helper.mimeType(attachmentName)
MimeType=helper.guessMimeType(attachmentName)
)
attachments.append(attachment)

except Exception as e:
desc = "Error parsing attachments"
error_msg = desc + ": " + repr(e)
error.append(error_msg)
stack_trace.append(traceback.format_exc())
log.error(error_msg)
errors = helper.handle_error(errors, e, desc)


message = Email(
Error = error,
Error = errors["msg"],
Message_ID = mail.messageId.strip(),
Original_File=originalFile,
Message_Path=messagePath,
Expand All @@ -116,19 +116,18 @@ def messages(self):
# Doesn't look like we can feasibly get a full email.message.Message object for .msg
Message=None,
Attachments=attachments,
StackTrace=stack_trace
StackTrace=errors["stack_trace"]
)
# Make sure the MSG file is closed
mail.close()

except (email.errors.MessageParseError, Exception) as e:
desc = 'Error parsing message'
error_msg = desc + ": " + repr(e)
errors = helper.handle_error(errors, e, desc)
message = Email(
Error=error.append(error_msg),
StackTrace=stack_trace.append(traceback.format_exc())
Error=errors["msg"],
StackTrace=errors["stack_trace"]
)
log.error(error_msg)
# Make sure the MSG file is closed
mail.close()

Expand Down
Loading

0 comments on commit 92a2fe1

Please sign in to comment.