Skip to content

Commit

Permalink
Merge pull request #251 from UAlbanyArchives/bad_rtf
Browse files Browse the repository at this point in the history
Better handles RTF bodies for PST files
  • Loading branch information
gwiedeman authored May 2, 2024
2 parents 808a32e + 092b9e2 commit a8ac6d1
Showing 1 changed file with 27 additions and 9 deletions.
36 changes: 27 additions & 9 deletions mailbagit/formats/pst.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(self, args, source_parent_dir, mailbag_dir, mailbag_name, **kwargs)
self.source_parent_dir = source_parent_dir
self.companion_files = args.companion_files
log.info("Reading: " + self.path)
self.count = 0

@property
def account_data(self):
Expand All @@ -57,7 +58,7 @@ def number_of_messages(self):
count += 1
return count

def folders(self, folder, path, originalFile, errors, iteration_only=False):
def folders(self, folder, path, originalFile, iteration_only=False):
# recursive function that calls itself on any subfolders and
# returns a generator of messages
# path is the email folder path of the message, separated by "/"
Expand All @@ -69,6 +70,7 @@ def folders(self, folder, path, originalFile, errors, iteration_only=False):
yield None
continue
attachments = []
errors = []
try:
messageObj = folder.get_sub_message(index)

Expand Down Expand Up @@ -124,11 +126,28 @@ def folders(self, folder, path, originalFile, errors, iteration_only=False):
rtf_body = rtf_body[:-1]
# decode it before using DeEncapsulator
rtf_string, html_encoding, errors = format.safely_decode("HTML", rtf_body, encodings, errors)
deencapsulated_body = DeEncapsulator(rtf_string)
deencapsulated_body.deencapsulate()
html_body = deencapsulated_body.html
except:
pass

# Some sort of encoding issue can cause multiple EOF characters which is malformed RTF
"""
eof_index = rtf_string.find('\x1a')
self.count += 1
if eof_index != -1:
print (rtf_string.count('\x1a'))
"""

try:
deencapsulated_body = DeEncapsulator(rtf_body)
deencapsulated_body.deencapsulate()
html_body, html_encoding, errors = format.safely_decode(
"HTML", deencapsulated_body.html, encodings, errors
)
# html_body = deencapsulated_body.html.decode(html_encoding)
except Exception as e:
desc = "Error parsing RTF body"
errors = common.handle_error(errors, e, desc)
except Exception as e:
desc = "Error parsing HTML or RTF body"
errors = common.handle_error(errors, e, desc)
if messageObj.plain_text_body:
encodings[len(encodings.keys()) + 1] = {
"name": "utf-8",
Expand Down Expand Up @@ -282,7 +301,7 @@ def folders(self, folder, path, originalFile, errors, iteration_only=False):
if folder.number_of_sub_folders:
for folder_index in range(folder.number_of_sub_folders):
subfolder = folder.get_sub_folder(folder_index)
yield from self.folders(subfolder, path + "/" + subfolder.name, originalFile, errors, iteration_only=iteration_only)
yield from self.folders(subfolder, path + "/" + subfolder.name, originalFile, iteration_only=iteration_only)
else:
if not iteration_only:
if not folder.number_of_sub_messages:
Expand Down Expand Up @@ -320,11 +339,10 @@ def messages(self, iteration_only=False):
pst = pypff.file()
pst.open(filePath)
root = pst.get_root_folder()
errors = []
for folder in root.sub_folders:
if folder.number_of_sub_folders:
# call recursive function to parse email folder
yield from self.folders(folder, folder.name, originalFile, errors, iteration_only=iteration_only)
yield from self.folders(folder, folder.name, originalFile, iteration_only=iteration_only)
else:
if not iteration_only:
# This is an email folder that does not contain any messages.
Expand Down

0 comments on commit a8ac6d1

Please sign in to comment.