From afecc0c4c4b0bb1ff3493d5621ebb64425ea6a4f Mon Sep 17 00:00:00 2001 From: waltyon Date: Wed, 9 Jun 2021 11:45:10 +0200 Subject: [PATCH] Improve MsgParser analyzer adding attachment and IOC as observables * Add refactor modules by using external libraries * Explore attachments to import into observables and the lists with their hashes (md5, sha1, sha256) * Search for possible IOC in the mail (IP, Hash, url, email address..) * The code structure was inspired by EmlParser analyzer --- analyzers/MsgParser/Msg_Parser.json | 6 +- analyzers/MsgParser/lib/__init__.py | 1 - analyzers/MsgParser/lib/msgParser.py | 300 ------------------ analyzers/MsgParser/parse.py | 118 +++++-- analyzers/MsgParser/requirements.txt | 3 + thehive-templates/Msg_Parser_3_0/short.html | 3 - .../long.html | 48 ++- thehive-templates/Msg_Parser_4_0/short.html | 4 + 8 files changed, 138 insertions(+), 345 deletions(-) delete mode 100755 analyzers/MsgParser/lib/__init__.py delete mode 100755 analyzers/MsgParser/lib/msgParser.py delete mode 100644 thehive-templates/Msg_Parser_3_0/short.html rename thehive-templates/{Msg_Parser_3_0 => Msg_Parser_4_0}/long.html (62%) create mode 100644 thehive-templates/Msg_Parser_4_0/short.html diff --git a/analyzers/MsgParser/Msg_Parser.json b/analyzers/MsgParser/Msg_Parser.json index 06d5acccc..905a9692d 100644 --- a/analyzers/MsgParser/Msg_Parser.json +++ b/analyzers/MsgParser/Msg_Parser.json @@ -1,11 +1,11 @@ { "name": "Msg_Parser", - "version": "3.0", - "author": "CERT-BDF", + "version": "4.0", + "author": "Waltyon", "url": "https://github.com/TheHive-Project/Cortex-Analyzers", "license": "AGPL-V3", "description": "Parse Outlook MSG files and extract the main artifacts.", "dataTypeList": ["file"], "baseConfig": "MsgParser", "command": "MsgParser/parse.py" -} +} \ No newline at end of file diff --git a/analyzers/MsgParser/lib/__init__.py b/analyzers/MsgParser/lib/__init__.py deleted file mode 100755 index 35e377ec3..000000000 --- a/analyzers/MsgParser/lib/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# __import__('pkg_resources').declare_namespace(__name__) diff --git a/analyzers/MsgParser/lib/msgParser.py b/analyzers/MsgParser/lib/msgParser.py deleted file mode 100755 index 00ad5356d..000000000 --- a/analyzers/MsgParser/lib/msgParser.py +++ /dev/null @@ -1,300 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 - -# --- LICENSE ----------------------------------------------------------------- -# -# Copyright 2013 Matthew Walker -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -import json -import os -import sys -import glob -import traceback -from email.parser import Parser as EmailParser -import email.utils -import olefile as OleFile - - -class Attachment: - - def __init__(self, msg, dir_): - - # print dir_ - - # Get long filename - self.longFilename = msg._getStringStream([dir_, '__substg1.0_3707']) - # print self.longFilename - - # Get short filename - self.shortFilename = msg._getStringStream([dir_, '__substg1.0_3704']) - - # Get attachment data - self.data = msg._getStream([dir_, '__substg1.0_37010102']) - - # Get short mimeTag - self.mimeTag = msg._getStringStream([dir_, '__substg1.0_370E']) - - # Get extension - self.extension = msg._getStringStream([dir_, '__substg1.0_3703']) - - def save(self): - # Use long filename as first preference - filename = self.longFilename - - # Otherwise use the short filename - if filename is None: - filename = self.shortFilename - # Otherwise just make something up! - if filename is None: - import random - import string - filename = 'UnknownFilename ' + \ - ''.join(random.choice(string.ascii_uppercase + string.digits) - for _ in range(5)) + ".bin" - #f = open("/tmp/" + filename, 'wb') - # if self.data is None: - #f.write(("Pas de PJ")) - # f.close() - # else: - # f.write((self.data)) - # f.close() - # return filename - - -def windowsUnicode(string): - if string is None: - return None - if sys.version_info[0] >= 3: # Python 3 - return str(string, 'utf_16_le') - else: # Python 2 - return unicode(string, 'utf_16_le') - - -class Message(OleFile.OleFileIO): - - def __init__(self, filename): - OleFile.OleFileIO.__init__(self, filename) - - def _getStream(self, filename): - if self.exists(filename): - stream = self.openstream(filename) - return stream.read() - else: - return None - - def _getStringStream(self, filename, prefer='unicode'): - """Gets a string representation of the requested filename. - Checks for both ASCII and Unicode representations and returns - a value if possible. If there are both ASCII and Unicode - versions, then the parameter /prefer/ specifies which will be - returned. - """ - - if isinstance(filename, list): - # Join with slashes to make it easier to append the type - filename = "/".join(filename) - - asciiVersion = self._getStream(filename + '001E') - unicodeVersion = windowsUnicode(self._getStream(filename + '001F')) - if asciiVersion is None: - return unicodeVersion - elif unicodeVersion is None: - return asciiVersion.decode('ascii', 'ignore') - else: - if prefer == 'unicode': - return unicodeVersion - else: - return asciiVersion.decode('ascii', 'ignore') - - @property - def subject(self): - return self._getStringStream('__substg1.0_0037') - - @property - def header(self): - try: - return self._header - except Exception: - headerText = self._getStringStream('__substg1.0_007D') - if headerText is not None: - self._header = EmailParser().parsestr(headerText) - else: - self._header = None - return self._header - - @property - def date(self): - # Get the message's header and extract the date - if self.header is None: - return None - else: - return self.header['date'] - - @property - def parsedDate(self): - return email.utils.parsedate(self.date) - - @property - def attachments(self): - try: - return self._attachments - except Exception: - # Get the attachments - attachmentDirs = [] - - for dir_ in self.listdir(): - if dir_[0].startswith('__attach') and dir_[0] not in attachmentDirs: - attachmentDirs.append(dir_[0]) - - self._attachments = [] - - for attachmentDir in attachmentDirs: - self._attachments.append(Attachment(self, attachmentDir)) - - return self._attachments - - @property - def sender(self): - try: - return self._sender - except Exception: - # Check header first - if self.header is not None: - headerResult = self.header["from"] - if headerResult is not None: - self._sender = headerResult - return headerResult - - # Extract from other fields - text = self._getStringStream('__substg1.0_0C1A') - email = self._getStringStream('__substg1.0_0C1F') - result = None - if text is None: - result = email - else: - result = text - if email is not None: - result = result + " <" + email + ">" - - self._sender = result - return result - - @property - def to(self): - try: - return self._to - except Exception: - # Check header first - if self.header is not None: - headerResult = self.header["to"] - if headerResult is not None: - self._to = headerResult - return headerResult - - # Extract from other fields - # TODO: This should really extract data from the recip folders, - # but how do you know which is to/cc/bcc? - display = self._getStringStream('__substg1.0_0E04') - self._to = display - return display - - @property - def cc(self): - try: - return self._cc - except Exception: - # Check header first - if self.header is not None: - headerResult = self.header["cc"] - if headerResult is not None: - self._cc = headerResult - return headerResult - - # Extract from other fields - # TODO: This should really extract data from the recip folders, - # but how do you know which is to/cc/bcc? - display = self._getStringStream('__substg1.0_0E03') - self._cc = display - return display - - @property - def body(self): - return self._getStringStream('__substg1.0_1000') - - @property - def sujet(self): - return self._getStringStream('__substg1.0_0037') - - @property - def recupar(self): - return self._getStringStream('__substg1.0_0040') - - @property - def nomaffichefrom(self): - return self._getStringStream('__substg1.0_0042') - - @property - def Recupar(self): - return self._getStringStream('__substg1.0_0044') - - @property - def Lesender(self): - return self._getStringStream('__substg1.0_0065') - - @property - def lobjet(self): - return self._getStringStream('__substg1.0_0070') - - @property - def lentete(self): - return self._getStringStream('__substg1.0_007d') - - @property - def bcc(self): - return self._getStringStream('__substg1.0_0E02') - - @property - def displayto(self): - return self._getStringStream('__substg1.0_0E04') - - def dump(self): - # Prints out a summary of the message - print('Message') - print('Subject:', self.subject) - print('Date:', self.date) - print('Body:') - print(self.body) - print('Recu par: ', self.recupar) - print('Nom affiche dans le from: %s' % self.nomaffichefrom) - print('Le sender: ', self.Lesender) - print('lobjet: ', self.lobjet) - print('lentete: ', self.lentete) - print('bcc: ', self.bcc) - print('display to: ', self.displayto) - - def getReport(self): - result = {"subject": self.subject, "date": self.date, "receivers": self.recupar, "displayFrom": self.nomaffichefrom, - "sender": self.Lesender, "topic": self.lobjet, "bcc": self.bcc, "displayTo": self.displayto, - "headers": self.lentete, "body": self.body} - - attachments = [] - for attachment in self.attachments: - attachments.append({"filename": attachment.longFilename, - "mime": attachment.mimeTag, "extension": attachment.extension}) - - result["attachments"] = attachments - - return result diff --git a/analyzers/MsgParser/parse.py b/analyzers/MsgParser/parse.py index d12c71343..de498a905 100755 --- a/analyzers/MsgParser/parse.py +++ b/analyzers/MsgParser/parse.py @@ -1,40 +1,116 @@ #!/usr/bin/env python3 # encoding: utf-8 -from lib.msgParser import Message from cortexutils.analyzer import Analyzer - +from outlook_msg import Message +import iocextract +import extract_msg +import tempfile +import hashlib class MsgParserAnalyzer(Analyzer): - + def __init__(self): Analyzer.__init__(self) - - self.filename = self.get_param('filename', 'noname.ext') - self.filepath = self.get_param('file', None, 'File is missing') + self.filepath = self.get_param('file', None, 'File is missing') def summary(self, raw): taxonomies = [] - level = "info" - namespace = "MsgParser" - predicate = "Attachments" - value = "0" - if "attachments" in raw: - value = len(raw["attachments"]) - taxonomies.append(self.build_taxonomy(level, namespace, predicate, value)) + if 'attachments' in raw: + taxonomies.append(self.build_taxonomy('info', 'MsgParser', 'Attachments', len(raw['attachments']))) + + return { 'taxonomies': taxonomies } + + # @brief Bringing up observables from the mail to TheHive + def artifacts(self, raw): + artifacts = [] + urls = list(set(iocextract.extract_urls(str(raw)))) + ipv4s = list(set(iocextract.extract_ipv4s(str(raw)))) + mail_addresses = list(set(iocextract.extract_emails(str(raw)))) + hashes = list(set(iocextract.extract_hashes(str(raw)))) + + # Extract each attachment to send as an observable + for attachment in self.attachments_paths: + artifacts.append(self.build_artifact('file', attachment, tlp=3)) + + for u in urls: + artifacts.append(self.build_artifact('url', str(u))) + + for i in ipv4s: + artifacts.append(self.build_artifact('ip', str(i))) + + for e in mail_addresses: + artifacts.append(self.build_artifact('mail', str(e))) + + for h in hashes: + artifacts.append(self.build_artifact('hash', str(h))) + + # Cleanup the temporary folder + self.temp_dir.cleanup() + + return artifacts + + + # @brief Returns the hash of the input file + # @param data_bytes: content of the file readed + # @param mode: Hash algorithms mode + def get_hash(self, data_bytes, mode='md5'): + h = hashlib.new(mode) + h.update(data_bytes) + digest = h.hexdigest() + return digest - return {"taxonomies": taxonomies} + # @brief Main function to retrieve mail information and attachments + def parseMsg(self): + + # Extract all information from the mail with extract_msg + msg = extract_msg.Message(self.filepath) + + result = dict() + result['subject'] = str(msg.subject) + result['date'] = str(msg.date) + result['receivers'] = str(msg.to) + result['sender'] = str(msg.sender) + result['bcc'] = str(msg.bcc) + result['headers'] = str(msg.header) + result['body'] = str(msg.body) + result['MessageID'] = str(msg.messageId) + result['XoriginatingIP'] = str(msg.header.get('x-originating-ip')) + + result['attachments'] = list() + + # Retrieves the list of attachments and saves them in a temporary folder. + # Then for each attachment, calculates the different Hash of the attachment + self.attachments_paths = [] + self.temp_dir = tempfile.TemporaryDirectory() + + with open(self.filepath) as msg_file: + msg = Message(msg_file) + + for an_attachment in msg.attachments: + attachment_name = '{}/{}'.format(str(self.temp_dir.name), str(an_attachment.filename)) + self.attachments_paths.append(attachment_name) + + with an_attachment.open() as attachment_fp, open(attachment_name, 'wb') as output_fp: + data = attachment_fp.read() + output_fp.write(data) + attachment_sum_up = dict() + attachment_sum_up['filename'] = attachment_name.split('/')[-1] + # Calculates the hash of each attachment + attachment_sum_up['md5'] = self.get_hash(data, 'md5') + attachment_sum_up['sha1'] = self.get_hash(data, 'sha1') + attachment_sum_up['sha256'] = self.get_hash(data, 'sha256') + result['attachments'].append(attachment_sum_up) + + return result def run(self): - if self.data_type == 'file': - try: - self.report(Message(self.filepath).getReport()) - except Exception as e: - self.unexpectedError(e) + if self.data_type == 'file': + parsingResult = self.parseMsg() + self.report(parsingResult) else: self.notSupported() - - + if __name__ == '__main__': MsgParserAnalyzer().run() diff --git a/analyzers/MsgParser/requirements.txt b/analyzers/MsgParser/requirements.txt index 1a17a0ad5..227395cd9 100644 --- a/analyzers/MsgParser/requirements.txt +++ b/analyzers/MsgParser/requirements.txt @@ -1,2 +1,5 @@ cortexutils olefile +extract-msg +iocextract +outlook-msg \ No newline at end of file diff --git a/thehive-templates/Msg_Parser_3_0/short.html b/thehive-templates/Msg_Parser_3_0/short.html deleted file mode 100644 index 5fc0dabfb..000000000 --- a/thehive-templates/Msg_Parser_3_0/short.html +++ /dev/null @@ -1,3 +0,0 @@ - - {{t.namespace}}:{{t.predicate}}="{{t.value}}" - diff --git a/thehive-templates/Msg_Parser_3_0/long.html b/thehive-templates/Msg_Parser_4_0/long.html similarity index 62% rename from thehive-templates/Msg_Parser_3_0/long.html rename to thehive-templates/Msg_Parser_4_0/long.html index 1291294fc..6a1a62520 100644 --- a/thehive-templates/Msg_Parser_3_0/long.html +++ b/thehive-templates/Msg_Parser_4_0/long.html @@ -7,7 +7,6 @@ -
Email message details @@ -18,44 +17,58 @@
From
-
{{content.displayFrom}} ({{content.sender}})
+
{{content.sender}}
To
-
{{content.displayTo}} ({{content.receivers}})
+
{{content.receivers}}
Subject
{{content.subject || '-'}}
-
Topic
-
{{content.topic || '-'}}
+
Date
+
{{content.date || '-'}}
+
+
+
X-Originating-IP
+
{{content.XoriginatingIP || '-'}}
+
+
+
Message-ID
+
{{content.MessageID || '-'}}
Bcc
{{content.bcc || '-'}}
+
Attachments
This message file includes
- +
- - - - - + + + + + - - - - - - + + + + + + + + + + +
FilenameMime TypeExtension
FilenameFile information
{{a.filename}}{{a.mime}}{{a.extension}}
{{a.filename}}[MD5]: {{a.md5}}
[SHA1]: {{a.sha1}}
[SHA256]: {{a.sha256}}
@@ -74,3 +87,4 @@
+ diff --git a/thehive-templates/Msg_Parser_4_0/short.html b/thehive-templates/Msg_Parser_4_0/short.html new file mode 100644 index 000000000..41a60f314 --- /dev/null +++ b/thehive-templates/Msg_Parser_4_0/short.html @@ -0,0 +1,4 @@ + + {{t.namespace}}:{{t.predicate}}={{t.value}}