From afecc0c4c4b0bb1ff3493d5621ebb64425ea6a4f Mon Sep 17 00:00:00 2001
From: waltyon <waltyon@hotmail.fr>
Date: Wed, 9 Jun 2021 11:45:10 +0200
Subject: [PATCH] Improve MsgParser analyzer adding attachment and IOC as
 observables

* Add refactor modules by using external libraries
* Explore attachments to import into observables and the lists with their hashes (md5, sha1, sha256)
* Search for possible IOC in the mail (IP, Hash, url, email address..)
* The code structure was inspired by EmlParser analyzer
---
 analyzers/MsgParser/Msg_Parser.json           |   6 +-
 analyzers/MsgParser/lib/__init__.py           |   1 -
 analyzers/MsgParser/lib/msgParser.py          | 300 ------------------
 analyzers/MsgParser/parse.py                  | 118 +++++--
 analyzers/MsgParser/requirements.txt          |   3 +
 thehive-templates/Msg_Parser_3_0/short.html   |   3 -
 .../long.html                                 |  48 ++-
 thehive-templates/Msg_Parser_4_0/short.html   |   4 +
 8 files changed, 138 insertions(+), 345 deletions(-)
 delete mode 100755 analyzers/MsgParser/lib/__init__.py
 delete mode 100755 analyzers/MsgParser/lib/msgParser.py
 delete mode 100644 thehive-templates/Msg_Parser_3_0/short.html
 rename thehive-templates/{Msg_Parser_3_0 => Msg_Parser_4_0}/long.html (62%)
 create mode 100644 thehive-templates/Msg_Parser_4_0/short.html

diff --git a/analyzers/MsgParser/Msg_Parser.json b/analyzers/MsgParser/Msg_Parser.json
index 06d5acccc..905a9692d 100644
--- a/analyzers/MsgParser/Msg_Parser.json
+++ b/analyzers/MsgParser/Msg_Parser.json
@@ -1,11 +1,11 @@
 {
   "name": "Msg_Parser",
-  "version": "3.0",
-  "author": "CERT-BDF",
+  "version": "4.0",
+  "author": "Waltyon",
   "url": "https://github.com/TheHive-Project/Cortex-Analyzers",
   "license": "AGPL-V3",
   "description": "Parse Outlook MSG files and extract the main artifacts.",
   "dataTypeList": ["file"],
   "baseConfig": "MsgParser",
   "command": "MsgParser/parse.py"
-}
+}
\ No newline at end of file
diff --git a/analyzers/MsgParser/lib/__init__.py b/analyzers/MsgParser/lib/__init__.py
deleted file mode 100755
index 35e377ec3..000000000
--- a/analyzers/MsgParser/lib/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# __import__('pkg_resources').declare_namespace(__name__)
diff --git a/analyzers/MsgParser/lib/msgParser.py b/analyzers/MsgParser/lib/msgParser.py
deleted file mode 100755
index 00ad5356d..000000000
--- a/analyzers/MsgParser/lib/msgParser.py
+++ /dev/null
@@ -1,300 +0,0 @@
-#!/usr/bin/env python
-# encoding: utf-8
-
-# --- LICENSE -----------------------------------------------------------------
-#
-#    Copyright 2013 Matthew Walker
-#
-#    This program is free software: you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation, either version 3 of the License, or
-#    (at your option) any later version.
-#
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-#
-#    You should have received a copy of the GNU General Public License
-#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-import json
-import os
-import sys
-import glob
-import traceback
-from email.parser import Parser as EmailParser
-import email.utils
-import olefile as OleFile
-
-
-class Attachment:
-
-    def __init__(self, msg, dir_):
-
-        # print dir_
-
-        # Get long filename
-        self.longFilename = msg._getStringStream([dir_, '__substg1.0_3707'])
-        # print  self.longFilename
-
-        # Get short filename
-        self.shortFilename = msg._getStringStream([dir_, '__substg1.0_3704'])
-
-        # Get attachment data
-        self.data = msg._getStream([dir_, '__substg1.0_37010102'])
-
-        # Get short mimeTag
-        self.mimeTag = msg._getStringStream([dir_, '__substg1.0_370E'])
-
-        # Get extension
-        self.extension = msg._getStringStream([dir_, '__substg1.0_3703'])
-
-    def save(self):
-        # Use long filename as first preference
-        filename = self.longFilename
-
-        # Otherwise use the short filename
-        if filename is None:
-            filename = self.shortFilename
-        # Otherwise just make something up!
-        if filename is None:
-            import random
-            import string
-            filename = 'UnknownFilename ' + \
-                       ''.join(random.choice(string.ascii_uppercase + string.digits)
-                               for _ in range(5)) + ".bin"
-            #f = open("/tmp/" + filename, 'wb')
-            # if self.data is None:
-            #f.write(("Pas de PJ"))
-            # f.close()
-            # else:
-            # f.write((self.data))
-            # f.close()
-            # return filename
-
-
-def windowsUnicode(string):
-    if string is None:
-        return None
-    if sys.version_info[0] >= 3:  # Python 3
-        return str(string, 'utf_16_le')
-    else:  # Python 2
-        return unicode(string, 'utf_16_le')
-
-
-class Message(OleFile.OleFileIO):
-
-    def __init__(self, filename):
-        OleFile.OleFileIO.__init__(self, filename)
-
-    def _getStream(self, filename):
-        if self.exists(filename):
-            stream = self.openstream(filename)
-            return stream.read()
-        else:
-            return None
-
-    def _getStringStream(self, filename, prefer='unicode'):
-        """Gets a string representation of the requested filename.
-        Checks for both ASCII and Unicode representations and returns
-        a value if possible.  If there are both ASCII and Unicode
-        versions, then the parameter /prefer/ specifies which will be
-        returned.
-        """
-
-        if isinstance(filename, list):
-            # Join with slashes to make it easier to append the type
-            filename = "/".join(filename)
-
-        asciiVersion = self._getStream(filename + '001E')
-        unicodeVersion = windowsUnicode(self._getStream(filename + '001F'))
-        if asciiVersion is None:
-            return unicodeVersion
-        elif unicodeVersion is None:
-            return asciiVersion.decode('ascii', 'ignore')
-        else:
-            if prefer == 'unicode':
-                return unicodeVersion
-            else:
-                return asciiVersion.decode('ascii', 'ignore')
-
-    @property
-    def subject(self):
-        return self._getStringStream('__substg1.0_0037')
-
-    @property
-    def header(self):
-        try:
-            return self._header
-        except Exception:
-            headerText = self._getStringStream('__substg1.0_007D')
-            if headerText is not None:
-                self._header = EmailParser().parsestr(headerText)
-            else:
-                self._header = None
-            return self._header
-
-    @property
-    def date(self):
-        # Get the message's header and extract the date
-        if self.header is None:
-            return None
-        else:
-            return self.header['date']
-
-    @property
-    def parsedDate(self):
-        return email.utils.parsedate(self.date)
-
-    @property
-    def attachments(self):
-        try:
-            return self._attachments
-        except Exception:
-            # Get the attachments
-            attachmentDirs = []
-
-            for dir_ in self.listdir():
-                if dir_[0].startswith('__attach') and dir_[0] not in attachmentDirs:
-                    attachmentDirs.append(dir_[0])
-
-            self._attachments = []
-
-            for attachmentDir in attachmentDirs:
-                self._attachments.append(Attachment(self, attachmentDir))
-
-            return self._attachments
-
-    @property
-    def sender(self):
-        try:
-            return self._sender
-        except Exception:
-            # Check header first
-            if self.header is not None:
-                headerResult = self.header["from"]
-                if headerResult is not None:
-                    self._sender = headerResult
-                    return headerResult
-
-            # Extract from other fields
-            text = self._getStringStream('__substg1.0_0C1A')
-            email = self._getStringStream('__substg1.0_0C1F')
-            result = None
-            if text is None:
-                result = email
-            else:
-                result = text
-                if email is not None:
-                    result = result + " <" + email + ">"
-
-            self._sender = result
-            return result
-
-    @property
-    def to(self):
-        try:
-            return self._to
-        except Exception:
-            # Check header first
-            if self.header is not None:
-                headerResult = self.header["to"]
-                if headerResult is not None:
-                    self._to = headerResult
-                    return headerResult
-
-            # Extract from other fields
-            # TODO: This should really extract data from the recip folders,
-            # but how do you know which is to/cc/bcc?
-            display = self._getStringStream('__substg1.0_0E04')
-            self._to = display
-            return display
-
-    @property
-    def cc(self):
-        try:
-            return self._cc
-        except Exception:
-            # Check header first
-            if self.header is not None:
-                headerResult = self.header["cc"]
-                if headerResult is not None:
-                    self._cc = headerResult
-                    return headerResult
-
-            # Extract from other fields
-            # TODO: This should really extract data from the recip folders,
-            # but how do you know which is to/cc/bcc?
-            display = self._getStringStream('__substg1.0_0E03')
-            self._cc = display
-            return display
-
-    @property
-    def body(self):
-        return self._getStringStream('__substg1.0_1000')
-
-    @property
-    def sujet(self):
-        return self._getStringStream('__substg1.0_0037')
-
-    @property
-    def recupar(self):
-        return self._getStringStream('__substg1.0_0040')
-
-    @property
-    def nomaffichefrom(self):
-        return self._getStringStream('__substg1.0_0042')
-
-    @property
-    def Recupar(self):
-        return self._getStringStream('__substg1.0_0044')
-
-    @property
-    def Lesender(self):
-        return self._getStringStream('__substg1.0_0065')
-
-    @property
-    def lobjet(self):
-        return self._getStringStream('__substg1.0_0070')
-
-    @property
-    def lentete(self):
-        return self._getStringStream('__substg1.0_007d')
-
-    @property
-    def bcc(self):
-        return self._getStringStream('__substg1.0_0E02')
-
-    @property
-    def displayto(self):
-        return self._getStringStream('__substg1.0_0E04')
-
-    def dump(self):
-        # Prints out a summary of the message
-        print('Message')
-        print('Subject:', self.subject)
-        print('Date:', self.date)
-        print('Body:')
-        print(self.body)
-        print('Recu par: ', self.recupar)
-        print('Nom affiche dans le from: %s' % self.nomaffichefrom)
-        print('Le sender: ', self.Lesender)
-        print('lobjet: ', self.lobjet)
-        print('lentete: ', self.lentete)
-        print('bcc: ', self.bcc)
-        print('display to: ', self.displayto)
-
-    def getReport(self):
-        result = {"subject": self.subject, "date": self.date, "receivers": self.recupar, "displayFrom": self.nomaffichefrom,
-                  "sender": self.Lesender, "topic": self.lobjet, "bcc": self.bcc, "displayTo": self.displayto,
-                  "headers": self.lentete, "body": self.body}
-
-        attachments = []
-        for attachment in self.attachments:
-            attachments.append({"filename": attachment.longFilename,
-                                "mime": attachment.mimeTag, "extension": attachment.extension})
-
-        result["attachments"] = attachments
-
-        return result
diff --git a/analyzers/MsgParser/parse.py b/analyzers/MsgParser/parse.py
index d12c71343..de498a905 100755
--- a/analyzers/MsgParser/parse.py
+++ b/analyzers/MsgParser/parse.py
@@ -1,40 +1,116 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 
-from lib.msgParser import Message
 from cortexutils.analyzer import Analyzer
-
+from outlook_msg import Message
+import iocextract
+import extract_msg
+import tempfile
+import hashlib
 
 class MsgParserAnalyzer(Analyzer):
-
+ 
     def __init__(self):
         Analyzer.__init__(self)
-
-        self.filename = self.get_param('filename', 'noname.ext')
-        self.filepath = self.get_param('file', None, 'File is missing')
+        self.filepath = self.get_param('file', None, 'File is missing')        
 
     def summary(self, raw):
         taxonomies = []
-        level = "info"
-        namespace = "MsgParser"
-        predicate = "Attachments"
-        value = "0"
 
-        if "attachments" in raw:
-            value = len(raw["attachments"])
-            taxonomies.append(self.build_taxonomy(level, namespace, predicate, value))
+        if 'attachments' in raw:
+            taxonomies.append(self.build_taxonomy('info', 'MsgParser', 'Attachments', len(raw['attachments'])))
+
+        return { 'taxonomies': taxonomies }
+
+    # @brief Bringing up observables from the mail to TheHive
+    def artifacts(self, raw):
+        artifacts = []
+        urls = list(set(iocextract.extract_urls(str(raw))))
+        ipv4s = list(set(iocextract.extract_ipv4s(str(raw))))
+        mail_addresses = list(set(iocextract.extract_emails(str(raw))))
+        hashes = list(set(iocextract.extract_hashes(str(raw))))
+        
+        # Extract each attachment to send as an observable
+        for attachment in self.attachments_paths:
+            artifacts.append(self.build_artifact('file', attachment, tlp=3))
+
+        for u in urls:
+            artifacts.append(self.build_artifact('url', str(u)))
+    
+        for i in ipv4s:
+            artifacts.append(self.build_artifact('ip', str(i)))
+    
+        for e in mail_addresses:
+            artifacts.append(self.build_artifact('mail', str(e)))
+        
+        for h in hashes:
+            artifacts.append(self.build_artifact('hash', str(h)))
+
+        # Cleanup the temporary folder
+        self.temp_dir.cleanup()
+
+        return artifacts
+    
+  
+    # @brief Returns the hash of the input file
+    # @param data_bytes: content of the file readed
+    # @param mode: Hash algorithms mode
+    def get_hash(self, data_bytes, mode='md5'):
+        h = hashlib.new(mode)
+        h.update(data_bytes)
+        digest = h.hexdigest()
+        return digest
 
-        return {"taxonomies": taxonomies}
+    # @brief Main function to retrieve mail information and attachments
+    def parseMsg(self):
+
+        # Extract all information from the mail with extract_msg
+        msg = extract_msg.Message(self.filepath)
+
+        result = dict()
+        result['subject'] = str(msg.subject)
+        result['date'] = str(msg.date)
+        result['receivers'] = str(msg.to)
+        result['sender'] = str(msg.sender)
+        result['bcc'] = str(msg.bcc)
+        result['headers'] = str(msg.header)
+        result['body'] = str(msg.body)
+        result['MessageID'] = str(msg.messageId)
+        result['XoriginatingIP'] = str(msg.header.get('x-originating-ip'))
+        
+        result['attachments'] = list()
+
+        # Retrieves the list of attachments and saves them in a temporary folder. 
+        # Then for each attachment, calculates the different Hash of the attachment
+        self.attachments_paths = []
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+        with open(self.filepath) as msg_file:
+            msg = Message(msg_file)
+        
+        for an_attachment in msg.attachments:
+            attachment_name = '{}/{}'.format(str(self.temp_dir.name), str(an_attachment.filename)) 
+            self.attachments_paths.append(attachment_name)
+            
+            with an_attachment.open() as attachment_fp, open(attachment_name, 'wb') as output_fp:
+                data = attachment_fp.read()
+                output_fp.write(data)    
+                attachment_sum_up = dict()
+                attachment_sum_up['filename'] =  attachment_name.split('/')[-1]
+                # Calculates the hash of each attachment
+                attachment_sum_up['md5'] = self.get_hash(data, 'md5')
+                attachment_sum_up['sha1'] = self.get_hash(data, 'sha1')
+                attachment_sum_up['sha256'] = self.get_hash(data, 'sha256')
+                result['attachments'].append(attachment_sum_up)
+
+        return result
 
     def run(self):
-        if self.data_type == 'file':
-            try:
-                self.report(Message(self.filepath).getReport())
-            except Exception as e:
-                self.unexpectedError(e)
+        if self.data_type == 'file':       
+            parsingResult = self.parseMsg()
+            self.report(parsingResult)
         else:
             self.notSupported()
-
-
+        
 if __name__ == '__main__':
     MsgParserAnalyzer().run()
diff --git a/analyzers/MsgParser/requirements.txt b/analyzers/MsgParser/requirements.txt
index 1a17a0ad5..227395cd9 100644
--- a/analyzers/MsgParser/requirements.txt
+++ b/analyzers/MsgParser/requirements.txt
@@ -1,2 +1,5 @@
 cortexutils
 olefile
+extract-msg
+iocextract
+outlook-msg
\ No newline at end of file
diff --git a/thehive-templates/Msg_Parser_3_0/short.html b/thehive-templates/Msg_Parser_3_0/short.html
deleted file mode 100644
index 5fc0dabfb..000000000
--- a/thehive-templates/Msg_Parser_3_0/short.html
+++ /dev/null
@@ -1,3 +0,0 @@
-<span class="label" ng-repeat="t in content.taxonomies" ng-class="{'info': 'label-info', 'safe': 'label-success', 'suspicious': 'label-warning', 'malicious':'label-danger'}[t.level]">
-    {{t.namespace}}:{{t.predicate}}="{{t.value}}"
-</span>
diff --git a/thehive-templates/Msg_Parser_3_0/long.html b/thehive-templates/Msg_Parser_4_0/long.html
similarity index 62%
rename from thehive-templates/Msg_Parser_3_0/long.html
rename to thehive-templates/Msg_Parser_4_0/long.html
index 1291294fc..6a1a62520 100644
--- a/thehive-templates/Msg_Parser_3_0/long.html
+++ b/thehive-templates/Msg_Parser_4_0/long.html
@@ -7,7 +7,6 @@
     </div>
 </div>
 
-
 <div class="panel panel-info" ng-if="success">
     <div class="panel-heading">
         Email message details
@@ -18,44 +17,58 @@
 
         <dl class="dl-horizontal">
             <dt>From</dt>
-            <dd>{{content.displayFrom}} ({{content.sender}})</dd>
+            <dd>{{content.sender}}</dd>
         </dl>
         <dl class="dl-horizontal">
             <dt>To</dt>
-            <dd>{{content.displayTo}} ({{content.receivers}})</dd>
+            <dd>{{content.receivers}}</dd>
         </dl>
         <dl class="dl-horizontal">
             <dt>Subject</dt>
             <dd>{{content.subject || '-'}}</dd>
         </dl>
         <dl class="dl-horizontal">
-            <dt>Topic</dt>
-            <dd>{{content.topic || '-'}}</dd>
+            <dt>Date</dt>
+            <dd>{{content.date || '-'}}</dd>
+        </dl>
+        <dl class="dl-horizontal">
+            <dt>X-Originating-IP</dt>
+            <dd>{{content.XoriginatingIP || '-'}}</dd>
+        </dl>
+        <dl class="dl-horizontal">
+            <dt>Message-ID</dt>
+            <dd>{{content.MessageID || '-'}}</dd>
         </dl>
         <dl class="dl-horizontal">
             <dt>Bcc</dt>
             <dd>{{content.bcc || '-'}}</dd>
         </dl>
+        
         <dl class="dl-horizontal" ng-if="content.attachments && content.attachments.length !== 0">
             <dt>Attachments</dt>
             <dd>
                 <div class="bm10">This message file includes
                     <ng-pluralize count="content.attachments.length" when="{'1': '1 attachment', 'other': '{} attachments'}"></ng-pluralize>
                 </div>
-                <table class="table table-hover">
+                <table class="table table-striped table-bordered">
                     <thead>
-                    <tr>
-                        <th>Filename</th>
-                        <th>Mime Type</th>
-                        <th>Extension</th>
-                    </tr>
+                        <tr>
+                            <th class="col-md-2">Filename</th>
+                            <th>File information</th>
+
+                        </tr>
                     </thead>
-                    <tbody>
-                    <tr ng-repeat="a in content.attachments">
-                        <td class="wrap">{{a.filename}}</td>
-                        <td class="wrap">{{a.mime}}</td>
-                        <td>{{a.extension}}</td>
-                    </tr>
+                    <tbody ng-repeat="a in content.attachments">
+                        <tr>
+                            <td rowspan=4>{{a.filename}}</td>
+                            <td>[MD5]: {{a.md5}}</td>
+                        </tr>
+                        <tr>
+                            <td>[SHA1]: {{a.sha1}}</td>
+                        </tr>
+                        <tr>
+                            <td>[SHA256]: {{a.sha256}}</td>
+                        </tr>
                     </tbody>
                 </table>
             </dd>
@@ -74,3 +87,4 @@
         </dl>
     </div>
 </div>
+
diff --git a/thehive-templates/Msg_Parser_4_0/short.html b/thehive-templates/Msg_Parser_4_0/short.html
new file mode 100644
index 000000000..41a60f314
--- /dev/null
+++ b/thehive-templates/Msg_Parser_4_0/short.html
@@ -0,0 +1,4 @@
+<span class="label" ng-repeat="t in content.taxonomies" ng-class="{'info': 
+'label-info', 'safe': 'label-success', 'suspicious': 'label-warning', 
+'malicious':'label-danger'}[t.level]">
+    {{t.namespace}}:{{t.predicate}}={{t.value}} </span>