fix: [logarchive] fix json parsing - needs new unifiedlogs_parser_jso…

…n code
EC-DIGIT-CSIRC · Jun 12, 2024 · 0cc2625 · 0cc2625
1 parent f5632a7
commit 0cc2625
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 47 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -54,27 +54,27 @@
             "cwd": "${workspaceFolder}/"
         },
         {
-            "name": "Python Debugger: logarchive.py",
+            "name": "Python Debugger: parsing.py list all",
             "type": "debugpy",
             "request": "launch",
-            "program": "${workspaceFolder}/parsers/logarchive.py",
-            "args": "-i data/1/sysdiagnose_2023.08.25_15-14-14+0200_iPhone-OS_iPhone_21A5319a/system_logs.logarchive",
+            "program": "${workspaceFolder}/parsing.py",
+            "args": "list all",
             "cwd": "${workspaceFolder}/"
         },
         {
-            "name": "Python Debugger: parsing.py list all",
+            "name": "Python Debugger: parsing.py parse demo_parser",
             "type": "debugpy",
             "request": "launch",
             "program": "${workspaceFolder}/parsing.py",
-            "args": "list all",
+            "args": "parse demo_parser 1",
             "cwd": "${workspaceFolder}/"
         },
         {
-            "name": "Python Debugger: parsing.py parse demo_parser",
+            "name": "Python Debugger: parsing.py parse logarchive",
             "type": "debugpy",
             "request": "launch",
             "program": "${workspaceFolder}/parsing.py",
-            "args": "parse demo_parser 1",
+            "args": "parse logarchive 1",
             "cwd": "${workspaceFolder}/"
         },
         {

diff --git a/analysers/apps.py b/analysers/apps.py
@@ -71,12 +71,12 @@ def analyse_path(case_folder: str, output_file: str = 'apps.json') -> bool:
                     else:
                         apps[entry['bundle_id']]['found'].append('itunesstore')
 
-        if file_in_dir.endswith('logarchive'):
+        elif file_in_dir.endswith('logarchive'):
             re_bundle_id_pattern = r'(([a-zA-Z0-9-_]+\.)+[a-zA-Z0-9-_]+)'
             # list files in here
             for file_in_logarchive_dir in os.listdir(file_in_dir):
                 file_in_logarchive_dir = os.path.join(file_in_dir, file_in_logarchive_dir)
-                # logarchive/logarchive.json is a ijson multiline json - generated by native unifiedlog parser
+                # logarchive/logarchive.json is a multiline json - generated by native unifiedlog parser
                 if file_in_logarchive_dir.endswith('logarchive.json'):  # apple unified log format
                     print(f"Found apple logarchive.json: {file_in_logarchive_dir}")
                     # TODO fix the parser to ensure the same result is given for native and non-native unifiedlog parser ? or just catch it this way
@@ -96,50 +96,42 @@ def analyse_path(case_folder: str, output_file: str = 'apps.json') -> bool:
                                             apps[entry['subsystem']] = {'found': ['logarchive']}
                                         else:
                                             apps[entry['subsystem']]['found'].append('logarchive')
-                elif file_in_logarchive_dir.endswith('dataFoundInMultipleLogFiles.json'):
-                    # FIXME see https://github.com/mandiant/macos-UnifiedLogs/blob/main/examples/unifiedlog_parser_json/src/main.rs#L342
-                    # Long story short:
-                    # - invalid json, ['a']['b'] is not valid json, should be ['a', 'b']
-                    # - or multiline json
-                    # - potential workaround is to make a dirty function to parse, take the offset of the error, extract, parse again, and so on
-                    # See also https://github.com/mandiant/macos-UnifiedLogs/issues/16
-                    continue
                 else:
-                    print(f"Non-native file in logarchive folder: {file_in_logarchive_dir}")
+                    # mandiant unifiedlog parser is multiline json format
+                    print(f"Found non-native logarchive file: {file_in_logarchive_dir}")
                     with open(file_in_logarchive_dir, 'r') as f:
-                        json_data = json.load(f)
-                    # directly going to the list of apps
-                    for entry in json_data:
-                        # skip empty entries
-                        if entry['subsystem'] == '':
-                            continue
-                        # extract app/bundle id or process name from the subsystem field
-                        if not re.search(r'^' + re_bundle_id_pattern + r'$', entry['subsystem']):
-                            # extract foo.bar.hello from the substing if it is in that format
-                            matches = re.findall(re_bundle_id_pattern, entry['subsystem'])
-                            if matches:
-                                new_term = matches[0][0]
-                            else:
-                                # below are not really apps...more processes.
-                                # TODO decide if we want to keep them or not.
-                                matches = re.findall(r'\[([a-zA-Z0-9-_]+)\]', entry['subsystem'])
+                        for line in f:  # jsonl format
+                            entry = json.loads(line)
+                            # skip empty entries
+                            if entry['subsystem'] == '':
+                                continue
+                            # extract app/bundle id or process name from the subsystem field
+                            if not re.search(r'^' + re_bundle_id_pattern + r'$', entry['subsystem']):
+                                # extract foo.bar.hello from the substing if it is in that format
+                                matches = re.findall(re_bundle_id_pattern, entry['subsystem'])
                                 if matches:
-                                    new_term = matches[0]
+                                    new_term = matches[0][0]
                                 else:
-                                    matches = re.findall(r'^([a-zA-Z0-9-_]+)$', entry['subsystem'])
+                                    # below are not really apps...more processes.
+                                    # TODO decide if we want to keep them or not.
+                                    matches = re.findall(r'\[([a-zA-Z0-9-_]+)\]', entry['subsystem'])
                                     if matches:
                                         new_term = matches[0]
                                     else:
-                                        # print(f"Skipping entry: {entry['subsystem']}")
-                                        continue
-                            # print(f"New entry: {new_term} - was: {entry['subsystem']}")
-                            entry['subsystem'] = new_term
-                        # add it to the list
-                        if entry['subsystem'] not in apps:
-                            apps[entry['subsystem']] = {'found': ['logarchive']}
-                        else:
-                            if 'logarchive' not in apps[entry['subsystem']]['found']:
-                                apps[entry['subsystem']]['found'].append('logarchive')
+                                        matches = re.findall(r'^([a-zA-Z0-9-_]+)$', entry['subsystem'])
+                                        if matches:
+                                            new_term = matches[0]
+                                        else:
+                                            # print(f"Skipping entry: {entry['subsystem']}")
+                                            continue
+                                # print(f"New entry: {new_term} - was: {entry['subsystem']}")
+                                entry['subsystem'] = new_term
+                            # add it to the list
+                            if entry['subsystem'] not in apps:
+                                apps[entry['subsystem']] = {'found': ['logarchive']}
+                            else:
+                                if 'logarchive' not in apps[entry['subsystem']]['found']:
+                                    apps[entry['subsystem']]['found'].append('logarchive')
                 # logarchive/*.json are separate json files - generated by non-native unifiedlog parser
 
     with open(output_file, 'w') as f:

diff --git a/parsers/logarchive.py b/parsers/logarchive.py
@@ -119,7 +119,8 @@ def get_logs_on_linux(filename, output):
             for fname in os.listdir(tmp_outpath):
                 with open(os.path.join(tmp_outpath, fname), 'r') as f:
                     try:
-                        json_data = json.load(f)
+                        # jsonl format - one json object per line
+                        json_data = [json.loads(line) for line in f]
                         data.append(json_data)
                     except json.JSONDecodeError as e:
                         print(f"WARNING: error parsing JSON {fname}: {str(e)}")