Skip to content

Commit

Permalink
fix: [logarchive] fix json parsing - needs new unifiedlogs_parser_jso…
Browse files Browse the repository at this point in the history
…n code
  • Loading branch information
cvandeplas committed Jun 12, 2024
1 parent f5632a7 commit 0cc2625
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 47 deletions.
14 changes: 7 additions & 7 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,27 +54,27 @@
"cwd": "${workspaceFolder}/"
},
{
"name": "Python Debugger: logarchive.py",
"name": "Python Debugger: parsing.py list all",
"type": "debugpy",
"request": "launch",
"program": "${workspaceFolder}/parsers/logarchive.py",
"args": "-i data/1/sysdiagnose_2023.08.25_15-14-14+0200_iPhone-OS_iPhone_21A5319a/system_logs.logarchive",
"program": "${workspaceFolder}/parsing.py",
"args": "list all",
"cwd": "${workspaceFolder}/"
},
{
"name": "Python Debugger: parsing.py list all",
"name": "Python Debugger: parsing.py parse demo_parser",
"type": "debugpy",
"request": "launch",
"program": "${workspaceFolder}/parsing.py",
"args": "list all",
"args": "parse demo_parser 1",
"cwd": "${workspaceFolder}/"
},
{
"name": "Python Debugger: parsing.py parse demo_parser",
"name": "Python Debugger: parsing.py parse logarchive",
"type": "debugpy",
"request": "launch",
"program": "${workspaceFolder}/parsing.py",
"args": "parse demo_parser 1",
"args": "parse logarchive 1",
"cwd": "${workspaceFolder}/"
},
{
Expand Down
70 changes: 31 additions & 39 deletions analysers/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,12 @@ def analyse_path(case_folder: str, output_file: str = 'apps.json') -> bool:
else:
apps[entry['bundle_id']]['found'].append('itunesstore')

if file_in_dir.endswith('logarchive'):
elif file_in_dir.endswith('logarchive'):
re_bundle_id_pattern = r'(([a-zA-Z0-9-_]+\.)+[a-zA-Z0-9-_]+)'
# list files in here
for file_in_logarchive_dir in os.listdir(file_in_dir):
file_in_logarchive_dir = os.path.join(file_in_dir, file_in_logarchive_dir)
# logarchive/logarchive.json is a ijson multiline json - generated by native unifiedlog parser
# logarchive/logarchive.json is a multiline json - generated by native unifiedlog parser
if file_in_logarchive_dir.endswith('logarchive.json'): # apple unified log format
print(f"Found apple logarchive.json: {file_in_logarchive_dir}")
# TODO fix the parser to ensure the same result is given for native and non-native unifiedlog parser ? or just catch it this way
Expand All @@ -96,50 +96,42 @@ def analyse_path(case_folder: str, output_file: str = 'apps.json') -> bool:
apps[entry['subsystem']] = {'found': ['logarchive']}
else:
apps[entry['subsystem']]['found'].append('logarchive')
elif file_in_logarchive_dir.endswith('dataFoundInMultipleLogFiles.json'):
# FIXME see https://github.com/mandiant/macos-UnifiedLogs/blob/main/examples/unifiedlog_parser_json/src/main.rs#L342
# Long story short:
# - invalid json, ['a']['b'] is not valid json, should be ['a', 'b']
# - or multiline json
# - potential workaround is to make a dirty function to parse, take the offset of the error, extract, parse again, and so on
# See also https://github.com/mandiant/macos-UnifiedLogs/issues/16
continue
else:
print(f"Non-native file in logarchive folder: {file_in_logarchive_dir}")
# mandiant unifiedlog parser is multiline json format
print(f"Found non-native logarchive file: {file_in_logarchive_dir}")
with open(file_in_logarchive_dir, 'r') as f:
json_data = json.load(f)
# directly going to the list of apps
for entry in json_data:
# skip empty entries
if entry['subsystem'] == '':
continue
# extract app/bundle id or process name from the subsystem field
if not re.search(r'^' + re_bundle_id_pattern + r'$', entry['subsystem']):
# extract foo.bar.hello from the substing if it is in that format
matches = re.findall(re_bundle_id_pattern, entry['subsystem'])
if matches:
new_term = matches[0][0]
else:
# below are not really apps...more processes.
# TODO decide if we want to keep them or not.
matches = re.findall(r'\[([a-zA-Z0-9-_]+)\]', entry['subsystem'])
for line in f: # jsonl format
entry = json.loads(line)
# skip empty entries
if entry['subsystem'] == '':
continue
# extract app/bundle id or process name from the subsystem field
if not re.search(r'^' + re_bundle_id_pattern + r'$', entry['subsystem']):
# extract foo.bar.hello from the substing if it is in that format
matches = re.findall(re_bundle_id_pattern, entry['subsystem'])
if matches:
new_term = matches[0]
new_term = matches[0][0]
else:
matches = re.findall(r'^([a-zA-Z0-9-_]+)$', entry['subsystem'])
# below are not really apps...more processes.
# TODO decide if we want to keep them or not.
matches = re.findall(r'\[([a-zA-Z0-9-_]+)\]', entry['subsystem'])
if matches:
new_term = matches[0]
else:
# print(f"Skipping entry: {entry['subsystem']}")
continue
# print(f"New entry: {new_term} - was: {entry['subsystem']}")
entry['subsystem'] = new_term
# add it to the list
if entry['subsystem'] not in apps:
apps[entry['subsystem']] = {'found': ['logarchive']}
else:
if 'logarchive' not in apps[entry['subsystem']]['found']:
apps[entry['subsystem']]['found'].append('logarchive')
matches = re.findall(r'^([a-zA-Z0-9-_]+)$', entry['subsystem'])
if matches:
new_term = matches[0]
else:
# print(f"Skipping entry: {entry['subsystem']}")
continue
# print(f"New entry: {new_term} - was: {entry['subsystem']}")
entry['subsystem'] = new_term
# add it to the list
if entry['subsystem'] not in apps:
apps[entry['subsystem']] = {'found': ['logarchive']}
else:
if 'logarchive' not in apps[entry['subsystem']]['found']:
apps[entry['subsystem']]['found'].append('logarchive')
# logarchive/*.json are separate json files - generated by non-native unifiedlog parser

with open(output_file, 'w') as f:
Expand Down
3 changes: 2 additions & 1 deletion parsers/logarchive.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ def get_logs_on_linux(filename, output):
for fname in os.listdir(tmp_outpath):
with open(os.path.join(tmp_outpath, fname), 'r') as f:
try:
json_data = json.load(f)
# jsonl format - one json object per line
json_data = [json.loads(line) for line in f]
data.append(json_data)
except json.JSONDecodeError as e:
print(f"WARNING: error parsing JSON {fname}: {str(e)}")
Expand Down

0 comments on commit 0cc2625

Please sign in to comment.