From 27f91d7edd1a006ae43f7b500c022d97bdb5d40b Mon Sep 17 00:00:00 2001 From: Onewbiek Date: Thu, 24 Aug 2023 13:22:38 -0400 Subject: [PATCH 01/19] Init .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2ed7bdc --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +build/ +dist/ +*.egg-info/ \ No newline at end of file From 79e61cb4caa7b3c044a0dced5188aac748627204 Mon Sep 17 00:00:00 2001 From: Onewbiek Date: Thu, 24 Aug 2023 23:37:39 -0400 Subject: [PATCH 02/19] extend drishti to support Recorder traces --- drishti/{main.py => handle_darshan.py} | 397 +------- drishti/handle_recorder.py | 1185 ++++++++++++++++++++++++ drishti/includes.py | 192 ++++ drishti/reporter.py | 132 +++ requirements.txt | 1 + setup.py | 5 +- 6 files changed, 1563 insertions(+), 349 deletions(-) rename drishti/{main.py => handle_darshan.py} (82%) create mode 100644 drishti/handle_recorder.py create mode 100644 drishti/includes.py create mode 100644 drishti/reporter.py diff --git a/drishti/main.py b/drishti/handle_darshan.py similarity index 82% rename from drishti/main.py rename to drishti/handle_darshan.py index 3afb96c..6d4e70f 100644 --- a/drishti/main.py +++ b/drishti/handle_darshan.py @@ -9,7 +9,6 @@ import shlex import shutil import datetime -import argparse import subprocess import pandas as pd @@ -17,230 +16,17 @@ import darshan import darshan.backend.cffi_backend as darshanll -from rich import print, box, rule -from rich.console import Console, Group +from rich import print, box +from rich.console import Group from rich.padding import Padding -from rich.text import Text from rich.syntax import Syntax from rich.panel import Panel from rich.terminal_theme import TerminalTheme from rich.terminal_theme import MONOKAI -from subprocess import call from packaging import version - -RECOMMENDATIONS = 0 -HIGH = 1 -WARN = 2 -INFO = 3 -OK = 4 - -ROOT = os.path.abspath(os.path.dirname(__file__)) - -TARGET_USER = 1 -TARGET_DEVELOPER = 2 -TARGET_SYSTEM = 3 - -insights_operation = [] -insights_metadata = [] -insights_dxt = [] - -insights_total = dict() - -insights_total[HIGH] = 0 -insights_total[WARN] = 0 -insights_total[RECOMMENDATIONS] = 0 - -THRESHOLD_OPERATION_IMBALANCE = 0.1 -THRESHOLD_SMALL_REQUESTS = 0.1 -THRESHOLD_SMALL_REQUESTS_ABSOLUTE = 1000 -THRESHOLD_MISALIGNED_REQUESTS = 0.1 -THRESHOLD_METADATA = 0.1 -THRESHOLD_METADATA_TIME_RANK = 30 # seconds -THRESHOLD_RANDOM_OPERATIONS = 0.2 -THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE = 1000 -THRESHOLD_STRAGGLERS = 0.15 -THRESHOLD_IMBALANCE = 0.30 -THRESHOLD_INTERFACE_STDIO = 0.1 -THRESHOLD_COLLECTIVE_OPERATIONS = 0.5 -THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE = 1000 - -INSIGHTS_STDIO_HIGH_USAGE = 'S01' -INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE = 'P01' -INSIGHTS_POSIX_READ_COUNT_INTENSIVE = 'P02' -INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE = 'P03' -INSIGHTS_POSIX_READ_SIZE_INTENSIVE = 'P04' -INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE = 'P05' -INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE = 'P06' -INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE = 'P07' -INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE = 'P08' -INSIGHTS_POSIX_REDUNDANT_READ_USAGE = 'P09' -INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE = 'P10' -INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE = 'P11' -INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE = 'P12' -INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE = 'P13' -INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE = 'P14' -INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE = 'P15' -INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE = 'P16' -INSIGHTS_POSIX_HIGH_METADATA_TIME = 'P17' -INSIGHTS_POSIX_SIZE_IMBALANCE = 'P18' -INSIGHTS_POSIX_TIME_IMBALANCE = 'P19' -INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE = 'P21' -INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE = 'P22' -INSIGHTS_MPI_IO_NO_USAGE = 'M01' -INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE = 'M02' -INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE = 'M03' -INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE = 'M04' -INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE = 'M05' -INSIGHTS_MPI_IO_BLOCKING_READ_USAGE = 'M06' -INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE = 'M07' -INSIGHTS_MPI_IO_AGGREGATORS_INTRA = 'M08' -INSIGHTS_MPI_IO_AGGREGATORS_INTER = 'M09' -INSIGHTS_MPI_IO_AGGREGATORS_OK = 'M10' - -# TODO: need to verify the threashold to be between 0 and 1 -# TODO: read thresholds from file - -parser = argparse.ArgumentParser( - description='Drishti: ' -) - -parser.add_argument( - 'darshan', - help='Input .darshan file' -) - -parser.add_argument( - '--issues', - default=False, - action='store_true', - dest='only_issues', - help='Only displays the detected issues and hides the recommendations' -) - -parser.add_argument( - '--html', - default=False, - action='store_true', - dest='export_html', - help='Export the report as an HTML page' -) - -parser.add_argument( - '--svg', - default=False, - action='store_true', - dest='export_svg', - help='Export the report as an SVG image' -) - -parser.add_argument( - '--light', - default=False, - action='store_true', - dest='export_theme_light', - help='Use a light theme for the report when generating files' -) - -parser.add_argument( - '--size', - default=False, - dest='export_size', - help='Console width used for the report and generated files' -) - -parser.add_argument( - '--verbose', - default=False, - action='store_true', - dest='verbose', - help='Display extended details for the recommendations' -) - -parser.add_argument( - '--code', - default=False, - action='store_true', - dest='code', - help='Display insights identification code' -) - -parser.add_argument( - '--path', - default=False, - action='store_true', - dest='full_path', - help='Display the full file path for the files that triggered the issue' -) - -parser.add_argument( - '--csv', - default=False, - action='store_true', - dest='export_csv', - help='Export a CSV with the code of all issues that were triggered' -) - -parser.add_argument( - '--json', - default=False, - dest='json', - help=argparse.SUPPRESS) - -args = parser.parse_args() - -if args.export_size: - console = Console(record=True, width=int(args.export_size)) -else: - console = Console(record=True) - -csv_report = [] - - -def validate_thresholds(): - """ - Validate thresholds defined by the user. - """ - assert(THRESHOLD_OPERATION_IMBALANCE >= 0.0 and THRESHOLD_OPERATION_IMBALANCE <= 1.0) - assert(THRESHOLD_SMALL_REQUESTS >= 0.0 and THRESHOLD_SMALL_REQUESTS <= 1.0) - assert(THRESHOLD_MISALIGNED_REQUESTS >= 0.0 and THRESHOLD_MISALIGNED_REQUESTS <= 1.0) - assert(THRESHOLD_METADATA >= 0.0 and THRESHOLD_METADATA <= 1.0) - assert(THRESHOLD_RANDOM_OPERATIONS >= 0.0 and THRESHOLD_RANDOM_OPERATIONS <= 1.0) - - assert(THRESHOLD_METADATA_TIME_RANK >= 0.0) - - -def clear(): - """ - Clear the screen with the comment call based on the operating system. - """ - _ = call('clear' if os.name == 'posix' else 'cls') - - -def convert_bytes(bytes_number): - """ - Convert bytes into formatted string. - """ - tags = [ - 'bytes', - 'KB', - 'MB', - 'GB', - 'TB', - 'PB', - 'EB' - ] - - i = 0 - double_bytes = bytes_number - - while (i < len(tags) and bytes_number >= 1024): - double_bytes = bytes_number / 1024.0 - i = i + 1 - bytes_number = bytes_number / 1024 - - return str(round(double_bytes, 2)) + ' ' + tags[i] +from .includes import * def is_available(name): @@ -249,71 +35,6 @@ def is_available(name): return shutil.which(name) is not None -def message(code, target, level, issue, recommendations=None, details=None): - """ - Display the message on the screen with level, issue, and recommendation. - """ - icon = ':arrow_forward:' - - if level in (HIGH, WARN): - insights_total[level] += 1 - - if level == HIGH: - color = '[red]' - elif level == WARN: - color = '[orange1]' - elif level == OK: - color = '[green]' - else: - color = '' - - messages = [ - '{}{}{} {}'.format( - color, - icon, - ' [' + code + ']' if args.code else '', - issue - ) - ] - - if args.export_csv: - csv_report.append(code) - - if details: - for detail in details: - messages.append(' {}:left_arrow_curving_right: {}'.format( - color, - detail['message'] - ) - ) - - if recommendations: - if not args.only_issues: - messages.append(' [white]:left_arrow_curving_right: [b]Recommendations:[/b]') - - for recommendation in recommendations: - messages.append(' :left_arrow_curving_right: {}'.format(recommendation['message'])) - - if args.verbose and 'sample' in recommendation: - messages.append( - Padding( - Panel( - recommendation['sample'], - title='Solution Example Snippet', - title_align='left', - padding=(1, 2) - ), - (1, 0, 1, 7) - ) - ) - - insights_total[RECOMMENDATIONS] += len(recommendations) - - return Group( - *messages - ) - - def check_log_version(file, log_version, library_version): use_file = file @@ -363,18 +84,13 @@ def check_log_version(file, log_version, library_version): return use_file -def main(): - if not os.path.isfile(args.darshan): - print('Unable to open .darshan file.') - - sys.exit(os.EX_NOINPUT) - - # clear() +def handler(args): + init_console(args) validate_thresholds() insights_start_time = time.time() - log = darshanll.log_open(args.darshan) + log = darshanll.log_open(args.log_path) modules = darshanll.log_get_modules(log) @@ -384,7 +100,7 @@ def main(): library_version = darshanll.darshan.backend.cffi_backend.get_lib_version() # Make sure log format is of the same version - filename = check_log_version(args.darshan, log_version, library_version) + filename = check_log_version(args.log_path, log_version, library_version) darshanll.log_close(log) @@ -491,8 +207,6 @@ def main(): 'mpiio': uses_mpiio } - df_posix_files = df_posix - if total_size and total_size_stdio / total_size > THRESHOLD_INTERFACE_STDIO: issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format( total_size_stdio / total_size * 100.0, @@ -506,7 +220,7 @@ def main(): ] insights_operation.append( - message(INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message(args, INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) ) if 'MPI-IO' not in modules: @@ -519,7 +233,7 @@ def main(): ] insights_operation.append( - message(INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + message(args, INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) ) ######################################################################################################################################################################### @@ -527,10 +241,6 @@ def main(): if 'POSIX' in report.records: df = report.records['POSIX'].to_df() - #print(df) - #print(df['counters'].columns) - #print(df['fcounters'].columns) - ######################################################################################################################################################################### # Get number of write/read operations @@ -547,7 +257,7 @@ def main(): ) insights_metadata.append( - message(INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + message(args, INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) ) if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: @@ -556,7 +266,7 @@ def main(): ) insights_metadata.append( - message(INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + message(args, INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) ) total_read_size = df['counters']['POSIX_BYTES_READ'].sum() @@ -564,22 +274,22 @@ def main(): total_size = total_written_size + total_read_size - if total_written_size > total_read_size and abs(total_written_size - total_read_size) / (total_written_size + total_read_size) > THRESHOLD_OPERATION_IMBALANCE: + if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format( - total_written_size / (total_written_size + total_read_size) * 100.0, total_read_size / (total_written_size + total_read_size) * 100.0 + total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 ) insights_metadata.append( - message(INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + message(args, INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) ) - if total_read_size > total_written_size and abs(total_written_size - total_read_size) / (total_written_size + total_read_size) > THRESHOLD_OPERATION_IMBALANCE: + if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: issue = 'Application is read size intensive ({:.2f}% write vs. {:.2f}% read)'.format( - total_written_size / (total_written_size + total_read_size) * 100.0, total_read_size / (total_written_size + total_read_size) * 100.0 + total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 ) insights_metadata.append( - message(INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + message(args, INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) ) ######################################################################################################################################################################### @@ -657,7 +367,7 @@ def main(): ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message(args, INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) ) # Get the number of small I/O operations (less than the stripe size) @@ -710,7 +420,7 @@ def main(): ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message(args, INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) ) ######################################################################################################################################################################### @@ -726,7 +436,7 @@ def main(): ) insights_metadata.append( - message(INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, TARGET_DEVELOPER, HIGH, issue, None) + message(args, INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, TARGET_DEVELOPER, HIGH, issue, None) ) if total_operations and total_file_not_aligned / total_operations > THRESHOLD_MISALIGNED_REQUESTS: @@ -760,7 +470,7 @@ def main(): ) insights_metadata.append( - message(INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message(args, INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) ) ######################################################################################################################################################################### @@ -773,7 +483,7 @@ def main(): issue = 'Application might have redundant read traffic (more data read than the highest offset)' insights_metadata.append( - message(INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None) + message(args, INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None) ) max_write_offset = df['counters']['POSIX_MAX_BYTE_WRITTEN'].max() @@ -782,7 +492,7 @@ def main(): issue = 'Application might have redundant write traffic (more data written than the highest offset)' insights_metadata.append( - message(INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None) + message(args, INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None) ) ######################################################################################################################################################################### @@ -792,6 +502,7 @@ def main(): read_consecutive = df['counters']['POSIX_CONSEC_READS'].sum() #print('READ Consecutive: {} ({:.2f}%)'.format(read_consecutive, read_consecutive / total_reads * 100)) + read_sequential = df['counters']['POSIX_SEQ_READS'].sum() read_sequential -= read_consecutive #print('READ Sequential: {} ({:.2f}%)'.format(read_sequential, read_sequential / total_reads * 100)) @@ -812,7 +523,7 @@ def main(): ] insights_operation.append( - message(INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message(args, INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) ) else: issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests'.format( @@ -821,15 +532,13 @@ def main(): ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, TARGET_DEVELOPER, OK, issue, None) + message(args, INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, TARGET_DEVELOPER, OK, issue, None) ) write_consecutive = df['counters']['POSIX_CONSEC_WRITES'].sum() - #print('WRITE Consecutive: {} ({:.2f}%)'.format(write_consecutive, write_consecutive / total_writes * 100)) write_sequential = df['counters']['POSIX_SEQ_WRITES'].sum() write_sequential -= write_consecutive - #print('WRITE Sequential: {} ({:.2f}%)'.format(write_sequential, write_sequential / total_writes * 100)) write_random = total_writes - write_consecutive - write_sequential #print('WRITE Random: {} ({:.2f}%)'.format(write_random, write_random / total_writes * 100)) @@ -847,7 +556,7 @@ def main(): ] insights_operation.append( - message(INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message(args, INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) ) else: issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests'.format( @@ -856,13 +565,12 @@ def main(): ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, TARGET_DEVELOPER, OK, issue, None) + message(args, INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, TARGET_DEVELOPER, OK, issue, None) ) ######################################################################################################################################################################### # Shared file with small operations - # print(df['counters'].loc[(df['counters']['rank'] == -1)]) shared_files = df['counters'].loc[(df['counters']['rank'] == -1)] @@ -913,7 +621,7 @@ def main(): ] insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message(args, INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) ) total_shared_writes = shared_files['POSIX_WRITES'].sum() @@ -960,7 +668,7 @@ def main(): ] insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message(args, INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) ) ######################################################################################################################################################################### @@ -991,7 +699,7 @@ def main(): ) insights_metadata.append( - message(INSIGHTS_POSIX_HIGH_METADATA_TIME, TARGET_DEVELOPER, HIGH, issue, recommendation) + message(args, INSIGHTS_POSIX_HIGH_METADATA_TIME, TARGET_DEVELOPER, HIGH, issue, recommendation) ) # We already have a single line for each shared-file access @@ -1046,7 +754,7 @@ def main(): ] insights_operation.append( - message(INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) + message(args, INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) ) # POSIX_F_FASTEST_RANK_TIME @@ -1101,7 +809,7 @@ def main(): ] insights_operation.append( - message(INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) + message(args, INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) ) aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][ @@ -1164,7 +872,7 @@ def main(): ] insights_operation.append( - message(INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message(args, INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) ) imbalance_count = 0 @@ -1214,7 +922,7 @@ def main(): ] insights_operation.append( - message(INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message(args, INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) ) ######################################################################################################################################################################### @@ -1225,8 +933,6 @@ def main(): df_mpiio['counters'] = df_mpiio['counters'].assign(id=lambda d: d['id'].astype(str)) - #print(df_mpiio) - # Get the files responsible detected_files = [] @@ -1265,7 +971,7 @@ def main(): ] insights_operation.append( - message(INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message(args, INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) ) else: issue = 'Application uses MPI-IO and read data using {} ({:.2f}%) collective operations'.format( @@ -1274,7 +980,7 @@ def main(): ) insights_operation.append( - message(INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, OK, issue) + message(args, INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, OK, issue) ) df_mpiio_collective_writes = df_mpiio['counters'] #.loc[(df_mpiio['counters']['MPIIO_COLL_WRITES'] > 0)] @@ -1312,7 +1018,7 @@ def main(): ] insights_operation.append( - message(INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message(args, INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) ) else: issue = 'Application uses MPI-IO and write data using {} ({:.2f}%) collective operations'.format( @@ -1321,7 +1027,7 @@ def main(): ) insights_operation.append( - message(INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, OK, issue) + message(args, INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, OK, issue) ) ######################################################################################################################################################################### @@ -1358,7 +1064,7 @@ def main(): ) insights_operation.append( - message(INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + message(args, INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) ) if df_mpiio['counters']['MPIIO_NB_WRITES'].sum() == 0: @@ -1383,7 +1089,7 @@ def main(): ) insights_operation.append( - message(INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + message(args, INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) ) ######################################################################################################################################################################### @@ -1448,21 +1154,21 @@ def main(): ] insights_operation.append( - message(INSIGHTS_MPI_IO_AGGREGATORS_INTER, TARGET_USER, HIGH, issue, recommendation) + message(args, INSIGHTS_MPI_IO_AGGREGATORS_INTER, TARGET_USER, HIGH, issue, recommendation) ) if cb_nodes < NUMBER_OF_COMPUTE_NODES: issue = 'Application is using intra-node aggregators' insights_operation.append( - message(INSIGHTS_MPI_IO_AGGREGATORS_INTRA, TARGET_USER, OK, issue) + message(args, INSIGHTS_MPI_IO_AGGREGATORS_INTRA, TARGET_USER, OK, issue) ) if cb_nodes == NUMBER_OF_COMPUTE_NODES: issue = 'Application is using one aggregator per compute node' insights_operation.append( - message(INSIGHTS_MPI_IO_AGGREGATORS_OK, TARGET_USER, OK, issue) + message(args, INSIGHTS_MPI_IO_AGGREGATORS_OK, TARGET_USER, OK, issue) ) @@ -1491,7 +1197,7 @@ def main(): recommendation.append(new_message) insights_dxt.append( - message(code, TARGET_DEVELOPER, level, issue, recommendation) + message(args, code, TARGET_DEVELOPER, level, issue, recommendation) ) ######################################################################################################################################################################### @@ -1518,7 +1224,7 @@ def main(): job['exe'].split()[0] ), ' [b]DARSHAN[/b]: [white]{}[/white]'.format( - os.path.basename(args.darshan) + os.path.basename(args.log_path) ), ' [b]EXECUTION TIME[/b]: [white]{} to {} ({:.2f} hours)[/white]'.format( job_start, @@ -1541,7 +1247,7 @@ def main(): ' '.join(hints) ) ]), - title='[b][slate_blue3]DRISHTI[/slate_blue3] v.0.3[/b]', + title='[b][slate_blue3]DRISHTI[/slate_blue3] v.0.5[/b]', title_align='left', subtitle='[red][b]{} critical issues[/b][/red], [orange1][b]{} warnings[/b][/orange1], and [white][b]{} recommendations[/b][/white]'.format( insights_total[HIGH], @@ -1638,14 +1344,14 @@ def main(): if args.export_html: console.save_html( - '{}.html'.format(args.darshan), + '{}.html'.format(args.log_path), theme=export_theme, clear=False ) if args.export_svg: console.save_svg( - '{}.svg'.format(args.darshan), + '{}.svg'.format(args.log_path), title='Drishti', theme=export_theme, clear=False @@ -1697,7 +1403,7 @@ def main(): detected_issues[report] = True filename = '{}-summary.csv'.format( - args.darshan.replace('.darshan', '') + args.log_path.replace('.darshan', '') ) with open(filename, 'w') as f: @@ -1705,6 +1411,3 @@ def main(): w.writerow(detected_issues.keys()) w.writerow(detected_issues.values()) - -if __name__ == '__main__': - main() diff --git a/drishti/handle_recorder.py b/drishti/handle_recorder.py new file mode 100644 index 0000000..18136f3 --- /dev/null +++ b/drishti/handle_recorder.py @@ -0,0 +1,1185 @@ +#!/usr/bin/env python3 + +import os +import csv +import time +import json + +import datetime + +import pandas as pd + +from rich import print, box +from rich.console import Group +from rich.padding import Padding +from rich.syntax import Syntax +from rich.panel import Panel +from rich.terminal_theme import TerminalTheme +from rich.terminal_theme import MONOKAI + +from recorder_utils import RecorderReader +from recorder_utils.build_offset_intervals import build_offset_intervals + +from .includes import * + + +def get_modules(reader): + func_list = reader.funcs + ranks = reader.GM.total_ranks + modules = set() + + for rank in range(ranks): + for i in range(reader.LMs[rank].total_records): + record = reader.records[rank][i] + func_name = func_list[record.func_id] + if 'MPI_File' in func_name: + modules.add('MPI-IO') + elif 'MPI' in func_name: + modules.add('MPI') + elif 'H5' in func_name: + modules.add('H5F') + else: modules.add('POSIX') + + return modules + + +def get_accessed_files(reader): + ranks = reader.GM.total_ranks + filemap = {} + for rank in range(ranks): + filemap.update(reader.LMs[rank].filemap) + + return filemap + + +def init_df_posix_recordes(reader): + func_list = reader.funcs + ranks = reader.GM.total_ranks + records = [] + for rank in range(ranks): + for i in range(reader.LMs[rank].total_records): + record = reader.records[rank][i] + func_name = func_list[record.func_id] + + if 'MPI' not in func_name and 'H5' not in func_name: + records.append( [rank, func_name, record.tstart, record.tend] ) + + head = ['rank', 'function', 'start', 'end'] + df_posix_records = pd.DataFrame(records, columns=head) + return df_posix_records + + +def handler(args): + init_console(args) + validate_thresholds() + + insights_start_time = time.time() + + reader = RecorderReader(args.log_path) + df_intervals = build_offset_intervals(reader) + df_posix_records = init_df_posix_recordes(reader) + + modules = get_modules(reader) + unique_files = get_accessed_files(reader) + + def add_api(row): + if 'MPI' in row['function']: + return 'MPIIO' + elif 'H5' in row['function']: + return 'H5F' + else: + return 'POSIX' + + df_intervals['api'] = df_intervals.apply(add_api, axis=1) + + def add_duration(row): + return row['end'] - row['start'] + + df_intervals['duration'] = df_intervals.apply(add_duration, axis=1) + df_posix_records['duration'] = df_posix_records.apply(add_duration, axis=1) + + ######################################################################################################################################################################### + + # Check usage of POSIX, and MPI-IO per file + total_size_stdio = 0 + total_size_posix = 0 + total_size_mpiio = 0 + total_size = 0 + + total_files = len(unique_files) + total_files_stdio = 0 + total_files_posix = 0 + total_files_mpiio = 0 + + for fid in unique_files.keys(): + df_intervals_in_one_file = df_intervals[(df_intervals['file_id'] == fid)] + df_stdio_intervals_in_one_file = df_intervals_in_one_file[(df_intervals_in_one_file['api'] == 'STDIO')] + df_posix_intervals_in_one_file = df_intervals_in_one_file[(df_intervals_in_one_file['api'] == 'POSIX')] + df_mpiio_intervals_in_one_file = df_intervals_in_one_file[(df_intervals_in_one_file['api'] == 'MPIIO')] + + if len(df_stdio_intervals_in_one_file): + total_files_stdio += 1 + total_size_stdio += df_stdio_intervals_in_one_file['size'].sum() + + if len(df_posix_intervals_in_one_file): + total_files_posix += 1 + total_size_posix += df_posix_intervals_in_one_file['size'].sum() + + if len(df_mpiio_intervals_in_one_file): + total_files_mpiio += 1 + total_size_mpiio += df_mpiio_intervals_in_one_file['size'].sum() + + + # Since POSIX will capture both POSIX-only accesses and those comming from MPI-IO, we can subtract those + if total_size_posix > 0 and total_size_posix >= total_size_mpiio: + total_size_posix -= total_size_mpiio + + total_size = total_size_stdio + total_size_posix + total_size_mpiio + + assert(total_size_stdio >= 0) + assert(total_size_posix >= 0) + assert(total_size_mpiio >= 0) + + if total_size and total_size_stdio / total_size > THRESHOLD_INTERFACE_STDIO: + issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format( + total_size_stdio / total_size * 100.0, + convert_bytes(total_size_stdio) + ) + + recommendation = [ + { + 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' + } + ] + + insights_operation.append( + message(args, INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + if 'MPI-IO' not in modules: + issue = 'Application is using low-performance interface' + + recommendation = [ + { + 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' + } + ] + + insights_operation.append( + message(args, INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + ) + + ######################################################################################################################################################################### + + if df_intervals['api'].eq('POSIX').any(): + df_posix = df_intervals[(df_intervals['api'] == 'POSIX')] + + ######################################################################################################################################################################### + + # Get number of write/read operations + total_reads = len(df_posix[(df_posix['function'].str.contains('read'))]) + total_writes = len(df_posix[~(df_posix['function'].str.contains('read'))]) + + # Get total number of I/O operations + total_operations = total_writes + total_reads + + # To check whether the application is write-intersive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance + if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: + issue = 'Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( + total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 + ) + + insights_metadata.append( + message(args, INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + ) + + if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: + issue = 'Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( + total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 + ) + + insights_metadata.append( + message(args, INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + ) + + total_read_size = df_posix[(df_posix['function'].str.contains('read'))]['size'].sum() + total_written_size = df_posix[~(df_posix['function'].str.contains('read'))]['size'].sum() + + total_size = total_written_size + total_read_size + + if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: + issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format( + total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 + ) + + insights_metadata.append( + message(args, INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + ) + + if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: + issue = 'Application is read size intensive ({:.2f}% write vs. {:.2f}% read)'.format( + total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 + ) + + insights_metadata.append( + message(args, INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + ) + + ######################################################################################################################################################################### + + # Get the number of small I/O operations (less than 1 MB) + + total_reads_small = len(df_posix[(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + total_writes_small = len(df_posix[~(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + + detected_files = [] # [fname, num of read, num of write] + for fid in unique_files.keys(): + read_cnt = len(df_posix[(df_posix['file_id'] == fid) & (df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + write_cnt = len(df_posix[(df_posix['file_id'] == fid) & ~(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + detected_files.append([unique_files[fid], read_cnt, write_cnt]) + + if total_reads_small and total_reads_small / total_reads > THRESHOLD_SMALL_REQUESTS and total_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests'.format( + total_reads_small, total_reads_small / total_reads * 100.0 + ) + + detail = [] + recommendation = [] + + for file in detected_files: + if file[1] > (total_reads * THRESHOLD_SMALL_REQUESTS / 2): + detail.append( + { + 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( + file[1], + file[1] / total_reads * 100.0, + file[0] if args.full_path else os.path.basename(file[0]) + ) + } + ) + + recommendation.append( + { + 'message': 'Consider buffering read operations into larger more contiguous ones' + } + ) + + if 'MPI-IO' in modules: + recommendation.append( + { + 'message': 'Since the appplication already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + } + ) + else: + recommendation.append( + { + 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' + } + ) + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + + if total_writes_small and total_writes_small / total_writes > THRESHOLD_SMALL_REQUESTS and total_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests'.format( + total_writes_small, total_writes_small / total_writes * 100.0 + ) + + detail = [] + recommendation = [] + + for file in detected_files: + if file[2] > (total_writes * THRESHOLD_SMALL_REQUESTS / 2): + detail.append( + { + 'message': '{} ({:.2f}%) small write requests are to "{}"'.format( + file[2], + file[2] / total_writes * 100.0, + file[0] if args.full_path else os.path.basename(file[0]) + ) + } + ) + + recommendation.append( + { + 'message': 'Consider buffering write operations into larger more contiguous ones' + } + ) + + if 'MPI-IO' in modules: + recommendation.append( + { + 'message': 'Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + } + ) + else: + recommendation.append( + { + 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' + } + ) + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + + ######################################################################################################################################################################### + + # How many requests are misaligned? + # TODO: + + ######################################################################################################################################################################### + + # Redundant read-traffic (based on Phill) + # POSIX_MAX_BYTE_READ (Highest offset in the file that was read) + max_read_offset = df_posix[(df_posix['function'].str.contains('read'))]['offset'].max() + + if max_read_offset > total_read_size: + issue = 'Application might have redundant read traffic (more data read than the highest offset)' + + insights_metadata.append( + message(args, INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None) + ) + + max_write_offset = df_posix[~(df_posix['function'].str.contains('read'))]['offset'].max() + + if max_write_offset > total_written_size: + issue = 'Application might have redundant write traffic (more data written than the highest offset)' + + insights_metadata.append( + message(args, INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None) + ) + + ######################################################################################################################################################################### + + # Check for a lot of random operations + + grp_posix_by_fid = df_posix.groupby('file_id') + + read_consecutive = 0 + read_sequential = 0 + read_random = 0 + + for fid, df_filtered in grp_posix_by_fid: + df_filtered = df_filtered[(df_filtered['function'].str.contains('read'))].sort_values('start') + + for i in range(len(df_filtered) - 1): + curr_interval = df_filtered.iloc[i] + next_interval = df_filtered.iloc[i + 1] + if curr_interval['offset'] + curr_interval['size'] == next_interval['offset']: + read_consecutive += 1 + elif curr_interval['offset'] + curr_interval['size'] < next_interval['offset']: + read_sequential += 1 + else: + read_random += 1 + + if total_reads: + if read_random and read_random / total_reads > THRESHOLD_RANDOM_OPERATIONS and read_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: + issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format( + read_random, read_random / total_reads * 100.0 + ) + + recommendation = [ + { + 'message': 'Consider changing your data model to have consecutive or sequential reads' + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + else: + issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests'.format( + read_consecutive / total_reads * 100.0, + read_sequential / total_reads * 100.0 + ) + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, TARGET_DEVELOPER, OK, issue, None) + ) + + write_consecutive = 0 + write_sequential = 0 + write_random = 0 + + for fid, df_filtered in grp_posix_by_fid: + df_filtered = df_filtered[~(df_filtered['function'].str.contains('read'))].sort_values('start') + + for i in range(len(df_filtered) - 1): + curr_interval = df_filtered.iloc[i] + next_interval = df_filtered.iloc[i + 1] + if curr_interval['offset'] + curr_interval['size'] == next_interval['offset']: + write_consecutive += 1 + elif curr_interval['offset'] + curr_interval['size'] < next_interval['offset']: + write_sequential += 1 + else: + write_random += 1 + + if total_writes: + if write_random and write_random / total_writes > THRESHOLD_RANDOM_OPERATIONS and write_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: + issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)'.format( + write_random, write_random / total_writes * 100.0 + ) + + recommendation = [ + { + 'message': 'Consider changing your data model to have consecutive or sequential writes' + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + else: + issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests'.format( + write_consecutive / total_writes * 100.0, + write_sequential / total_writes * 100.0 + ) + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, TARGET_DEVELOPER, OK, issue, None) + ) + + ######################################################################################################################################################################### + + # Shared file with small operations + + # A file is shared if it's been read/written by more than 1 rank + detected_files = grp_posix_by_fid['rank'].nunique() + shared_files = set(detected_files[detected_files > 1].index) + + total_shared_reads = 0 + total_shared_reads_small = 0 + total_shared_writes = 0 + total_shared_writes_small = 0 + + detected_files = [] # [fname, num of read, num of write] + for fid in shared_files: + total_shared_reads += len(df_posix[(df_posix['file_id'] == fid) & (df_posix['function'].str.contains('read'))]) + total_shared_writes += len(df_posix[(df_posix['file_id'] == fid) & ~(df_posix['function'].str.contains('read'))]) + + read_cnt = len(df_posix[(df_posix['file_id'] == fid) + & (df_posix['function'].str.contains('read')) + & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + write_cnt = len(df_posix[(df_posix['file_id'] == fid) + & ~(df_posix['function'].str.contains('read')) + & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + detected_files.append([unique_files[fid], read_cnt, write_cnt]) + + total_shared_reads_small += read_cnt + total_shared_writes_small += write_cnt + + if total_shared_reads and total_shared_reads_small / total_shared_reads > THRESHOLD_SMALL_REQUESTS and total_shared_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format( + total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0 + ) + + detail = [] + + for file in detected_files: + if file[1] > (total_reads * THRESHOLD_SMALL_REQUESTS / 2): + detail.append( + { + 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( + file[1], + file[1] / total_reads * 100.0, + file[0] if args.full_path else os.path.basename(file[0]) + ) + } + ) + + recommendation = [ + { + 'message': 'Consider coalesceing read requests into larger more contiguous ones using MPI-IO collective operations', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + + if total_shared_writes and total_shared_writes_small / total_shared_writes > THRESHOLD_SMALL_REQUESTS and total_shared_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests'.format( + total_shared_writes_small, total_shared_writes_small / total_shared_writes * 100.0 + ) + + detail = [] + + for file in detected_files: + if file[2] > (total_writes * THRESHOLD_SMALL_REQUESTS / 2): + detail.append( + { + 'message': '{} ({:.2f}%) small write requests are to "{}"'.format( + file[2], + file[2] / total_writes * 100.0, + file[0] if args.full_path else os.path.basename(file[0]) + ) + } + ) + + recommendation = [ + { + 'message': 'Consider coalescing write requests into larger more contiguous ones using MPI-IO collective operations', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + + ######################################################################################################################################################################### + + # TODO: Here I assume all operations other than write/read are metadata operations + df_posix_metadata = df_posix_records[~(df_posix_records['function'].str.contains('read|write|print', na=False))] + df_detected = df_posix_metadata.groupby('rank')['duration'].sum().reset_index() + has_long_metadata = df_detected[(df_detected['duration'] > THRESHOLD_METADATA_TIME_RANK)] + + if not has_long_metadata.empty: + issue = 'There are {} ranks where metadata operations take over {} seconds'.format( + len(has_long_metadata), THRESHOLD_METADATA_TIME_RANK + ) + + recommendation = [ + { + 'message': 'Attempt to combine files, reduce, or cache metadata operations' + } + ] + + if 'H5F' in modules: + recommendation.append( + { + 'message': 'Since your appplication uses HDF5, try enabling collective metadata calls with H5Pset_coll_metadata_write() and H5Pset_all_coll_metadata_ops()', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-collective-metadata.c'), line_numbers=True, background_color='default') + }, + { + 'message': 'Since your appplication uses HDF5, try using metadata cache to defer metadata operations', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-cache.c'), line_numbers=True, background_color='default') + } + ) + + insights_metadata.append( + message(args, INSIGHTS_POSIX_HIGH_METADATA_TIME, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + # We already have a single line for each shared-file access + # To check for stragglers, we can check the difference between the + + # POSIX_FASTEST_RANK_BYTES + # POSIX_SLOWEST_RANK_BYTES + # POSIX_VARIANCE_RANK_BYTES + + stragglers_count = 0 + + detected_files = [] + + for fid in shared_files: + df_posix_in_one_file = df_posix[(df_posix['file_id'] == fid)] + total_transfer_size = df_posix_in_one_file['size'].sum() + + df_detected = df_posix_in_one_file.groupby('rank').agg({'size': 'sum', 'duration': 'sum'}).reset_index() + slowest_rank_bytes = df_detected.loc[df_detected['duration'].idxmax(), 'size'] + fastest_rank_bytes = df_detected.loc[df_detected['duration'].idxmin(), 'size'] + + if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > THRESHOLD_STRAGGLERS: + stragglers_count += 1 + + detected_files.append([ + unique_files[fid], abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100 + ]) + + if stragglers_count: + issue = 'Detected data transfer imbalance caused by stragglers when accessing {} shared file.'.format( + stragglers_count + ) + + detail = [] + + for file in detected_files: + detail.append( + { + 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + file[1], + file[0] if args.full_path else os.path.basename(file[0]) + ) + } + ) + + recommendation = [ + { + 'message': 'Consider better balancing the data transfer between the application ranks' + }, + { + 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) + ) + + # POSIX_F_FASTEST_RANK_TIME + # POSIX_F_SLOWEST_RANK_TIME + # POSIX_F_VARIANCE_RANK_TIME + + stragglers_count = 0 + + detected_files = [] + + for fid in shared_files: + df_posix_in_one_file = df_posix[(df_posix['file_id'] == fid)] + total_transfer_time = df_posix_in_one_file['duration'].sum() + + df_detected = df_posix_in_one_file.groupby('rank')['duration'].sum().reset_index() + + slowest_rank_time = df_detected['duration'].max() + fastest_rank_time = df_detected['duration'].min() + + if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > THRESHOLD_STRAGGLERS: + stragglers_count += 1 + + detected_files.append([ + unique_files[fid], abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100 + ]) + + if stragglers_count: + issue = 'Detected time imbalance caused by stragglers when accessing {} shared file.'.format( + stragglers_count + ) + + detail = [] + + for file in detected_files: + detail.append( + { + 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + file[1], + file[0] if args.full_path else os.path.basename(file[0]) + ) + } + ) + + recommendation = [ + { + 'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give + }, + { + 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) + ) + + # Get the individual files responsible for imbalance + imbalance_count = 0 + + detected_files = [] + + for fid in unique_files.keys(): + if fid in shared_files: continue + df_detected = df_posix[(df_posix['file_id'] == fid) & ~(df_posix['function'].str.contains('read'))] + + max_bytes_written = df_detected['size'].max() + min_bytes_written = df_detected['size'].min() + + if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > THRESHOLD_IMBALANCE: + imbalance_count += 1 + + detected_files.append([ + unique_files[fid], abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 + ]) + + if imbalance_count: + issue = 'Detected write imbalance when accessing {} individual files'.format( + imbalance_count + ) + + detail = [] + + for file in detected_files: + detail.append( + { + 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + file[1], + file[0] if args.full_path else os.path.basename(file[0]) + ) + } + ) + + recommendation = [ + { + 'message': 'Consider better balancing the data transfer between the application ranks' + }, + { + 'message': 'Consider tuning the stripe size and count to better distribute the data', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + }, + { + 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + }, + { + 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + + imbalance_count = 0 + + detected_files = [] + + for fid in shared_files: + df_detected = df_posix[(df_posix['file_id'] == fid) & (df_posix['function'].str.contains('read'))] + + max_bytes_read = df_detected['size'].max() + min_bytes_read = df_detected['size'].min() + + if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > THRESHOLD_IMBALANCE: + imbalance_count += 1 + + detected_files.append([ + unique_files[fid], abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 + ]) + + if imbalance_count: + issue = 'Detected read imbalance when accessing {} individual files.'.format( + imbalance_count + ) + + detail = [] + + for file in detected_files: + detail.append( + { + 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + file[1], + file[0] if args.full_path else os.path.basename(file[0]) + ) + } + ) + + recommendation = [ + { + 'message': 'Consider better balancing the data transfer between the application ranks' + }, + { + 'message': 'Consider tuning the stripe size and count to better distribute the data', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + }, + { + 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + }, + { + 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + + ######################################################################################################################################################################### + + if df_intervals['api'].eq('MPIIO').any(): + df_mpiio = df_intervals[(df_intervals['api'] == 'MPIIO')] + + df_mpiio_reads = df_mpiio[(df_mpiio['function'].str.contains('read'))] + mpiio_indp_reads = len(df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all'))]) + mpiio_coll_reads = len(df_mpiio_reads[(df_mpiio_reads['function'].str.contains('_all'))]) + total_mpiio_read_operations = mpiio_indp_reads + mpiio_coll_reads + + df_mpiio_writes = df_mpiio[~(df_mpiio['function'].str.contains('read'))] + mpiio_indp_writes = len(df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all'))]) + mpiio_coll_writes = len(df_mpiio_writes[(df_mpiio_writes['function'].str.contains('_all'))]) + total_mpiio_write_operations = mpiio_indp_writes + mpiio_coll_writes + + detected_files = [] # [fname, total_read, total_write] + for fid in unique_files.keys(): + read_cnt = len(df_mpiio_reads[(df_mpiio_reads['file_id'] == fid) & (df_mpiio_reads['function'].str.contains('read'))]) + write_cnt = len(df_mpiio_reads[(df_mpiio_reads['file_id'] == fid) & ~(df_mpiio_reads['function'].str.contains('read'))]) + detected_files.append([unique_files[fid], read_cnt, write_cnt]) + + if mpiio_coll_reads == 0: + if total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format( + mpiio_indp_reads, + mpiio_indp_reads / (total_mpiio_read_operations) * 100 + ) + + detail = [] + + for file in detected_files: + total_cnt = file[1] + file[2] + if total_cnt and file[1] / total_cnt > THRESHOLD_COLLECTIVE_OPERATIONS and total_cnt > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + detail.append( + { + 'message': '{} ({}%) of independent reads to "{}"'.format( + file[1], + file[1] / total_cnt * 100, + file[0] if args.full_path else os.path.basename(file[0]) + ) + } + ) + + recommendation = [ + { + 'message': 'Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(args, INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + else: + issue = 'Application uses MPI-IO and read data using {} ({:.2f}%) collective operations'.format( + mpiio_coll_reads, + mpiio_coll_reads / total_mpiio_read_operations * 100 + ) + + insights_operation.append( + message(args, INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, OK, issue) + ) + + if mpiio_coll_writes == 0: + if total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format( + mpiio_indp_writes, + mpiio_indp_writes / (total_mpiio_write_operations) * 100 + ) + + detail = [] + + for file in detected_files: + total_cnt = file[1] + file[2] + if total_cnt and file[2] / total_cnt > THRESHOLD_COLLECTIVE_OPERATIONS and total_cnt > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + detail.append( + { + 'message': '{} ({}%) of independent writes to "{}"'.format( + file[2], + file[2] / total_cnt * 100, + file[0] if args.full_path else os.path.basename(file[0]) + ) + } + ) + + recommendation = [ + { + 'message': 'Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(args, INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + + else: + issue = 'Application uses MPI-IO and write data using {} ({:.2f}%) collective operations'.format( + mpiio_coll_writes, + mpiio_coll_writes / total_mpiio_write_operations * 100 + ) + + insights_operation.append( + message(args, INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, OK, issue) + ) + + ######################################################################################################################################################################### + + # Look for usage of non-block operations + + # Look for HDF5 file extension + + has_hdf5_extension = False + + for fid in unique_files.keys(): + fname = unique_files[fid] + if fname.endswith('.h5') or fname.endswith('.hdf5'): + has_hdf5_extension = True + + if len(df_mpiio_reads[(df_mpiio_reads['function'].str.contains('iread|begin|end'))]) == 0: + issue = 'Application could benefit from non-blocking (asynchronous) reads' + + recommendation = [] + + if 'H5F' in modules or has_hdf5_extension: + recommendation.append( + { + 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-read.c'), line_numbers=True, background_color='default') + } + ) + + if 'MPI-IO' in modules: + recommendation.append( + { + 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iread.c'), line_numbers=True, background_color='default') + } + ) + + insights_operation.append( + message(args, INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + ) + + if len(df_mpiio_writes[(df_mpiio_writes['function'].str.contains('iwrite|begin|end'))]) == 0: + issue = 'Application could benefit from non-blocking (asynchronous) writes' + + recommendation = [] + + if 'H5F' in modules or has_hdf5_extension: + recommendation.append( + { + 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-write.c'), line_numbers=True, background_color='default') + } + ) + + if 'MPI-IO' in modules: + recommendation.append( + { + 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iwrite.c'), line_numbers=True, background_color='default') + } + ) + + insights_operation.append( + message(args, INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + ) + + ######################################################################################################################################################################### + + # Nodes and MPI-IO aggregators + # If the application uses collective reads or collective writes, look for the number of aggregators + # TODO: + + ######################################################################################################################################################################### + + NUMBER_OF_COMPUTE_NODES = 0 + + ######################################################################################################################################################################### + + codes = [] + if args.json: + f = open(args.json) + data = json.load(f) + + for key, values in data.items(): + for value in values: + code = value['code'] + codes.append(code) + + level = value['level'] + issue = value['issue'] + recommendation = [] + for rec in value['recommendations']: + new_message = {'message': rec} + recommendation.append(new_message) + + insights_dxt.append( + message(args, code, TARGET_DEVELOPER, level, issue, recommendation) + ) + + ######################################################################################################################################################################### + + + insights_end_time = time.time() + + console.print() + + console.print( + Panel( + '\n'.join([ + ' [b]RECORDER[/b]: [white]{}[/white]'.format( + os.path.basename(args.log_path) + ), + ' [b]FILES[/b]: [white]{} files ({} use STDIO, {} use POSIX, {} use MPI-IO)[/white]'.format( + total_files, + total_files_stdio, + total_files_posix - total_files_mpiio, # Since MPI-IO files will always use POSIX, we can decrement to get a unique count + total_files_mpiio + ), + ' [b]COMPUTE NODES[/b] [white]{}[/white]'.format( + NUMBER_OF_COMPUTE_NODES + ), + ' [b]PROCESSES[/b] [white]{}[/white]'.format( + reader.GM.total_ranks + ), + ]), + title='[b][slate_blue3]DRISHTI[/slate_blue3] v.0.5[/b]', + title_align='left', + subtitle='[red][b]{} critical issues[/b][/red], [orange1][b]{} warnings[/b][/orange1], and [white][b]{} recommendations[/b][/white]'.format( + insights_total[HIGH], + insights_total[WARN], + insights_total[RECOMMENDATIONS], + ), + subtitle_align='left', + padding=1 + ) + ) + + console.print() + + if insights_metadata: + console.print( + Panel( + Padding( + Group( + *insights_metadata + ), + (1, 1) + ), + title='METADATA', + title_align='left' + ) + ) + + if insights_operation: + console.print( + Panel( + Padding( + Group( + *insights_operation + ), + (1, 1) + ), + title='OPERATIONS', + title_align='left' + ) + ) + + if insights_dxt: + console.print( + Panel( + Padding( + Group( + *insights_dxt + ), + (1, 1) + ), + title='DXT', + title_align='left' + ) + ) + + console.print( + Panel( + ' {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds'.format( + datetime.datetime.now().year, + datetime.datetime.now(), + insights_end_time - insights_start_time + ), + box=box.SIMPLE + ) + ) + + if args.export_theme_light: + export_theme = TerminalTheme( + (255, 255, 255), + (0, 0, 0), + [ + (26, 26, 26), + (244, 0, 95), + (152, 224, 36), + (253, 151, 31), + (157, 101, 255), + (244, 0, 95), + (88, 209, 235), + (120, 120, 120), + (98, 94, 76), + ], + [ + (244, 0, 95), + (152, 224, 36), + (224, 213, 97), + (157, 101, 255), + (244, 0, 95), + (88, 209, 235), + (246, 246, 239), + ], + ) + else: + export_theme = MONOKAI + + if args.export_html: + console.save_html( + '{}.html'.format(args.log_path), + theme=export_theme, + clear=False + ) + + if args.export_svg: + console.save_svg( + '{}.svg'.format(args.log_path), + title='Drishti', + theme=export_theme, + clear=False + ) + + if args.export_csv: + issues = [ + 'JOB', + INSIGHTS_STDIO_HIGH_USAGE, + INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, + INSIGHTS_POSIX_READ_COUNT_INTENSIVE, + INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, + INSIGHTS_POSIX_READ_SIZE_INTENSIVE, + INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, + INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, + INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, + INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, + INSIGHTS_POSIX_REDUNDANT_READ_USAGE, + INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, + INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, + INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, + INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, + INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, + INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, + INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, + INSIGHTS_POSIX_HIGH_METADATA_TIME, + INSIGHTS_POSIX_SIZE_IMBALANCE, + INSIGHTS_POSIX_TIME_IMBALANCE, + INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, + INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, + INSIGHTS_MPI_IO_NO_USAGE, + INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, + INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, + INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE, + INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE, + INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, + INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, + INSIGHTS_MPI_IO_AGGREGATORS_INTRA, + INSIGHTS_MPI_IO_AGGREGATORS_INTER, + INSIGHTS_MPI_IO_AGGREGATORS_OK + ] + if codes: + issues.extend(codes) + + detected_issues = dict.fromkeys(issues, False) + detected_issues['JOB'] = None + + for report in csv_report: + detected_issues[report] = True + + filename = '{}-summary.csv'.format( + args.log_path + ) + + with open(filename, 'w') as f: + w = csv.writer(f) + w.writerow(detected_issues.keys()) + w.writerow(detected_issues.values()) + + diff --git a/drishti/includes.py b/drishti/includes.py new file mode 100644 index 0000000..e801cdf --- /dev/null +++ b/drishti/includes.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 + +import os + +from rich.console import Console, Group +from rich.padding import Padding +from rich.panel import Panel + + +RECOMMENDATIONS = 0 +HIGH = 1 +WARN = 2 +INFO = 3 +OK = 4 + +ROOT = os.path.abspath(os.path.dirname(__file__)) + +TARGET_USER = 1 +TARGET_DEVELOPER = 2 +TARGET_SYSTEM = 3 + +insights_operation = [] +insights_metadata = [] +insights_dxt = [] + +insights_total = dict() + +insights_total[HIGH] = 0 +insights_total[WARN] = 0 +insights_total[RECOMMENDATIONS] = 0 + +THRESHOLD_OPERATION_IMBALANCE = 0.1 +THRESHOLD_SMALL_REQUESTS = 0.1 +THRESHOLD_SMALL_REQUESTS_ABSOLUTE = 1000 +THRESHOLD_MISALIGNED_REQUESTS = 0.1 +THRESHOLD_METADATA = 0.1 +THRESHOLD_METADATA_TIME_RANK = 30 # seconds +THRESHOLD_RANDOM_OPERATIONS = 0.2 +THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE = 1000 +THRESHOLD_STRAGGLERS = 0.15 +THRESHOLD_IMBALANCE = 0.30 +THRESHOLD_INTERFACE_STDIO = 0.1 +THRESHOLD_COLLECTIVE_OPERATIONS = 0.5 +THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE = 1000 +THRESHOLD_SMALL_BYTES = 1048576 # 1 MB + +INSIGHTS_STDIO_HIGH_USAGE = 'S01' +INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE = 'P01' +INSIGHTS_POSIX_READ_COUNT_INTENSIVE = 'P02' +INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE = 'P03' +INSIGHTS_POSIX_READ_SIZE_INTENSIVE = 'P04' +INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE = 'P05' +INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE = 'P06' +INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE = 'P07' +INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE = 'P08' +INSIGHTS_POSIX_REDUNDANT_READ_USAGE = 'P09' +INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE = 'P10' +INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE = 'P11' +INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE = 'P12' +INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE = 'P13' +INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE = 'P14' +INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE = 'P15' +INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE = 'P16' +INSIGHTS_POSIX_HIGH_METADATA_TIME = 'P17' +INSIGHTS_POSIX_SIZE_IMBALANCE = 'P18' +INSIGHTS_POSIX_TIME_IMBALANCE = 'P19' +INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE = 'P21' +INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE = 'P22' +INSIGHTS_MPI_IO_NO_USAGE = 'M01' +INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE = 'M02' +INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE = 'M03' +INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE = 'M04' +INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE = 'M05' +INSIGHTS_MPI_IO_BLOCKING_READ_USAGE = 'M06' +INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE = 'M07' +INSIGHTS_MPI_IO_AGGREGATORS_INTRA = 'M08' +INSIGHTS_MPI_IO_AGGREGATORS_INTER = 'M09' +INSIGHTS_MPI_IO_AGGREGATORS_OK = 'M10' + +# TODO: need to verify the threashold to be between 0 and 1 +# TODO: read thresholds from file + + +console = Console(record=True) +csv_report = [] + + +def init_console(args): + if args.export_size: console.width = int(args.export_size) + + +def validate_thresholds(): + """ + Validate thresholds defined by the user. + """ + assert(THRESHOLD_OPERATION_IMBALANCE >= 0.0 and THRESHOLD_OPERATION_IMBALANCE <= 1.0) + assert(THRESHOLD_SMALL_REQUESTS >= 0.0 and THRESHOLD_SMALL_REQUESTS <= 1.0) + assert(THRESHOLD_MISALIGNED_REQUESTS >= 0.0 and THRESHOLD_MISALIGNED_REQUESTS <= 1.0) + assert(THRESHOLD_METADATA >= 0.0 and THRESHOLD_METADATA <= 1.0) + assert(THRESHOLD_RANDOM_OPERATIONS >= 0.0 and THRESHOLD_RANDOM_OPERATIONS <= 1.0) + + assert(THRESHOLD_METADATA_TIME_RANK >= 0.0) + + +def convert_bytes(bytes_number): + """ + Convert bytes into formatted string. + """ + tags = [ + 'bytes', + 'KB', + 'MB', + 'GB', + 'TB', + 'PB', + 'EB' + ] + + i = 0 + double_bytes = bytes_number + + while (i < len(tags) and bytes_number >= 1024): + double_bytes = bytes_number / 1024.0 + i = i + 1 + bytes_number = bytes_number / 1024 + + return str(round(double_bytes, 2)) + ' ' + tags[i] + + +def message(args, code, target, level, issue, recommendations=None, details=None): + """ + Display the message on the screen with level, issue, and recommendation. + """ + icon = ':arrow_forward:' + + if level in (HIGH, WARN): + insights_total[level] += 1 + + if level == HIGH: + color = '[red]' + elif level == WARN: + color = '[orange1]' + elif level == OK: + color = '[green]' + else: + color = '' + + messages = [ + '{}{}{} {}'.format( + color, + icon, + ' [' + code + ']' if args.code else '', + issue + ) + ] + + if args.export_csv: + csv_report.append(code) + + if details: + for detail in details: + messages.append(' {}:left_arrow_curving_right: {}'.format( + color, + detail['message'] + ) + ) + + if recommendations: + if not args.only_issues: + messages.append(' [white]:left_arrow_curving_right: [b]Recommendations:[/b]') + + for recommendation in recommendations: + messages.append(' :left_arrow_curving_right: {}'.format(recommendation['message'])) + + if args.verbose and 'sample' in recommendation: + messages.append( + Padding( + Panel( + recommendation['sample'], + title='Solution Example Snippet', + title_align='left', + padding=(1, 2) + ), + (1, 0, 1, 7) + ) + ) + + insights_total[RECOMMENDATIONS] += len(recommendations) + + return Group( + *messages + ) diff --git a/drishti/reporter.py b/drishti/reporter.py new file mode 100644 index 0000000..f1ab847 --- /dev/null +++ b/drishti/reporter.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse + +from subprocess import call + + +LOG_TYPE_DARSHAN = 0 +LOG_TYPE_RECORDER = 1 + +parser = argparse.ArgumentParser( + description='Drishti: ' +) + +parser.add_argument( + 'log_path', + help='Input .darshan file or recorder folder' +) + +parser.add_argument( + '--issues', + default=False, + action='store_true', + dest='only_issues', + help='Only displays the detected issues and hides the recommendations' +) + +parser.add_argument( + '--html', + default=False, + action='store_true', + dest='export_html', + help='Export the report as an HTML page' +) + +parser.add_argument( + '--svg', + default=False, + action='store_true', + dest='export_svg', + help='Export the report as an SVG image' +) + +parser.add_argument( + '--light', + default=False, + action='store_true', + dest='export_theme_light', + help='Use a light theme for the report when generating files' +) + +parser.add_argument( + '--size', + default=False, + dest='export_size', + help='Console width used for the report and generated files' +) + +parser.add_argument( + '--verbose', + default=False, + action='store_true', + dest='verbose', + help='Display extended details for the recommendations' +) + +parser.add_argument( + '--code', + default=False, + action='store_true', + dest='code', + help='Display insights identification code' +) + +parser.add_argument( + '--path', + default=False, + action='store_true', + dest='full_path', + help='Display the full file path for the files that triggered the issue' +) + +parser.add_argument( + '--csv', + default=False, + action='store_true', + dest='export_csv', + help='Export a CSV with the code of all issues that were triggered' +) + +parser.add_argument( + '--json', + default=False, + dest='json', + help=argparse.SUPPRESS) + +args = parser.parse_args() + + +def clear(): + """ + Clear the screen with the comment call based on the operating system. + """ + _ = call('clear' if os.name == 'posix' else 'cls') + + +def check_log_type(path): + if path.endswith('.darshan'): + if not os.path.isfile(path): + print('Unable to open .darshan file.') + sys.exit(os.EX_NOINPUT) + else: return LOG_TYPE_DARSHAN + else: # check whether is a valid recorder log + if not os.path.isdir(path): + print('Unable to open recorder folder.') + sys.exit(os.EX_NOINPUT) + else: return LOG_TYPE_RECORDER + + +def main(): + log_type = check_log_type(args.log_path) + + if log_type == LOG_TYPE_DARSHAN: + from . import handle_darshan + handle_darshan.handler(args) + + elif log_type == LOG_TYPE_RECORDER: + from . import handle_recorder + handle_recorder.handler(args) + diff --git a/requirements.txt b/requirements.txt index 1f1dc56..467f761 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ argparse darshan pandas rich==12.5.1 +recorder-utils diff --git a/setup.py b/setup.py index e680e7b..dd18cb6 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setuptools.setup( name="drishti-io", keywords="drishti", - version="0.4", + version="0.5", author="Jean Luca Bez, Suren Byna", author_email="jlbez@lbl.gov, sbyna@lbl.gov", description="", @@ -21,6 +21,7 @@ 'pandas', 'darshan', 'rich ==12.5.1', + 'recorder-utils', ], packages=[ 'drishti' @@ -33,7 +34,7 @@ include_package_data=True, entry_points={ "console_scripts": [ - "drishti=drishti.main:main" + "drishti=drishti.reporter:main" ] }, classifiers=[ From ae1dde1e3cf412e0a71a30dbcbf0c7f672820c53 Mon Sep 17 00:00:00 2001 From: Onewbiek Date: Fri, 3 Nov 2023 00:39:38 -0400 Subject: [PATCH 03/19] Limit details to 10 lines --- drishti/includes.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drishti/includes.py b/drishti/includes.py index e801cdf..3b921aa 100644 --- a/drishti/includes.py +++ b/drishti/includes.py @@ -77,6 +77,8 @@ INSIGHTS_MPI_IO_AGGREGATORS_INTER = 'M09' INSIGHTS_MPI_IO_AGGREGATORS_OK = 'M10' +DETAILS_MAX_SIZE = 10 + # TODO: need to verify the threashold to be between 0 and 1 # TODO: read thresholds from file @@ -88,6 +90,15 @@ def init_console(args): if args.export_size: console.width = int(args.export_size) + insights_operation.clear() + insights_metadata.clear() + insights_dxt.clear() + + insights_total[HIGH] = 0 + insights_total[WARN] = 0 + insights_total[RECOMMENDATIONS] = 0 + + csv_report.clear() def validate_thresholds(): """ @@ -158,7 +169,7 @@ def message(args, code, target, level, issue, recommendations=None, details=None csv_report.append(code) if details: - for detail in details: + for detail in details[:DETAILS_MAX_SIZE]: messages.append(' {}:left_arrow_curving_right: {}'.format( color, detail['message'] From a221578c35216baf154b219d458541538b4b5b64 Mon Sep 17 00:00:00 2001 From: Onewbiek Date: Fri, 3 Nov 2023 00:41:39 -0400 Subject: [PATCH 04/19] Add new function to split report for each file been tracked --- drishti/handle_recorder.py | 33 +- drishti/handle_recorder_split.py | 982 +++++++++++++++++++++++++++++++ drishti/reporter.py | 19 +- 3 files changed, 1005 insertions(+), 29 deletions(-) create mode 100644 drishti/handle_recorder_split.py diff --git a/drishti/handle_recorder.py b/drishti/handle_recorder.py index 18136f3..59462af 100644 --- a/drishti/handle_recorder.py +++ b/drishti/handle_recorder.py @@ -23,26 +23,6 @@ from .includes import * -def get_modules(reader): - func_list = reader.funcs - ranks = reader.GM.total_ranks - modules = set() - - for rank in range(ranks): - for i in range(reader.LMs[rank].total_records): - record = reader.records[rank][i] - func_name = func_list[record.func_id] - if 'MPI_File' in func_name: - modules.add('MPI-IO') - elif 'MPI' in func_name: - modules.add('MPI') - elif 'H5' in func_name: - modules.add('H5F') - else: modules.add('POSIX') - - return modules - - def get_accessed_files(reader): ranks = reader.GM.total_ranks filemap = {} @@ -79,7 +59,6 @@ def handler(args): df_intervals = build_offset_intervals(reader) df_posix_records = init_df_posix_recordes(reader) - modules = get_modules(reader) unique_files = get_accessed_files(reader) def add_api(row): @@ -98,6 +77,8 @@ def add_duration(row): df_intervals['duration'] = df_intervals.apply(add_duration, axis=1) df_posix_records['duration'] = df_posix_records.apply(add_duration, axis=1) + modules = set(df_intervals['api'].unique()) + ######################################################################################################################################################################### # Check usage of POSIX, and MPI-IO per file @@ -156,7 +137,7 @@ def add_duration(row): message(args, INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) ) - if 'MPI-IO' not in modules: + if 'MPIIO' not in modules: issue = 'Application is using low-performance interface' recommendation = [ @@ -264,7 +245,7 @@ def add_duration(row): } ) - if 'MPI-IO' in modules: + if 'MPIIO' in modules: recommendation.append( { 'message': 'Since the appplication already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones', @@ -308,7 +289,7 @@ def add_duration(row): } ) - if 'MPI-IO' in modules: + if 'MPIIO' in modules: recommendation.append( { 'message': 'Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones', @@ -922,7 +903,7 @@ def add_duration(row): } ) - if 'MPI-IO' in modules: + if 'MPIIO' in modules: recommendation.append( { 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())', @@ -947,7 +928,7 @@ def add_duration(row): } ) - if 'MPI-IO' in modules: + if 'MPIIO' in modules: recommendation.append( { 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())', diff --git a/drishti/handle_recorder_split.py b/drishti/handle_recorder_split.py new file mode 100644 index 0000000..74bc899 --- /dev/null +++ b/drishti/handle_recorder_split.py @@ -0,0 +1,982 @@ +#!/usr/bin/env python3 + +import os +import csv +import time +import json + +import datetime + +import pandas as pd + +from rich import print, box +from rich.console import Group +from rich.padding import Padding +from rich.syntax import Syntax +from rich.panel import Panel +from rich.terminal_theme import TerminalTheme +from rich.terminal_theme import MONOKAI + +from recorder_utils import RecorderReader +from recorder_utils.build_offset_intervals import build_offset_intervals + +from .includes import * + + +def get_accessed_files(reader): + ranks = reader.GM.total_ranks + filemap = {} + for rank in range(ranks): + filemap.update(reader.LMs[rank].filemap) + + return filemap + + +def init_df_posix_recordes(reader): + func_list = reader.funcs + ranks = reader.GM.total_ranks + records = [] + for rank in range(ranks): + for i in range(reader.LMs[rank].total_records): + record = reader.records[rank][i] + func_name = func_list[record.func_id] + + if 'MPI' not in func_name and 'H5' not in func_name: + filename = None + if "open" in func_name or "close" in func_name or "creat" in func_name \ + or "seek" in func_name or "sync" in func_name: + fstr = record.args[0] + filename = fstr if type(fstr)==str else fstr.decode('utf-8') + filename = filename.replace('./', '') + + records.append( [filename, rank, func_name, record.tstart, record.tend] ) + + head = ['fname', 'rank', 'function', 'start', 'end'] + df_posix_records = pd.DataFrame(records, columns=head) + return df_posix_records + + +def handler(args): + reader = RecorderReader(args.log_path) + df_intervals = build_offset_intervals(reader) + df_posix_records = init_df_posix_recordes(reader) + + unique_files = get_accessed_files(reader) + + def add_api(row): + if 'MPI' in row['function']: + return 'MPIIO' + elif 'H5' in row['function']: + return 'H5F' + else: + return 'POSIX' + + df_intervals['api'] = df_intervals.apply(add_api, axis=1) + + def add_duration(row): + return row['end'] - row['start'] + + df_intervals['duration'] = df_intervals.apply(add_duration, axis=1) + df_posix_records['duration'] = df_posix_records.apply(add_duration, axis=1) + + ######################################################################################################################################################################### + for fid, fname in unique_files.items(): + console = Console(record=True) + init_console(args) + validate_thresholds() + insights_start_time = time.time() + + df_intervals_temp = df_intervals[(df_intervals['file_id'] == fid)] + if not len(df_intervals_temp): continue + + df_posix_records = df_posix_records[(df_posix_records['fname'] == fname)] + modules = set(df_intervals_temp['api'].unique()) + + # Check usage of POSIX, and MPI-IO per file + total_size_stdio = 0 + total_size_posix = 0 + total_size_mpiio = 0 + total_size = 0 + + df_stdio_intervals = df_intervals_temp[(df_intervals_temp['api'] == 'STDIO')] + df_posix_intervals = df_intervals_temp[(df_intervals_temp['api'] == 'POSIX')] + df_mpiio_intervals = df_intervals_temp[(df_intervals_temp['api'] == 'MPIIO')] + + if len(df_stdio_intervals): + total_size_stdio += df_stdio_intervals['size'].sum() + + if len(df_posix_intervals): + total_size_posix += df_posix_intervals['size'].sum() + + if len(df_mpiio_intervals): + total_size_mpiio += df_mpiio_intervals['size'].sum() + + + # Since POSIX will capture both POSIX-only accesses and those comming from MPI-IO, we can subtract those + if total_size_posix > 0 and total_size_posix >= total_size_mpiio: + total_size_posix -= total_size_mpiio + + total_size = total_size_stdio + total_size_posix + total_size_mpiio + + assert(total_size_stdio >= 0) + assert(total_size_posix >= 0) + assert(total_size_mpiio >= 0) + + if total_size and total_size_stdio / total_size > THRESHOLD_INTERFACE_STDIO: + issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format( + total_size_stdio / total_size * 100.0, + convert_bytes(total_size_stdio) + ) + + recommendation = [ + { + 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' + } + ] + + insights_operation.append( + message(args, INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + if 'MPIIO' not in modules: + issue = 'Application is using low-performance interface' + + recommendation = [ + { + 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' + } + ] + + insights_operation.append( + message(args, INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + ) + + ######################################################################################################################################################################### + + if df_intervals_temp['api'].eq('POSIX').any(): + df_posix = df_intervals_temp[(df_intervals_temp['api'] == 'POSIX')] + + ######################################################################################################################################################################### + + # Get number of write/read operations + total_reads = len(df_posix[(df_posix['function'].str.contains('read'))]) + total_writes = len(df_posix[~(df_posix['function'].str.contains('read'))]) + + # Get total number of I/O operations + total_operations = total_writes + total_reads + + # To check whether the application is write-intersive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance + if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: + issue = 'Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( + total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 + ) + + insights_metadata.append( + message(args, INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + ) + + if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: + issue = 'Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( + total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 + ) + + insights_metadata.append( + message(args, INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + ) + + total_read_size = df_posix[(df_posix['function'].str.contains('read'))]['size'].sum() + total_written_size = df_posix[~(df_posix['function'].str.contains('read'))]['size'].sum() + + total_size = total_written_size + total_read_size + + if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: + issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format( + total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 + ) + + insights_metadata.append( + message(args, INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + ) + + if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: + issue = 'Application is read size intensive ({:.2f}% write vs. {:.2f}% read)'.format( + total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 + ) + + insights_metadata.append( + message(args, INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + ) + + ######################################################################################################################################################################### + + # Get the number of small I/O operations (less than 1 MB) + + total_reads_small = len(df_posix[(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + total_writes_small = len(df_posix[~(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + + if total_reads_small and total_reads_small / total_reads > THRESHOLD_SMALL_REQUESTS and total_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests'.format( + total_reads_small, total_reads_small / total_reads * 100.0 + ) + + recommendation = [] + + recommendation.append( + { + 'message': 'Consider buffering read operations into larger more contiguous ones' + } + ) + + if 'MPIIO' in modules: + recommendation.append( + { + 'message': 'Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + } + ) + else: + recommendation.append( + { + 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' + } + ) + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + if total_writes_small and total_writes_small / total_writes > THRESHOLD_SMALL_REQUESTS and total_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests'.format( + total_writes_small, total_writes_small / total_writes * 100.0 + ) + + recommendation = [] + + recommendation.append( + { + 'message': 'Consider buffering write operations into larger more contiguous ones' + } + ) + + if 'MPIIO' in modules: + recommendation.append( + { + 'message': 'Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + } + ) + else: + recommendation.append( + { + 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' + } + ) + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + ######################################################################################################################################################################### + + # How many requests are misaligned? + # TODO: + + ######################################################################################################################################################################### + + # Redundant read-traffic (based on Phill) + # POSIX_MAX_BYTE_READ (Highest offset in the file that was read) + max_read_offset = df_posix[(df_posix['function'].str.contains('read'))]['offset'].max() + + if max_read_offset > total_read_size: + issue = 'Application might have redundant read traffic (more data read than the highest offset)' + + insights_metadata.append( + message(args, INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None) + ) + + max_write_offset = df_posix[~(df_posix['function'].str.contains('read'))]['offset'].max() + + if max_write_offset > total_written_size: + issue = 'Application might have redundant write traffic (more data written than the highest offset)' + + insights_metadata.append( + message(args, INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None) + ) + + ######################################################################################################################################################################### + + # Check for a lot of random operations + + read_consecutive = 0 + read_sequential = 0 + read_random = 0 + + df_filtered = df_posix[(df_posix['function'].str.contains('read'))].sort_values('start') + + for i in range(len(df_filtered) - 1): + curr_interval = df_filtered.iloc[i] + next_interval = df_filtered.iloc[i + 1] + if curr_interval['offset'] + curr_interval['size'] == next_interval['offset']: + read_consecutive += 1 + elif curr_interval['offset'] + curr_interval['size'] < next_interval['offset']: + read_sequential += 1 + else: + read_random += 1 + + if total_reads: + if read_random and read_random / total_reads > THRESHOLD_RANDOM_OPERATIONS and read_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: + issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format( + read_random, read_random / total_reads * 100.0 + ) + + recommendation = [ + { + 'message': 'Consider changing your data model to have consecutive or sequential reads' + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + else: + issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests'.format( + read_consecutive / total_reads * 100.0, + read_sequential / total_reads * 100.0 + ) + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, TARGET_DEVELOPER, OK, issue, None) + ) + + write_consecutive = 0 + write_sequential = 0 + write_random = 0 + + + df_filtered = df_posix[~(df_posix['function'].str.contains('read'))].sort_values('start') + + for i in range(len(df_filtered) - 1): + curr_interval = df_filtered.iloc[i] + next_interval = df_filtered.iloc[i + 1] + if curr_interval['offset'] + curr_interval['size'] == next_interval['offset']: + write_consecutive += 1 + elif curr_interval['offset'] + curr_interval['size'] < next_interval['offset']: + write_sequential += 1 + else: + write_random += 1 + + if total_writes: + if write_random and write_random / total_writes > THRESHOLD_RANDOM_OPERATIONS and write_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: + issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)'.format( + write_random, write_random / total_writes * 100.0 + ) + + recommendation = [ + { + 'message': 'Consider changing your data model to have consecutive or sequential writes' + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + else: + issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests'.format( + write_consecutive / total_writes * 100.0, + write_sequential / total_writes * 100.0 + ) + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, TARGET_DEVELOPER, OK, issue, None) + ) + + ######################################################################################################################################################################### + + # Shared file with small operations + + # A file is shared if it's been read/written by more than 1 rank + detected_files = df_posix['rank'].nunique() + + total_shared_reads = 0 + total_shared_reads_small = 0 + total_shared_writes = 0 + total_shared_writes_small = 0 + + if df_posix['rank'].nunique() > 1: + total_shared_reads += len(df_posix[(df_posix['function'].str.contains('read'))]) + total_shared_writes += len(df_posix[~(df_posix['function'].str.contains('read'))]) + + total_shared_reads_small = len(df_posix[(df_posix['function'].str.contains('read')) + & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + total_shared_writes_small = len(df_posix[~(df_posix['function'].str.contains('read')) + & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + + if total_shared_reads and total_shared_reads_small / total_shared_reads > THRESHOLD_SMALL_REQUESTS and total_shared_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format( + total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0 + ) + + recommendation = [ + { + 'message': 'Consider coalesceing read requests into larger more contiguous ones using MPI-IO collective operations', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + if total_shared_writes and total_shared_writes_small / total_shared_writes > THRESHOLD_SMALL_REQUESTS and total_shared_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests'.format( + total_shared_writes_small, total_shared_writes_small / total_shared_writes * 100.0 + ) + + recommendation = [ + { + 'message': 'Consider coalescing write requests into larger more contiguous ones using MPI-IO collective operations', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + ######################################################################################################################################################################### + + # TODO: Here I assume all operations other than write/read are metadata operations + df_posix_metadata = df_posix_records[~(df_posix_records['function'].str.contains('read|write|print', na=False))] + df_detected = df_posix_metadata.groupby('rank')['duration'].sum().reset_index() + has_long_metadata = df_detected[(df_detected['duration'] > THRESHOLD_METADATA_TIME_RANK)] + + if not has_long_metadata.empty: + issue = 'There are {} ranks where metadata operations take over {} seconds'.format( + len(has_long_metadata), THRESHOLD_METADATA_TIME_RANK + ) + + recommendation = [ + { + 'message': 'Attempt to combine files, reduce, or cache metadata operations' + } + ] + + if 'H5F' in modules: + recommendation.append( + { + 'message': 'Since your appplication uses HDF5, try enabling collective metadata calls with H5Pset_coll_metadata_write() and H5Pset_all_coll_metadata_ops()', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-collective-metadata.c'), line_numbers=True, background_color='default') + }, + { + 'message': 'Since your appplication uses HDF5, try using metadata cache to defer metadata operations', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-cache.c'), line_numbers=True, background_color='default') + } + ) + + insights_metadata.append( + message(args, INSIGHTS_POSIX_HIGH_METADATA_TIME, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + # We already have a single line for each shared-file access + # To check for stragglers, we can check the difference between the + + # POSIX_FASTEST_RANK_BYTES + # POSIX_SLOWEST_RANK_BYTES + # POSIX_VARIANCE_RANK_BYTES + + stragglers = False + + if df_posix['rank'].nunique() > 1: + total_transfer_size = df_posix['size'].sum() + + df_detected = df_posix.groupby('rank').agg({'size': 'sum', 'duration': 'sum'}).reset_index() + slowest_rank_bytes = df_detected.loc[df_detected['duration'].idxmax(), 'size'] + fastest_rank_bytes = df_detected.loc[df_detected['duration'].idxmin(), 'size'] + + if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > THRESHOLD_STRAGGLERS: + stragglers = True + + if stragglers: + issue = 'Load imbalance of {:.2f}% detected'.format( + abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100 + ) + + recommendation = [ + { + 'message': 'Consider better balancing the data transfer between the application ranks' + }, + { + 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation) + ) + + # POSIX_F_FASTEST_RANK_TIME + # POSIX_F_SLOWEST_RANK_TIME + # POSIX_F_VARIANCE_RANK_TIME + + stragglers = False + + if df_posix['rank'].nunique() > 1: + total_transfer_time = df_posix['duration'].sum() + + df_detected = df_posix.groupby('rank')['duration'].sum().reset_index() + + slowest_rank_time = df_detected['duration'].max() + fastest_rank_time = df_detected['duration'].min() + + if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > THRESHOLD_STRAGGLERS: + stragglers = True + + if stragglers: + issue = 'Load imbalance of {:.2f}% detected'.format( + abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100 + ) + + recommendation = [ + { + 'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give + }, + { + 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation) + ) + + # Get the individual files responsible for imbalance + imbalance = False + + if df_posix['rank'].nunique() == 1: + df_detected = df_posix[~(df_posix['function'].str.contains('read'))] + + max_bytes_written = df_detected['size'].max() + min_bytes_written = df_detected['size'].min() + + if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > THRESHOLD_IMBALANCE: + imbalance = True + + if imbalance: + issue = 'Load imbalance of {:.2f}% detected'.format( + abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 + ) + + recommendation = [ + { + 'message': 'Consider better balancing the data transfer between the application ranks' + }, + { + 'message': 'Consider tuning the stripe size and count to better distribute the data', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + }, + { + 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + }, + { + 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + imbalance = False + + if df_posix['rank'].nunique() == 1: + df_detected = df_posix[(df_posix['function'].str.contains('read'))] + + max_bytes_read = df_detected['size'].max() + min_bytes_read = df_detected['size'].min() + + if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > THRESHOLD_IMBALANCE: + imbalance = True + + if imbalance: + issue = 'Load imbalance of {:.2f}% detected'.format( + abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 + ) + + recommendation = [ + { + 'message': 'Consider better balancing the data transfer between the application ranks' + }, + { + 'message': 'Consider tuning the stripe size and count to better distribute the data', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + }, + { + 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + }, + { + 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' + } + ] + + insights_operation.append( + message(args, INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + ######################################################################################################################################################################### + + if df_intervals_temp['api'].eq('MPIIO').any(): + df_mpiio = df_intervals_temp[(df_intervals_temp['api'] == 'MPIIO')] + + df_mpiio_reads = df_mpiio[(df_mpiio['function'].str.contains('read'))] + mpiio_indp_reads = len(df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all'))]) + mpiio_coll_reads = len(df_mpiio_reads[(df_mpiio_reads['function'].str.contains('_all'))]) + total_mpiio_read_operations = mpiio_indp_reads + mpiio_coll_reads + + df_mpiio_writes = df_mpiio[~(df_mpiio['function'].str.contains('read'))] + mpiio_indp_writes = len(df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all'))]) + mpiio_coll_writes = len(df_mpiio_writes[(df_mpiio_writes['function'].str.contains('_all'))]) + total_mpiio_write_operations = mpiio_indp_writes + mpiio_coll_writes + + if mpiio_coll_reads == 0: + if total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format( + mpiio_indp_reads, + mpiio_indp_reads / (total_mpiio_read_operations) * 100 + ) + + recommendation = [ + { + 'message': 'Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(args, INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + else: + issue = 'Application uses MPI-IO and read data using {} ({:.2f}%) collective operations'.format( + mpiio_coll_reads, + mpiio_coll_reads / total_mpiio_read_operations * 100 + ) + + insights_operation.append( + message(args, INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, OK, issue) + ) + + if mpiio_coll_writes == 0: + if total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format( + mpiio_indp_writes, + mpiio_indp_writes / (total_mpiio_write_operations) * 100 + ) + + recommendation = [ + { + 'message': 'Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(args, INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + else: + issue = 'Application uses MPI-IO and write data using {} ({:.2f}%) collective operations'.format( + mpiio_coll_writes, + mpiio_coll_writes / total_mpiio_write_operations * 100 + ) + + insights_operation.append( + message(args, INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, OK, issue) + ) + + ######################################################################################################################################################################### + + # Look for usage of non-block operations + + # Look for HDF5 file extension + + has_hdf5_extension = False + + if fname.endswith('.h5') or fname.endswith('.hdf5'): + has_hdf5_extension = True + + if len(df_mpiio_reads[(df_mpiio_reads['function'].str.contains('iread|begin|end'))]) == 0: + issue = 'Application could benefit from non-blocking (asynchronous) reads' + + recommendation = [] + + if 'H5F' in modules or has_hdf5_extension: + recommendation.append( + { + 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-read.c'), line_numbers=True, background_color='default') + } + ) + + if 'MPIIO' in modules: + recommendation.append( + { + 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iread.c'), line_numbers=True, background_color='default') + } + ) + + insights_operation.append( + message(args, INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + ) + + if len(df_mpiio_writes[(df_mpiio_writes['function'].str.contains('iwrite|begin|end'))]) == 0: + issue = 'Application could benefit from non-blocking (asynchronous) writes' + + recommendation = [] + + if 'H5F' in modules or has_hdf5_extension: + recommendation.append( + { + 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-write.c'), line_numbers=True, background_color='default') + } + ) + + if 'MPIIO' in modules: + recommendation.append( + { + 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iwrite.c'), line_numbers=True, background_color='default') + } + ) + + insights_operation.append( + message(args, INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + ) + + ######################################################################################################################################################################### + + # Nodes and MPI-IO aggregators + # If the application uses collective reads or collective writes, look for the number of aggregators + # TODO: + + ######################################################################################################################################################################### + + NUMBER_OF_COMPUTE_NODES = 0 + + ######################################################################################################################################################################### + + codes = [] + if args.json: + f = open(args.json) + data = json.load(f) + + for key, values in data.items(): + for value in values: + code = value['code'] + codes.append(code) + + level = value['level'] + issue = value['issue'] + recommendation = [] + for rec in value['recommendations']: + new_message = {'message': rec} + recommendation.append(new_message) + + insights_dxt.append( + message(args, code, TARGET_DEVELOPER, level, issue, recommendation) + ) + + ######################################################################################################################################################################### + + insights_end_time = time.time() + + console.print() + + console.print( + Panel( + '\n'.join([ + ' [b]RECORDER[/b]: [white]{}[/white]'.format( + os.path.basename(args.log_path) + ), + ' [b]FILE[/b]: [white]{} ({})[/white]'.format( + fname, + fid, + ), + # ' [b]COMPUTE NODES[/b] [white]{}[/white]'.format( + # NUMBER_OF_COMPUTE_NODES + # ), + ' [b]PROCESSES[/b] [white]{}[/white]'.format( + reader.GM.total_ranks + ), + ]), + title='[b][slate_blue3]DRISHTI[/slate_blue3] v.0.5[/b]', + title_align='left', + subtitle='[red][b]{} critical issues[/b][/red], [orange1][b]{} warnings[/b][/orange1], and [white][b]{} recommendations[/b][/white]'.format( + insights_total[HIGH], + insights_total[WARN], + insights_total[RECOMMENDATIONS], + ), + subtitle_align='left', + padding=1 + ) + ) + + console.print() + + if insights_metadata: + console.print( + Panel( + Padding( + Group( + *insights_metadata + ), + (1, 1) + ), + title='METADATA', + title_align='left' + ) + ) + + if insights_operation: + console.print( + Panel( + Padding( + Group( + *insights_operation + ), + (1, 1) + ), + title='OPERATIONS', + title_align='left' + ) + ) + + if insights_dxt: + console.print( + Panel( + Padding( + Group( + *insights_dxt + ), + (1, 1) + ), + title='DXT', + title_align='left' + ) + ) + + console.print( + Panel( + ' {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds'.format( + datetime.datetime.now().year, + datetime.datetime.now(), + insights_end_time - insights_start_time + ), + box=box.SIMPLE + ) + ) + + if args.export_theme_light: + export_theme = TerminalTheme( + (255, 255, 255), + (0, 0, 0), + [ + (26, 26, 26), + (244, 0, 95), + (152, 224, 36), + (253, 151, 31), + (157, 101, 255), + (244, 0, 95), + (88, 209, 235), + (120, 120, 120), + (98, 94, 76), + ], + [ + (244, 0, 95), + (152, 224, 36), + (224, 213, 97), + (157, 101, 255), + (244, 0, 95), + (88, 209, 235), + (246, 246, 239), + ], + ) + else: + export_theme = MONOKAI + + if args.export_html: + console.save_html( + '{}.{}.html'.format(args.log_path, fid), + theme=export_theme, + clear=False + ) + + if args.export_svg: + console.save_svg( + '{}.{}.svg'.format(args.log_path, fid), + title='Drishti', + theme=export_theme, + clear=False + ) + + if args.export_csv: + issues = [ + 'JOB', + INSIGHTS_STDIO_HIGH_USAGE, + INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, + INSIGHTS_POSIX_READ_COUNT_INTENSIVE, + INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, + INSIGHTS_POSIX_READ_SIZE_INTENSIVE, + INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, + INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, + INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, + INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, + INSIGHTS_POSIX_REDUNDANT_READ_USAGE, + INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, + INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, + INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, + INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, + INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, + INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, + INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, + INSIGHTS_POSIX_HIGH_METADATA_TIME, + INSIGHTS_POSIX_SIZE_IMBALANCE, + INSIGHTS_POSIX_TIME_IMBALANCE, + INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, + INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, + INSIGHTS_MPI_IO_NO_USAGE, + INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, + INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, + INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE, + INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE, + INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, + INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, + INSIGHTS_MPI_IO_AGGREGATORS_INTRA, + INSIGHTS_MPI_IO_AGGREGATORS_INTER, + INSIGHTS_MPI_IO_AGGREGATORS_OK + ] + if codes: + issues.extend(codes) + + detected_issues = dict.fromkeys(issues, False) + detected_issues['JOB'] = None + + for report in csv_report: + detected_issues[report] = True + + filename = '{}.{}.summary.csv'.format( + args.log_path, + fid + ) + + with open(filename, 'w') as f: + w = csv.writer(f) + w.writerow(detected_issues.keys()) + w.writerow(detected_issues.values()) + + + diff --git a/drishti/reporter.py b/drishti/reporter.py index f1ab847..4a274fe 100644 --- a/drishti/reporter.py +++ b/drishti/reporter.py @@ -94,7 +94,16 @@ '--json', default=False, dest='json', - help=argparse.SUPPRESS) + help=argparse.SUPPRESS +) + +parser.add_argument( + '--split', + default=False, + action='store_true', + dest='split_files', + help='Split the files and generate report for each file' +) args = parser.parse_args() @@ -127,6 +136,10 @@ def main(): handle_darshan.handler(args) elif log_type == LOG_TYPE_RECORDER: - from . import handle_recorder - handle_recorder.handler(args) + if args.split_files: + from . import handle_recorder_split + handle_recorder_split.handler(args) + else: + from . import handle_recorder + handle_recorder.handler(args) From c05bf2d449878ef2dacaa7a71c1a5ee7906d555c Mon Sep 17 00:00:00 2001 From: onewbiek Date: Wed, 29 Nov 2023 16:29:56 -0800 Subject: [PATCH 05/19] Update configuration file --- drishti/config.py | 272 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 272 insertions(+) create mode 100644 drishti/config.py diff --git a/drishti/config.py b/drishti/config.py new file mode 100644 index 0000000..aaf25b1 --- /dev/null +++ b/drishti/config.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 + +import os +import json + +from rich.console import Console, Group +from rich.padding import Padding +from rich.panel import Panel +from rich.terminal_theme import TerminalTheme +from rich.terminal_theme import MONOKAI + +from .parser import * + + +RECOMMENDATIONS = 0 +HIGH = 1 +WARN = 2 +INFO = 3 +OK = 4 + +ROOT = os.path.abspath(os.path.dirname(__file__)) + +TARGET_USER = 1 +TARGET_DEVELOPER = 2 +TARGET_SYSTEM = 3 + +insights_operation = [] +insights_metadata = [] +insights_dxt = [] + +insights_total = dict() + +insights_total[HIGH] = 0 +insights_total[WARN] = 0 +insights_total[RECOMMENDATIONS] = 0 + +THRESHOLD_OPERATION_IMBALANCE = 0.1 +THRESHOLD_SMALL_REQUESTS = 0.1 +THRESHOLD_SMALL_REQUESTS_ABSOLUTE = 1000 +THRESHOLD_MISALIGNED_REQUESTS = 0.1 +THRESHOLD_METADATA = 0.1 +THRESHOLD_METADATA_TIME_RANK = 30 # seconds +THRESHOLD_RANDOM_OPERATIONS = 0.2 +THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE = 1000 +THRESHOLD_STRAGGLERS = 0.15 +THRESHOLD_IMBALANCE = 0.30 +THRESHOLD_INTERFACE_STDIO = 0.1 +THRESHOLD_COLLECTIVE_OPERATIONS = 0.5 +THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE = 1000 +THRESHOLD_SMALL_BYTES = 1048576 # 1 MB + +INSIGHTS_STDIO_HIGH_USAGE = 'S01' +INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE = 'P01' +INSIGHTS_POSIX_READ_COUNT_INTENSIVE = 'P02' +INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE = 'P03' +INSIGHTS_POSIX_READ_SIZE_INTENSIVE = 'P04' +INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE = 'P05' +INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE = 'P06' +INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE = 'P07' +INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE = 'P08' +INSIGHTS_POSIX_REDUNDANT_READ_USAGE = 'P09' +INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE = 'P10' +INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE = 'P11' +INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE = 'P12' +INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE = 'P13' +INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE = 'P14' +INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE = 'P15' +INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE = 'P16' +INSIGHTS_POSIX_HIGH_METADATA_TIME = 'P17' +INSIGHTS_POSIX_SIZE_IMBALANCE = 'P18' +INSIGHTS_POSIX_TIME_IMBALANCE = 'P19' +INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE = 'P21' +INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE = 'P22' +INSIGHTS_MPI_IO_NO_USAGE = 'M01' +INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE = 'M02' +INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE = 'M03' +INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE = 'M04' +INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE = 'M05' +INSIGHTS_MPI_IO_BLOCKING_READ_USAGE = 'M06' +INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE = 'M07' +INSIGHTS_MPI_IO_AGGREGATORS_INTRA = 'M08' +INSIGHTS_MPI_IO_AGGREGATORS_INTER = 'M09' +INSIGHTS_MPI_IO_AGGREGATORS_OK = 'M10' + +DETAILS_MAX_SIZE = 10 + +# TODO: need to verify the threashold to be between 0 and 1 +# TODO: read thresholds from file + + +console = Console(record=True) +csv_report = [] +codes = [] +export_theme = MONOKAI + + +def init_console(): + set_export_size() + set_export_theme() + + insights_operation.clear() + insights_metadata.clear() + + insights_total[HIGH] = 0 + insights_total[WARN] = 0 + insights_total[RECOMMENDATIONS] = 0 + + +def set_export_theme(): + global export_theme + if args.export_theme_light: + export_theme = TerminalTheme( + (255, 255, 255), + (0, 0, 0), + [ + (26, 26, 26), + (244, 0, 95), + (152, 224, 36), + (253, 151, 31), + (157, 101, 255), + (244, 0, 95), + (88, 209, 235), + (120, 120, 120), + (98, 94, 76), + ], + [ + (244, 0, 95), + (152, 224, 36), + (224, 213, 97), + (157, 101, 255), + (244, 0, 95), + (88, 209, 235), + (246, 246, 239), + ], + ) + + +def set_export_size(): + if args.export_size: console.width = int(args.export_size) + + +def load_json(): + codes = [] + if args.json: + f = open(args.json) + data = json.load(f) + + for key, values in data.items(): + for value in values: + code = value['code'] + codes.append(code) + + level = value['level'] + issue = value['issue'] + recommendation = [] + for rec in value['recommendations']: + new_message = {'message': rec} + recommendation.append(new_message) + + insights_dxt.append( + message(code, TARGET_DEVELOPER, level, issue, recommendation) + ) + + +def validate_thresholds(): + """ + Validate thresholds defined by the user. + """ + assert(THRESHOLD_OPERATION_IMBALANCE >= 0.0 and THRESHOLD_OPERATION_IMBALANCE <= 1.0) + assert(THRESHOLD_SMALL_REQUESTS >= 0.0 and THRESHOLD_SMALL_REQUESTS <= 1.0) + assert(THRESHOLD_MISALIGNED_REQUESTS >= 0.0 and THRESHOLD_MISALIGNED_REQUESTS <= 1.0) + assert(THRESHOLD_METADATA >= 0.0 and THRESHOLD_METADATA <= 1.0) + assert(THRESHOLD_RANDOM_OPERATIONS >= 0.0 and THRESHOLD_RANDOM_OPERATIONS <= 1.0) + + assert(THRESHOLD_METADATA_TIME_RANK >= 0.0) + + +def convert_bytes(bytes_number): + """ + Convert bytes into formatted string. + """ + tags = [ + 'bytes', + 'KB', + 'MB', + 'GB', + 'TB', + 'PB', + 'EB' + ] + + i = 0 + double_bytes = bytes_number + + while (i < len(tags) and bytes_number >= 1024): + double_bytes = bytes_number / 1024.0 + i = i + 1 + bytes_number = bytes_number / 1024 + + return str(round(double_bytes, 2)) + ' ' + tags[i] + + +def message(code, target, level, issue, recommendations=None, details=None): + """ + Display the message on the screen with level, issue, and recommendation. + """ + icon = ':arrow_forward:' + + if level in (HIGH, WARN): + insights_total[level] += 1 + + if level == HIGH: + color = '[red]' + elif level == WARN: + color = '[orange1]' + elif level == OK: + color = '[green]' + else: + color = '' + + messages = [ + '{}{}{} {}'.format( + color, + icon, + ' [' + code + ']' if args.code else '', + issue + ) + ] + + if args.export_csv: + csv_report.append(code) + + if details: + for detail in details[:DETAILS_MAX_SIZE]: + messages.append(' {}:left_arrow_curving_right: {}'.format( + color, + detail['message'] + ) + ) + + if recommendations: + if not args.only_issues: + messages.append(' [white]:left_arrow_curving_right: [b]Recommendations:[/b]') + + for recommendation in recommendations: + messages.append(' :left_arrow_curving_right: {}'.format(recommendation['message'])) + + if args.verbose and 'sample' in recommendation: + messages.append( + Padding( + Panel( + recommendation['sample'], + title='Solution Example Snippet', + title_align='left', + padding=(1, 2) + ), + (1, 0, 1, 7) + ) + ) + + insights_total[RECOMMENDATIONS] += len(recommendations) + + return Group( + *messages + ) + + +''' +Pre-load +''' +load_json() + From 9509e90fc75fbd87a7eecacc42379db1623d4aab Mon Sep 17 00:00:00 2001 From: onewbiek Date: Wed, 29 Nov 2023 16:30:42 -0800 Subject: [PATCH 06/19] Seperate argument parser --- drishti/parser.py | 98 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 drishti/parser.py diff --git a/drishti/parser.py b/drishti/parser.py new file mode 100644 index 0000000..0261312 --- /dev/null +++ b/drishti/parser.py @@ -0,0 +1,98 @@ +import argparse + +parser = argparse.ArgumentParser( + description='Drishti: ' +) + +parser.add_argument( + 'log_path', + help='Input .darshan file or recorder folder' +) + +parser.add_argument( + '--issues', + default=False, + action='store_true', + dest='only_issues', + help='Only displays the detected issues and hides the recommendations' +) + +parser.add_argument( + '--html', + default=False, + action='store_true', + dest='export_html', + help='Export the report as an HTML page' +) + +parser.add_argument( + '--svg', + default=False, + action='store_true', + dest='export_svg', + help='Export the report as an SVG image' +) + +parser.add_argument( + '--light', + default=False, + action='store_true', + dest='export_theme_light', + help='Use a light theme for the report when generating files' +) + +parser.add_argument( + '--size', + default=False, + dest='export_size', + help='Console width used for the report and generated files' +) + +parser.add_argument( + '--verbose', + default=False, + action='store_true', + dest='verbose', + help='Display extended details for the recommendations' +) + +parser.add_argument( + '--code', + default=False, + action='store_true', + dest='code', + help='Display insights identification code' +) + +parser.add_argument( + '--path', + default=False, + action='store_true', + dest='full_path', + help='Display the full file path for the files that triggered the issue' +) + +parser.add_argument( + '--csv', + default=False, + action='store_true', + dest='export_csv', + help='Export a CSV with the code of all issues that were triggered' +) + +parser.add_argument( + '--json', + default=False, + dest='json', + help=argparse.SUPPRESS +) + +parser.add_argument( + '--split', + default=False, + action='store_true', + dest='split_files', + help='Split the files and generate report for each file' +) + +args = parser.parse_args() From db437085bbdbbbb4e3c10614409232e331014df0 Mon Sep 17 00:00:00 2001 From: onewbiek Date: Wed, 29 Nov 2023 16:31:57 -0800 Subject: [PATCH 07/19] Main entry --- drishti/reporter.py | 111 +++----------------------------------------- 1 file changed, 6 insertions(+), 105 deletions(-) diff --git a/drishti/reporter.py b/drishti/reporter.py index 4a274fe..ef92d11 100644 --- a/drishti/reporter.py +++ b/drishti/reporter.py @@ -2,111 +2,13 @@ import os import sys -import argparse - from subprocess import call +from .parser import * LOG_TYPE_DARSHAN = 0 LOG_TYPE_RECORDER = 1 -parser = argparse.ArgumentParser( - description='Drishti: ' -) - -parser.add_argument( - 'log_path', - help='Input .darshan file or recorder folder' -) - -parser.add_argument( - '--issues', - default=False, - action='store_true', - dest='only_issues', - help='Only displays the detected issues and hides the recommendations' -) - -parser.add_argument( - '--html', - default=False, - action='store_true', - dest='export_html', - help='Export the report as an HTML page' -) - -parser.add_argument( - '--svg', - default=False, - action='store_true', - dest='export_svg', - help='Export the report as an SVG image' -) - -parser.add_argument( - '--light', - default=False, - action='store_true', - dest='export_theme_light', - help='Use a light theme for the report when generating files' -) - -parser.add_argument( - '--size', - default=False, - dest='export_size', - help='Console width used for the report and generated files' -) - -parser.add_argument( - '--verbose', - default=False, - action='store_true', - dest='verbose', - help='Display extended details for the recommendations' -) - -parser.add_argument( - '--code', - default=False, - action='store_true', - dest='code', - help='Display insights identification code' -) - -parser.add_argument( - '--path', - default=False, - action='store_true', - dest='full_path', - help='Display the full file path for the files that triggered the issue' -) - -parser.add_argument( - '--csv', - default=False, - action='store_true', - dest='export_csv', - help='Export a CSV with the code of all issues that were triggered' -) - -parser.add_argument( - '--json', - default=False, - dest='json', - help=argparse.SUPPRESS -) - -parser.add_argument( - '--split', - default=False, - action='store_true', - dest='split_files', - help='Split the files and generate report for each file' -) - -args = parser.parse_args() - def clear(): """ @@ -132,14 +34,13 @@ def main(): log_type = check_log_type(args.log_path) if log_type == LOG_TYPE_DARSHAN: - from . import handle_darshan - handle_darshan.handler(args) + from .handle_darshan import handler elif log_type == LOG_TYPE_RECORDER: if args.split_files: - from . import handle_recorder_split - handle_recorder_split.handler(args) + from .handle_recorder_split import handler else: - from . import handle_recorder - handle_recorder.handler(args) + from .handle_recorder import handler + + handler() From d178dfd77cbb1ade7bef59aec9b37eedb108ad3d Mon Sep 17 00:00:00 2001 From: onewbiek Date: Wed, 29 Nov 2023 19:10:53 -0800 Subject: [PATCH 08/19] Modules to be called by handlers --- drishti/module.py | 864 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 864 insertions(+) create mode 100644 drishti/module.py diff --git a/drishti/module.py b/drishti/module.py new file mode 100644 index 0000000..fe21a18 --- /dev/null +++ b/drishti/module.py @@ -0,0 +1,864 @@ +#!/usr/bin/env python3 + +import datetime +import csv +from rich import box +from rich.syntax import Syntax +from .config import * + +''' +Before calling the functions below +Make sure the variables passed are in the given structure: +file_map: a dict of (id, path) pair +modules: a set or a dict should be ok +detected_files: A pandas dataframe +''' + +# Basic usage check + +def check_stdio(total_size, total_size_stdio): + if total_size and total_size_stdio / total_size > THRESHOLD_INTERFACE_STDIO: + issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format( + total_size_stdio / total_size * 100.0, + convert_bytes(total_size_stdio) + ) + + recommendation = [ + { + 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' + } + ] + + insights_operation.append( + message(INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + +def check_mpiio(modules): + if 'MPI-IO' not in modules: + issue = 'Application is using low-performance interface' + + recommendation = [ + { + 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' + } + ] + + insights_operation.append( + message(INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + ) + + + +# POSIX level check + + +def check_operation_intensive(total_operations, total_reads, total_writes): + if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: + issue = 'Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( + total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 + ) + + insights_metadata.append( + message(INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + ) + + if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: + issue = 'Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( + total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 + ) + + insights_metadata.append( + message(INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + ) + + +def check_size_intensive(total_size, total_read_size, total_written_size): + if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: + issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format( + total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 + ) + + insights_metadata.append( + message(INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + ) + + if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: + issue = 'Application is read size intensive ({:.2f}% write vs. {:.2f}% read)'.format( + total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 + ) + + insights_metadata.append( + message(INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + ) + + +''' +detected_files required columns: +['id', 'total_reads', 'total_writes'] +detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str) +''' +def check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map): + if total_reads_small and total_reads_small / total_reads > THRESHOLD_SMALL_REQUESTS and total_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests'.format( + total_reads_small, total_reads_small / total_reads * 100.0 + ) + + detail = [] + recommendation = [] + + for index, row in detected_files.iterrows(): + if row['total_reads'] > (total_reads * THRESHOLD_SMALL_REQUESTS / 2): + detail.append( + { + 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( + row['total_reads'], + row['total_reads'] / total_reads * 100.0, + file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) + ) + } + ) + + recommendation.append( + { + 'message': 'Consider buffering read operations into larger more contiguous ones' + } + ) + + if 'MPI-IO' in modules: + recommendation.append( + { + 'message': 'Since the appplication already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + } + ) + else: + recommendation.append( + { + 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' + } + ) + + insights_operation.append( + message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + + if total_writes_small and total_writes_small / total_writes > THRESHOLD_SMALL_REQUESTS and total_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests'.format( + total_writes_small, total_writes_small / total_writes * 100.0 + ) + + detail = [] + recommendation = [] + + for index, row in detected_files.iterrows(): + if row['total_writes'] > (total_writes * THRESHOLD_SMALL_REQUESTS / 2): + detail.append( + { + 'message': '{} ({:.2f}%) small write requests are to "{}"'.format( + row['total_writes'], + row['total_writes'] / total_writes * 100.0, + file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) + ) + } + ) + + recommendation.append( + { + 'message': 'Consider buffering write operations into larger more contiguous ones' + } + ) + + if 'MPI-IO' in modules: + recommendation.append( + { + 'message': 'Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + } + ) + else: + recommendation.append( + { + 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' + } + ) + + insights_operation.append( + message(INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + + +def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules): + if total_operations and total_mem_not_aligned / total_operations > THRESHOLD_MISALIGNED_REQUESTS: + issue = 'Application has a high number ({:.2f}%) of misaligned memory requests'.format( + total_mem_not_aligned / total_operations * 100.0 + ) + + insights_metadata.append( + message(INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, TARGET_DEVELOPER, HIGH, issue, None) + ) + + if total_operations and total_file_not_aligned / total_operations > THRESHOLD_MISALIGNED_REQUESTS: + issue = 'Application issues a high number ({:.2f}%) of misaligned file requests'.format( + total_file_not_aligned / total_operations * 100.0 + ) + + recommendation = [ + { + 'message': 'Consider aligning the requests to the file system block boundaries' + } + ] + + if 'HF5' in modules: + recommendation.append( + { + 'message': 'Since the appplication uses HDF5, consider using H5Pset_alignment() in a file access property list', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-alignment.c'), line_numbers=True, background_color='default') + }, + { + 'message': 'Any file object greater than or equal in size to threshold bytes will be aligned on an address which is a multiple of alignment' + } + ) + + if 'LUSTRE' in modules: + recommendation.append( + { + 'message': 'Consider using a Lustre alignment that matches the file system stripe configuration', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + } + ) + + insights_metadata.append( + message(INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + +def check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size): + if max_read_offset > total_read_size: + issue = 'Application might have redundant read traffic (more data read than the highest offset)' + + insights_metadata.append( + message(INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None) + ) + + if max_write_offset > total_written_size: + issue = 'Application might have redundant write traffic (more data written than the highest offset)' + + insights_metadata.append( + message(INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None) + ) + + +def check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes): + if total_reads: + if read_random and read_random / total_reads > THRESHOLD_RANDOM_OPERATIONS and read_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: + issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format( + read_random, read_random / total_reads * 100.0 + ) + + recommendation = [ + { + 'message': 'Consider changing your data model to have consecutive or sequential reads' + } + ] + + insights_operation.append( + message(INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + else: + issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests'.format( + read_consecutive / total_reads * 100.0, + read_sequential / total_reads * 100.0 + ) + + insights_operation.append( + message(INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, TARGET_DEVELOPER, OK, issue, None) + ) + + if total_writes: + if write_random and write_random / total_writes > THRESHOLD_RANDOM_OPERATIONS and write_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: + issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)'.format( + write_random, write_random / total_writes * 100.0 + ) + + recommendation = [ + { + 'message': 'Consider changing your data model to have consecutive or sequential writes' + } + ] + + insights_operation.append( + message(INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + else: + issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests'.format( + write_consecutive / total_writes * 100.0, + write_sequential / total_writes * 100.0 + ) + + insights_operation.append( + message(INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, TARGET_DEVELOPER, OK, issue, None) + ) + + +'''' +The shared_file required columns: +['id', 'INSIGHTS_POSIX_SMALL_READS', 'INSIGHTS_POSIX_SMALL_WRITES'] +''' +def check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map): + if total_shared_reads and total_shared_reads_small / total_shared_reads > THRESHOLD_SMALL_REQUESTS and total_shared_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format( + total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0 + ) + + detail = [] + + for index, row in shared_files.iterrows(): + if row['INSIGHTS_POSIX_SMALL_READS'] > (total_shared_reads * THRESHOLD_SMALL_REQUESTS / 2): + detail.append( + { + 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( + row['INSIGHTS_POSIX_SMALL_READS'], + row['INSIGHTS_POSIX_SMALL_READS'] / total_shared_reads * 100.0, + file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) + ) + } + ) + + recommendation = [ + { + 'message': 'Consider coalesceing read requests into larger more contiguous ones using MPI-IO collective operations', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + + if total_shared_writes and total_shared_writes_small / total_shared_writes > THRESHOLD_SMALL_REQUESTS and total_shared_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests'.format( + total_shared_writes_small, total_shared_writes_small / total_shared_writes * 100.0 + ) + + detail = [] + + for index, row in shared_files.iterrows(): + if row['INSIGHTS_POSIX_SMALL_WRITES'] > (total_shared_writes * THRESHOLD_SMALL_REQUESTS / 2): + detail.append( + { + 'message': '{} ({:.2f}%) small writes requests are to "{}"'.format( + row['INSIGHTS_POSIX_SMALL_WRITES'], + row['INSIGHTS_POSIX_SMALL_WRITES'] / total_shared_writes * 100.0, + file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) + ) + } + ) + + recommendation = [ + { + 'message': 'Consider coalescing write requests into larger more contiguous ones using MPI-IO collective operations', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + + +def check_long_metadata(count_long_metadata, modules): + if count_long_metadata > 0: + issue = 'There are {} ranks where metadata operations take over {} seconds'.format( + count_long_metadata, THRESHOLD_METADATA_TIME_RANK + ) + + recommendation = [ + { + 'message': 'Attempt to combine files, reduce, or cache metadata operations' + } + ] + + if 'HF5' in modules: + recommendation.append( + { + 'message': 'Since your appplication uses HDF5, try enabling collective metadata calls with H5Pset_coll_metadata_write() and H5Pset_all_coll_metadata_ops()', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-collective-metadata.c'), line_numbers=True, background_color='default') + }, + { + 'message': 'Since your appplication uses HDF5, try using metadata cache to defer metadata operations', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-cache.c'), line_numbers=True, background_color='default') + } + ) + + insights_metadata.append( + message(INSIGHTS_POSIX_HIGH_METADATA_TIME, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + +''' +detected_files required columns: +['id', 'data_imbalance'] +''' +def check_shared_data_imblance(stragglers_count, detected_files, file_map): + if stragglers_count: + issue = 'Detected data transfer imbalance caused by stragglers when accessing {} shared file.'.format( + stragglers_count + ) + + detail = [] + + for index, row in detected_files.iterrows(): + detail.append( + { + 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + row['data_imbalance'], + file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) + ) + } + ) + + recommendation = [ + { + 'message': 'Consider better balancing the data transfer between the application ranks' + }, + { + 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) + ) + + +''' +detected_files required columns: +['id', 'time_imbalance'] +''' +def check_shared_time_imbalance(stragglers_count, detected_files, file_map): + if stragglers_count: + issue = 'Detected time imbalance caused by stragglers when accessing {} shared file.'.format( + stragglers_count + ) + + detail = [] + + for index, row in detected_files.iterrows(): + detail.append( + { + 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + row['time_imbalance'], + file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) + ) + } + ) + + recommendation = [ + { + 'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give + }, + { + 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) + ) + + +''' +detected_files required columns: +['id', 'write_imbalance'] +''' +def check_individual_write_imbalance(imbalance_count, detected_files, file_map): + if imbalance_count: + issue = 'Detected write imbalance when accessing {} individual files'.format( + imbalance_count + ) + + detail = [] + + for index, row in detected_files.iterrows(): + detail.append( + { + 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + row['write_imbalance'], + file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) + ) + } + ) + + recommendation = [ + { + 'message': 'Consider better balancing the data transfer between the application ranks' + }, + { + 'message': 'Consider tuning the stripe size and count to better distribute the data', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + }, + { + 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + }, + { + 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' + } + ] + + insights_operation.append( + message(INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + + +''' +detected_files required columns: +['id', 'read_imbalance'] +''' +def check_individual_read_imbalance(imbalance_count, detected_files, file_map): + if imbalance_count: + issue = 'Detected read imbalance when accessing {} individual files.'.format( + imbalance_count + ) + + detail = [] + + for index, row in detected_files.iterrows(): + detail.append( + { + 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + row['read_imbalance'], + file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) + ) + } + ) + + recommendation = [ + { + 'message': 'Consider better balancing the data transfer between the application ranks' + }, + { + 'message': 'Consider tuning the stripe size and count to better distribute the data', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + }, + { + 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + }, + { + 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' + } + ] + + insights_operation.append( + message(INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + + +# MPIIO level check + +''' +detected_files required columns: +['id', 'absolute_indep_reads', 'percent_indep_reads'] +''' +def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map): + if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format( + mpiio_indep_reads, + mpiio_indep_reads / total_mpiio_read_operations * 100 + ) + + detail = [] + + for index, row in detected_files.iterrows(): + detail.append( + { + 'message': '{} ({}%) of independent reads to "{}"'.format( + row['absolute_indep_reads'], + row['percent_indep_reads'], + file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) + ) + } + ) + + recommendation = [ + { + 'message': 'Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + else: + issue = 'Application uses MPI-IO and read data using {} ({:.2f}%) collective operations'.format( + mpiio_coll_reads, + mpiio_coll_reads / total_mpiio_read_operations * 100 + ) + + insights_operation.append( + message(INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, OK, issue) + ) + + +''' +detected_files required columns: +['id', 'absolute_indep_writes', 'percent_indep_writes'] +''' +def check_mpi_collective_write_operation(mpi_coll_writes, mpi_indep_writes, total_mpiio_write_operations, detected_files, file_map): + if mpi_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format( + mpi_indep_writes, + mpi_indep_writes / total_mpiio_write_operations * 100 + ) + + detail = [] + + for index, row in detected_files.iterrows(): + detail.append( + { + 'message': '{} ({}%) independent writes to "{}"'.format( + row['absolute_indep_writes'], + row['percent_indep_writes'], + file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) + ) + } + ) + + recommendation = [ + { + 'message': 'Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) + else: + issue = 'Application uses MPI-IO and write data using {} ({:.2f}%) collective operations'.format( + mpi_coll_writes, + mpi_coll_writes / total_mpiio_write_operations * 100 + ) + + insights_operation.append( + message(INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, OK, issue) + ) + + +def check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules): + if mpiio_nb_reads == 0: + issue = 'Application could benefit from non-blocking (asynchronous) reads' + + recommendation = [] + + if 'H5F' in modules or has_hdf5_extension: + recommendation.append( + { + 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-read.c'), line_numbers=True, background_color='default') + } + ) + + if 'MPI-IO' in modules: + recommendation.append( + { + 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iread.c'), line_numbers=True, background_color='default') + } + ) + + insights_operation.append( + message(INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + ) + + if mpiio_nb_writes == 0: + issue = 'Application could benefit from non-blocking (asynchronous) writes' + + recommendation = [] + + if 'H5F' in modules or has_hdf5_extension: + recommendation.append( + { + 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-write.c'), line_numbers=True, background_color='default') + } + ) + + if 'MPI-IO' in modules: + recommendation.append( + { + 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iwrite.c'), line_numbers=True, background_color='default') + } + ) + + insights_operation.append( + message(INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + ) + + +def check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES): + if cb_nodes > NUMBER_OF_COMPUTE_NODES: + issue = 'Application is using inter-node aggregators (which require network communication)' + + recommendation = [ + { + 'message': 'Set the MPI hints for the number of aggregators as one per compute node (e.g., cb_nodes={})'.format( + NUMBER_OF_COMPUTE_NODES + ), + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-hints.bash'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(INSIGHTS_MPI_IO_AGGREGATORS_INTER, TARGET_USER, HIGH, issue, recommendation) + ) + + if cb_nodes < NUMBER_OF_COMPUTE_NODES: + issue = 'Application is using intra-node aggregators' + + insights_operation.append( + message(INSIGHTS_MPI_IO_AGGREGATORS_INTRA, TARGET_USER, OK, issue) + ) + + if cb_nodes == NUMBER_OF_COMPUTE_NODES: + issue = 'Application is using one aggregator per compute node' + + insights_operation.append( + message(INSIGHTS_MPI_IO_AGGREGATORS_OK, TARGET_USER, OK, issue) + ) + + +# Layout and export + +def display_content(): + if insights_metadata: + console.print( + Panel( + Padding( + Group( + *insights_metadata + ), + (1, 1) + ), + title='METADATA', + title_align='left' + ) + ) + + if insights_operation: + console.print( + Panel( + Padding( + Group( + *insights_operation + ), + (1, 1) + ), + title='OPERATIONS', + title_align='left' + ) + ) + + if insights_dxt: + console.print( + Panel( + Padding( + Group( + *insights_dxt + ), + (1, 1) + ), + title='DXT', + title_align='left' + ) + ) + + +def display_footer(insights_start_time, insights_end_time): + console.print( + Panel( + ' {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds'.format( + datetime.datetime.now().year, + datetime.datetime.now(), + insights_end_time - insights_start_time + ), + box=box.SIMPLE + ) + ) + +def export_html(): + if args.export_html: + console.save_html( + '{}.html'.format(args.log_path), + theme=export_theme, + clear=False + ) + + +def export_svg(): + if args.export_svg: + console.save_svg( + '{}.svg'.format(args.log_path), + title='Drishti', + theme=export_theme, + clear=False + ) + + +def export_csv(filename, jobid=None): + if args.export_csv: + issues = [ + 'JOB', + INSIGHTS_STDIO_HIGH_USAGE, + INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, + INSIGHTS_POSIX_READ_COUNT_INTENSIVE, + INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, + INSIGHTS_POSIX_READ_SIZE_INTENSIVE, + INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, + INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, + INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, + INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, + INSIGHTS_POSIX_REDUNDANT_READ_USAGE, + INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, + INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, + INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, + INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, + INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, + INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, + INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, + INSIGHTS_POSIX_HIGH_METADATA_TIME, + INSIGHTS_POSIX_SIZE_IMBALANCE, + INSIGHTS_POSIX_TIME_IMBALANCE, + INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, + INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, + INSIGHTS_MPI_IO_NO_USAGE, + INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, + INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, + INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE, + INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE, + INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, + INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, + INSIGHTS_MPI_IO_AGGREGATORS_INTRA, + INSIGHTS_MPI_IO_AGGREGATORS_INTER, + INSIGHTS_MPI_IO_AGGREGATORS_OK + ] + if codes: + issues.extend(codes) + + detected_issues = dict.fromkeys(issues, False) + detected_issues['JOB'] = jobid + + for report in csv_report: + detected_issues[report] = True + + with open(filename, 'w') as f: + w = csv.writer(f) + w.writerow(detected_issues.keys()) + w.writerow(detected_issues.values()) + From a20d74f7fc54d150d7f0c9b61345e33cb0b6f818 Mon Sep 17 00:00:00 2001 From: onewbiek Date: Wed, 29 Nov 2023 19:13:13 -0800 Subject: [PATCH 09/19] Darshan handler cleanup --- drishti/handle_darshan.py | 912 ++++---------------------------------- 1 file changed, 82 insertions(+), 830 deletions(-) diff --git a/drishti/handle_darshan.py b/drishti/handle_darshan.py index 6d4e70f..a5d8fbc 100644 --- a/drishti/handle_darshan.py +++ b/drishti/handle_darshan.py @@ -1,32 +1,18 @@ #!/usr/bin/env python3 -import os import io import sys -import csv import time -import json import shlex import shutil -import datetime import subprocess - import pandas as pd - import darshan import darshan.backend.cffi_backend as darshanll -from rich import print, box -from rich.console import Group -from rich.padding import Padding -from rich.syntax import Syntax -from rich.panel import Panel -from rich.terminal_theme import TerminalTheme -from rich.terminal_theme import MONOKAI - +from rich import print from packaging import version - -from .includes import * +from .module import * def is_available(name): @@ -84,8 +70,8 @@ def check_log_version(file, log_version, library_version): return use_file -def handler(args): - init_console(args) +def handler(): + init_console() validate_thresholds() insights_start_time = time.time() @@ -207,34 +193,8 @@ def handler(args): 'mpiio': uses_mpiio } - if total_size and total_size_stdio / total_size > THRESHOLD_INTERFACE_STDIO: - issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format( - total_size_stdio / total_size * 100.0, - convert_bytes(total_size_stdio) - ) - - recommendation = [ - { - 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' - } - ] - - insights_operation.append( - message(args, INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - - if 'MPI-IO' not in modules: - issue = 'Application is using low-performance interface' - - recommendation = [ - { - 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' - } - ] - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) - ) + check_stdio(total_size, total_size_stdio) + check_mpiio(modules) ######################################################################################################################################################################### @@ -251,46 +211,14 @@ def handler(args): total_operations = total_writes + total_reads # To check whether the application is write-intersive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance - if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: - issue = 'Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( - total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) - ) - - if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: - issue = 'Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( - total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) - ) + check_operation_intensive(total_operations, total_reads, total_writes) total_read_size = df['counters']['POSIX_BYTES_READ'].sum() total_written_size = df['counters']['POSIX_BYTES_WRITTEN'].sum() total_size = total_written_size + total_read_size - if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: - issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format( - total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) - ) - - if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: - issue = 'Application is read size intensive ({:.2f}% write vs. {:.2f}% read)'.format( - total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) - ) + check_size_intensive(total_size, total_read_size, total_written_size) ######################################################################################################################################################################### @@ -303,6 +231,14 @@ def handler(args): df['counters']['POSIX_SIZE_READ_100K_1M'].sum() ) + total_writes_small = ( + df['counters']['POSIX_SIZE_WRITE_0_100'].sum() + + df['counters']['POSIX_SIZE_WRITE_100_1K'].sum() + + df['counters']['POSIX_SIZE_WRITE_1K_10K'].sum() + + df['counters']['POSIX_SIZE_WRITE_10K_100K'].sum() + + df['counters']['POSIX_SIZE_WRITE_100K_1M'].sum() + ) + # Get the files responsible for more than half of these accesses files = [] @@ -326,102 +262,7 @@ def handler(args): detected_files.columns = ['id', 'total_reads', 'total_writes'] detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str) - if total_reads_small and total_reads_small / total_reads > THRESHOLD_SMALL_REQUESTS and total_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: - issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests'.format( - total_reads_small, total_reads_small / total_reads * 100.0 - ) - - detail = [] - recommendation = [] - - for index, row in detected_files.iterrows(): - if row['total_reads'] > (total_reads * THRESHOLD_SMALL_REQUESTS / 2): - detail.append( - { - 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( - row['total_reads'], - row['total_reads'] / total_reads * 100.0, - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) - } - ) - - recommendation.append( - { - 'message': 'Consider buffering read operations into larger more contiguous ones' - } - ) - - if 'MPI-IO' in modules: - recommendation.append( - { - 'message': 'Since the appplication already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') - } - ) - else: - recommendation.append( - { - 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' - } - ) - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) - - # Get the number of small I/O operations (less than the stripe size) - total_writes_small = ( - df['counters']['POSIX_SIZE_WRITE_0_100'].sum() + - df['counters']['POSIX_SIZE_WRITE_100_1K'].sum() + - df['counters']['POSIX_SIZE_WRITE_1K_10K'].sum() + - df['counters']['POSIX_SIZE_WRITE_10K_100K'].sum() + - df['counters']['POSIX_SIZE_WRITE_100K_1M'].sum() - ) - - if total_writes_small and total_writes_small / total_writes > THRESHOLD_SMALL_REQUESTS and total_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: - issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests'.format( - total_writes_small, total_writes_small / total_writes * 100.0 - ) - - detail = [] - recommendation = [] - - for index, row in detected_files.iterrows(): - if row['total_writes'] > (total_writes * THRESHOLD_SMALL_REQUESTS / 2): - detail.append( - { - 'message': '{} ({:.2f}%) small write requests are to "{}"'.format( - row['total_writes'], - row['total_writes'] / total_writes * 100.0, - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) - } - ) - - recommendation.append( - { - 'message': 'Consider buffering write operations into larger more contiguous ones' - } - ) - - if 'MPI-IO' in modules: - recommendation.append( - { - 'message': 'Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') - } - ) - else: - recommendation.append( - { - 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' - } - ) - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) + check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map) ######################################################################################################################################################################### @@ -430,70 +271,16 @@ def handler(args): total_mem_not_aligned = df['counters']['POSIX_MEM_NOT_ALIGNED'].sum() total_file_not_aligned = df['counters']['POSIX_FILE_NOT_ALIGNED'].sum() - if total_operations and total_mem_not_aligned / total_operations > THRESHOLD_MISALIGNED_REQUESTS: - issue = 'Application has a high number ({:.2f}%) of misaligned memory requests'.format( - total_mem_not_aligned / total_operations * 100.0 - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, TARGET_DEVELOPER, HIGH, issue, None) - ) - - if total_operations and total_file_not_aligned / total_operations > THRESHOLD_MISALIGNED_REQUESTS: - issue = 'Application issues a high number ({:.2f}%) of misaligned file requests'.format( - total_file_not_aligned / total_operations * 100.0 - ) - - recommendation = [ - { - 'message': 'Consider aligning the requests to the file system block boundaries' - } - ] - - if 'HF5' in modules: - recommendation.append( - { - 'message': 'Since the appplication uses HDF5, consider using H5Pset_alignment() in a file access property list', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-alignment.c'), line_numbers=True, background_color='default') - }, - { - 'message': 'Any file object greater than or equal in size to threshold bytes will be aligned on an address which is a multiple of alignment' - } - ) - - if 'LUSTRE' in modules: - recommendation.append( - { - 'message': 'Consider using a Lustre alignment that matches the file system stripe configuration', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) + check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules) ######################################################################################################################################################################### # Redundant read-traffic (based on Phill) # POSIX_MAX_BYTE_READ (Highest offset in the file that was read) max_read_offset = df['counters']['POSIX_MAX_BYTE_READ'].max() - - if max_read_offset > total_read_size: - issue = 'Application might have redundant read traffic (more data read than the highest offset)' - - insights_metadata.append( - message(args, INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None) - ) - max_write_offset = df['counters']['POSIX_MAX_BYTE_WRITTEN'].max() - if max_write_offset > total_written_size: - issue = 'Application might have redundant write traffic (more data written than the highest offset)' - - insights_metadata.append( - message(args, INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None) - ) + check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size) ######################################################################################################################################################################### @@ -502,7 +289,6 @@ def handler(args): read_consecutive = df['counters']['POSIX_CONSEC_READS'].sum() #print('READ Consecutive: {} ({:.2f}%)'.format(read_consecutive, read_consecutive / total_reads * 100)) - read_sequential = df['counters']['POSIX_SEQ_READS'].sum() read_sequential -= read_consecutive #print('READ Sequential: {} ({:.2f}%)'.format(read_sequential, read_sequential / total_reads * 100)) @@ -510,30 +296,6 @@ def handler(args): read_random = total_reads - read_consecutive - read_sequential #print('READ Random: {} ({:.2f}%)'.format(read_random, read_random / total_reads * 100)) - if total_reads: - if read_random and read_random / total_reads > THRESHOLD_RANDOM_OPERATIONS and read_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: - issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format( - read_random, read_random / total_reads * 100.0 - ) - - recommendation = [ - { - 'message': 'Consider changing your data model to have consecutive or sequential reads' - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - else: - issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests'.format( - read_consecutive / total_reads * 100.0, - read_sequential / total_reads * 100.0 - ) - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, TARGET_DEVELOPER, OK, issue, None) - ) write_consecutive = df['counters']['POSIX_CONSEC_WRITES'].sum() @@ -543,30 +305,7 @@ def handler(args): write_random = total_writes - write_consecutive - write_sequential #print('WRITE Random: {} ({:.2f}%)'.format(write_random, write_random / total_writes * 100)) - if total_writes: - if write_random and write_random / total_writes > THRESHOLD_RANDOM_OPERATIONS and write_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: - issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)'.format( - write_random, write_random / total_writes * 100.0 - ) - - recommendation = [ - { - 'message': 'Consider changing your data model to have consecutive or sequential writes' - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - else: - issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests'.format( - write_consecutive / total_writes * 100.0, - write_sequential / total_writes * 100.0 - ) - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, TARGET_DEVELOPER, OK, issue, None) - ) + check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes) ######################################################################################################################################################################### @@ -594,35 +333,6 @@ def handler(args): shared_files['POSIX_SIZE_READ_100K_1M'] ) - if total_shared_reads and total_shared_reads_small / total_shared_reads > THRESHOLD_SMALL_REQUESTS and total_shared_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: - issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format( - total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0 - ) - - detail = [] - - for index, row in shared_files.iterrows(): - if row['INSIGHTS_POSIX_SMALL_READS'] > (total_shared_reads * THRESHOLD_SMALL_REQUESTS / 2): - detail.append( - { - 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( - row['INSIGHTS_POSIX_SMALL_READS'], - row['INSIGHTS_POSIX_SMALL_READS'] / total_shared_reads * 100.0, - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) - } - ) - - recommendation = [ - { - 'message': 'Consider coalesceing read requests into larger more contiguous ones using MPI-IO collective operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) total_shared_writes = shared_files['POSIX_WRITES'].sum() total_shared_writes_small = ( @@ -641,66 +351,13 @@ def handler(args): shared_files['POSIX_SIZE_WRITE_100K_1M'] ) - if total_shared_writes and total_shared_writes_small / total_shared_writes > THRESHOLD_SMALL_REQUESTS and total_shared_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: - issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests'.format( - total_shared_writes_small, total_shared_writes_small / total_shared_writes * 100.0 - ) - - detail = [] - - for index, row in shared_files.iterrows(): - if row['INSIGHTS_POSIX_SMALL_WRITES'] > (total_shared_writes * THRESHOLD_SMALL_REQUESTS / 2): - detail.append( - { - 'message': '{} ({:.2f}%) small writes requests are to "{}"'.format( - row['INSIGHTS_POSIX_SMALL_WRITES'], - row['INSIGHTS_POSIX_SMALL_WRITES'] / total_shared_writes * 100.0, - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) - } - ) - - recommendation = [ - { - 'message': 'Consider coalescing write requests into larger more contiguous ones using MPI-IO collective operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) + check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map) ######################################################################################################################################################################### - has_long_metadata = df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > THRESHOLD_METADATA_TIME_RANK)] + count_long_metadata = len(df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > THRESHOLD_METADATA_TIME_RANK)]) - if not has_long_metadata.empty: - issue = 'There are {} ranks where metadata operations take over {} seconds'.format( - len(has_long_metadata), THRESHOLD_METADATA_TIME_RANK - ) - - recommendation = [ - { - 'message': 'Attempt to combine files, reduce, or cache metadata operations' - } - ] - - if 'HF5' in modules: - recommendation.append( - { - 'message': 'Since your appplication uses HDF5, try enabling collective metadata calls with H5Pset_coll_metadata_write() and H5Pset_all_coll_metadata_ops()', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-collective-metadata.c'), line_numbers=True, background_color='default') - }, - { - 'message': 'Since your appplication uses HDF5, try using metadata cache to defer metadata operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-cache.c'), line_numbers=True, background_color='default') - } - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_HIGH_METADATA_TIME, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) + check_long_metadata(count_long_metadata, modules) # We already have a single line for each shared-file access # To check for stragglers, we can check the difference between the @@ -726,36 +383,9 @@ def handler(args): row['id'], abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size * 100 ]) - if stragglers_count: - issue = 'Detected data transfer imbalance caused by stragglers when accessing {} shared file.'.format( - stragglers_count - ) - - detail = [] - - for file in detected_files: - detail.append( - { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - file[1], - file_map[int(file[0])] if args.full_path else os.path.basename(file_map[int(file[0])]) - ) - } - ) - - recommendation = [ - { - 'message': 'Consider better balancing the data transfer between the application ranks' - }, - { - 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) - ) + column_names = ['id', 'data_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + check_shared_data_imblance(stragglers_count, detected_files, file_map) # POSIX_F_FASTEST_RANK_TIME # POSIX_F_SLOWEST_RANK_TIME @@ -781,36 +411,9 @@ def handler(args): row['id'], abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time * 100 ]) - if stragglers_count: - issue = 'Detected time imbalance caused by stragglers when accessing {} shared file.'.format( - stragglers_count - ) - - detail = [] - - for file in detected_files: - detail.append( - { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - file[1], - file_map[int(file[0])] if args.full_path else os.path.basename(file_map[int(file[0])]) - ) - } - ) - - recommendation = [ - { - 'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give - }, - { - 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) - ) + column_names = ['id', 'time_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + check_shared_time_imbalance(stragglers_count, detected_files, file_map) aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][ ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ'] @@ -837,43 +440,9 @@ def handler(args): row['id'], abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row['POSIX_BYTES_WRITTEN_max'] * 100 ]) - if imbalance_count: - issue = 'Detected write imbalance when accessing {} individual files'.format( - imbalance_count - ) - - detail = [] - - for file in detected_files: - detail.append( - { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - file[1], - file_map[int(file[0])] if args.full_path else os.path.basename(file_map[int(file[0])]) - ) - } - ) - - recommendation = [ - { - 'message': 'Consider better balancing the data transfer between the application ranks' - }, - { - 'message': 'Consider tuning the stripe size and count to better distribute the data', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - }, - { - 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') - }, - { - 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) + column_names = ['id', 'write_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + check_individual_write_imbalance(imbalance_count, detected_files, file_map) imbalance_count = 0 @@ -887,43 +456,9 @@ def handler(args): row['id'], abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] * 100 ]) - if imbalance_count: - issue = 'Detected read imbalance when accessing {} individual files.'.format( - imbalance_count - ) - - detail = [] - - for file in detected_files: - detail.append( - { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - file[1], - file_map[int(file[0])] if args.full_path else os.path.basename(file_map[int(file[0])]) - ) - } - ) - - recommendation = [ - { - 'message': 'Consider better balancing the data transfer between the application ranks' - }, - { - 'message': 'Consider tuning the stripe size and count to better distribute the data', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - }, - { - 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') - }, - { - 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) + column_names = ['id', 'read_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + check_individual_read_imbalance(imbalance_count, detected_files, file_map) ######################################################################################################################################################################### @@ -940,95 +475,50 @@ def handler(args): total_mpiio_read_operations = df_mpiio['counters']['MPIIO_INDEP_READS'].sum() + df_mpiio['counters']['MPIIO_COLL_READS'].sum() - if df_mpiio['counters']['MPIIO_COLL_READS'].sum() == 0: - if total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: - issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format( - df_mpiio['counters']['MPIIO_INDEP_READS'].sum(), - df_mpiio['counters']['MPIIO_INDEP_READS'].sum() / (total_mpiio_read_operations) * 100 - ) + mpiio_coll_reads = df_mpiio['counters']['MPIIO_COLL_READS'].sum() + mpiio_indep_reads = df_mpiio['counters']['MPIIO_INDEP_READS'].sum() - detail = [] - - files = pd.DataFrame(df_mpiio_collective_reads.groupby('id').sum()).reset_index() - - for index, row in df_mpiio_collective_reads.iterrows(): - if (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS and (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: - detail.append( - { - 'message': '{} ({}%) of independent reads to "{}"'.format( - row['MPIIO_INDEP_READS'], - row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100, - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) - } - ) - - recommendation = [ - { - 'message': 'Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) - else: - issue = 'Application uses MPI-IO and read data using {} ({:.2f}%) collective operations'.format( - df_mpiio['counters']['MPIIO_COLL_READS'].sum(), - df_mpiio['counters']['MPIIO_COLL_READS'].sum() / (df_mpiio['counters']['MPIIO_INDEP_READS'].sum() + df_mpiio['counters']['MPIIO_COLL_READS'].sum()) * 100 - ) + detected_files = [] + if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + files = pd.DataFrame(df_mpiio_collective_reads.groupby('id').sum()).reset_index() + for index, row in df_mpiio_collective_reads.iterrows(): + if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and + row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS and + (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE): + + detected_files.append([ + row['id'], row['MPIIO_INDEP_READS'], row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100 + ]) + + column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads'] + detected_files = pd.DataFrame(detected_files, columns=column_names) - insights_operation.append( - message(args, INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, OK, issue) - ) + check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map) df_mpiio_collective_writes = df_mpiio['counters'] #.loc[(df_mpiio['counters']['MPIIO_COLL_WRITES'] > 0)] total_mpiio_write_operations = df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() + df_mpiio['counters']['MPIIO_COLL_WRITES'].sum() - if df_mpiio['counters']['MPIIO_COLL_WRITES'].sum() == 0: - if total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: - issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format( - df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum(), - df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() / (total_mpiio_write_operations) * 100 - ) + mpi_coll_writes = df_mpiio['counters']['MPIIO_COLL_WRITES'].sum() + mpi_indep_writes = df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() - detail = [] - - files = pd.DataFrame(df_mpiio_collective_writes.groupby('id').sum()).reset_index() - - for index, row in df_mpiio_collective_writes.iterrows(): - if (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS and (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: - detail.append( - { - 'message': '{} ({}%) independent writes to "{}"'.format( - row['MPIIO_INDEP_WRITES'], - row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100, - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) - } - ) - - recommendation = [ - { - 'message': 'Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) - else: - issue = 'Application uses MPI-IO and write data using {} ({:.2f}%) collective operations'.format( - df_mpiio['counters']['MPIIO_COLL_WRITES'].sum(), - df_mpiio['counters']['MPIIO_COLL_WRITES'].sum() / (df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() + df_mpiio['counters']['MPIIO_COLL_WRITES'].sum()) * 100 - ) + detected_files = [] + if mpi_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + files = pd.DataFrame(df_mpiio_collective_writes.groupby('id').sum()).reset_index() - insights_operation.append( - message(args, INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, OK, issue) - ) + for index, row in df_mpiio_collective_writes.iterrows(): + if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and + row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS and + (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE): + + detected_files.append([ + row['id'], row['MPIIO_INDEP_WRITES'], row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100 + ]) + + column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + check_mpi_collective_write_operation(mpi_coll_writes, mpi_indep_writes, total_mpiio_write_operations, detected_files, file_map) ######################################################################################################################################################################### @@ -1042,55 +532,10 @@ def handler(args): if file_map[int(row['id'])].endswith('.h5') or file_map[int(row['id'])].endswith('.hdf5'): has_hdf5_extension = True - if df_mpiio['counters']['MPIIO_NB_READS'].sum() == 0: - issue = 'Application could benefit from non-blocking (asynchronous) reads' - - recommendation = [] - - if 'H5F' in modules or has_hdf5_extension: - recommendation.append( - { - 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-read.c'), line_numbers=True, background_color='default') - } - ) - - if 'MPI-IO' in modules: - recommendation.append( - { - 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iread.c'), line_numbers=True, background_color='default') - } - ) - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) - ) - - if df_mpiio['counters']['MPIIO_NB_WRITES'].sum() == 0: - issue = 'Application could benefit from non-blocking (asynchronous) writes' - - recommendation = [] + mpiio_nb_reads = df_mpiio['counters']['MPIIO_NB_READS'].sum() + mpiio_nb_writes = df_mpiio['counters']['MPIIO_NB_WRITES'].sum() - if 'H5F' in modules or has_hdf5_extension: - recommendation.append( - { - 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-write.c'), line_numbers=True, background_color='default') - } - ) - - if 'MPI-IO' in modules: - recommendation.append( - { - 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iwrite.c'), line_numbers=True, background_color='default') - } - ) - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) - ) + check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules) ######################################################################################################################################################################### @@ -1106,8 +551,6 @@ def handler(args): # print('Hints: ', hints) - ######################################################################################################################################################################### - NUMBER_OF_COMPUTE_NODES = 0 if 'MPI-IO' in modules: @@ -1141,66 +584,13 @@ def handler(args): NUMBER_OF_COMPUTE_NODES = first['NNodes'] # Do we have one MPI-IO aggregator per node? - if cb_nodes > NUMBER_OF_COMPUTE_NODES: - issue = 'Application is using inter-node aggregators (which require network communication)' - - recommendation = [ - { - 'message': 'Set the MPI hints for the number of aggregators as one per compute node (e.g., cb_nodes={})'.format( - NUMBER_OF_COMPUTE_NODES - ), - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-hints.bash'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_AGGREGATORS_INTER, TARGET_USER, HIGH, issue, recommendation) - ) - - if cb_nodes < NUMBER_OF_COMPUTE_NODES: - issue = 'Application is using intra-node aggregators' - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_AGGREGATORS_INTRA, TARGET_USER, OK, issue) - ) - - if cb_nodes == NUMBER_OF_COMPUTE_NODES: - issue = 'Application is using one aggregator per compute node' - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_AGGREGATORS_OK, TARGET_USER, OK, issue) - ) - - + check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES) except StopIteration: pass except FileNotFoundError: pass ######################################################################################################################################################################### - - codes = [] - if args.json: - f = open(args.json) - data = json.load(f) - - for key, values in data.items(): - for value in values: - code = value['code'] - codes.append(code) - - level = value['level'] - issue = value['issue'] - recommendation = [] - for rec in value['recommendations']: - new_message = {'message': rec} - recommendation.append(new_message) - - insights_dxt.append( - message(args, code, TARGET_DEVELOPER, level, issue, recommendation) - ) - - ######################################################################################################################################################################### insights_end_time = time.time() @@ -1261,153 +651,15 @@ def handler(args): console.print() - if insights_metadata: - console.print( - Panel( - Padding( - Group( - *insights_metadata - ), - (1, 1) - ), - title='METADATA', - title_align='left' - ) - ) + display_content() + display_footer(insights_start_time, insights_end_time) - if insights_operation: - console.print( - Panel( - Padding( - Group( - *insights_operation - ), - (1, 1) - ), - title='OPERATIONS', - title_align='left' - ) - ) + export_html() + export_svg() - if insights_dxt: - console.print( - Panel( - Padding( - Group( - *insights_dxt - ), - (1, 1) - ), - title='DXT', - title_align='left' - ) - ) - - console.print( - Panel( - ' {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds'.format( - datetime.datetime.now().year, - datetime.datetime.now(), - insights_end_time - insights_start_time - ), - box=box.SIMPLE - ) + filename = '{}-summary.csv'.format( + args.log_path.replace('.darshan', '') ) - - if args.export_theme_light: - export_theme = TerminalTheme( - (255, 255, 255), - (0, 0, 0), - [ - (26, 26, 26), - (244, 0, 95), - (152, 224, 36), - (253, 151, 31), - (157, 101, 255), - (244, 0, 95), - (88, 209, 235), - (120, 120, 120), - (98, 94, 76), - ], - [ - (244, 0, 95), - (152, 224, 36), - (224, 213, 97), - (157, 101, 255), - (244, 0, 95), - (88, 209, 235), - (246, 246, 239), - ], - ) - else: - export_theme = MONOKAI - - if args.export_html: - console.save_html( - '{}.html'.format(args.log_path), - theme=export_theme, - clear=False - ) - - if args.export_svg: - console.save_svg( - '{}.svg'.format(args.log_path), - title='Drishti', - theme=export_theme, - clear=False - ) - - if args.export_csv: - issues = [ - 'JOB', - INSIGHTS_STDIO_HIGH_USAGE, - INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, - INSIGHTS_POSIX_READ_COUNT_INTENSIVE, - INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, - INSIGHTS_POSIX_READ_SIZE_INTENSIVE, - INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, - INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, - INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, - INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, - INSIGHTS_POSIX_REDUNDANT_READ_USAGE, - INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, - INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, - INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, - INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, - INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, - INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, - INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, - INSIGHTS_POSIX_HIGH_METADATA_TIME, - INSIGHTS_POSIX_SIZE_IMBALANCE, - INSIGHTS_POSIX_TIME_IMBALANCE, - INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, - INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, - INSIGHTS_MPI_IO_NO_USAGE, - INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, - INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, - INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE, - INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE, - INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, - INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, - INSIGHTS_MPI_IO_AGGREGATORS_INTRA, - INSIGHTS_MPI_IO_AGGREGATORS_INTER, - INSIGHTS_MPI_IO_AGGREGATORS_OK - ] - if codes: - issues.extend(codes) - - detected_issues = dict.fromkeys(issues, False) - detected_issues['JOB'] = job['job']['jobid'] - - for report in csv_report: - detected_issues[report] = True - - filename = '{}-summary.csv'.format( - args.log_path.replace('.darshan', '') - ) - - with open(filename, 'w') as f: - w = csv.writer(f) - w.writerow(detected_issues.keys()) - w.writerow(detected_issues.values()) + + export_csv(filename, job['job']['jobid']) From cee980c006f118e2d6d469c1a92522aee58a6ab2 Mon Sep 17 00:00:00 2001 From: onewbiek Date: Wed, 29 Nov 2023 21:27:42 -0800 Subject: [PATCH 10/19] Fix bug --- drishti/handle_darshan.py | 8 +-- drishti/module.py | 108 +++++++++++++++++++------------------- 2 files changed, 59 insertions(+), 57 deletions(-) diff --git a/drishti/handle_darshan.py b/drishti/handle_darshan.py index a5d8fbc..6daa7b5 100644 --- a/drishti/handle_darshan.py +++ b/drishti/handle_darshan.py @@ -499,11 +499,11 @@ def handler(): total_mpiio_write_operations = df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() + df_mpiio['counters']['MPIIO_COLL_WRITES'].sum() - mpi_coll_writes = df_mpiio['counters']['MPIIO_COLL_WRITES'].sum() - mpi_indep_writes = df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() + mpiio_coll_writes = df_mpiio['counters']['MPIIO_COLL_WRITES'].sum() + mpiio_indep_writes = df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() detected_files = [] - if mpi_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: files = pd.DataFrame(df_mpiio_collective_writes.groupby('id').sum()).reset_index() for index, row in df_mpiio_collective_writes.iterrows(): @@ -518,7 +518,7 @@ def handler(): column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes'] detected_files = pd.DataFrame(detected_files, columns=column_names) - check_mpi_collective_write_operation(mpi_coll_writes, mpi_indep_writes, total_mpiio_write_operations, detected_files, file_map) + check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map) ######################################################################################################################################################################### diff --git a/drishti/module.py b/drishti/module.py index fe21a18..3a58a06 100644 --- a/drishti/module.py +++ b/drishti/module.py @@ -565,35 +565,36 @@ def check_individual_read_imbalance(imbalance_count, detected_files, file_map): ['id', 'absolute_indep_reads', 'percent_indep_reads'] ''' def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map): - if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: - issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format( - mpiio_indep_reads, - mpiio_indep_reads / total_mpiio_read_operations * 100 - ) + if mpiio_coll_reads == 0: + if total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format( + mpiio_indep_reads, + mpiio_indep_reads / total_mpiio_read_operations * 100 + ) - detail = [] + detail = [] - for index, row in detected_files.iterrows(): - detail.append( - { - 'message': '{} ({}%) of independent reads to "{}"'.format( - row['absolute_indep_reads'], - row['percent_indep_reads'], - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) - } - ) + for index, row in detected_files.iterrows(): + detail.append( + { + 'message': '{} ({}%) of independent reads to "{}"'.format( + row['absolute_indep_reads'], + row['percent_indep_reads'], + file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) + ) + } + ) - recommendation = [ - { - 'message': 'Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') - } - ] + recommendation = [ + { + 'message': 'Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + } + ] - insights_operation.append( - message(INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) + insights_operation.append( + message(INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) else: issue = 'Application uses MPI-IO and read data using {} ({:.2f}%) collective operations'.format( mpiio_coll_reads, @@ -609,40 +610,41 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot detected_files required columns: ['id', 'absolute_indep_writes', 'percent_indep_writes'] ''' -def check_mpi_collective_write_operation(mpi_coll_writes, mpi_indep_writes, total_mpiio_write_operations, detected_files, file_map): - if mpi_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: - issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format( - mpi_indep_writes, - mpi_indep_writes / total_mpiio_write_operations * 100 - ) +def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map): + if mpiio_coll_writes == 0: + if total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format( + mpiio_indep_writes, + mpiio_indep_writes / total_mpiio_write_operations * 100 + ) - detail = [] + detail = [] - for index, row in detected_files.iterrows(): - detail.append( - { - 'message': '{} ({}%) independent writes to "{}"'.format( - row['absolute_indep_writes'], - row['percent_indep_writes'], - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) - } - ) + for index, row in detected_files.iterrows(): + detail.append( + { + 'message': '{} ({}%) independent writes to "{}"'.format( + row['absolute_indep_writes'], + row['percent_indep_writes'], + file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) + ) + } + ) - recommendation = [ - { - 'message': 'Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') - } - ] + recommendation = [ + { + 'message': 'Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + } + ] - insights_operation.append( - message(INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) + insights_operation.append( + message(INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + ) else: issue = 'Application uses MPI-IO and write data using {} ({:.2f}%) collective operations'.format( - mpi_coll_writes, - mpi_coll_writes / total_mpiio_write_operations * 100 + mpiio_coll_writes, + mpiio_coll_writes / total_mpiio_write_operations * 100 ) insights_operation.append( From d30a3247e58922e969c902a77a5573ad1f503782 Mon Sep 17 00:00:00 2001 From: onewbiek Date: Wed, 29 Nov 2023 21:28:20 -0800 Subject: [PATCH 11/19] Recorder cleanup --- drishti/handle_recorder.py | 944 +++++-------------------------------- 1 file changed, 125 insertions(+), 819 deletions(-) diff --git a/drishti/handle_recorder.py b/drishti/handle_recorder.py index 59462af..b864e0a 100644 --- a/drishti/handle_recorder.py +++ b/drishti/handle_recorder.py @@ -1,35 +1,19 @@ #!/usr/bin/env python3 import os -import csv import time -import json - -import datetime - import pandas as pd - -from rich import print, box -from rich.console import Group -from rich.padding import Padding -from rich.syntax import Syntax -from rich.panel import Panel -from rich.terminal_theme import TerminalTheme -from rich.terminal_theme import MONOKAI - from recorder_utils import RecorderReader from recorder_utils.build_offset_intervals import build_offset_intervals - -from .includes import * +from .module import * def get_accessed_files(reader): ranks = reader.GM.total_ranks - filemap = {} + file_map = {} for rank in range(ranks): - filemap.update(reader.LMs[rank].filemap) - - return filemap + file_map.update(reader.LMs[rank].filemap) + return file_map def init_df_posix_recordes(reader): @@ -49,8 +33,8 @@ def init_df_posix_recordes(reader): return df_posix_records -def handler(args): - init_console(args) +def handler(): + init_console() validate_thresholds() insights_start_time = time.time() @@ -59,21 +43,20 @@ def handler(args): df_intervals = build_offset_intervals(reader) df_posix_records = init_df_posix_recordes(reader) - unique_files = get_accessed_files(reader) + file_map = get_accessed_files(reader) def add_api(row): if 'MPI' in row['function']: - return 'MPIIO' + return 'MPI-IO' elif 'H5' in row['function']: return 'H5F' else: return 'POSIX' - df_intervals['api'] = df_intervals.apply(add_api, axis=1) - def add_duration(row): return row['end'] - row['start'] + df_intervals['api'] = df_intervals.apply(add_api, axis=1) df_intervals['duration'] = df_intervals.apply(add_duration, axis=1) df_posix_records['duration'] = df_posix_records.apply(add_duration, axis=1) @@ -87,16 +70,16 @@ def add_duration(row): total_size_mpiio = 0 total_size = 0 - total_files = len(unique_files) + total_files = len(file_map) total_files_stdio = 0 total_files_posix = 0 total_files_mpiio = 0 - for fid in unique_files.keys(): - df_intervals_in_one_file = df_intervals[(df_intervals['file_id'] == fid)] + for id in file_map.keys(): + df_intervals_in_one_file = df_intervals[(df_intervals['file_id'] == id)] df_stdio_intervals_in_one_file = df_intervals_in_one_file[(df_intervals_in_one_file['api'] == 'STDIO')] df_posix_intervals_in_one_file = df_intervals_in_one_file[(df_intervals_in_one_file['api'] == 'POSIX')] - df_mpiio_intervals_in_one_file = df_intervals_in_one_file[(df_intervals_in_one_file['api'] == 'MPIIO')] + df_mpiio_intervals_in_one_file = df_intervals_in_one_file[(df_intervals_in_one_file['api'] == 'MPI-IO')] if len(df_stdio_intervals_in_one_file): total_files_stdio += 1 @@ -121,34 +104,8 @@ def add_duration(row): assert(total_size_posix >= 0) assert(total_size_mpiio >= 0) - if total_size and total_size_stdio / total_size > THRESHOLD_INTERFACE_STDIO: - issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format( - total_size_stdio / total_size * 100.0, - convert_bytes(total_size_stdio) - ) - - recommendation = [ - { - 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' - } - ] - - insights_operation.append( - message(args, INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - - if 'MPIIO' not in modules: - issue = 'Application is using low-performance interface' - - recommendation = [ - { - 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' - } - ] - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) - ) + check_stdio(total_size, total_size_stdio) + check_mpiio(modules) ######################################################################################################################################################################### @@ -165,46 +122,14 @@ def add_duration(row): total_operations = total_writes + total_reads # To check whether the application is write-intersive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance - if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: - issue = 'Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( - total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) - ) - - if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: - issue = 'Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( - total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) - ) + check_operation_intensive(total_operations, total_reads, total_writes) total_read_size = df_posix[(df_posix['function'].str.contains('read'))]['size'].sum() total_written_size = df_posix[~(df_posix['function'].str.contains('read'))]['size'].sum() total_size = total_written_size + total_read_size - if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: - issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format( - total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) - ) - - if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: - issue = 'Application is read size intensive ({:.2f}% write vs. {:.2f}% read)'.format( - total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) - ) + check_size_intensive(total_size, total_read_size, total_written_size) ######################################################################################################################################################################### @@ -213,99 +138,16 @@ def add_duration(row): total_reads_small = len(df_posix[(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) total_writes_small = len(df_posix[~(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - detected_files = [] # [fname, num of read, num of write] - for fid in unique_files.keys(): - read_cnt = len(df_posix[(df_posix['file_id'] == fid) & (df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - write_cnt = len(df_posix[(df_posix['file_id'] == fid) & ~(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - detected_files.append([unique_files[fid], read_cnt, write_cnt]) + detected_files = [] + for id in file_map.keys(): + read_cnt = len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + write_cnt = len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + detected_files.append([id, read_cnt, write_cnt]) - if total_reads_small and total_reads_small / total_reads > THRESHOLD_SMALL_REQUESTS and total_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: - issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests'.format( - total_reads_small, total_reads_small / total_reads * 100.0 - ) - - detail = [] - recommendation = [] - - for file in detected_files: - if file[1] > (total_reads * THRESHOLD_SMALL_REQUESTS / 2): - detail.append( - { - 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( - file[1], - file[1] / total_reads * 100.0, - file[0] if args.full_path else os.path.basename(file[0]) - ) - } - ) - - recommendation.append( - { - 'message': 'Consider buffering read operations into larger more contiguous ones' - } - ) - - if 'MPIIO' in modules: - recommendation.append( - { - 'message': 'Since the appplication already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') - } - ) - else: - recommendation.append( - { - 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' - } - ) - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) - - if total_writes_small and total_writes_small / total_writes > THRESHOLD_SMALL_REQUESTS and total_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: - issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests'.format( - total_writes_small, total_writes_small / total_writes * 100.0 - ) - - detail = [] - recommendation = [] - - for file in detected_files: - if file[2] > (total_writes * THRESHOLD_SMALL_REQUESTS / 2): - detail.append( - { - 'message': '{} ({:.2f}%) small write requests are to "{}"'.format( - file[2], - file[2] / total_writes * 100.0, - file[0] if args.full_path else os.path.basename(file[0]) - ) - } - ) - - recommendation.append( - { - 'message': 'Consider buffering write operations into larger more contiguous ones' - } - ) - - if 'MPIIO' in modules: - recommendation.append( - { - 'message': 'Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') - } - ) - else: - recommendation.append( - { - 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' - } - ) - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) + column_names = ['id', 'total_reads', 'total_writes'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map) ######################################################################################################################################################################### @@ -317,34 +159,21 @@ def add_duration(row): # Redundant read-traffic (based on Phill) # POSIX_MAX_BYTE_READ (Highest offset in the file that was read) max_read_offset = df_posix[(df_posix['function'].str.contains('read'))]['offset'].max() - - if max_read_offset > total_read_size: - issue = 'Application might have redundant read traffic (more data read than the highest offset)' - - insights_metadata.append( - message(args, INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None) - ) - max_write_offset = df_posix[~(df_posix['function'].str.contains('read'))]['offset'].max() - - if max_write_offset > total_written_size: - issue = 'Application might have redundant write traffic (more data written than the highest offset)' - - insights_metadata.append( - message(args, INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None) - ) + + check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size) ######################################################################################################################################################################### # Check for a lot of random operations - grp_posix_by_fid = df_posix.groupby('file_id') + grp_posix_by_id = df_posix.groupby('file_id') read_consecutive = 0 read_sequential = 0 read_random = 0 - for fid, df_filtered in grp_posix_by_fid: + for id, df_filtered in grp_posix_by_id: df_filtered = df_filtered[(df_filtered['function'].str.contains('read'))].sort_values('start') for i in range(len(df_filtered) - 1): @@ -357,36 +186,11 @@ def add_duration(row): else: read_random += 1 - if total_reads: - if read_random and read_random / total_reads > THRESHOLD_RANDOM_OPERATIONS and read_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: - issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format( - read_random, read_random / total_reads * 100.0 - ) - - recommendation = [ - { - 'message': 'Consider changing your data model to have consecutive or sequential reads' - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - else: - issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests'.format( - read_consecutive / total_reads * 100.0, - read_sequential / total_reads * 100.0 - ) - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, TARGET_DEVELOPER, OK, issue, None) - ) - write_consecutive = 0 write_sequential = 0 write_random = 0 - for fid, df_filtered in grp_posix_by_fid: + for id, df_filtered in grp_posix_by_id: df_filtered = df_filtered[~(df_filtered['function'].str.contains('read'))].sort_values('start') for i in range(len(df_filtered) - 1): @@ -399,37 +203,14 @@ def add_duration(row): else: write_random += 1 - if total_writes: - if write_random and write_random / total_writes > THRESHOLD_RANDOM_OPERATIONS and write_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: - issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)'.format( - write_random, write_random / total_writes * 100.0 - ) - - recommendation = [ - { - 'message': 'Consider changing your data model to have consecutive or sequential writes' - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - else: - issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests'.format( - write_consecutive / total_writes * 100.0, - write_sequential / total_writes * 100.0 - ) - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, TARGET_DEVELOPER, OK, issue, None) - ) + check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes) ######################################################################################################################################################################### # Shared file with small operations # A file is shared if it's been read/written by more than 1 rank - detected_files = grp_posix_by_fid['rank'].nunique() + detected_files = grp_posix_by_id['rank'].nunique() shared_files = set(detected_files[detected_files > 1].index) total_shared_reads = 0 @@ -437,116 +218,36 @@ def add_duration(row): total_shared_writes = 0 total_shared_writes_small = 0 - detected_files = [] # [fname, num of read, num of write] - for fid in shared_files: - total_shared_reads += len(df_posix[(df_posix['file_id'] == fid) & (df_posix['function'].str.contains('read'))]) - total_shared_writes += len(df_posix[(df_posix['file_id'] == fid) & ~(df_posix['function'].str.contains('read'))]) + detected_files = [] + for id in shared_files: + total_shared_reads += len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read'))]) + total_shared_writes += len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read'))]) - read_cnt = len(df_posix[(df_posix['file_id'] == fid) + read_cnt = len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - write_cnt = len(df_posix[(df_posix['file_id'] == fid) + write_cnt = len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - detected_files.append([unique_files[fid], read_cnt, write_cnt]) + detected_files.append([id, read_cnt, write_cnt]) total_shared_reads_small += read_cnt total_shared_writes_small += write_cnt - - if total_shared_reads and total_shared_reads_small / total_shared_reads > THRESHOLD_SMALL_REQUESTS and total_shared_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: - issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format( - total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0 - ) - - detail = [] - - for file in detected_files: - if file[1] > (total_reads * THRESHOLD_SMALL_REQUESTS / 2): - detail.append( - { - 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( - file[1], - file[1] / total_reads * 100.0, - file[0] if args.full_path else os.path.basename(file[0]) - ) - } - ) - - recommendation = [ - { - 'message': 'Consider coalesceing read requests into larger more contiguous ones using MPI-IO collective operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') - } - ] - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) - - if total_shared_writes and total_shared_writes_small / total_shared_writes > THRESHOLD_SMALL_REQUESTS and total_shared_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: - issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests'.format( - total_shared_writes_small, total_shared_writes_small / total_shared_writes * 100.0 - ) - - detail = [] - - for file in detected_files: - if file[2] > (total_writes * THRESHOLD_SMALL_REQUESTS / 2): - detail.append( - { - 'message': '{} ({:.2f}%) small write requests are to "{}"'.format( - file[2], - file[2] / total_writes * 100.0, - file[0] if args.full_path else os.path.basename(file[0]) - ) - } - ) - - recommendation = [ - { - 'message': 'Consider coalescing write requests into larger more contiguous ones using MPI-IO collective operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) + column_names = ['id', 'INSIGHTS_POSIX_SMALL_READS', 'INSIGHTS_POSIX_SMALL_WRITES'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, detected_files, file_map) ######################################################################################################################################################################### # TODO: Here I assume all operations other than write/read are metadata operations df_posix_metadata = df_posix_records[~(df_posix_records['function'].str.contains('read|write|print', na=False))] df_detected = df_posix_metadata.groupby('rank')['duration'].sum().reset_index() - has_long_metadata = df_detected[(df_detected['duration'] > THRESHOLD_METADATA_TIME_RANK)] - - if not has_long_metadata.empty: - issue = 'There are {} ranks where metadata operations take over {} seconds'.format( - len(has_long_metadata), THRESHOLD_METADATA_TIME_RANK - ) - - recommendation = [ - { - 'message': 'Attempt to combine files, reduce, or cache metadata operations' - } - ] - - if 'H5F' in modules: - recommendation.append( - { - 'message': 'Since your appplication uses HDF5, try enabling collective metadata calls with H5Pset_coll_metadata_write() and H5Pset_all_coll_metadata_ops()', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-collective-metadata.c'), line_numbers=True, background_color='default') - }, - { - 'message': 'Since your appplication uses HDF5, try using metadata cache to defer metadata operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-cache.c'), line_numbers=True, background_color='default') - } - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_HIGH_METADATA_TIME, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) + count_long_metadata = len(df_detected[(df_detected['duration'] > THRESHOLD_METADATA_TIME_RANK)]) + check_long_metadata(count_long_metadata, modules) + # We already have a single line for each shared-file access # To check for stragglers, we can check the difference between the @@ -557,9 +258,8 @@ def add_duration(row): stragglers_count = 0 detected_files = [] - - for fid in shared_files: - df_posix_in_one_file = df_posix[(df_posix['file_id'] == fid)] + for id in shared_files: + df_posix_in_one_file = df_posix[(df_posix['file_id'] == id)] total_transfer_size = df_posix_in_one_file['size'].sum() df_detected = df_posix_in_one_file.groupby('rank').agg({'size': 'sum', 'duration': 'sum'}).reset_index() @@ -570,39 +270,13 @@ def add_duration(row): stragglers_count += 1 detected_files.append([ - unique_files[fid], abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100 + id, abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100 ]) + + column_names = ['id', 'data_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) - if stragglers_count: - issue = 'Detected data transfer imbalance caused by stragglers when accessing {} shared file.'.format( - stragglers_count - ) - - detail = [] - - for file in detected_files: - detail.append( - { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - file[1], - file[0] if args.full_path else os.path.basename(file[0]) - ) - } - ) - - recommendation = [ - { - 'message': 'Consider better balancing the data transfer between the application ranks' - }, - { - 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) - ) + check_shared_data_imblance(stragglers_count, detected_files, file_map) # POSIX_F_FASTEST_RANK_TIME # POSIX_F_SLOWEST_RANK_TIME @@ -611,9 +285,8 @@ def add_duration(row): stragglers_count = 0 detected_files = [] - - for fid in shared_files: - df_posix_in_one_file = df_posix[(df_posix['file_id'] == fid)] + for id in shared_files: + df_posix_in_one_file = df_posix[(df_posix['file_id'] == id)] total_transfer_time = df_posix_in_one_file['duration'].sum() df_detected = df_posix_in_one_file.groupby('rank')['duration'].sum().reset_index() @@ -625,48 +298,21 @@ def add_duration(row): stragglers_count += 1 detected_files.append([ - unique_files[fid], abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100 + id, abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100 ]) - if stragglers_count: - issue = 'Detected time imbalance caused by stragglers when accessing {} shared file.'.format( - stragglers_count - ) + column_names = ['id', 'time_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) - detail = [] - - for file in detected_files: - detail.append( - { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - file[1], - file[0] if args.full_path else os.path.basename(file[0]) - ) - } - ) - - recommendation = [ - { - 'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give - }, - { - 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) - ) + check_shared_time_imbalance(stragglers_count, detected_files, file_map) # Get the individual files responsible for imbalance imbalance_count = 0 detected_files = [] - - for fid in unique_files.keys(): - if fid in shared_files: continue - df_detected = df_posix[(df_posix['file_id'] == fid) & ~(df_posix['function'].str.contains('read'))] + for id in file_map.keys(): + if id in shared_files: continue + df_detected = df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read'))] max_bytes_written = df_detected['size'].max() min_bytes_written = df_detected['size'].min() @@ -675,53 +321,19 @@ def add_duration(row): imbalance_count += 1 detected_files.append([ - unique_files[fid], abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 + id, abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 ]) - if imbalance_count: - issue = 'Detected write imbalance when accessing {} individual files'.format( - imbalance_count - ) + column_names = ['id', 'write_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) - detail = [] - - for file in detected_files: - detail.append( - { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - file[1], - file[0] if args.full_path else os.path.basename(file[0]) - ) - } - ) - - recommendation = [ - { - 'message': 'Consider better balancing the data transfer between the application ranks' - }, - { - 'message': 'Consider tuning the stripe size and count to better distribute the data', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - }, - { - 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') - }, - { - 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) + check_individual_write_imbalance(imbalance_count, detected_files, file_map) imbalance_count = 0 detected_files = [] - - for fid in shared_files: - df_detected = df_posix[(df_posix['file_id'] == fid) & (df_posix['function'].str.contains('read'))] + for id in shared_files: + df_detected = df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read'))] max_bytes_read = df_detected['size'].max() min_bytes_read = df_detected['size'].min() @@ -730,152 +342,62 @@ def add_duration(row): imbalance_count += 1 detected_files.append([ - unique_files[fid], abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 - ]) + id, abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 + ]) - if imbalance_count: - issue = 'Detected read imbalance when accessing {} individual files.'.format( - imbalance_count - ) + column_names = ['id', 'read_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) - detail = [] - - for file in detected_files: - detail.append( - { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - file[1], - file[0] if args.full_path else os.path.basename(file[0]) - ) - } - ) - - recommendation = [ - { - 'message': 'Consider better balancing the data transfer between the application ranks' - }, - { - 'message': 'Consider tuning the stripe size and count to better distribute the data', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - }, - { - 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') - }, - { - 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) + check_individual_read_imbalance(imbalance_count, detected_files, file_map) ######################################################################################################################################################################### - if df_intervals['api'].eq('MPIIO').any(): - df_mpiio = df_intervals[(df_intervals['api'] == 'MPIIO')] + if df_intervals['api'].eq('MPI-IO').any(): + df_mpiio = df_intervals[(df_intervals['api'] == 'MPI-IO')] df_mpiio_reads = df_mpiio[(df_mpiio['function'].str.contains('read'))] - mpiio_indp_reads = len(df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all'))]) + mpiio_indep_reads = len(df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all'))]) mpiio_coll_reads = len(df_mpiio_reads[(df_mpiio_reads['function'].str.contains('_all'))]) - total_mpiio_read_operations = mpiio_indp_reads + mpiio_coll_reads + total_mpiio_read_operations = mpiio_indep_reads + mpiio_coll_reads df_mpiio_writes = df_mpiio[~(df_mpiio['function'].str.contains('read'))] - mpiio_indp_writes = len(df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all'))]) + mpiio_indep_writes = len(df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all'))]) mpiio_coll_writes = len(df_mpiio_writes[(df_mpiio_writes['function'].str.contains('_all'))]) - total_mpiio_write_operations = mpiio_indp_writes + mpiio_coll_writes - - detected_files = [] # [fname, total_read, total_write] - for fid in unique_files.keys(): - read_cnt = len(df_mpiio_reads[(df_mpiio_reads['file_id'] == fid) & (df_mpiio_reads['function'].str.contains('read'))]) - write_cnt = len(df_mpiio_reads[(df_mpiio_reads['file_id'] == fid) & ~(df_mpiio_reads['function'].str.contains('read'))]) - detected_files.append([unique_files[fid], read_cnt, write_cnt]) - - if mpiio_coll_reads == 0: - if total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: - issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format( - mpiio_indp_reads, - mpiio_indp_reads / (total_mpiio_read_operations) * 100 - ) - - detail = [] - - for file in detected_files: - total_cnt = file[1] + file[2] - if total_cnt and file[1] / total_cnt > THRESHOLD_COLLECTIVE_OPERATIONS and total_cnt > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: - detail.append( - { - 'message': '{} ({}%) of independent reads to "{}"'.format( - file[1], - file[1] / total_cnt * 100, - file[0] if args.full_path else os.path.basename(file[0]) - ) - } - ) - - recommendation = [ - { - 'message': 'Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) - else: - issue = 'Application uses MPI-IO and read data using {} ({:.2f}%) collective operations'.format( - mpiio_coll_reads, - mpiio_coll_reads / total_mpiio_read_operations * 100 - ) - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, OK, issue) - ) - - if mpiio_coll_writes == 0: - if total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: - issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format( - mpiio_indp_writes, - mpiio_indp_writes / (total_mpiio_write_operations) * 100 - ) - - detail = [] - - for file in detected_files: - total_cnt = file[1] + file[2] - if total_cnt and file[2] / total_cnt > THRESHOLD_COLLECTIVE_OPERATIONS and total_cnt > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: - detail.append( - { - 'message': '{} ({}%) of independent writes to "{}"'.format( - file[2], - file[2] / total_cnt * 100, - file[0] if args.full_path else os.path.basename(file[0]) - ) - } - ) - - recommendation = [ - { - 'message': 'Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) - ) + total_mpiio_write_operations = mpiio_indep_writes + mpiio_coll_writes - else: - issue = 'Application uses MPI-IO and write data using {} ({:.2f}%) collective operations'.format( - mpiio_coll_writes, - mpiio_coll_writes / total_mpiio_write_operations * 100 - ) + detected_files = [] + if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + for id in file_map.keys(): + indep_read_count = df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all')) & (df_mpiio_reads['file_id'] == id)] + indep_write_count = df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all')) & (df_mpiio_writes['file_id'] == id)] + indep_total_count = indep_read_count + indep_write_count; + + if (indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE and indep_read_count / indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS): + detected_files.append([ + id, indep_read_count, indep_read_count / indep_total_count * 100 + ]) + + column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map) + + detected_files = [] + if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + for id in file_map.keys(): + indep_read_count = df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all')) & (df_mpiio_reads['file_id'] == id)] + indep_write_count = df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all')) & (df_mpiio_writes['file_id'] == id)] + indep_total_count = indep_read_count + indep_write_count; + + if (indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE and indep_write_count / indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS): + detected_files.append([ + id, indep_write_count, indep_write_count / indep_total_count * 100 + ]) - insights_operation.append( - message(args, INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, OK, issue) - ) + column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map) ######################################################################################################################################################################### @@ -885,60 +407,15 @@ def add_duration(row): has_hdf5_extension = False - for fid in unique_files.keys(): - fname = unique_files[fid] + for id in file_map.keys(): + fname = file_map[id] if fname.endswith('.h5') or fname.endswith('.hdf5'): has_hdf5_extension = True - if len(df_mpiio_reads[(df_mpiio_reads['function'].str.contains('iread|begin|end'))]) == 0: - issue = 'Application could benefit from non-blocking (asynchronous) reads' - - recommendation = [] - - if 'H5F' in modules or has_hdf5_extension: - recommendation.append( - { - 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-read.c'), line_numbers=True, background_color='default') - } - ) - - if 'MPIIO' in modules: - recommendation.append( - { - 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iread.c'), line_numbers=True, background_color='default') - } - ) - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) - ) - - if len(df_mpiio_writes[(df_mpiio_writes['function'].str.contains('iwrite|begin|end'))]) == 0: - issue = 'Application could benefit from non-blocking (asynchronous) writes' - - recommendation = [] - - if 'H5F' in modules or has_hdf5_extension: - recommendation.append( - { - 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-write.c'), line_numbers=True, background_color='default') - } - ) - - if 'MPIIO' in modules: - recommendation.append( - { - 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iwrite.c'), line_numbers=True, background_color='default') - } - ) - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) - ) + mpiio_nb_reads = len(df_mpiio_reads[(df_mpiio_reads['function'].str.contains('iread|begin|end'))]) + mpiio_nb_writes = len(df_mpiio_writes[(df_mpiio_writes['function'].str.contains('iwrite|begin|end'))]) + + check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules) ######################################################################################################################################################################### @@ -947,34 +424,6 @@ def add_duration(row): # TODO: ######################################################################################################################################################################### - - NUMBER_OF_COMPUTE_NODES = 0 - - ######################################################################################################################################################################### - - codes = [] - if args.json: - f = open(args.json) - data = json.load(f) - - for key, values in data.items(): - for value in values: - code = value['code'] - codes.append(code) - - level = value['level'] - issue = value['issue'] - recommendation = [] - for rec in value['recommendations']: - new_message = {'message': rec} - recommendation.append(new_message) - - insights_dxt.append( - message(args, code, TARGET_DEVELOPER, level, issue, recommendation) - ) - - ######################################################################################################################################################################### - insights_end_time = time.time() @@ -992,9 +441,6 @@ def add_duration(row): total_files_posix - total_files_mpiio, # Since MPI-IO files will always use POSIX, we can decrement to get a unique count total_files_mpiio ), - ' [b]COMPUTE NODES[/b] [white]{}[/white]'.format( - NUMBER_OF_COMPUTE_NODES - ), ' [b]PROCESSES[/b] [white]{}[/white]'.format( reader.GM.total_ranks ), @@ -1013,154 +459,14 @@ def add_duration(row): console.print() - if insights_metadata: - console.print( - Panel( - Padding( - Group( - *insights_metadata - ), - (1, 1) - ), - title='METADATA', - title_align='left' - ) - ) + display_content() + display_footer(insights_start_time, insights_end_time) - if insights_operation: - console.print( - Panel( - Padding( - Group( - *insights_operation - ), - (1, 1) - ), - title='OPERATIONS', - title_align='left' - ) - ) + export_html() + export_svg() - if insights_dxt: - console.print( - Panel( - Padding( - Group( - *insights_dxt - ), - (1, 1) - ), - title='DXT', - title_align='left' - ) - ) - - console.print( - Panel( - ' {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds'.format( - datetime.datetime.now().year, - datetime.datetime.now(), - insights_end_time - insights_start_time - ), - box=box.SIMPLE - ) + filename = '{}-summary.csv'.format( + args.log_path ) - - if args.export_theme_light: - export_theme = TerminalTheme( - (255, 255, 255), - (0, 0, 0), - [ - (26, 26, 26), - (244, 0, 95), - (152, 224, 36), - (253, 151, 31), - (157, 101, 255), - (244, 0, 95), - (88, 209, 235), - (120, 120, 120), - (98, 94, 76), - ], - [ - (244, 0, 95), - (152, 224, 36), - (224, 213, 97), - (157, 101, 255), - (244, 0, 95), - (88, 209, 235), - (246, 246, 239), - ], - ) - else: - export_theme = MONOKAI - - if args.export_html: - console.save_html( - '{}.html'.format(args.log_path), - theme=export_theme, - clear=False - ) - - if args.export_svg: - console.save_svg( - '{}.svg'.format(args.log_path), - title='Drishti', - theme=export_theme, - clear=False - ) - - if args.export_csv: - issues = [ - 'JOB', - INSIGHTS_STDIO_HIGH_USAGE, - INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, - INSIGHTS_POSIX_READ_COUNT_INTENSIVE, - INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, - INSIGHTS_POSIX_READ_SIZE_INTENSIVE, - INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, - INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, - INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, - INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, - INSIGHTS_POSIX_REDUNDANT_READ_USAGE, - INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, - INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, - INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, - INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, - INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, - INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, - INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, - INSIGHTS_POSIX_HIGH_METADATA_TIME, - INSIGHTS_POSIX_SIZE_IMBALANCE, - INSIGHTS_POSIX_TIME_IMBALANCE, - INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, - INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, - INSIGHTS_MPI_IO_NO_USAGE, - INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, - INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, - INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE, - INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE, - INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, - INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, - INSIGHTS_MPI_IO_AGGREGATORS_INTRA, - INSIGHTS_MPI_IO_AGGREGATORS_INTER, - INSIGHTS_MPI_IO_AGGREGATORS_OK - ] - if codes: - issues.extend(codes) - - detected_issues = dict.fromkeys(issues, False) - detected_issues['JOB'] = None - - for report in csv_report: - detected_issues[report] = True - - filename = '{}-summary.csv'.format( - args.log_path - ) - - with open(filename, 'w') as f: - w = csv.writer(f) - w.writerow(detected_issues.keys()) - w.writerow(detected_issues.values()) - + export_csv(filename) From d7fff3db6057068fb39c6f8a477d231725035f6f Mon Sep 17 00:00:00 2001 From: onewbiek Date: Thu, 30 Nov 2023 12:22:55 -0800 Subject: [PATCH 12/19] Accommodate split mode --- drishti/config.py | 24 ++++---- drishti/handle_darshan.py | 17 +++--- drishti/module.py | 114 +++++++++++++++++++++++++++++++++++--- 3 files changed, 127 insertions(+), 28 deletions(-) diff --git a/drishti/config.py b/drishti/config.py index aaf25b1..e11a824 100644 --- a/drishti/config.py +++ b/drishti/config.py @@ -84,19 +84,17 @@ DETAILS_MAX_SIZE = 10 -# TODO: need to verify the threashold to be between 0 and 1 -# TODO: read thresholds from file - -console = Console(record=True) csv_report = [] codes = [] -export_theme = MONOKAI + +# TODO: need to verify the threashold to be between 0 and 1 +# TODO: read thresholds from file def init_console(): - set_export_size() - set_export_theme() + console = Console(record=True) + if args.export_size: console.width = int(args.export_size) insights_operation.clear() insights_metadata.clear() @@ -104,10 +102,10 @@ def init_console(): insights_total[HIGH] = 0 insights_total[WARN] = 0 insights_total[RECOMMENDATIONS] = 0 + return console def set_export_theme(): - global export_theme if args.export_theme_light: export_theme = TerminalTheme( (255, 255, 255), @@ -133,10 +131,9 @@ def set_export_theme(): (246, 246, 239), ], ) - - -def set_export_size(): - if args.export_size: console.width = int(args.export_size) + else: + export_theme = MONOKAI + return export_theme def load_json(): @@ -268,5 +265,6 @@ def message(code, target, level, issue, recommendations=None, details=None): ''' Pre-load ''' -load_json() +if not args.split_files: + load_json() diff --git a/drishti/handle_darshan.py b/drishti/handle_darshan.py index 6daa7b5..98cc63b 100644 --- a/drishti/handle_darshan.py +++ b/drishti/handle_darshan.py @@ -21,7 +21,7 @@ def is_available(name): return shutil.which(name) is not None -def check_log_version(file, log_version, library_version): +def check_log_version(console, file, log_version, library_version): use_file = file if version.parse(log_version) < version.parse('3.4.0'): @@ -71,7 +71,7 @@ def check_log_version(file, log_version, library_version): def handler(): - init_console() + console = init_console() validate_thresholds() insights_start_time = time.time() @@ -86,7 +86,7 @@ def handler(): library_version = darshanll.darshan.backend.cffi_backend.get_lib_version() # Make sure log format is of the same version - filename = check_log_version(args.log_path, log_version, library_version) + filename = check_log_version(console, args.log_path, log_version, library_version) darshanll.log_close(log) @@ -651,11 +651,14 @@ def handler(): console.print() - display_content() - display_footer(insights_start_time, insights_end_time) + display_content(console) + display_footer(console, insights_start_time, insights_end_time) - export_html() - export_svg() + filename = '{}.html'.format(args.log_path) + export_html(console, filename) + + filename = '{}.svg'.format(args.log_path) + export_svg(console, filename) filename = '{}-summary.csv'.format( args.log_path.replace('.darshan', '') diff --git a/drishti/module.py b/drishti/module.py index 3a58a06..a75d574 100644 --- a/drishti/module.py +++ b/drishti/module.py @@ -433,6 +433,27 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map): ) +def check_shared_data_imblance_split(slowest_rank_bytes, fastest_rank_bytes, total_transfer_size): + if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > THRESHOLD_STRAGGLERS: + issue = 'Load imbalance of {:.2f}% detected'.format( + abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100 + ) + + recommendation = [ + { + 'message': 'Consider better balancing the data transfer between the application ranks' + }, + { + 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation) + ) + + ''' detected_files required columns: ['id', 'time_imbalance'] @@ -470,6 +491,27 @@ def check_shared_time_imbalance(stragglers_count, detected_files, file_map): ) +def check_shared_time_imbalance_split(slowest_rank_time, fastest_rank_time, total_transfer_time): + if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > THRESHOLD_STRAGGLERS: + issue = 'Load imbalance of {:.2f}% detected'.format( + abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100 + ) + + recommendation = [ + { + 'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give + }, + { + 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + } + ] + + insights_operation.append( + message(INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation) + ) + + ''' detected_files required columns: ['id', 'write_imbalance'] @@ -514,6 +556,34 @@ def check_individual_write_imbalance(imbalance_count, detected_files, file_map): ) +def check_individual_write_imbalance_split(max_bytes_written, min_bytes_written): + if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > THRESHOLD_IMBALANCE: + issue = 'Load imbalance of {:.2f}% detected'.format( + abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 + ) + + recommendation = [ + { + 'message': 'Consider better balancing the data transfer between the application ranks' + }, + { + 'message': 'Consider tuning the stripe size and count to better distribute the data', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + }, + { + 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + }, + { + 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' + } + ] + + insights_operation.append( + message(INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + ''' detected_files required columns: ['id', 'read_imbalance'] @@ -558,6 +628,34 @@ def check_individual_read_imbalance(imbalance_count, detected_files, file_map): ) +def check_individual_read_imbalance_split(max_bytes_read, min_bytes_read): + if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > THRESHOLD_IMBALANCE: + issue = 'Load imbalance of {:.2f}% detected'.format( + abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 + ) + + recommendation = [ + { + 'message': 'Consider better balancing the data transfer between the application ranks' + }, + { + 'message': 'Consider tuning the stripe size and count to better distribute the data', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + }, + { + 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', + 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + }, + { + 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' + } + ] + + insights_operation.append( + message(INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation) + ) + + # MPIIO level check ''' @@ -738,7 +836,7 @@ def check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES): # Layout and export -def display_content(): +def display_content(console): if insights_metadata: console.print( Panel( @@ -782,7 +880,7 @@ def display_content(): ) -def display_footer(insights_start_time, insights_end_time): +def display_footer(console, insights_start_time, insights_end_time): console.print( Panel( ' {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds'.format( @@ -794,21 +892,21 @@ def display_footer(insights_start_time, insights_end_time): ) ) -def export_html(): +def export_html(console, filename): if args.export_html: console.save_html( - '{}.html'.format(args.log_path), - theme=export_theme, + filename, + theme=set_export_theme(), clear=False ) -def export_svg(): +def export_svg(console, filename): if args.export_svg: console.save_svg( - '{}.svg'.format(args.log_path), + filename, title='Drishti', - theme=export_theme, + theme=set_export_theme(), clear=False ) From 5aac80c0d326aeb4ebcad6fa4f714abbe86f905e Mon Sep 17 00:00:00 2001 From: onewbiek Date: Thu, 30 Nov 2023 12:23:43 -0800 Subject: [PATCH 13/19] Embedded split mode --- drishti/handle_recorder.py | 449 ++++++++------ drishti/handle_recorder_split.py | 982 ------------------------------- drishti/includes.py | 203 ------- drishti/reporter.py | 17 +- 4 files changed, 290 insertions(+), 1361 deletions(-) delete mode 100644 drishti/handle_recorder_split.py delete mode 100644 drishti/includes.py diff --git a/drishti/handle_recorder.py b/drishti/handle_recorder.py index b864e0a..a9af622 100644 --- a/drishti/handle_recorder.py +++ b/drishti/handle_recorder.py @@ -26,19 +26,21 @@ def init_df_posix_recordes(reader): func_name = func_list[record.func_id] if 'MPI' not in func_name and 'H5' not in func_name: - records.append( [rank, func_name, record.tstart, record.tend] ) + filename = None + if "open" in func_name or "close" in func_name or "creat" in func_name \ + or "seek" in func_name or "sync" in func_name: + fstr = record.args[0] + filename = fstr if type(fstr)==str else fstr.decode('utf-8') + filename = filename.replace('./', '') - head = ['rank', 'function', 'start', 'end'] + records.append( [filename, rank, func_name, record.tstart, record.tend] ) + + head = ['fname', 'rank', 'function', 'start', 'end'] df_posix_records = pd.DataFrame(records, columns=head) return df_posix_records def handler(): - init_console() - validate_thresholds() - - insights_start_time = time.time() - reader = RecorderReader(args.log_path) df_intervals = build_offset_intervals(reader) df_posix_records = init_df_posix_recordes(reader) @@ -60,10 +62,23 @@ def add_duration(row): df_intervals['duration'] = df_intervals.apply(add_duration, axis=1) df_posix_records['duration'] = df_posix_records.apply(add_duration, axis=1) - modules = set(df_intervals['api'].unique()) + if args.split_files: + for fid in file_map: + process_helper(file_map, df_intervals[(df_intervals['file_id'] == fid)], + df_posix_records[(df_posix_records['fname'] == file_map[fid])], fid) + else: + process_helper(file_map, df_intervals, df_posix_records) - ######################################################################################################################################################################### +def process_helper(file_map, df_intervals, df_posix_records, fid=None): + if not len(df_intervals): return + + insights_start_time = time.time() + + console = init_console() + validate_thresholds() + + modules = set(df_intervals['api'].unique()) # Check usage of POSIX, and MPI-IO per file total_size_stdio = 0 total_size_posix = 0 @@ -75,23 +90,28 @@ def add_duration(row): total_files_posix = 0 total_files_mpiio = 0 - for id in file_map.keys(): - df_intervals_in_one_file = df_intervals[(df_intervals['file_id'] == id)] - df_stdio_intervals_in_one_file = df_intervals_in_one_file[(df_intervals_in_one_file['api'] == 'STDIO')] - df_posix_intervals_in_one_file = df_intervals_in_one_file[(df_intervals_in_one_file['api'] == 'POSIX')] - df_mpiio_intervals_in_one_file = df_intervals_in_one_file[(df_intervals_in_one_file['api'] == 'MPI-IO')] + if args.split_files: + total_size_stdio = df_intervals[(df_intervals['api'] == 'STDIO')]['size'].sum() + total_size_posix = df_intervals[(df_intervals['api'] == 'POSIX')]['size'].sum() + total_size_mpiio = df_intervals[(df_intervals['api'] == 'MPI-IO')]['size'].sum() + else: + for id in file_map.keys(): + df_intervals_in_one_file = df_intervals[(df_intervals['file_id'] == id)] + df_stdio_intervals_in_one_file = df_intervals_in_one_file[(df_intervals_in_one_file['api'] == 'STDIO')] + df_posix_intervals_in_one_file = df_intervals_in_one_file[(df_intervals_in_one_file['api'] == 'POSIX')] + df_mpiio_intervals_in_one_file = df_intervals_in_one_file[(df_intervals_in_one_file['api'] == 'MPI-IO')] - if len(df_stdio_intervals_in_one_file): - total_files_stdio += 1 - total_size_stdio += df_stdio_intervals_in_one_file['size'].sum() + if len(df_stdio_intervals_in_one_file): + total_files_stdio += 1 + total_size_stdio += df_stdio_intervals_in_one_file['size'].sum() - if len(df_posix_intervals_in_one_file): - total_files_posix += 1 - total_size_posix += df_posix_intervals_in_one_file['size'].sum() + if len(df_posix_intervals_in_one_file): + total_files_posix += 1 + total_size_posix += df_posix_intervals_in_one_file['size'].sum() - if len(df_mpiio_intervals_in_one_file): - total_files_mpiio += 1 - total_size_mpiio += df_mpiio_intervals_in_one_file['size'].sum() + if len(df_mpiio_intervals_in_one_file): + total_files_mpiio += 1 + total_size_mpiio += df_mpiio_intervals_in_one_file['size'].sum() # Since POSIX will capture both POSIX-only accesses and those comming from MPI-IO, we can subtract those @@ -138,14 +158,17 @@ def add_duration(row): total_reads_small = len(df_posix[(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) total_writes_small = len(df_posix[~(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - detected_files = [] - for id in file_map.keys(): - read_cnt = len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - write_cnt = len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - detected_files.append([id, read_cnt, write_cnt]) + if args.split_files: + detected_files = pd.DataFrame() + else: + detected_files = [] + for id in file_map.keys(): + read_cnt = len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + write_cnt = len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + detected_files.append([id, read_cnt, write_cnt]) - column_names = ['id', 'total_reads', 'total_writes'] - detected_files = pd.DataFrame(detected_files, columns=column_names) + column_names = ['id', 'total_reads', 'total_writes'] + detected_files = pd.DataFrame(detected_files, columns=column_names) check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map) @@ -213,37 +236,38 @@ def add_duration(row): detected_files = grp_posix_by_id['rank'].nunique() shared_files = set(detected_files[detected_files > 1].index) - total_shared_reads = 0 - total_shared_reads_small = 0 - total_shared_writes = 0 - total_shared_writes_small = 0 - - detected_files = [] - for id in shared_files: - total_shared_reads += len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read'))]) - total_shared_writes += len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read'))]) - - read_cnt = len(df_posix[(df_posix['file_id'] == id) + total_shared_reads = len(df_posix[(df_posix['file_id'].isin(shared_files)) & (df_posix['function'].str.contains('read'))]) + total_shared_reads_small = len(df_posix[(df_posix['file_id'].isin(shared_files)) & (df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - write_cnt = len(df_posix[(df_posix['file_id'] == id) + + total_shared_writes = len(df_posix[(df_posix['file_id'].isin(shared_files)) & ~(df_posix['function'].str.contains('read'))]) + total_shared_writes_small = len(df_posix[(df_posix['file_id'].isin(shared_files)) & ~(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - detected_files.append([id, read_cnt, write_cnt]) - total_shared_reads_small += read_cnt - total_shared_writes_small += write_cnt - - column_names = ['id', 'INSIGHTS_POSIX_SMALL_READS', 'INSIGHTS_POSIX_SMALL_WRITES'] - detected_files = pd.DataFrame(detected_files, columns=column_names) + if args.split_files: + detected_files = pd.DataFrame() + else: + detected_files = [] + for id in shared_files: + read_cnt = len(df_posix[(df_posix['file_id'] == id) + & (df_posix['function'].str.contains('read')) + & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + write_cnt = len(df_posix[(df_posix['file_id'] == id) + & ~(df_posix['function'].str.contains('read')) + & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + detected_files.append([id, read_cnt, write_cnt]) + + column_names = ['id', 'INSIGHTS_POSIX_SMALL_READS', 'INSIGHTS_POSIX_SMALL_WRITES'] + detected_files = pd.DataFrame(detected_files, columns=column_names) check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, detected_files, file_map) ######################################################################################################################################################################### - # TODO: Here I assume all operations other than write/read are metadata operations - df_posix_metadata = df_posix_records[~(df_posix_records['function'].str.contains('read|write|print', na=False))] - df_detected = df_posix_metadata.groupby('rank')['duration'].sum().reset_index() + # TODO: Assumed metadata operations: open, close, sync, create, seek + df_detected = df_posix_records.groupby('rank')['duration'].sum().reset_index() count_long_metadata = len(df_detected[(df_detected['duration'] > THRESHOLD_METADATA_TIME_RANK)]) check_long_metadata(count_long_metadata, modules) @@ -254,101 +278,137 @@ def add_duration(row): # POSIX_FASTEST_RANK_BYTES # POSIX_SLOWEST_RANK_BYTES # POSIX_VARIANCE_RANK_BYTES + if args.split_files: + if df_posix['rank'].nunique() > 1: + total_transfer_size = df_posix['size'].sum() - stragglers_count = 0 - - detected_files = [] - for id in shared_files: - df_posix_in_one_file = df_posix[(df_posix['file_id'] == id)] - total_transfer_size = df_posix_in_one_file['size'].sum() + df_detected = df_posix.groupby('rank').agg({'size': 'sum', 'duration': 'sum'}).reset_index() + slowest_rank_bytes = df_detected.loc[df_detected['duration'].idxmax(), 'size'] + fastest_rank_bytes = df_detected.loc[df_detected['duration'].idxmin(), 'size'] + + check_shared_data_imblance_split(slowest_rank_bytes, fastest_rank_bytes, total_transfer_size) + else: + stragglers_count = 0 + + detected_files = [] + for id in shared_files: + df_posix_in_one_file = df_posix[(df_posix['file_id'] == id)] + total_transfer_size = df_posix_in_one_file['size'].sum() - df_detected = df_posix_in_one_file.groupby('rank').agg({'size': 'sum', 'duration': 'sum'}).reset_index() - slowest_rank_bytes = df_detected.loc[df_detected['duration'].idxmax(), 'size'] - fastest_rank_bytes = df_detected.loc[df_detected['duration'].idxmin(), 'size'] + df_detected = df_posix_in_one_file.groupby('rank').agg({'size': 'sum', 'duration': 'sum'}).reset_index() + slowest_rank_bytes = df_detected.loc[df_detected['duration'].idxmax(), 'size'] + fastest_rank_bytes = df_detected.loc[df_detected['duration'].idxmin(), 'size'] - if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > THRESHOLD_STRAGGLERS: - stragglers_count += 1 + if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > THRESHOLD_STRAGGLERS: + stragglers_count += 1 - detected_files.append([ - id, abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100 - ]) - - column_names = ['id', 'data_imbalance'] - detected_files = pd.DataFrame(detected_files, columns=column_names) + detected_files.append([ + id, abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100 + ]) + + column_names = ['id', 'data_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) - check_shared_data_imblance(stragglers_count, detected_files, file_map) + check_shared_data_imblance(stragglers_count, detected_files, file_map) # POSIX_F_FASTEST_RANK_TIME # POSIX_F_SLOWEST_RANK_TIME # POSIX_F_VARIANCE_RANK_TIME + if args.split_files: + if df_posix['rank'].nunique() > 1: + total_transfer_time = df_posix['duration'].sum() - stragglers_count = 0 - - detected_files = [] - for id in shared_files: - df_posix_in_one_file = df_posix[(df_posix['file_id'] == id)] - total_transfer_time = df_posix_in_one_file['duration'].sum() + df_detected = df_posix.groupby('rank')['duration'].sum().reset_index() + + slowest_rank_time = df_detected['duration'].max() + fastest_rank_time = df_detected['duration'].min() + + check_shared_time_imbalance_split(slowest_rank_time, fastest_rank_time, total_transfer_time) + else: + stragglers_count = 0 + + detected_files = [] + for id in shared_files: + df_posix_in_one_file = df_posix[(df_posix['file_id'] == id)] + total_transfer_time = df_posix_in_one_file['duration'].sum() - df_detected = df_posix_in_one_file.groupby('rank')['duration'].sum().reset_index() + df_detected = df_posix_in_one_file.groupby('rank')['duration'].sum().reset_index() - slowest_rank_time = df_detected['duration'].max() - fastest_rank_time = df_detected['duration'].min() + slowest_rank_time = df_detected['duration'].max() + fastest_rank_time = df_detected['duration'].min() - if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > THRESHOLD_STRAGGLERS: - stragglers_count += 1 + if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > THRESHOLD_STRAGGLERS: + stragglers_count += 1 - detected_files.append([ - id, abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100 - ]) + detected_files.append([ + id, abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100 + ]) - column_names = ['id', 'time_imbalance'] - detected_files = pd.DataFrame(detected_files, columns=column_names) + column_names = ['id', 'time_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) - check_shared_time_imbalance(stragglers_count, detected_files, file_map) + check_shared_time_imbalance(stragglers_count, detected_files, file_map) # Get the individual files responsible for imbalance - imbalance_count = 0 + if args.split_files: + if df_posix['rank'].nunique() == 1: + df_detected = df_posix[~(df_posix['function'].str.contains('read'))] + + max_bytes_written = df_detected['size'].max() + min_bytes_written = df_detected['size'].min() + + check_individual_write_imbalance_split(max_bytes_written, min_bytes_written) + + if df_posix['rank'].nunique() == 1: + df_detected = df_posix[(df_posix['function'].str.contains('read'))] + + max_bytes_read = df_detected['size'].max() + min_bytes_read = df_detected['size'].min() + + check_individual_read_imbalance_split(max_bytes_read, min_bytes_read) + else: + imbalance_count = 0 - detected_files = [] - for id in file_map.keys(): - if id in shared_files: continue - df_detected = df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read'))] - - max_bytes_written = df_detected['size'].max() - min_bytes_written = df_detected['size'].min() + detected_files = [] + for id in file_map.keys(): + if id in shared_files: continue + df_detected = df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read'))] + + max_bytes_written = df_detected['size'].max() + min_bytes_written = df_detected['size'].min() - if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > THRESHOLD_IMBALANCE: - imbalance_count += 1 + if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > THRESHOLD_IMBALANCE: + imbalance_count += 1 - detected_files.append([ - id, abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 - ]) + detected_files.append([ + id, abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 + ]) - column_names = ['id', 'write_imbalance'] - detected_files = pd.DataFrame(detected_files, columns=column_names) + column_names = ['id', 'write_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) - check_individual_write_imbalance(imbalance_count, detected_files, file_map) + check_individual_write_imbalance(imbalance_count, detected_files, file_map) - imbalance_count = 0 + imbalance_count = 0 - detected_files = [] - for id in shared_files: - df_detected = df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read'))] - - max_bytes_read = df_detected['size'].max() - min_bytes_read = df_detected['size'].min() + detected_files = [] + for id in shared_files: + df_detected = df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read'))] + + max_bytes_read = df_detected['size'].max() + min_bytes_read = df_detected['size'].min() - if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > THRESHOLD_IMBALANCE: - imbalance_count += 1 + if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > THRESHOLD_IMBALANCE: + imbalance_count += 1 - detected_files.append([ - id, abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 - ]) + detected_files.append([ + id, abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 + ]) - column_names = ['id', 'read_imbalance'] - detected_files = pd.DataFrame(detected_files, columns=column_names) + column_names = ['id', 'read_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) - check_individual_read_imbalance(imbalance_count, detected_files, file_map) + check_individual_read_imbalance(imbalance_count, detected_files, file_map) ######################################################################################################################################################################### @@ -365,37 +425,43 @@ def add_duration(row): mpiio_coll_writes = len(df_mpiio_writes[(df_mpiio_writes['function'].str.contains('_all'))]) total_mpiio_write_operations = mpiio_indep_writes + mpiio_coll_writes - detected_files = [] - if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: - for id in file_map.keys(): - indep_read_count = df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all')) & (df_mpiio_reads['file_id'] == id)] - indep_write_count = df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all')) & (df_mpiio_writes['file_id'] == id)] - indep_total_count = indep_read_count + indep_write_count; + if args.split_files: + detected_files = pd.DataFrame() + else: + detected_files = [] + if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + for id in file_map.keys(): + indep_read_count = df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all')) & (df_mpiio_reads['file_id'] == id)] + indep_write_count = df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all')) & (df_mpiio_writes['file_id'] == id)] + indep_total_count = indep_read_count + indep_write_count; - if (indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE and indep_read_count / indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS): - detected_files.append([ - id, indep_read_count, indep_read_count / indep_total_count * 100 - ]) + if (indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE and indep_read_count / indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS): + detected_files.append([ + id, indep_read_count, indep_read_count / indep_total_count * 100 + ]) - column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads'] - detected_files = pd.DataFrame(detected_files, columns=column_names) + column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads'] + detected_files = pd.DataFrame(detected_files, columns=column_names) check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map) - detected_files = [] - if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: - for id in file_map.keys(): - indep_read_count = df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all')) & (df_mpiio_reads['file_id'] == id)] - indep_write_count = df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all')) & (df_mpiio_writes['file_id'] == id)] - indep_total_count = indep_read_count + indep_write_count; + if args.split_files: + detected_files = pd.DataFrame() + else: + detected_files = [] + if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + for id in file_map.keys(): + indep_read_count = df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all')) & (df_mpiio_reads['file_id'] == id)] + indep_write_count = df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all')) & (df_mpiio_writes['file_id'] == id)] + indep_total_count = indep_read_count + indep_write_count; - if (indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE and indep_write_count / indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS): - detected_files.append([ - id, indep_write_count, indep_write_count / indep_total_count * 100 - ]) + if (indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE and indep_write_count / indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS): + detected_files.append([ + id, indep_write_count, indep_write_count / indep_total_count * 100 + ]) - column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes'] - detected_files = pd.DataFrame(detected_files, columns=column_names) + column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes'] + detected_files = pd.DataFrame(detected_files, columns=column_names) check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map) @@ -429,44 +495,83 @@ def add_duration(row): console.print() - console.print( - Panel( - '\n'.join([ - ' [b]RECORDER[/b]: [white]{}[/white]'.format( - os.path.basename(args.log_path) + if args.split_files: + console.print( + Panel( + '\n'.join([ + ' [b]RECORDER[/b]: [white]{}[/white]'.format( + os.path.basename(args.log_path) + ), + ' [b]FILE[/b]: [white]{} ({})[/white]'.format( + file_map[fid], + fid, + ), + ' [b]PROCESSES[/b] [white]{}[/white]'.format( + df_intervals['rank'].nunique() + ), + ]), + title='[b][slate_blue3]DRISHTI[/slate_blue3] v.0.5[/b]', + title_align='left', + subtitle='[red][b]{} critical issues[/b][/red], [orange1][b]{} warnings[/b][/orange1], and [white][b]{} recommendations[/b][/white]'.format( + insights_total[HIGH], + insights_total[WARN], + insights_total[RECOMMENDATIONS], ), - ' [b]FILES[/b]: [white]{} files ({} use STDIO, {} use POSIX, {} use MPI-IO)[/white]'.format( - total_files, - total_files_stdio, - total_files_posix - total_files_mpiio, # Since MPI-IO files will always use POSIX, we can decrement to get a unique count - total_files_mpiio - ), - ' [b]PROCESSES[/b] [white]{}[/white]'.format( - reader.GM.total_ranks + subtitle_align='left', + padding=1 + ) + ) + else: + console.print( + Panel( + '\n'.join([ + ' [b]RECORDER[/b]: [white]{}[/white]'.format( + os.path.basename(args.log_path) + ), + ' [b]FILES[/b]: [white]{} files ({} use STDIO, {} use POSIX, {} use MPI-IO)[/white]'.format( + total_files, + total_files_stdio, + total_files_posix - total_files_mpiio, # Since MPI-IO files will always use POSIX, we can decrement to get a unique count + total_files_mpiio + ), + ' [b]PROCESSES[/b] [white]{}[/white]'.format( + df_intervals['rank'].nunique() + ), + ]), + title='[b][slate_blue3]DRISHTI[/slate_blue3] v.0.5[/b]', + title_align='left', + subtitle='[red][b]{} critical issues[/b][/red], [orange1][b]{} warnings[/b][/orange1], and [white][b]{} recommendations[/b][/white]'.format( + insights_total[HIGH], + insights_total[WARN], + insights_total[RECOMMENDATIONS], ), - ]), - title='[b][slate_blue3]DRISHTI[/slate_blue3] v.0.5[/b]', - title_align='left', - subtitle='[red][b]{} critical issues[/b][/red], [orange1][b]{} warnings[/b][/orange1], and [white][b]{} recommendations[/b][/white]'.format( - insights_total[HIGH], - insights_total[WARN], - insights_total[RECOMMENDATIONS], - ), - subtitle_align='left', - padding=1 + subtitle_align='left', + padding=1 + ) ) - ) console.print() - display_content() - display_footer(insights_start_time, insights_end_time) + display_content(console) + display_footer(console, insights_start_time, insights_end_time) + + if args.split_files: + filename = '{}.{}.html'.format(args.log_path, fid) + else: + filename = '{}.html'.format(args.log_path) + + export_html(console, filename) + + if args.split_files: + filename = '{}.{}.svg'.format(args.log_path, fid) + else: + filename = '{}.svg'.format(args.log_path) - export_html() - export_svg() + export_svg(console, filename) - filename = '{}-summary.csv'.format( - args.log_path - ) + if args.split_files: + filename = '{}.{}.summary.csv'.format(args.log_path, fid) + else: + filename = '{}-summary.csv'.format(args.log_path) export_csv(filename) diff --git a/drishti/handle_recorder_split.py b/drishti/handle_recorder_split.py deleted file mode 100644 index 74bc899..0000000 --- a/drishti/handle_recorder_split.py +++ /dev/null @@ -1,982 +0,0 @@ -#!/usr/bin/env python3 - -import os -import csv -import time -import json - -import datetime - -import pandas as pd - -from rich import print, box -from rich.console import Group -from rich.padding import Padding -from rich.syntax import Syntax -from rich.panel import Panel -from rich.terminal_theme import TerminalTheme -from rich.terminal_theme import MONOKAI - -from recorder_utils import RecorderReader -from recorder_utils.build_offset_intervals import build_offset_intervals - -from .includes import * - - -def get_accessed_files(reader): - ranks = reader.GM.total_ranks - filemap = {} - for rank in range(ranks): - filemap.update(reader.LMs[rank].filemap) - - return filemap - - -def init_df_posix_recordes(reader): - func_list = reader.funcs - ranks = reader.GM.total_ranks - records = [] - for rank in range(ranks): - for i in range(reader.LMs[rank].total_records): - record = reader.records[rank][i] - func_name = func_list[record.func_id] - - if 'MPI' not in func_name and 'H5' not in func_name: - filename = None - if "open" in func_name or "close" in func_name or "creat" in func_name \ - or "seek" in func_name or "sync" in func_name: - fstr = record.args[0] - filename = fstr if type(fstr)==str else fstr.decode('utf-8') - filename = filename.replace('./', '') - - records.append( [filename, rank, func_name, record.tstart, record.tend] ) - - head = ['fname', 'rank', 'function', 'start', 'end'] - df_posix_records = pd.DataFrame(records, columns=head) - return df_posix_records - - -def handler(args): - reader = RecorderReader(args.log_path) - df_intervals = build_offset_intervals(reader) - df_posix_records = init_df_posix_recordes(reader) - - unique_files = get_accessed_files(reader) - - def add_api(row): - if 'MPI' in row['function']: - return 'MPIIO' - elif 'H5' in row['function']: - return 'H5F' - else: - return 'POSIX' - - df_intervals['api'] = df_intervals.apply(add_api, axis=1) - - def add_duration(row): - return row['end'] - row['start'] - - df_intervals['duration'] = df_intervals.apply(add_duration, axis=1) - df_posix_records['duration'] = df_posix_records.apply(add_duration, axis=1) - - ######################################################################################################################################################################### - for fid, fname in unique_files.items(): - console = Console(record=True) - init_console(args) - validate_thresholds() - insights_start_time = time.time() - - df_intervals_temp = df_intervals[(df_intervals['file_id'] == fid)] - if not len(df_intervals_temp): continue - - df_posix_records = df_posix_records[(df_posix_records['fname'] == fname)] - modules = set(df_intervals_temp['api'].unique()) - - # Check usage of POSIX, and MPI-IO per file - total_size_stdio = 0 - total_size_posix = 0 - total_size_mpiio = 0 - total_size = 0 - - df_stdio_intervals = df_intervals_temp[(df_intervals_temp['api'] == 'STDIO')] - df_posix_intervals = df_intervals_temp[(df_intervals_temp['api'] == 'POSIX')] - df_mpiio_intervals = df_intervals_temp[(df_intervals_temp['api'] == 'MPIIO')] - - if len(df_stdio_intervals): - total_size_stdio += df_stdio_intervals['size'].sum() - - if len(df_posix_intervals): - total_size_posix += df_posix_intervals['size'].sum() - - if len(df_mpiio_intervals): - total_size_mpiio += df_mpiio_intervals['size'].sum() - - - # Since POSIX will capture both POSIX-only accesses and those comming from MPI-IO, we can subtract those - if total_size_posix > 0 and total_size_posix >= total_size_mpiio: - total_size_posix -= total_size_mpiio - - total_size = total_size_stdio + total_size_posix + total_size_mpiio - - assert(total_size_stdio >= 0) - assert(total_size_posix >= 0) - assert(total_size_mpiio >= 0) - - if total_size and total_size_stdio / total_size > THRESHOLD_INTERFACE_STDIO: - issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format( - total_size_stdio / total_size * 100.0, - convert_bytes(total_size_stdio) - ) - - recommendation = [ - { - 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' - } - ] - - insights_operation.append( - message(args, INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - - if 'MPIIO' not in modules: - issue = 'Application is using low-performance interface' - - recommendation = [ - { - 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' - } - ] - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) - ) - - ######################################################################################################################################################################### - - if df_intervals_temp['api'].eq('POSIX').any(): - df_posix = df_intervals_temp[(df_intervals_temp['api'] == 'POSIX')] - - ######################################################################################################################################################################### - - # Get number of write/read operations - total_reads = len(df_posix[(df_posix['function'].str.contains('read'))]) - total_writes = len(df_posix[~(df_posix['function'].str.contains('read'))]) - - # Get total number of I/O operations - total_operations = total_writes + total_reads - - # To check whether the application is write-intersive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance - if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: - issue = 'Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( - total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) - ) - - if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: - issue = 'Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( - total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) - ) - - total_read_size = df_posix[(df_posix['function'].str.contains('read'))]['size'].sum() - total_written_size = df_posix[~(df_posix['function'].str.contains('read'))]['size'].sum() - - total_size = total_written_size + total_read_size - - if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: - issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format( - total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) - ) - - if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: - issue = 'Application is read size intensive ({:.2f}% write vs. {:.2f}% read)'.format( - total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) - ) - - ######################################################################################################################################################################### - - # Get the number of small I/O operations (less than 1 MB) - - total_reads_small = len(df_posix[(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - total_writes_small = len(df_posix[~(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - - if total_reads_small and total_reads_small / total_reads > THRESHOLD_SMALL_REQUESTS and total_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: - issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests'.format( - total_reads_small, total_reads_small / total_reads * 100.0 - ) - - recommendation = [] - - recommendation.append( - { - 'message': 'Consider buffering read operations into larger more contiguous ones' - } - ) - - if 'MPIIO' in modules: - recommendation.append( - { - 'message': 'Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') - } - ) - else: - recommendation.append( - { - 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' - } - ) - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - - if total_writes_small and total_writes_small / total_writes > THRESHOLD_SMALL_REQUESTS and total_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: - issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests'.format( - total_writes_small, total_writes_small / total_writes * 100.0 - ) - - recommendation = [] - - recommendation.append( - { - 'message': 'Consider buffering write operations into larger more contiguous ones' - } - ) - - if 'MPIIO' in modules: - recommendation.append( - { - 'message': 'Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') - } - ) - else: - recommendation.append( - { - 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' - } - ) - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - - ######################################################################################################################################################################### - - # How many requests are misaligned? - # TODO: - - ######################################################################################################################################################################### - - # Redundant read-traffic (based on Phill) - # POSIX_MAX_BYTE_READ (Highest offset in the file that was read) - max_read_offset = df_posix[(df_posix['function'].str.contains('read'))]['offset'].max() - - if max_read_offset > total_read_size: - issue = 'Application might have redundant read traffic (more data read than the highest offset)' - - insights_metadata.append( - message(args, INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None) - ) - - max_write_offset = df_posix[~(df_posix['function'].str.contains('read'))]['offset'].max() - - if max_write_offset > total_written_size: - issue = 'Application might have redundant write traffic (more data written than the highest offset)' - - insights_metadata.append( - message(args, INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None) - ) - - ######################################################################################################################################################################### - - # Check for a lot of random operations - - read_consecutive = 0 - read_sequential = 0 - read_random = 0 - - df_filtered = df_posix[(df_posix['function'].str.contains('read'))].sort_values('start') - - for i in range(len(df_filtered) - 1): - curr_interval = df_filtered.iloc[i] - next_interval = df_filtered.iloc[i + 1] - if curr_interval['offset'] + curr_interval['size'] == next_interval['offset']: - read_consecutive += 1 - elif curr_interval['offset'] + curr_interval['size'] < next_interval['offset']: - read_sequential += 1 - else: - read_random += 1 - - if total_reads: - if read_random and read_random / total_reads > THRESHOLD_RANDOM_OPERATIONS and read_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: - issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format( - read_random, read_random / total_reads * 100.0 - ) - - recommendation = [ - { - 'message': 'Consider changing your data model to have consecutive or sequential reads' - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - else: - issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests'.format( - read_consecutive / total_reads * 100.0, - read_sequential / total_reads * 100.0 - ) - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, TARGET_DEVELOPER, OK, issue, None) - ) - - write_consecutive = 0 - write_sequential = 0 - write_random = 0 - - - df_filtered = df_posix[~(df_posix['function'].str.contains('read'))].sort_values('start') - - for i in range(len(df_filtered) - 1): - curr_interval = df_filtered.iloc[i] - next_interval = df_filtered.iloc[i + 1] - if curr_interval['offset'] + curr_interval['size'] == next_interval['offset']: - write_consecutive += 1 - elif curr_interval['offset'] + curr_interval['size'] < next_interval['offset']: - write_sequential += 1 - else: - write_random += 1 - - if total_writes: - if write_random and write_random / total_writes > THRESHOLD_RANDOM_OPERATIONS and write_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: - issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)'.format( - write_random, write_random / total_writes * 100.0 - ) - - recommendation = [ - { - 'message': 'Consider changing your data model to have consecutive or sequential writes' - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - else: - issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests'.format( - write_consecutive / total_writes * 100.0, - write_sequential / total_writes * 100.0 - ) - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, TARGET_DEVELOPER, OK, issue, None) - ) - - ######################################################################################################################################################################### - - # Shared file with small operations - - # A file is shared if it's been read/written by more than 1 rank - detected_files = df_posix['rank'].nunique() - - total_shared_reads = 0 - total_shared_reads_small = 0 - total_shared_writes = 0 - total_shared_writes_small = 0 - - if df_posix['rank'].nunique() > 1: - total_shared_reads += len(df_posix[(df_posix['function'].str.contains('read'))]) - total_shared_writes += len(df_posix[~(df_posix['function'].str.contains('read'))]) - - total_shared_reads_small = len(df_posix[(df_posix['function'].str.contains('read')) - & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - total_shared_writes_small = len(df_posix[~(df_posix['function'].str.contains('read')) - & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - - if total_shared_reads and total_shared_reads_small / total_shared_reads > THRESHOLD_SMALL_REQUESTS and total_shared_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: - issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format( - total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0 - ) - - recommendation = [ - { - 'message': 'Consider coalesceing read requests into larger more contiguous ones using MPI-IO collective operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - - if total_shared_writes and total_shared_writes_small / total_shared_writes > THRESHOLD_SMALL_REQUESTS and total_shared_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: - issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests'.format( - total_shared_writes_small, total_shared_writes_small / total_shared_writes * 100.0 - ) - - recommendation = [ - { - 'message': 'Consider coalescing write requests into larger more contiguous ones using MPI-IO collective operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - - ######################################################################################################################################################################### - - # TODO: Here I assume all operations other than write/read are metadata operations - df_posix_metadata = df_posix_records[~(df_posix_records['function'].str.contains('read|write|print', na=False))] - df_detected = df_posix_metadata.groupby('rank')['duration'].sum().reset_index() - has_long_metadata = df_detected[(df_detected['duration'] > THRESHOLD_METADATA_TIME_RANK)] - - if not has_long_metadata.empty: - issue = 'There are {} ranks where metadata operations take over {} seconds'.format( - len(has_long_metadata), THRESHOLD_METADATA_TIME_RANK - ) - - recommendation = [ - { - 'message': 'Attempt to combine files, reduce, or cache metadata operations' - } - ] - - if 'H5F' in modules: - recommendation.append( - { - 'message': 'Since your appplication uses HDF5, try enabling collective metadata calls with H5Pset_coll_metadata_write() and H5Pset_all_coll_metadata_ops()', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-collective-metadata.c'), line_numbers=True, background_color='default') - }, - { - 'message': 'Since your appplication uses HDF5, try using metadata cache to defer metadata operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-cache.c'), line_numbers=True, background_color='default') - } - ) - - insights_metadata.append( - message(args, INSIGHTS_POSIX_HIGH_METADATA_TIME, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - - # We already have a single line for each shared-file access - # To check for stragglers, we can check the difference between the - - # POSIX_FASTEST_RANK_BYTES - # POSIX_SLOWEST_RANK_BYTES - # POSIX_VARIANCE_RANK_BYTES - - stragglers = False - - if df_posix['rank'].nunique() > 1: - total_transfer_size = df_posix['size'].sum() - - df_detected = df_posix.groupby('rank').agg({'size': 'sum', 'duration': 'sum'}).reset_index() - slowest_rank_bytes = df_detected.loc[df_detected['duration'].idxmax(), 'size'] - fastest_rank_bytes = df_detected.loc[df_detected['duration'].idxmin(), 'size'] - - if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > THRESHOLD_STRAGGLERS: - stragglers = True - - if stragglers: - issue = 'Load imbalance of {:.2f}% detected'.format( - abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100 - ) - - recommendation = [ - { - 'message': 'Consider better balancing the data transfer between the application ranks' - }, - { - 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation) - ) - - # POSIX_F_FASTEST_RANK_TIME - # POSIX_F_SLOWEST_RANK_TIME - # POSIX_F_VARIANCE_RANK_TIME - - stragglers = False - - if df_posix['rank'].nunique() > 1: - total_transfer_time = df_posix['duration'].sum() - - df_detected = df_posix.groupby('rank')['duration'].sum().reset_index() - - slowest_rank_time = df_detected['duration'].max() - fastest_rank_time = df_detected['duration'].min() - - if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > THRESHOLD_STRAGGLERS: - stragglers = True - - if stragglers: - issue = 'Load imbalance of {:.2f}% detected'.format( - abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100 - ) - - recommendation = [ - { - 'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give - }, - { - 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation) - ) - - # Get the individual files responsible for imbalance - imbalance = False - - if df_posix['rank'].nunique() == 1: - df_detected = df_posix[~(df_posix['function'].str.contains('read'))] - - max_bytes_written = df_detected['size'].max() - min_bytes_written = df_detected['size'].min() - - if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > THRESHOLD_IMBALANCE: - imbalance = True - - if imbalance: - issue = 'Load imbalance of {:.2f}% detected'.format( - abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 - ) - - recommendation = [ - { - 'message': 'Consider better balancing the data transfer between the application ranks' - }, - { - 'message': 'Consider tuning the stripe size and count to better distribute the data', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - }, - { - 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') - }, - { - 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - - imbalance = False - - if df_posix['rank'].nunique() == 1: - df_detected = df_posix[(df_posix['function'].str.contains('read'))] - - max_bytes_read = df_detected['size'].max() - min_bytes_read = df_detected['size'].min() - - if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > THRESHOLD_IMBALANCE: - imbalance = True - - if imbalance: - issue = 'Load imbalance of {:.2f}% detected'.format( - abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 - ) - - recommendation = [ - { - 'message': 'Consider better balancing the data transfer between the application ranks' - }, - { - 'message': 'Consider tuning the stripe size and count to better distribute the data', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - }, - { - 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') - }, - { - 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' - } - ] - - insights_operation.append( - message(args, INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - - ######################################################################################################################################################################### - - if df_intervals_temp['api'].eq('MPIIO').any(): - df_mpiio = df_intervals_temp[(df_intervals_temp['api'] == 'MPIIO')] - - df_mpiio_reads = df_mpiio[(df_mpiio['function'].str.contains('read'))] - mpiio_indp_reads = len(df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all'))]) - mpiio_coll_reads = len(df_mpiio_reads[(df_mpiio_reads['function'].str.contains('_all'))]) - total_mpiio_read_operations = mpiio_indp_reads + mpiio_coll_reads - - df_mpiio_writes = df_mpiio[~(df_mpiio['function'].str.contains('read'))] - mpiio_indp_writes = len(df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all'))]) - mpiio_coll_writes = len(df_mpiio_writes[(df_mpiio_writes['function'].str.contains('_all'))]) - total_mpiio_write_operations = mpiio_indp_writes + mpiio_coll_writes - - if mpiio_coll_reads == 0: - if total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: - issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format( - mpiio_indp_reads, - mpiio_indp_reads / (total_mpiio_read_operations) * 100 - ) - - recommendation = [ - { - 'message': 'Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - else: - issue = 'Application uses MPI-IO and read data using {} ({:.2f}%) collective operations'.format( - mpiio_coll_reads, - mpiio_coll_reads / total_mpiio_read_operations * 100 - ) - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, OK, issue) - ) - - if mpiio_coll_writes == 0: - if total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: - issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format( - mpiio_indp_writes, - mpiio_indp_writes / (total_mpiio_write_operations) * 100 - ) - - recommendation = [ - { - 'message': 'Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') - } - ] - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) - ) - - else: - issue = 'Application uses MPI-IO and write data using {} ({:.2f}%) collective operations'.format( - mpiio_coll_writes, - mpiio_coll_writes / total_mpiio_write_operations * 100 - ) - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, OK, issue) - ) - - ######################################################################################################################################################################### - - # Look for usage of non-block operations - - # Look for HDF5 file extension - - has_hdf5_extension = False - - if fname.endswith('.h5') or fname.endswith('.hdf5'): - has_hdf5_extension = True - - if len(df_mpiio_reads[(df_mpiio_reads['function'].str.contains('iread|begin|end'))]) == 0: - issue = 'Application could benefit from non-blocking (asynchronous) reads' - - recommendation = [] - - if 'H5F' in modules or has_hdf5_extension: - recommendation.append( - { - 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-read.c'), line_numbers=True, background_color='default') - } - ) - - if 'MPIIO' in modules: - recommendation.append( - { - 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iread.c'), line_numbers=True, background_color='default') - } - ) - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) - ) - - if len(df_mpiio_writes[(df_mpiio_writes['function'].str.contains('iwrite|begin|end'))]) == 0: - issue = 'Application could benefit from non-blocking (asynchronous) writes' - - recommendation = [] - - if 'H5F' in modules or has_hdf5_extension: - recommendation.append( - { - 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-write.c'), line_numbers=True, background_color='default') - } - ) - - if 'MPIIO' in modules: - recommendation.append( - { - 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iwrite.c'), line_numbers=True, background_color='default') - } - ) - - insights_operation.append( - message(args, INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) - ) - - ######################################################################################################################################################################### - - # Nodes and MPI-IO aggregators - # If the application uses collective reads or collective writes, look for the number of aggregators - # TODO: - - ######################################################################################################################################################################### - - NUMBER_OF_COMPUTE_NODES = 0 - - ######################################################################################################################################################################### - - codes = [] - if args.json: - f = open(args.json) - data = json.load(f) - - for key, values in data.items(): - for value in values: - code = value['code'] - codes.append(code) - - level = value['level'] - issue = value['issue'] - recommendation = [] - for rec in value['recommendations']: - new_message = {'message': rec} - recommendation.append(new_message) - - insights_dxt.append( - message(args, code, TARGET_DEVELOPER, level, issue, recommendation) - ) - - ######################################################################################################################################################################### - - insights_end_time = time.time() - - console.print() - - console.print( - Panel( - '\n'.join([ - ' [b]RECORDER[/b]: [white]{}[/white]'.format( - os.path.basename(args.log_path) - ), - ' [b]FILE[/b]: [white]{} ({})[/white]'.format( - fname, - fid, - ), - # ' [b]COMPUTE NODES[/b] [white]{}[/white]'.format( - # NUMBER_OF_COMPUTE_NODES - # ), - ' [b]PROCESSES[/b] [white]{}[/white]'.format( - reader.GM.total_ranks - ), - ]), - title='[b][slate_blue3]DRISHTI[/slate_blue3] v.0.5[/b]', - title_align='left', - subtitle='[red][b]{} critical issues[/b][/red], [orange1][b]{} warnings[/b][/orange1], and [white][b]{} recommendations[/b][/white]'.format( - insights_total[HIGH], - insights_total[WARN], - insights_total[RECOMMENDATIONS], - ), - subtitle_align='left', - padding=1 - ) - ) - - console.print() - - if insights_metadata: - console.print( - Panel( - Padding( - Group( - *insights_metadata - ), - (1, 1) - ), - title='METADATA', - title_align='left' - ) - ) - - if insights_operation: - console.print( - Panel( - Padding( - Group( - *insights_operation - ), - (1, 1) - ), - title='OPERATIONS', - title_align='left' - ) - ) - - if insights_dxt: - console.print( - Panel( - Padding( - Group( - *insights_dxt - ), - (1, 1) - ), - title='DXT', - title_align='left' - ) - ) - - console.print( - Panel( - ' {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds'.format( - datetime.datetime.now().year, - datetime.datetime.now(), - insights_end_time - insights_start_time - ), - box=box.SIMPLE - ) - ) - - if args.export_theme_light: - export_theme = TerminalTheme( - (255, 255, 255), - (0, 0, 0), - [ - (26, 26, 26), - (244, 0, 95), - (152, 224, 36), - (253, 151, 31), - (157, 101, 255), - (244, 0, 95), - (88, 209, 235), - (120, 120, 120), - (98, 94, 76), - ], - [ - (244, 0, 95), - (152, 224, 36), - (224, 213, 97), - (157, 101, 255), - (244, 0, 95), - (88, 209, 235), - (246, 246, 239), - ], - ) - else: - export_theme = MONOKAI - - if args.export_html: - console.save_html( - '{}.{}.html'.format(args.log_path, fid), - theme=export_theme, - clear=False - ) - - if args.export_svg: - console.save_svg( - '{}.{}.svg'.format(args.log_path, fid), - title='Drishti', - theme=export_theme, - clear=False - ) - - if args.export_csv: - issues = [ - 'JOB', - INSIGHTS_STDIO_HIGH_USAGE, - INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, - INSIGHTS_POSIX_READ_COUNT_INTENSIVE, - INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, - INSIGHTS_POSIX_READ_SIZE_INTENSIVE, - INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, - INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, - INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, - INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, - INSIGHTS_POSIX_REDUNDANT_READ_USAGE, - INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, - INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, - INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, - INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, - INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, - INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, - INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, - INSIGHTS_POSIX_HIGH_METADATA_TIME, - INSIGHTS_POSIX_SIZE_IMBALANCE, - INSIGHTS_POSIX_TIME_IMBALANCE, - INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, - INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, - INSIGHTS_MPI_IO_NO_USAGE, - INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, - INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, - INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE, - INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE, - INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, - INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, - INSIGHTS_MPI_IO_AGGREGATORS_INTRA, - INSIGHTS_MPI_IO_AGGREGATORS_INTER, - INSIGHTS_MPI_IO_AGGREGATORS_OK - ] - if codes: - issues.extend(codes) - - detected_issues = dict.fromkeys(issues, False) - detected_issues['JOB'] = None - - for report in csv_report: - detected_issues[report] = True - - filename = '{}.{}.summary.csv'.format( - args.log_path, - fid - ) - - with open(filename, 'w') as f: - w = csv.writer(f) - w.writerow(detected_issues.keys()) - w.writerow(detected_issues.values()) - - - diff --git a/drishti/includes.py b/drishti/includes.py deleted file mode 100644 index 3b921aa..0000000 --- a/drishti/includes.py +++ /dev/null @@ -1,203 +0,0 @@ -#!/usr/bin/env python3 - -import os - -from rich.console import Console, Group -from rich.padding import Padding -from rich.panel import Panel - - -RECOMMENDATIONS = 0 -HIGH = 1 -WARN = 2 -INFO = 3 -OK = 4 - -ROOT = os.path.abspath(os.path.dirname(__file__)) - -TARGET_USER = 1 -TARGET_DEVELOPER = 2 -TARGET_SYSTEM = 3 - -insights_operation = [] -insights_metadata = [] -insights_dxt = [] - -insights_total = dict() - -insights_total[HIGH] = 0 -insights_total[WARN] = 0 -insights_total[RECOMMENDATIONS] = 0 - -THRESHOLD_OPERATION_IMBALANCE = 0.1 -THRESHOLD_SMALL_REQUESTS = 0.1 -THRESHOLD_SMALL_REQUESTS_ABSOLUTE = 1000 -THRESHOLD_MISALIGNED_REQUESTS = 0.1 -THRESHOLD_METADATA = 0.1 -THRESHOLD_METADATA_TIME_RANK = 30 # seconds -THRESHOLD_RANDOM_OPERATIONS = 0.2 -THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE = 1000 -THRESHOLD_STRAGGLERS = 0.15 -THRESHOLD_IMBALANCE = 0.30 -THRESHOLD_INTERFACE_STDIO = 0.1 -THRESHOLD_COLLECTIVE_OPERATIONS = 0.5 -THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE = 1000 -THRESHOLD_SMALL_BYTES = 1048576 # 1 MB - -INSIGHTS_STDIO_HIGH_USAGE = 'S01' -INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE = 'P01' -INSIGHTS_POSIX_READ_COUNT_INTENSIVE = 'P02' -INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE = 'P03' -INSIGHTS_POSIX_READ_SIZE_INTENSIVE = 'P04' -INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE = 'P05' -INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE = 'P06' -INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE = 'P07' -INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE = 'P08' -INSIGHTS_POSIX_REDUNDANT_READ_USAGE = 'P09' -INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE = 'P10' -INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE = 'P11' -INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE = 'P12' -INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE = 'P13' -INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE = 'P14' -INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE = 'P15' -INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE = 'P16' -INSIGHTS_POSIX_HIGH_METADATA_TIME = 'P17' -INSIGHTS_POSIX_SIZE_IMBALANCE = 'P18' -INSIGHTS_POSIX_TIME_IMBALANCE = 'P19' -INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE = 'P21' -INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE = 'P22' -INSIGHTS_MPI_IO_NO_USAGE = 'M01' -INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE = 'M02' -INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE = 'M03' -INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE = 'M04' -INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE = 'M05' -INSIGHTS_MPI_IO_BLOCKING_READ_USAGE = 'M06' -INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE = 'M07' -INSIGHTS_MPI_IO_AGGREGATORS_INTRA = 'M08' -INSIGHTS_MPI_IO_AGGREGATORS_INTER = 'M09' -INSIGHTS_MPI_IO_AGGREGATORS_OK = 'M10' - -DETAILS_MAX_SIZE = 10 - -# TODO: need to verify the threashold to be between 0 and 1 -# TODO: read thresholds from file - - -console = Console(record=True) -csv_report = [] - - -def init_console(args): - if args.export_size: console.width = int(args.export_size) - - insights_operation.clear() - insights_metadata.clear() - insights_dxt.clear() - - insights_total[HIGH] = 0 - insights_total[WARN] = 0 - insights_total[RECOMMENDATIONS] = 0 - - csv_report.clear() - -def validate_thresholds(): - """ - Validate thresholds defined by the user. - """ - assert(THRESHOLD_OPERATION_IMBALANCE >= 0.0 and THRESHOLD_OPERATION_IMBALANCE <= 1.0) - assert(THRESHOLD_SMALL_REQUESTS >= 0.0 and THRESHOLD_SMALL_REQUESTS <= 1.0) - assert(THRESHOLD_MISALIGNED_REQUESTS >= 0.0 and THRESHOLD_MISALIGNED_REQUESTS <= 1.0) - assert(THRESHOLD_METADATA >= 0.0 and THRESHOLD_METADATA <= 1.0) - assert(THRESHOLD_RANDOM_OPERATIONS >= 0.0 and THRESHOLD_RANDOM_OPERATIONS <= 1.0) - - assert(THRESHOLD_METADATA_TIME_RANK >= 0.0) - - -def convert_bytes(bytes_number): - """ - Convert bytes into formatted string. - """ - tags = [ - 'bytes', - 'KB', - 'MB', - 'GB', - 'TB', - 'PB', - 'EB' - ] - - i = 0 - double_bytes = bytes_number - - while (i < len(tags) and bytes_number >= 1024): - double_bytes = bytes_number / 1024.0 - i = i + 1 - bytes_number = bytes_number / 1024 - - return str(round(double_bytes, 2)) + ' ' + tags[i] - - -def message(args, code, target, level, issue, recommendations=None, details=None): - """ - Display the message on the screen with level, issue, and recommendation. - """ - icon = ':arrow_forward:' - - if level in (HIGH, WARN): - insights_total[level] += 1 - - if level == HIGH: - color = '[red]' - elif level == WARN: - color = '[orange1]' - elif level == OK: - color = '[green]' - else: - color = '' - - messages = [ - '{}{}{} {}'.format( - color, - icon, - ' [' + code + ']' if args.code else '', - issue - ) - ] - - if args.export_csv: - csv_report.append(code) - - if details: - for detail in details[:DETAILS_MAX_SIZE]: - messages.append(' {}:left_arrow_curving_right: {}'.format( - color, - detail['message'] - ) - ) - - if recommendations: - if not args.only_issues: - messages.append(' [white]:left_arrow_curving_right: [b]Recommendations:[/b]') - - for recommendation in recommendations: - messages.append(' :left_arrow_curving_right: {}'.format(recommendation['message'])) - - if args.verbose and 'sample' in recommendation: - messages.append( - Padding( - Panel( - recommendation['sample'], - title='Solution Example Snippet', - title_align='left', - padding=(1, 2) - ), - (1, 0, 1, 7) - ) - ) - - insights_total[RECOMMENDATIONS] += len(recommendations) - - return Group( - *messages - ) diff --git a/drishti/reporter.py b/drishti/reporter.py index ef92d11..54c7b17 100644 --- a/drishti/reporter.py +++ b/drishti/reporter.py @@ -6,6 +6,18 @@ from .parser import * +''' + |- handler_darshan -| + | | +reporter -> /handlers -> |- handler_recorder -| -| + | | | + |- handler_xxx ... -| | + ________________________________________________| + | + |-----> /includes -> module -> config -> parser +''' + + LOG_TYPE_DARSHAN = 0 LOG_TYPE_RECORDER = 1 @@ -37,10 +49,7 @@ def main(): from .handle_darshan import handler elif log_type == LOG_TYPE_RECORDER: - if args.split_files: - from .handle_recorder_split import handler - else: - from .handle_recorder import handler + from .handle_recorder import handler handler() From 440cd5c57b92c707633a8725f419f0ab8856e110 Mon Sep 17 00:00:00 2001 From: onewbiek Date: Sat, 16 Dec 2023 08:26:55 -0800 Subject: [PATCH 14/19] Reorg the file structure --- MANIFEST.in | 2 +- drishti/{snippets => handlers}/__init__.py | 0 drishti/{ => handlers}/handle_darshan.py | 2 +- drishti/{ => handlers}/handle_recorder.py | 2 +- drishti/includes/__init__.py | 0 drishti/{ => includes}/config.py | 2 +- drishti/{ => includes}/module.py | 2 +- drishti/{ => includes}/parser.py | 0 drishti/includes/snippets/__init__.py | 0 drishti/{ => includes}/snippets/hdf5-alignment.c | 0 drishti/{ => includes}/snippets/hdf5-cache.c | 0 .../snippets/hdf5-collective-metadata.c | 0 .../{ => includes}/snippets/hdf5-vol-async-read.c | 0 .../{ => includes}/snippets/hdf5-vol-async-write.c | 0 drishti/{ => includes}/snippets/lustre-striping.bash | 0 .../{ => includes}/snippets/mpi-io-collective-read.c | 0 .../snippets/mpi-io-collective-write.c | 0 drishti/{ => includes}/snippets/mpi-io-hints.bash | 0 drishti/{ => includes}/snippets/mpi-io-iread.c | 0 drishti/{ => includes}/snippets/mpi-io-iwrite.c | 0 .../{ => includes}/snippets/pnetcdf-hdf5-no-fill.c | 0 drishti/reporter.py | 6 +++--- setup.py | 12 +++++------- 23 files changed, 13 insertions(+), 15 deletions(-) rename drishti/{snippets => handlers}/__init__.py (100%) rename drishti/{ => handlers}/handle_darshan.py (99%) rename drishti/{ => handlers}/handle_recorder.py (99%) create mode 100644 drishti/includes/__init__.py rename drishti/{ => includes}/config.py (99%) rename drishti/{ => includes}/module.py (99%) rename drishti/{ => includes}/parser.py (100%) create mode 100644 drishti/includes/snippets/__init__.py rename drishti/{ => includes}/snippets/hdf5-alignment.c (100%) rename drishti/{ => includes}/snippets/hdf5-cache.c (100%) rename drishti/{ => includes}/snippets/hdf5-collective-metadata.c (100%) rename drishti/{ => includes}/snippets/hdf5-vol-async-read.c (100%) rename drishti/{ => includes}/snippets/hdf5-vol-async-write.c (100%) rename drishti/{ => includes}/snippets/lustre-striping.bash (100%) rename drishti/{ => includes}/snippets/mpi-io-collective-read.c (100%) rename drishti/{ => includes}/snippets/mpi-io-collective-write.c (100%) rename drishti/{ => includes}/snippets/mpi-io-hints.bash (100%) rename drishti/{ => includes}/snippets/mpi-io-iread.c (100%) rename drishti/{ => includes}/snippets/mpi-io-iwrite.c (100%) rename drishti/{ => includes}/snippets/pnetcdf-hdf5-no-fill.c (100%) diff --git a/MANIFEST.in b/MANIFEST.in index f354c46..5517bcc 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,2 @@ include requirements.txt -include drishti/snippets/* +include drishti/includes/snippets/* diff --git a/drishti/snippets/__init__.py b/drishti/handlers/__init__.py similarity index 100% rename from drishti/snippets/__init__.py rename to drishti/handlers/__init__.py diff --git a/drishti/handle_darshan.py b/drishti/handlers/handle_darshan.py similarity index 99% rename from drishti/handle_darshan.py rename to drishti/handlers/handle_darshan.py index 98cc63b..e533358 100644 --- a/drishti/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -12,7 +12,7 @@ from rich import print from packaging import version -from .module import * +from drishti.includes.module import * def is_available(name): diff --git a/drishti/handle_recorder.py b/drishti/handlers/handle_recorder.py similarity index 99% rename from drishti/handle_recorder.py rename to drishti/handlers/handle_recorder.py index a9af622..83b132d 100644 --- a/drishti/handle_recorder.py +++ b/drishti/handlers/handle_recorder.py @@ -5,7 +5,7 @@ import pandas as pd from recorder_utils import RecorderReader from recorder_utils.build_offset_intervals import build_offset_intervals -from .module import * +from drishti.includes.module import * def get_accessed_files(reader): diff --git a/drishti/includes/__init__.py b/drishti/includes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/drishti/config.py b/drishti/includes/config.py similarity index 99% rename from drishti/config.py rename to drishti/includes/config.py index e11a824..0041980 100644 --- a/drishti/config.py +++ b/drishti/includes/config.py @@ -9,7 +9,7 @@ from rich.terminal_theme import TerminalTheme from rich.terminal_theme import MONOKAI -from .parser import * +from drishti.includes.parser import * RECOMMENDATIONS = 0 diff --git a/drishti/module.py b/drishti/includes/module.py similarity index 99% rename from drishti/module.py rename to drishti/includes/module.py index a75d574..ae02c2e 100644 --- a/drishti/module.py +++ b/drishti/includes/module.py @@ -4,7 +4,7 @@ import csv from rich import box from rich.syntax import Syntax -from .config import * +from drishti.includes.config import * ''' Before calling the functions below diff --git a/drishti/parser.py b/drishti/includes/parser.py similarity index 100% rename from drishti/parser.py rename to drishti/includes/parser.py diff --git a/drishti/includes/snippets/__init__.py b/drishti/includes/snippets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/drishti/snippets/hdf5-alignment.c b/drishti/includes/snippets/hdf5-alignment.c similarity index 100% rename from drishti/snippets/hdf5-alignment.c rename to drishti/includes/snippets/hdf5-alignment.c diff --git a/drishti/snippets/hdf5-cache.c b/drishti/includes/snippets/hdf5-cache.c similarity index 100% rename from drishti/snippets/hdf5-cache.c rename to drishti/includes/snippets/hdf5-cache.c diff --git a/drishti/snippets/hdf5-collective-metadata.c b/drishti/includes/snippets/hdf5-collective-metadata.c similarity index 100% rename from drishti/snippets/hdf5-collective-metadata.c rename to drishti/includes/snippets/hdf5-collective-metadata.c diff --git a/drishti/snippets/hdf5-vol-async-read.c b/drishti/includes/snippets/hdf5-vol-async-read.c similarity index 100% rename from drishti/snippets/hdf5-vol-async-read.c rename to drishti/includes/snippets/hdf5-vol-async-read.c diff --git a/drishti/snippets/hdf5-vol-async-write.c b/drishti/includes/snippets/hdf5-vol-async-write.c similarity index 100% rename from drishti/snippets/hdf5-vol-async-write.c rename to drishti/includes/snippets/hdf5-vol-async-write.c diff --git a/drishti/snippets/lustre-striping.bash b/drishti/includes/snippets/lustre-striping.bash similarity index 100% rename from drishti/snippets/lustre-striping.bash rename to drishti/includes/snippets/lustre-striping.bash diff --git a/drishti/snippets/mpi-io-collective-read.c b/drishti/includes/snippets/mpi-io-collective-read.c similarity index 100% rename from drishti/snippets/mpi-io-collective-read.c rename to drishti/includes/snippets/mpi-io-collective-read.c diff --git a/drishti/snippets/mpi-io-collective-write.c b/drishti/includes/snippets/mpi-io-collective-write.c similarity index 100% rename from drishti/snippets/mpi-io-collective-write.c rename to drishti/includes/snippets/mpi-io-collective-write.c diff --git a/drishti/snippets/mpi-io-hints.bash b/drishti/includes/snippets/mpi-io-hints.bash similarity index 100% rename from drishti/snippets/mpi-io-hints.bash rename to drishti/includes/snippets/mpi-io-hints.bash diff --git a/drishti/snippets/mpi-io-iread.c b/drishti/includes/snippets/mpi-io-iread.c similarity index 100% rename from drishti/snippets/mpi-io-iread.c rename to drishti/includes/snippets/mpi-io-iread.c diff --git a/drishti/snippets/mpi-io-iwrite.c b/drishti/includes/snippets/mpi-io-iwrite.c similarity index 100% rename from drishti/snippets/mpi-io-iwrite.c rename to drishti/includes/snippets/mpi-io-iwrite.c diff --git a/drishti/snippets/pnetcdf-hdf5-no-fill.c b/drishti/includes/snippets/pnetcdf-hdf5-no-fill.c similarity index 100% rename from drishti/snippets/pnetcdf-hdf5-no-fill.c rename to drishti/includes/snippets/pnetcdf-hdf5-no-fill.c diff --git a/drishti/reporter.py b/drishti/reporter.py index 54c7b17..8455040 100644 --- a/drishti/reporter.py +++ b/drishti/reporter.py @@ -3,7 +3,7 @@ import os import sys from subprocess import call -from .parser import * +from drishti.includes.parser import * ''' @@ -46,10 +46,10 @@ def main(): log_type = check_log_type(args.log_path) if log_type == LOG_TYPE_DARSHAN: - from .handle_darshan import handler + from drishti.handlers.handle_darshan import handler elif log_type == LOG_TYPE_RECORDER: - from .handle_recorder import handler + from drishti.handlers.handle_recorder import handler handler() diff --git a/setup.py b/setup.py index dd18cb6..e8c33d6 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -import setuptools +from setuptools import setup, find_packages with open("README.md", "r") as f: long_description = f.read() @@ -6,7 +6,7 @@ with open("requirements.txt") as f: requirements = f.readlines() -setuptools.setup( +setup( name="drishti-io", keywords="drishti", version="0.5", @@ -23,12 +23,10 @@ 'rich ==12.5.1', 'recorder-utils', ], - packages=[ - 'drishti' - ], + packages=find_packages(), package_data={ - 'drishti': [ - 'drishti/snippets/*' + 'drishti.includes': [ + 'drishti/includes/snippets/*' ], }, include_package_data=True, From c304100c9a5006252cd6a15ca6d8313a590a31a3 Mon Sep 17 00:00:00 2001 From: onewbiek Date: Thu, 18 Jan 2024 19:53:24 -0800 Subject: [PATCH 15/19] Add documentations --- drishti/includes/module.py | 263 ++++++++++++++++++++++++++++++++----- 1 file changed, 230 insertions(+), 33 deletions(-) diff --git a/drishti/includes/module.py b/drishti/includes/module.py index ae02c2e..68e68dc 100644 --- a/drishti/includes/module.py +++ b/drishti/includes/module.py @@ -17,6 +17,15 @@ # Basic usage check def check_stdio(total_size, total_size_stdio): + ''' + Check whether the application has excessively utilized standard input/output operations + + Parameters: + total_size: total I/O size + total_size_stdio: total STDIO size + + ''' + if total_size and total_size_stdio / total_size > THRESHOLD_INTERFACE_STDIO: issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format( total_size_stdio / total_size * 100.0, @@ -35,6 +44,13 @@ def check_stdio(total_size, total_size_stdio): def check_mpiio(modules): + ''' + Check whether the application has used MPI-IO or not + + Parameter: + modules: all different mudules been used in the application + ''' + if 'MPI-IO' not in modules: issue = 'Application is using low-performance interface' @@ -54,6 +70,15 @@ def check_mpiio(modules): def check_operation_intensive(total_operations, total_reads, total_writes): + ''' + Check whether the application is read or write intensive + + Parameters: + total_operations: number of I/O operations been executed by the application + total_reads: number of read operations been executed by the application + total_writes: number of write operations been executed by the application + ''' + if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: issue = 'Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 @@ -74,6 +99,15 @@ def check_operation_intensive(total_operations, total_reads, total_writes): def check_size_intensive(total_size, total_read_size, total_written_size): + ''' + Check whether the application is read size intensive or written size intensive + + Parameters: + total_size: Total I/O size measured in byte + total_read_size: Input I/O size measured in byte + total_written_size: Output I/O size measured in byte + ''' + if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format( total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 @@ -93,12 +127,22 @@ def check_size_intensive(total_size, total_read_size, total_written_size): ) -''' -detected_files required columns: -['id', 'total_reads', 'total_writes'] -detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str) -''' def check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map): + ''' + Check whether application has performed an excessive number of small operations + + Parameters: + total_reads: number of read operations been executed by the application + total_reads_small: number of read operations that has small size + total_writes: number of write operations been executed by the application + total_writes_small: number of write operations that has small size + detected_files: + total_reads and total_writes in each file + required columns: ['id', 'total_reads', 'total_writes'] + modules: all different mudules been used in the application + file_map: file id and file name pairing + ''' + if total_reads_small and total_reads_small / total_reads > THRESHOLD_SMALL_REQUESTS and total_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests'.format( total_reads_small, total_reads_small / total_reads * 100.0 @@ -189,6 +233,16 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules): + ''' + Check whether application has excessive misaligned operations + + Parameters: + total_operations: number of I/O operations been executed by the application + total_mem_not_aligned: number of memory requests not aligned + total_file_not_aligned: number of file requests not aligned + modules: all different mudules been used in the application + ''' + if total_operations and total_mem_not_aligned / total_operations > THRESHOLD_MISALIGNED_REQUESTS: issue = 'Application has a high number ({:.2f}%) of misaligned memory requests'.format( total_mem_not_aligned / total_operations * 100.0 @@ -234,6 +288,16 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali def check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size): + ''' + Check whether application has redundant read or write traffic + + Parameters: + max_read_offset: max offset application is reading from + total_read_size: total size application has been read + max_write_offset: max offset application is writing to + total_written_size: total size application has been written + ''' + if max_read_offset > total_read_size: issue = 'Application might have redundant read traffic (more data read than the highest offset)' @@ -250,6 +314,21 @@ def check_traffic(max_read_offset, total_read_size, max_write_offset, total_writ def check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes): + ''' + Check whether application has performed excessive random operations + + Parameters: + read_consecutive: number of consecutive read operations + read_sequential: number of sequential read operations + read_random: number of random read operations + total_read: number of read operations been executed by the application + write_consecutive: number of consecutive write operations + write_sequential: number of sequential write operations + write_random: number of random write operations + total_write: number of write operations been executed by the application + ''' + + if total_reads: if read_random and read_random / total_reads > THRESHOLD_RANDOM_OPERATIONS and read_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format( @@ -301,11 +380,21 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total ) -'''' -The shared_file required columns: -['id', 'INSIGHTS_POSIX_SMALL_READS', 'INSIGHTS_POSIX_SMALL_WRITES'] -''' def check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map): + ''' + Check whether there are excessive small requests in shared files + + Parameters: + total_shared_reads: total read operations in shared files + total_shared_reads_small: small read operations in shared files + total_shared_writes: total write operations in shared files + total_shared_writes_small: small write operations in shared files + shared_files: + small reads an small writes in each shared file + required columns: ['id', 'INSIGHTS_POSIX_SMALL_READS', 'INSIGHTS_POSIX_SMALL_WRITES'] + file_map: file id and file name pairing + ''' + if total_shared_reads and total_shared_reads_small / total_shared_reads > THRESHOLD_SMALL_REQUESTS and total_shared_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format( total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0 @@ -368,6 +457,14 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t def check_long_metadata(count_long_metadata, modules): + ''' + Check how many ranks have metadata operations taking too long + + Parameters: + count_long_metadata: number of ranks that have metadata operations taking too long + modules: all different mudules been used in the application + ''' + if count_long_metadata > 0: issue = 'There are {} ranks where metadata operations take over {} seconds'.format( count_long_metadata, THRESHOLD_METADATA_TIME_RANK @@ -396,11 +493,18 @@ def check_long_metadata(count_long_metadata, modules): ) -''' -detected_files required columns: -['id', 'data_imbalance'] -''' def check_shared_data_imblance(stragglers_count, detected_files, file_map): + ''' + Check how many shared files containing data transfer imbalance + + Parameters: + stragglers_count: number of shared files that contain data transfer imbalane + detected_files: + data imbalance per file + required columns: ['id', 'data_imbalance'] + file_map: file id and file name pairing + ''' + if stragglers_count: issue = 'Detected data transfer imbalance caused by stragglers when accessing {} shared file.'.format( stragglers_count @@ -434,6 +538,15 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map): def check_shared_data_imblance_split(slowest_rank_bytes, fastest_rank_bytes, total_transfer_size): + ''' + Check whether the specific shared file contains data imbalance + + Parameters: + slowest_rank_bytes: the total request size of the rank that takes the longest data operation time + fastest_rank_bytes: the total request size of the rank that takes the shortest data operation time + total_transfer_size: total request size of that specific shared file + ''' + if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > THRESHOLD_STRAGGLERS: issue = 'Load imbalance of {:.2f}% detected'.format( abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100 @@ -454,11 +567,18 @@ def check_shared_data_imblance_split(slowest_rank_bytes, fastest_rank_bytes, tot ) -''' -detected_files required columns: -['id', 'time_imbalance'] -''' def check_shared_time_imbalance(stragglers_count, detected_files, file_map): + ''' + Check how many shared files containing time transfer imbalance + + Parameters: + stragglers_count: number of shared files that contain time transfer imbalane + detected_files: + data imbalance per file + required columns: ['id', 'time_imbalance'] + file_map: file id and file name pairing + ''' + if stragglers_count: issue = 'Detected time imbalance caused by stragglers when accessing {} shared file.'.format( stragglers_count @@ -492,6 +612,15 @@ def check_shared_time_imbalance(stragglers_count, detected_files, file_map): def check_shared_time_imbalance_split(slowest_rank_time, fastest_rank_time, total_transfer_time): + ''' + Check whether the specific shared file contains time imbalance + + Parameters: + slowest_rank_bytes: the total request time of the rank that takes the longest data operation time + fastest_rank_bytes: the total request time of the rank that takes the shortest data operation time + total_transfer_size: total request time of that specific shared file + ''' + if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > THRESHOLD_STRAGGLERS: issue = 'Load imbalance of {:.2f}% detected'.format( abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100 @@ -512,11 +641,17 @@ def check_shared_time_imbalance_split(slowest_rank_time, fastest_rank_time, tota ) -''' -detected_files required columns: -['id', 'write_imbalance'] -''' def check_individual_write_imbalance(imbalance_count, detected_files, file_map): + ''' + Check how many write imbalance when accessing individual files + + Parameters: + imbalance_count: number of individual files that have write imbalance + detected_files: + write imbalance per file + required columns: ['id', 'write_imbalance'] + ''' + if imbalance_count: issue = 'Detected write imbalance when accessing {} individual files'.format( imbalance_count @@ -557,6 +692,14 @@ def check_individual_write_imbalance(imbalance_count, detected_files, file_map): def check_individual_write_imbalance_split(max_bytes_written, min_bytes_written): + ''' + Check whether there is write imbalance in the specific individual file + + Parameters: + max_bytes_written: max byte written in the file + min_bytes_written: minimum byte written in the file + ''' + if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > THRESHOLD_IMBALANCE: issue = 'Load imbalance of {:.2f}% detected'.format( abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 @@ -584,11 +727,17 @@ def check_individual_write_imbalance_split(max_bytes_written, min_bytes_written) ) -''' -detected_files required columns: -['id', 'read_imbalance'] -''' def check_individual_read_imbalance(imbalance_count, detected_files, file_map): + ''' + Check how many read imbalance when accessing individual files + + Parameters: + imbalance_count: number of individual files that have read imbalance + detected_files: + read imbalance per file + required columns: ['id', 'read_imbalance'] + ''' + if imbalance_count: issue = 'Detected read imbalance when accessing {} individual files.'.format( imbalance_count @@ -629,6 +778,14 @@ def check_individual_read_imbalance(imbalance_count, detected_files, file_map): def check_individual_read_imbalance_split(max_bytes_read, min_bytes_read): + ''' + Check whether there is read imbalance in the specific individual file + + Parameters: + max_bytes_written: max byte read in the file + min_bytes_written: minimum byte read in the file + ''' + if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > THRESHOLD_IMBALANCE: issue = 'Load imbalance of {:.2f}% detected'.format( abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 @@ -658,11 +815,21 @@ def check_individual_read_imbalance_split(max_bytes_read, min_bytes_read): # MPIIO level check -''' -detected_files required columns: -['id', 'absolute_indep_reads', 'percent_indep_reads'] -''' + def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map): + ''' + Check whether application uses collective mpi read calls + + Parameters: + mpiio_coll_reads: number of mpiio read operations that are collective + mpiio_indep_reads: number of mpiio read operations that are independent + total_mpiio_read_operations: total mpiio read operations + detected_files: + independent read operations and percentage per file + required columns: ['id', 'absolute_indep_reads', 'percent_indep_reads'] + file_map: file id and file name pairing + ''' + if mpiio_coll_reads == 0: if total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format( @@ -704,11 +871,20 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot ) -''' -detected_files required columns: -['id', 'absolute_indep_writes', 'percent_indep_writes'] -''' def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map): + ''' + Check whether application uses collective mpi write calls + + Parameters: + mpiio_coll_writes: number of mpiio write operations that are collective + mpiio_indep_writes: number of mpiio write operations that are independent + total_mpiio_write_operations: total mpiio write operations + detected_files: + independent write operations and percentage per file + required columns: ['id', 'absolute_indep_writes', 'percent_indep_writes'] + file_map: file id and file name pairing + ''' + if mpiio_coll_writes == 0: if total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format( @@ -751,6 +927,16 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, def check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules): + ''' + Check whether application can benefit from non-blocking requests + + Parameters: + mpiio_nb_reads: number of non-blocking mpi read operations + mpiio_nb_writes: number of non-blocking mpi write operations + has_hdf5_extension: boolean value of whether the file in in hdf5 extension + modules: all different mudules been used in the application + ''' + if mpiio_nb_reads == 0: issue = 'Application could benefit from non-blocking (asynchronous) reads' @@ -803,6 +989,14 @@ def check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_ext def check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES): + ''' + Check whether application has used inter-node aggregators + + Parameters: + cb_nodes: + NUMBER_OF_COMPUTE_NODES: + ''' + if cb_nodes > NUMBER_OF_COMPUTE_NODES: issue = 'Application is using inter-node aggregators (which require network communication)' @@ -893,6 +1087,9 @@ def display_footer(console, insights_start_time, insights_end_time): ) def export_html(console, filename): + ''' + ''' + if args.export_html: console.save_html( filename, From e3cf3f52d15e770b34ba3bcd34df5bb2e3cc8782 Mon Sep 17 00:00:00 2001 From: onewbiek Date: Fri, 19 Jan 2024 05:01:10 -0800 Subject: [PATCH 16/19] Enable thresholds configurations --- drishti/handlers/handle_darshan.py | 1 - drishti/handlers/handle_recorder.py | 5 +-- drishti/includes/config.py | 67 ++++++++++++++++------------- drishti/includes/parser.py | 7 +++ 4 files changed, 45 insertions(+), 35 deletions(-) diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index e533358..c735731 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -72,7 +72,6 @@ def check_log_version(console, file, log_version, library_version): def handler(): console = init_console() - validate_thresholds() insights_start_time = time.time() diff --git a/drishti/handlers/handle_recorder.py b/drishti/handlers/handle_recorder.py index 83b132d..8b0f7b7 100644 --- a/drishti/handlers/handle_recorder.py +++ b/drishti/handlers/handle_recorder.py @@ -76,7 +76,6 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): insights_start_time = time.time() console = init_console() - validate_thresholds() modules = set(df_intervals['api'].unique()) # Check usage of POSIX, and MPI-IO per file @@ -433,7 +432,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): for id in file_map.keys(): indep_read_count = df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all')) & (df_mpiio_reads['file_id'] == id)] indep_write_count = df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all')) & (df_mpiio_writes['file_id'] == id)] - indep_total_count = indep_read_count + indep_write_count; + indep_total_count = indep_read_count + indep_write_count if (indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE and indep_read_count / indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS): detected_files.append([ @@ -453,7 +452,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): for id in file_map.keys(): indep_read_count = df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all')) & (df_mpiio_reads['file_id'] == id)] indep_write_count = df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all')) & (df_mpiio_writes['file_id'] == id)] - indep_total_count = indep_read_count + indep_write_count; + indep_total_count = indep_read_count + indep_write_count if (indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE and indep_write_count / indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS): detected_files.append([ diff --git a/drishti/includes/config.py b/drishti/includes/config.py index 0041980..17f81b2 100644 --- a/drishti/includes/config.py +++ b/drishti/includes/config.py @@ -88,9 +88,6 @@ csv_report = [] codes = [] -# TODO: need to verify the threashold to be between 0 and 1 -# TODO: read thresholds from file - def init_console(): console = Console(record=True) @@ -138,38 +135,47 @@ def set_export_theme(): def load_json(): codes = [] - if args.json: - f = open(args.json) - data = json.load(f) - - for key, values in data.items(): - for value in values: - code = value['code'] - codes.append(code) - - level = value['level'] - issue = value['issue'] - recommendation = [] - for rec in value['recommendations']: - new_message = {'message': rec} - recommendation.append(new_message) - - insights_dxt.append( - message(code, TARGET_DEVELOPER, level, issue, recommendation) - ) + if not args.split_files: + if args.json: + f = open(args.json) + data = json.load(f) + + for key, values in data.items(): + for value in values: + code = value['code'] + codes.append(code) + + level = value['level'] + issue = value['issue'] + recommendation = [] + for rec in value['recommendations']: + new_message = {'message': rec} + recommendation.append(new_message) + + insights_dxt.append( + message(code, TARGET_DEVELOPER, level, issue, recommendation) + ) def validate_thresholds(): """ Validate thresholds defined by the user. """ - assert(THRESHOLD_OPERATION_IMBALANCE >= 0.0 and THRESHOLD_OPERATION_IMBALANCE <= 1.0) - assert(THRESHOLD_SMALL_REQUESTS >= 0.0 and THRESHOLD_SMALL_REQUESTS <= 1.0) - assert(THRESHOLD_MISALIGNED_REQUESTS >= 0.0 and THRESHOLD_MISALIGNED_REQUESTS <= 1.0) - assert(THRESHOLD_METADATA >= 0.0 and THRESHOLD_METADATA <= 1.0) - assert(THRESHOLD_RANDOM_OPERATIONS >= 0.0 and THRESHOLD_RANDOM_OPERATIONS <= 1.0) + if args.config: + f = open(args.config) + data = json.load(f) + + for category, thresholds_spec in data.items(): + for threshold_name, threshold_value in thresholds_spec.items(): + globals()[threshold_name] = threshold_value - assert(THRESHOLD_METADATA_TIME_RANK >= 0.0) + assert(THRESHOLD_OPERATION_IMBALANCE >= 0.0 and THRESHOLD_OPERATION_IMBALANCE <= 1.0) + assert(THRESHOLD_SMALL_REQUESTS >= 0.0 and THRESHOLD_SMALL_REQUESTS <= 1.0) + assert(THRESHOLD_MISALIGNED_REQUESTS >= 0.0 and THRESHOLD_MISALIGNED_REQUESTS <= 1.0) + assert(THRESHOLD_METADATA >= 0.0 and THRESHOLD_METADATA <= 1.0) + assert(THRESHOLD_RANDOM_OPERATIONS >= 0.0 and THRESHOLD_RANDOM_OPERATIONS <= 1.0) + + assert(THRESHOLD_METADATA_TIME_RANK >= 0.0) def convert_bytes(bytes_number): @@ -265,6 +271,5 @@ def message(code, target, level, issue, recommendations=None, details=None): ''' Pre-load ''' -if not args.split_files: - load_json() - +load_json() +validate_thresholds() diff --git a/drishti/includes/parser.py b/drishti/includes/parser.py index 0261312..7ddfdd6 100644 --- a/drishti/includes/parser.py +++ b/drishti/includes/parser.py @@ -95,4 +95,11 @@ help='Split the files and generate report for each file' ) +parser.add_argument( + '--config', + default=False, + dest='config', + help='Enable thresholds read from json file' +) + args = parser.parse_args() From ba39b12a19d5540ebd38bc8351368b8895bc0ad1 Mon Sep 17 00:00:00 2001 From: onewbiek Date: Fri, 19 Jan 2024 10:20:36 -0800 Subject: [PATCH 17/19] Use cache to speed up Recorder log parsing process --- drishti/handlers/handle_recorder.py | 56 +++++++++++++++++++---------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/drishti/handlers/handle_recorder.py b/drishti/handlers/handle_recorder.py index 8b0f7b7..c4ffd2d 100644 --- a/drishti/handlers/handle_recorder.py +++ b/drishti/handlers/handle_recorder.py @@ -41,26 +41,46 @@ def init_df_posix_recordes(reader): def handler(): - reader = RecorderReader(args.log_path) - df_intervals = build_offset_intervals(reader) - df_posix_records = init_df_posix_recordes(reader) + df_intervals = None + df_posix_records = None + df_file_map = None + file_map = None + + if os.path.exists(args.log_path + '.intervals.csv') and os.path.exists(args.log_path + '.records.csv') and os.path.exists(args.log_path + '.filemap.csv'): + print('using existing parsed log file') + df_intervals = pd.read_csv(args.log_path + '.intervals.csv') + df_posix_records = pd.read_csv(args.log_path + '.records.csv') + df_file_map = pd.read_csv(args.log_path + '.filemap.csv') + file_map = {} + for index, row in df_file_map.iterrows(): + file_map[row['file_id']] = row['file_name'] + else: + reader = RecorderReader(args.log_path) + df_intervals = build_offset_intervals(reader) + df_posix_records = init_df_posix_recordes(reader) + + file_map = get_accessed_files(reader) + + def add_api(row): + if 'MPI' in row['function']: + return 'MPI-IO' + elif 'H5' in row['function']: + return 'H5F' + else: + return 'POSIX' + + def add_duration(row): + return row['end'] - row['start'] + + df_intervals['api'] = df_intervals.apply(add_api, axis=1) + df_intervals['duration'] = df_intervals.apply(add_duration, axis=1) + df_posix_records['duration'] = df_posix_records.apply(add_duration, axis=1) - file_map = get_accessed_files(reader) + df_intervals.to_csv(args.log_path + '.intervals.csv', mode='w', index=False, header=True) + df_posix_records.to_csv(args.log_path + '.records.csv', mode='w', index=False, header=True) - def add_api(row): - if 'MPI' in row['function']: - return 'MPI-IO' - elif 'H5' in row['function']: - return 'H5F' - else: - return 'POSIX' - - def add_duration(row): - return row['end'] - row['start'] - - df_intervals['api'] = df_intervals.apply(add_api, axis=1) - df_intervals['duration'] = df_intervals.apply(add_duration, axis=1) - df_posix_records['duration'] = df_posix_records.apply(add_duration, axis=1) + df_file_map = pd.DataFrame(list(file_map.items()), columns=['file_id', 'file_name']) + df_file_map.to_csv(args.log_path + '.filemap.csv', mode='w', index=False, header=True) if args.split_files: for fid in file_map: From 872d801421b6287268ab62573cc8ceb52da13674 Mon Sep 17 00:00:00 2001 From: onewbiek Date: Fri, 26 Jan 2024 04:25:06 -0800 Subject: [PATCH 18/19] Rename thresholds --- drishti/handlers/handle_darshan.py | 22 ++++++------- drishti/handlers/handle_recorder.py | 36 +++++++++++----------- drishti/includes/config.py | 38 +++++++++++------------ drishti/includes/module.py | 48 ++++++++++++++--------------- 4 files changed, 71 insertions(+), 73 deletions(-) diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index c735731..775a838 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -354,7 +354,7 @@ def handler(): ######################################################################################################################################################################### - count_long_metadata = len(df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > THRESHOLD_METADATA_TIME_RANK)]) + count_long_metadata = len(df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > metadata_time_rank)]) check_long_metadata(count_long_metadata, modules) @@ -375,7 +375,7 @@ def handler(): for index, row in shared_files.iterrows(): total_transfer_size = row['POSIX_BYTES_WRITTEN'] + row['POSIX_BYTES_READ'] - if total_transfer_size and abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size > THRESHOLD_STRAGGLERS: + if total_transfer_size and abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size > imbalance_stragglers: stragglers_count += 1 detected_files.append([ @@ -403,7 +403,7 @@ def handler(): for index, row in shared_files_times.iterrows(): total_transfer_time = row['POSIX_F_WRITE_TIME'] + row['POSIX_F_READ_TIME'] + row['POSIX_F_META_TIME'] - if total_transfer_time and abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time > THRESHOLD_STRAGGLERS: + if total_transfer_time and abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time > imbalance_stragglers: stragglers_count += 1 detected_files.append([ @@ -432,7 +432,7 @@ def handler(): detected_files = [] for index, row in aggregated.iterrows(): - if row['POSIX_BYTES_WRITTEN_max'] and abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row['POSIX_BYTES_WRITTEN_max'] > THRESHOLD_IMBALANCE: + if row['POSIX_BYTES_WRITTEN_max'] and abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row['POSIX_BYTES_WRITTEN_max'] > imbalance_size: imbalance_count += 1 detected_files.append([ @@ -448,7 +448,7 @@ def handler(): detected_files = [] for index, row in aggregated.iterrows(): - if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] > THRESHOLD_IMBALANCE: + if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] > imbalance_size: imbalance_count += 1 detected_files.append([ @@ -478,12 +478,12 @@ def handler(): mpiio_indep_reads = df_mpiio['counters']['MPIIO_INDEP_READS'].sum() detected_files = [] - if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > collective_operations_absolute: files = pd.DataFrame(df_mpiio_collective_reads.groupby('id').sum()).reset_index() for index, row in df_mpiio_collective_reads.iterrows(): if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and - row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS and - (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE): + row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > collective_operations and + (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > collective_operations_absolute): detected_files.append([ row['id'], row['MPIIO_INDEP_READS'], row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100 @@ -502,13 +502,13 @@ def handler(): mpiio_indep_writes = df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() detected_files = [] - if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > collective_operations_absolute: files = pd.DataFrame(df_mpiio_collective_writes.groupby('id').sum()).reset_index() for index, row in df_mpiio_collective_writes.iterrows(): if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and - row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS and - (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE): + row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > collective_operations and + (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > collective_operations_absolute): detected_files.append([ row['id'], row['MPIIO_INDEP_WRITES'], row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100 diff --git a/drishti/handlers/handle_recorder.py b/drishti/handlers/handle_recorder.py index c4ffd2d..0007d11 100644 --- a/drishti/handlers/handle_recorder.py +++ b/drishti/handlers/handle_recorder.py @@ -47,7 +47,7 @@ def handler(): file_map = None if os.path.exists(args.log_path + '.intervals.csv') and os.path.exists(args.log_path + '.records.csv') and os.path.exists(args.log_path + '.filemap.csv'): - print('using existing parsed log file') + print('Using existing parsed log file') df_intervals = pd.read_csv(args.log_path + '.intervals.csv') df_posix_records = pd.read_csv(args.log_path + '.records.csv') df_file_map = pd.read_csv(args.log_path + '.filemap.csv') @@ -174,16 +174,16 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): # Get the number of small I/O operations (less than 1 MB) - total_reads_small = len(df_posix[(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - total_writes_small = len(df_posix[~(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + total_reads_small = len(df_posix[(df_posix['function'].str.contains('read')) & (df_posix['size'] < small_bytes)]) + total_writes_small = len(df_posix[~(df_posix['function'].str.contains('read')) & (df_posix['size'] < small_bytes)]) if args.split_files: detected_files = pd.DataFrame() else: detected_files = [] for id in file_map.keys(): - read_cnt = len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) - write_cnt = len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read')) & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + read_cnt = len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read')) & (df_posix['size'] < small_bytes)]) + write_cnt = len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read')) & (df_posix['size'] < small_bytes)]) detected_files.append([id, read_cnt, write_cnt]) column_names = ['id', 'total_reads', 'total_writes'] @@ -258,12 +258,12 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): total_shared_reads = len(df_posix[(df_posix['file_id'].isin(shared_files)) & (df_posix['function'].str.contains('read'))]) total_shared_reads_small = len(df_posix[(df_posix['file_id'].isin(shared_files)) & (df_posix['function'].str.contains('read')) - & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + & (df_posix['size'] < small_bytes)]) total_shared_writes = len(df_posix[(df_posix['file_id'].isin(shared_files)) & ~(df_posix['function'].str.contains('read'))]) total_shared_writes_small = len(df_posix[(df_posix['file_id'].isin(shared_files)) & ~(df_posix['function'].str.contains('read')) - & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + & (df_posix['size'] < small_bytes)]) if args.split_files: detected_files = pd.DataFrame() @@ -272,10 +272,10 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): for id in shared_files: read_cnt = len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read')) - & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + & (df_posix['size'] < small_bytes)]) write_cnt = len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read')) - & (df_posix['size'] < THRESHOLD_SMALL_BYTES)]) + & (df_posix['size'] < small_bytes)]) detected_files.append([id, read_cnt, write_cnt]) column_names = ['id', 'INSIGHTS_POSIX_SMALL_READS', 'INSIGHTS_POSIX_SMALL_WRITES'] @@ -287,7 +287,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): # TODO: Assumed metadata operations: open, close, sync, create, seek df_detected = df_posix_records.groupby('rank')['duration'].sum().reset_index() - count_long_metadata = len(df_detected[(df_detected['duration'] > THRESHOLD_METADATA_TIME_RANK)]) + count_long_metadata = len(df_detected[(df_detected['duration'] > metadata_time_rank)]) check_long_metadata(count_long_metadata, modules) @@ -318,7 +318,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): slowest_rank_bytes = df_detected.loc[df_detected['duration'].idxmax(), 'size'] fastest_rank_bytes = df_detected.loc[df_detected['duration'].idxmin(), 'size'] - if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > THRESHOLD_STRAGGLERS: + if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > imbalance_stragglers: stragglers_count += 1 detected_files.append([ @@ -356,7 +356,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): slowest_rank_time = df_detected['duration'].max() fastest_rank_time = df_detected['duration'].min() - if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > THRESHOLD_STRAGGLERS: + if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > imbalance_stragglers: stragglers_count += 1 detected_files.append([ @@ -396,7 +396,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): max_bytes_written = df_detected['size'].max() min_bytes_written = df_detected['size'].min() - if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > THRESHOLD_IMBALANCE: + if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > imbalance_size: imbalance_count += 1 detected_files.append([ @@ -417,7 +417,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): max_bytes_read = df_detected['size'].max() min_bytes_read = df_detected['size'].min() - if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > THRESHOLD_IMBALANCE: + if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > imbalance_size: imbalance_count += 1 detected_files.append([ @@ -448,13 +448,13 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): detected_files = pd.DataFrame() else: detected_files = [] - if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > collective_operations_absolute: for id in file_map.keys(): indep_read_count = df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all')) & (df_mpiio_reads['file_id'] == id)] indep_write_count = df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all')) & (df_mpiio_writes['file_id'] == id)] indep_total_count = indep_read_count + indep_write_count - if (indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE and indep_read_count / indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS): + if (indep_total_count > collective_operations_absolute and indep_read_count / indep_total_count > collective_operations): detected_files.append([ id, indep_read_count, indep_read_count / indep_total_count * 100 ]) @@ -468,13 +468,13 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): detected_files = pd.DataFrame() else: detected_files = [] - if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > collective_operations_absolute: for id in file_map.keys(): indep_read_count = df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all')) & (df_mpiio_reads['file_id'] == id)] indep_write_count = df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all')) & (df_mpiio_writes['file_id'] == id)] indep_total_count = indep_read_count + indep_write_count - if (indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE and indep_write_count / indep_total_count > THRESHOLD_COLLECTIVE_OPERATIONS): + if (indep_total_count > collective_operations_absolute and indep_write_count / indep_total_count > collective_operations): detected_files.append([ id, indep_write_count, indep_write_count / indep_total_count * 100 ]) diff --git a/drishti/includes/config.py b/drishti/includes/config.py index 17f81b2..f362dc2 100644 --- a/drishti/includes/config.py +++ b/drishti/includes/config.py @@ -34,20 +34,19 @@ insights_total[WARN] = 0 insights_total[RECOMMENDATIONS] = 0 -THRESHOLD_OPERATION_IMBALANCE = 0.1 -THRESHOLD_SMALL_REQUESTS = 0.1 -THRESHOLD_SMALL_REQUESTS_ABSOLUTE = 1000 -THRESHOLD_MISALIGNED_REQUESTS = 0.1 -THRESHOLD_METADATA = 0.1 -THRESHOLD_METADATA_TIME_RANK = 30 # seconds -THRESHOLD_RANDOM_OPERATIONS = 0.2 -THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE = 1000 -THRESHOLD_STRAGGLERS = 0.15 -THRESHOLD_IMBALANCE = 0.30 -THRESHOLD_INTERFACE_STDIO = 0.1 -THRESHOLD_COLLECTIVE_OPERATIONS = 0.5 -THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE = 1000 -THRESHOLD_SMALL_BYTES = 1048576 # 1 MB +imbalance_operations = 0.1 +small_bytes = 1048576 # 1MB +small_requests = 0.1 +small_requests_absolute = 1000 +misaligned_requests = 0.1 +metadata_time_rank = 30 # seconds +random_operations = 0.2 +random_operations_absolute = 1000 +imbalance_stragglers = 0.15 +imbalance_size = 0.30 +interface_stdio = 0.1 +collective_operations = 0.5 +collective_operations_absolute = 1000 INSIGHTS_STDIO_HIGH_USAGE = 'S01' INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE = 'P01' @@ -169,13 +168,12 @@ def validate_thresholds(): for threshold_name, threshold_value in thresholds_spec.items(): globals()[threshold_name] = threshold_value - assert(THRESHOLD_OPERATION_IMBALANCE >= 0.0 and THRESHOLD_OPERATION_IMBALANCE <= 1.0) - assert(THRESHOLD_SMALL_REQUESTS >= 0.0 and THRESHOLD_SMALL_REQUESTS <= 1.0) - assert(THRESHOLD_MISALIGNED_REQUESTS >= 0.0 and THRESHOLD_MISALIGNED_REQUESTS <= 1.0) - assert(THRESHOLD_METADATA >= 0.0 and THRESHOLD_METADATA <= 1.0) - assert(THRESHOLD_RANDOM_OPERATIONS >= 0.0 and THRESHOLD_RANDOM_OPERATIONS <= 1.0) + assert(imbalance_operations >= 0.0 and imbalance_operations <= 1.0) + assert(small_requests >= 0.0 and small_requests <= 1.0) + assert(misaligned_requests >= 0.0 and misaligned_requests <= 1.0) + assert(random_operations >= 0.0 and random_operations <= 1.0) - assert(THRESHOLD_METADATA_TIME_RANK >= 0.0) + assert(metadata_time_rank >= 0.0) def convert_bytes(bytes_number): diff --git a/drishti/includes/module.py b/drishti/includes/module.py index 68e68dc..cf90530 100644 --- a/drishti/includes/module.py +++ b/drishti/includes/module.py @@ -26,7 +26,7 @@ def check_stdio(total_size, total_size_stdio): ''' - if total_size and total_size_stdio / total_size > THRESHOLD_INTERFACE_STDIO: + if total_size and total_size_stdio / total_size > interface_stdio: issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format( total_size_stdio / total_size * 100.0, convert_bytes(total_size_stdio) @@ -79,7 +79,7 @@ def check_operation_intensive(total_operations, total_reads, total_writes): total_writes: number of write operations been executed by the application ''' - if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: + if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > imbalance_operations: issue = 'Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 ) @@ -88,7 +88,7 @@ def check_operation_intensive(total_operations, total_reads, total_writes): message(INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) ) - if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > THRESHOLD_OPERATION_IMBALANCE: + if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > imbalance_operations: issue = 'Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 ) @@ -108,7 +108,7 @@ def check_size_intensive(total_size, total_read_size, total_written_size): total_written_size: Output I/O size measured in byte ''' - if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: + if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > imbalance_operations: issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format( total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 ) @@ -117,7 +117,7 @@ def check_size_intensive(total_size, total_read_size, total_written_size): message(INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) ) - if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > THRESHOLD_OPERATION_IMBALANCE: + if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > imbalance_operations: issue = 'Application is read size intensive ({:.2f}% write vs. {:.2f}% read)'.format( total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 ) @@ -143,7 +143,7 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr file_map: file id and file name pairing ''' - if total_reads_small and total_reads_small / total_reads > THRESHOLD_SMALL_REQUESTS and total_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + if total_reads_small and total_reads_small / total_reads > small_requests and total_reads_small > small_requests_absolute: issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests'.format( total_reads_small, total_reads_small / total_reads * 100.0 ) @@ -152,7 +152,7 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr recommendation = [] for index, row in detected_files.iterrows(): - if row['total_reads'] > (total_reads * THRESHOLD_SMALL_REQUESTS / 2): + if row['total_reads'] > (total_reads * small_requests / 2): detail.append( { 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( @@ -187,7 +187,7 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) ) - if total_writes_small and total_writes_small / total_writes > THRESHOLD_SMALL_REQUESTS and total_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + if total_writes_small and total_writes_small / total_writes > small_requests and total_writes_small > small_requests_absolute: issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests'.format( total_writes_small, total_writes_small / total_writes * 100.0 ) @@ -196,7 +196,7 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr recommendation = [] for index, row in detected_files.iterrows(): - if row['total_writes'] > (total_writes * THRESHOLD_SMALL_REQUESTS / 2): + if row['total_writes'] > (total_writes * small_requests / 2): detail.append( { 'message': '{} ({:.2f}%) small write requests are to "{}"'.format( @@ -243,7 +243,7 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali modules: all different mudules been used in the application ''' - if total_operations and total_mem_not_aligned / total_operations > THRESHOLD_MISALIGNED_REQUESTS: + if total_operations and total_mem_not_aligned / total_operations > misaligned_requests: issue = 'Application has a high number ({:.2f}%) of misaligned memory requests'.format( total_mem_not_aligned / total_operations * 100.0 ) @@ -252,7 +252,7 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali message(INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, TARGET_DEVELOPER, HIGH, issue, None) ) - if total_operations and total_file_not_aligned / total_operations > THRESHOLD_MISALIGNED_REQUESTS: + if total_operations and total_file_not_aligned / total_operations > misaligned_requests: issue = 'Application issues a high number ({:.2f}%) of misaligned file requests'.format( total_file_not_aligned / total_operations * 100.0 ) @@ -330,7 +330,7 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total if total_reads: - if read_random and read_random / total_reads > THRESHOLD_RANDOM_OPERATIONS and read_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: + if read_random and read_random / total_reads > random_operations and read_random > random_operations_absolute: issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format( read_random, read_random / total_reads * 100.0 ) @@ -355,7 +355,7 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total ) if total_writes: - if write_random and write_random / total_writes > THRESHOLD_RANDOM_OPERATIONS and write_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE: + if write_random and write_random / total_writes > random_operations and write_random > random_operations_absolute: issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)'.format( write_random, write_random / total_writes * 100.0 ) @@ -395,7 +395,7 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t file_map: file id and file name pairing ''' - if total_shared_reads and total_shared_reads_small / total_shared_reads > THRESHOLD_SMALL_REQUESTS and total_shared_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + if total_shared_reads and total_shared_reads_small / total_shared_reads > small_requests and total_shared_reads_small > small_requests_absolute: issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format( total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0 ) @@ -403,7 +403,7 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t detail = [] for index, row in shared_files.iterrows(): - if row['INSIGHTS_POSIX_SMALL_READS'] > (total_shared_reads * THRESHOLD_SMALL_REQUESTS / 2): + if row['INSIGHTS_POSIX_SMALL_READS'] > (total_shared_reads * small_requests / 2): detail.append( { 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( @@ -425,7 +425,7 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) ) - if total_shared_writes and total_shared_writes_small / total_shared_writes > THRESHOLD_SMALL_REQUESTS and total_shared_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE: + if total_shared_writes and total_shared_writes_small / total_shared_writes > small_requests and total_shared_writes_small > small_requests_absolute: issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests'.format( total_shared_writes_small, total_shared_writes_small / total_shared_writes * 100.0 ) @@ -433,7 +433,7 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t detail = [] for index, row in shared_files.iterrows(): - if row['INSIGHTS_POSIX_SMALL_WRITES'] > (total_shared_writes * THRESHOLD_SMALL_REQUESTS / 2): + if row['INSIGHTS_POSIX_SMALL_WRITES'] > (total_shared_writes * small_requests / 2): detail.append( { 'message': '{} ({:.2f}%) small writes requests are to "{}"'.format( @@ -467,7 +467,7 @@ def check_long_metadata(count_long_metadata, modules): if count_long_metadata > 0: issue = 'There are {} ranks where metadata operations take over {} seconds'.format( - count_long_metadata, THRESHOLD_METADATA_TIME_RANK + count_long_metadata, metadata_time_rank ) recommendation = [ @@ -547,7 +547,7 @@ def check_shared_data_imblance_split(slowest_rank_bytes, fastest_rank_bytes, tot total_transfer_size: total request size of that specific shared file ''' - if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > THRESHOLD_STRAGGLERS: + if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > imbalance_stragglers: issue = 'Load imbalance of {:.2f}% detected'.format( abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100 ) @@ -621,7 +621,7 @@ def check_shared_time_imbalance_split(slowest_rank_time, fastest_rank_time, tota total_transfer_size: total request time of that specific shared file ''' - if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > THRESHOLD_STRAGGLERS: + if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > imbalance_stragglers: issue = 'Load imbalance of {:.2f}% detected'.format( abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100 ) @@ -700,7 +700,7 @@ def check_individual_write_imbalance_split(max_bytes_written, min_bytes_written) min_bytes_written: minimum byte written in the file ''' - if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > THRESHOLD_IMBALANCE: + if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > imbalance_size: issue = 'Load imbalance of {:.2f}% detected'.format( abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 ) @@ -786,7 +786,7 @@ def check_individual_read_imbalance_split(max_bytes_read, min_bytes_read): min_bytes_written: minimum byte read in the file ''' - if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > THRESHOLD_IMBALANCE: + if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > imbalance_size: issue = 'Load imbalance of {:.2f}% detected'.format( abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 ) @@ -831,7 +831,7 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot ''' if mpiio_coll_reads == 0: - if total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + if total_mpiio_read_operations and total_mpiio_read_operations > collective_operations_absolute: issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format( mpiio_indep_reads, mpiio_indep_reads / total_mpiio_read_operations * 100 @@ -886,7 +886,7 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, ''' if mpiio_coll_writes == 0: - if total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE: + if total_mpiio_write_operations and total_mpiio_write_operations > collective_operations_absolute: issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format( mpiio_indep_writes, mpiio_indep_writes / total_mpiio_write_operations * 100 From 68ebab837716cff966d841ec519edeb5ad063387 Mon Sep 17 00:00:00 2001 From: onewbiek Date: Fri, 2 Feb 2024 10:48:13 -0800 Subject: [PATCH 19/19] Enable thresholds display --- drishti/handlers/handle_darshan.py | 23 +++--- drishti/handlers/handle_recorder.py | 39 +++++----- drishti/includes/config.py | 48 +++++++------ drishti/includes/module.py | 108 +++++++++++++++++++++------- drishti/includes/parser.py | 8 +++ 5 files changed, 152 insertions(+), 74 deletions(-) diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index 775a838..b4c59bf 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -354,7 +354,7 @@ def handler(): ######################################################################################################################################################################### - count_long_metadata = len(df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > metadata_time_rank)]) + count_long_metadata = len(df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > thresholds['metadata_time_rank'][0])]) check_long_metadata(count_long_metadata, modules) @@ -375,7 +375,7 @@ def handler(): for index, row in shared_files.iterrows(): total_transfer_size = row['POSIX_BYTES_WRITTEN'] + row['POSIX_BYTES_READ'] - if total_transfer_size and abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size > imbalance_stragglers: + if total_transfer_size and abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size > thresholds['imbalance_stragglers'][0]: stragglers_count += 1 detected_files.append([ @@ -403,7 +403,7 @@ def handler(): for index, row in shared_files_times.iterrows(): total_transfer_time = row['POSIX_F_WRITE_TIME'] + row['POSIX_F_READ_TIME'] + row['POSIX_F_META_TIME'] - if total_transfer_time and abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time > imbalance_stragglers: + if total_transfer_time and abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time > thresholds['imbalance_stragglers'][0]: stragglers_count += 1 detected_files.append([ @@ -432,7 +432,7 @@ def handler(): detected_files = [] for index, row in aggregated.iterrows(): - if row['POSIX_BYTES_WRITTEN_max'] and abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row['POSIX_BYTES_WRITTEN_max'] > imbalance_size: + if row['POSIX_BYTES_WRITTEN_max'] and abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row['POSIX_BYTES_WRITTEN_max'] > thresholds['imbalance_size'][0]: imbalance_count += 1 detected_files.append([ @@ -448,7 +448,7 @@ def handler(): detected_files = [] for index, row in aggregated.iterrows(): - if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] > imbalance_size: + if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] > thresholds['imbalance_size'][0]: imbalance_count += 1 detected_files.append([ @@ -478,12 +478,12 @@ def handler(): mpiio_indep_reads = df_mpiio['counters']['MPIIO_INDEP_READS'].sum() detected_files = [] - if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > collective_operations_absolute: + if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > thresholds['collective_operations_absolute'][0]: files = pd.DataFrame(df_mpiio_collective_reads.groupby('id').sum()).reset_index() for index, row in df_mpiio_collective_reads.iterrows(): if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and - row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > collective_operations and - (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > collective_operations_absolute): + row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > thresholds['collective_operations'][0] and + (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > thresholds['collective_operations_absolute'][0]): detected_files.append([ row['id'], row['MPIIO_INDEP_READS'], row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100 @@ -502,13 +502,13 @@ def handler(): mpiio_indep_writes = df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() detected_files = [] - if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > collective_operations_absolute: + if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > thresholds['collective_operations_absolute'][0]: files = pd.DataFrame(df_mpiio_collective_writes.groupby('id').sum()).reset_index() for index, row in df_mpiio_collective_writes.iterrows(): if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and - row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > collective_operations and - (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > collective_operations_absolute): + row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > thresholds['collective_operations'][0] and + (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > thresholds['collective_operations_absolute'][0]): detected_files.append([ row['id'], row['MPIIO_INDEP_WRITES'], row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100 @@ -651,6 +651,7 @@ def handler(): console.print() display_content(console) + display_thresholds(console) display_footer(console, insights_start_time, insights_end_time) filename = '{}.html'.format(args.log_path) diff --git a/drishti/handlers/handle_recorder.py b/drishti/handlers/handle_recorder.py index 0007d11..34c4790 100644 --- a/drishti/handlers/handle_recorder.py +++ b/drishti/handlers/handle_recorder.py @@ -47,7 +47,9 @@ def handler(): file_map = None if os.path.exists(args.log_path + '.intervals.csv') and os.path.exists(args.log_path + '.records.csv') and os.path.exists(args.log_path + '.filemap.csv'): - print('Using existing parsed log file') + print('Using parsed file: {}'.format(os.path.abspath(args.log_path + '.intervals.csv'))) + print('Using parsed file: {}'.format(os.path.abspath(args.log_path + '.records.csv'))) + print('Using parsed file: {}'.format(os.path.abspath(args.log_path + '.filemap.csv'))) df_intervals = pd.read_csv(args.log_path + '.intervals.csv') df_posix_records = pd.read_csv(args.log_path + '.records.csv') df_file_map = pd.read_csv(args.log_path + '.filemap.csv') @@ -174,16 +176,16 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): # Get the number of small I/O operations (less than 1 MB) - total_reads_small = len(df_posix[(df_posix['function'].str.contains('read')) & (df_posix['size'] < small_bytes)]) - total_writes_small = len(df_posix[~(df_posix['function'].str.contains('read')) & (df_posix['size'] < small_bytes)]) + total_reads_small = len(df_posix[(df_posix['function'].str.contains('read')) & (df_posix['size'] < thresholds['small_bytes'][0])]) + total_writes_small = len(df_posix[~(df_posix['function'].str.contains('read')) & (df_posix['size'] < thresholds['small_bytes'][0])]) if args.split_files: detected_files = pd.DataFrame() else: detected_files = [] for id in file_map.keys(): - read_cnt = len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read')) & (df_posix['size'] < small_bytes)]) - write_cnt = len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read')) & (df_posix['size'] < small_bytes)]) + read_cnt = len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read')) & (df_posix['size'] < thresholds['small_bytes'][0])]) + write_cnt = len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read')) & (df_posix['size'] < thresholds['small_bytes'][0])]) detected_files.append([id, read_cnt, write_cnt]) column_names = ['id', 'total_reads', 'total_writes'] @@ -258,12 +260,12 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): total_shared_reads = len(df_posix[(df_posix['file_id'].isin(shared_files)) & (df_posix['function'].str.contains('read'))]) total_shared_reads_small = len(df_posix[(df_posix['file_id'].isin(shared_files)) & (df_posix['function'].str.contains('read')) - & (df_posix['size'] < small_bytes)]) + & (df_posix['size'] < thresholds['small_bytes'][0])]) total_shared_writes = len(df_posix[(df_posix['file_id'].isin(shared_files)) & ~(df_posix['function'].str.contains('read'))]) total_shared_writes_small = len(df_posix[(df_posix['file_id'].isin(shared_files)) & ~(df_posix['function'].str.contains('read')) - & (df_posix['size'] < small_bytes)]) + & (df_posix['size'] < thresholds['small_bytes'][0])]) if args.split_files: detected_files = pd.DataFrame() @@ -272,10 +274,10 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): for id in shared_files: read_cnt = len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read')) - & (df_posix['size'] < small_bytes)]) + & (df_posix['size'] < thresholds['small_bytes'][0])]) write_cnt = len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read')) - & (df_posix['size'] < small_bytes)]) + & (df_posix['size'] < thresholds['small_bytes'][0])]) detected_files.append([id, read_cnt, write_cnt]) column_names = ['id', 'INSIGHTS_POSIX_SMALL_READS', 'INSIGHTS_POSIX_SMALL_WRITES'] @@ -287,7 +289,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): # TODO: Assumed metadata operations: open, close, sync, create, seek df_detected = df_posix_records.groupby('rank')['duration'].sum().reset_index() - count_long_metadata = len(df_detected[(df_detected['duration'] > metadata_time_rank)]) + count_long_metadata = len(df_detected[(df_detected['duration'] > thresholds['metadata_time_rank'][0])]) check_long_metadata(count_long_metadata, modules) @@ -318,7 +320,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): slowest_rank_bytes = df_detected.loc[df_detected['duration'].idxmax(), 'size'] fastest_rank_bytes = df_detected.loc[df_detected['duration'].idxmin(), 'size'] - if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > imbalance_stragglers: + if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > thresholds['imbalance_stragglers'][0]: stragglers_count += 1 detected_files.append([ @@ -356,7 +358,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): slowest_rank_time = df_detected['duration'].max() fastest_rank_time = df_detected['duration'].min() - if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > imbalance_stragglers: + if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > thresholds['imbalance_stragglers'][0]: stragglers_count += 1 detected_files.append([ @@ -396,7 +398,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): max_bytes_written = df_detected['size'].max() min_bytes_written = df_detected['size'].min() - if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > imbalance_size: + if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > thresholds['imbalance_size'][0]: imbalance_count += 1 detected_files.append([ @@ -417,7 +419,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): max_bytes_read = df_detected['size'].max() min_bytes_read = df_detected['size'].min() - if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > imbalance_size: + if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > thresholds['imbalance_size'][0]: imbalance_count += 1 detected_files.append([ @@ -448,13 +450,13 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): detected_files = pd.DataFrame() else: detected_files = [] - if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > collective_operations_absolute: + if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > thresholds['collective_operations_absolute'][0]: for id in file_map.keys(): indep_read_count = df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all')) & (df_mpiio_reads['file_id'] == id)] indep_write_count = df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all')) & (df_mpiio_writes['file_id'] == id)] indep_total_count = indep_read_count + indep_write_count - if (indep_total_count > collective_operations_absolute and indep_read_count / indep_total_count > collective_operations): + if (indep_total_count > thresholds['collective_operations_absolute'][0] and indep_read_count / indep_total_count > thresholds['collective_operations'][0]): detected_files.append([ id, indep_read_count, indep_read_count / indep_total_count * 100 ]) @@ -468,13 +470,13 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): detected_files = pd.DataFrame() else: detected_files = [] - if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > collective_operations_absolute: + if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > thresholds['collective_operations_absolute'][0]: for id in file_map.keys(): indep_read_count = df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all')) & (df_mpiio_reads['file_id'] == id)] indep_write_count = df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all')) & (df_mpiio_writes['file_id'] == id)] indep_total_count = indep_read_count + indep_write_count - if (indep_total_count > collective_operations_absolute and indep_write_count / indep_total_count > collective_operations): + if (indep_total_count > thresholds['collective_operations_absolute'][0] and indep_write_count / indep_total_count > thresholds['collective_operations'][0]): detected_files.append([ id, indep_write_count, indep_write_count / indep_total_count * 100 ]) @@ -572,6 +574,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): console.print() display_content(console) + display_thresholds(console) display_footer(console, insights_start_time, insights_end_time) if args.split_files: diff --git a/drishti/includes/config.py b/drishti/includes/config.py index f362dc2..82bd872 100644 --- a/drishti/includes/config.py +++ b/drishti/includes/config.py @@ -34,19 +34,21 @@ insights_total[WARN] = 0 insights_total[RECOMMENDATIONS] = 0 -imbalance_operations = 0.1 -small_bytes = 1048576 # 1MB -small_requests = 0.1 -small_requests_absolute = 1000 -misaligned_requests = 0.1 -metadata_time_rank = 30 # seconds -random_operations = 0.2 -random_operations_absolute = 1000 -imbalance_stragglers = 0.15 -imbalance_size = 0.30 -interface_stdio = 0.1 -collective_operations = 0.5 -collective_operations_absolute = 1000 +thresholds = { + 'imbalance_operations': [0.1, False], + 'small_bytes': [1048576, False], + 'small_requests': [0.1, False], + 'small_requests_absolute': [1000, False], + 'misaligned_requests': [0.1, False], + 'metadata_time_rank': [30, False], + 'random_operations': [0.2, False], + 'random_operations_absolute': [1000, False], + 'imbalance_stragglers': [0.15, False], + 'imbalance_size': [0.3, False], + 'interface_stdio': [0.1, False], + 'collective_operations': [0.5, False], + 'collective_operations_absolute': [1000, False], +} INSIGHTS_STDIO_HIGH_USAGE = 'S01' INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE = 'P01' @@ -98,6 +100,10 @@ def init_console(): insights_total[HIGH] = 0 insights_total[WARN] = 0 insights_total[RECOMMENDATIONS] = 0 + + for name in thresholds: + thresholds[name][1] = False + return console @@ -166,14 +172,14 @@ def validate_thresholds(): for category, thresholds_spec in data.items(): for threshold_name, threshold_value in thresholds_spec.items(): - globals()[threshold_name] = threshold_value - - assert(imbalance_operations >= 0.0 and imbalance_operations <= 1.0) - assert(small_requests >= 0.0 and small_requests <= 1.0) - assert(misaligned_requests >= 0.0 and misaligned_requests <= 1.0) - assert(random_operations >= 0.0 and random_operations <= 1.0) - - assert(metadata_time_rank >= 0.0) + thresholds[category + '_' + threshold_name][0] = threshold_value + + assert(thresholds['imbalance_operations'][0] >= 0.0 and thresholds['imbalance_operations'][0] <= 1.0) + assert(thresholds['small_requests'][0] >= 0.0 and thresholds['small_requests'][0] <= 1.0) + assert(thresholds['misaligned_requests'][0] >= 0.0 and thresholds['misaligned_requests'][0] <= 1.0) + assert(thresholds['random_operations'][0] >= 0.0 and thresholds['random_operations'][0] <= 1.0) + + assert(thresholds['metadata_time_rank'][0] >= 0.0) def convert_bytes(bytes_number): diff --git a/drishti/includes/module.py b/drishti/includes/module.py index cf90530..2731e69 100644 --- a/drishti/includes/module.py +++ b/drishti/includes/module.py @@ -26,7 +26,8 @@ def check_stdio(total_size, total_size_stdio): ''' - if total_size and total_size_stdio / total_size > interface_stdio: + if total_size and total_size_stdio / total_size > thresholds['interface_stdio'][0]: + thresholds['interface_stdio'][1] = True issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format( total_size_stdio / total_size * 100.0, convert_bytes(total_size_stdio) @@ -79,7 +80,7 @@ def check_operation_intensive(total_operations, total_reads, total_writes): total_writes: number of write operations been executed by the application ''' - if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > imbalance_operations: + if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > thresholds['imbalance_operations'][0]: issue = 'Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 ) @@ -88,7 +89,7 @@ def check_operation_intensive(total_operations, total_reads, total_writes): message(INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) ) - if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > imbalance_operations: + if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > thresholds['imbalance_operations'][0]: issue = 'Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 ) @@ -108,7 +109,7 @@ def check_size_intensive(total_size, total_read_size, total_written_size): total_written_size: Output I/O size measured in byte ''' - if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > imbalance_operations: + if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > thresholds['imbalance_operations'][0]: issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format( total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 ) @@ -117,7 +118,7 @@ def check_size_intensive(total_size, total_read_size, total_written_size): message(INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) ) - if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > imbalance_operations: + if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > thresholds['imbalance_operations'][0]: issue = 'Application is read size intensive ({:.2f}% write vs. {:.2f}% read)'.format( total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 ) @@ -143,7 +144,8 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr file_map: file id and file name pairing ''' - if total_reads_small and total_reads_small / total_reads > small_requests and total_reads_small > small_requests_absolute: + if total_reads_small and total_reads_small / total_reads > thresholds['small_requests'][0] and total_reads_small > thresholds['small_requests_absolute'][0]: + thresholds['small_requests_absolute'][1] = True issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests'.format( total_reads_small, total_reads_small / total_reads * 100.0 ) @@ -152,7 +154,7 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr recommendation = [] for index, row in detected_files.iterrows(): - if row['total_reads'] > (total_reads * small_requests / 2): + if row['total_reads'] > (total_reads * thresholds['small_requests'][0] / 2): detail.append( { 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( @@ -187,7 +189,8 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) ) - if total_writes_small and total_writes_small / total_writes > small_requests and total_writes_small > small_requests_absolute: + if total_writes_small and total_writes_small / total_writes > thresholds['small_requests'][0] and total_writes_small > thresholds['small_requests_absolute'][0]: + thresholds['small_requests_absolute'][1] = True issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests'.format( total_writes_small, total_writes_small / total_writes * 100.0 ) @@ -196,7 +199,7 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr recommendation = [] for index, row in detected_files.iterrows(): - if row['total_writes'] > (total_writes * small_requests / 2): + if row['total_writes'] > (total_writes * thresholds['small_requests'][0] / 2): detail.append( { 'message': '{} ({:.2f}%) small write requests are to "{}"'.format( @@ -243,7 +246,8 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali modules: all different mudules been used in the application ''' - if total_operations and total_mem_not_aligned / total_operations > misaligned_requests: + if total_operations and total_mem_not_aligned / total_operations > thresholds['misaligned_requests'][0]: + thresholds['misaligned_requests'][1] = True issue = 'Application has a high number ({:.2f}%) of misaligned memory requests'.format( total_mem_not_aligned / total_operations * 100.0 ) @@ -252,7 +256,8 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali message(INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, TARGET_DEVELOPER, HIGH, issue, None) ) - if total_operations and total_file_not_aligned / total_operations > misaligned_requests: + if total_operations and total_file_not_aligned / total_operations > thresholds['misaligned_requests'][0]: + thresholds['misaligned_requests'][1] = True issue = 'Application issues a high number ({:.2f}%) of misaligned file requests'.format( total_file_not_aligned / total_operations * 100.0 ) @@ -330,7 +335,9 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total if total_reads: - if read_random and read_random / total_reads > random_operations and read_random > random_operations_absolute: + if read_random and read_random / total_reads > thresholds['random_operations'][0] and read_random > thresholds['random_operations_absolute'][0]: + thresholds['random_operations'][1] = True + thresholds['random_operations_absolute'][1] = True issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format( read_random, read_random / total_reads * 100.0 ) @@ -355,7 +362,9 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total ) if total_writes: - if write_random and write_random / total_writes > random_operations and write_random > random_operations_absolute: + if write_random and write_random / total_writes > thresholds['random_operations'][0] and write_random > thresholds['random_operations_absolute'][0]: + thresholds['random_operations'][1] = True + thresholds['random_operations_absolute'][1] = True issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)'.format( write_random, write_random / total_writes * 100.0 ) @@ -395,7 +404,9 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t file_map: file id and file name pairing ''' - if total_shared_reads and total_shared_reads_small / total_shared_reads > small_requests and total_shared_reads_small > small_requests_absolute: + if total_shared_reads and total_shared_reads_small / total_shared_reads > thresholds['small_requests'][0] and total_shared_reads_small > thresholds['small_requests_absolute'][0]: + thresholds['small_requests'][1] = True + thresholds['small_requests_absolute'][1] = True issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format( total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0 ) @@ -403,7 +414,7 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t detail = [] for index, row in shared_files.iterrows(): - if row['INSIGHTS_POSIX_SMALL_READS'] > (total_shared_reads * small_requests / 2): + if row['INSIGHTS_POSIX_SMALL_READS'] > (total_shared_reads * thresholds['small_requests'][0] / 2): detail.append( { 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( @@ -425,7 +436,9 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) ) - if total_shared_writes and total_shared_writes_small / total_shared_writes > small_requests and total_shared_writes_small > small_requests_absolute: + if total_shared_writes and total_shared_writes_small / total_shared_writes > thresholds['small_requests'][0] and total_shared_writes_small > thresholds['small_requests_absolute'][0]: + thresholds['small_requests'][1] = True + thresholds['small_requests_absolute'][1] = True issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests'.format( total_shared_writes_small, total_shared_writes_small / total_shared_writes * 100.0 ) @@ -433,7 +446,7 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t detail = [] for index, row in shared_files.iterrows(): - if row['INSIGHTS_POSIX_SMALL_WRITES'] > (total_shared_writes * small_requests / 2): + if row['INSIGHTS_POSIX_SMALL_WRITES'] > (total_shared_writes * thresholds['small_requests'][0] / 2): detail.append( { 'message': '{} ({:.2f}%) small writes requests are to "{}"'.format( @@ -466,8 +479,9 @@ def check_long_metadata(count_long_metadata, modules): ''' if count_long_metadata > 0: + thresholds['metadata_time_rank'][1] = True issue = 'There are {} ranks where metadata operations take over {} seconds'.format( - count_long_metadata, metadata_time_rank + count_long_metadata, thresholds['metadata_time_rank'][0] ) recommendation = [ @@ -506,6 +520,7 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map): ''' if stragglers_count: + thresholds['imbalance_stragglers'][1] = True issue = 'Detected data transfer imbalance caused by stragglers when accessing {} shared file.'.format( stragglers_count ) @@ -547,7 +562,8 @@ def check_shared_data_imblance_split(slowest_rank_bytes, fastest_rank_bytes, tot total_transfer_size: total request size of that specific shared file ''' - if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > imbalance_stragglers: + if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > thresholds['imbalance_stragglers'][0]: + thresholds['imbalance_stragglers'][1] = True issue = 'Load imbalance of {:.2f}% detected'.format( abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100 ) @@ -580,6 +596,7 @@ def check_shared_time_imbalance(stragglers_count, detected_files, file_map): ''' if stragglers_count: + thresholds['imbalance_stragglers'][1] = True issue = 'Detected time imbalance caused by stragglers when accessing {} shared file.'.format( stragglers_count ) @@ -621,7 +638,8 @@ def check_shared_time_imbalance_split(slowest_rank_time, fastest_rank_time, tota total_transfer_size: total request time of that specific shared file ''' - if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > imbalance_stragglers: + if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > thresholds['imbalance_stragglers'][0]: + thresholds['imbalance_stragglers'][1] = True issue = 'Load imbalance of {:.2f}% detected'.format( abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100 ) @@ -653,6 +671,7 @@ def check_individual_write_imbalance(imbalance_count, detected_files, file_map): ''' if imbalance_count: + thresholds['imbalance_size'][1] = True issue = 'Detected write imbalance when accessing {} individual files'.format( imbalance_count ) @@ -700,7 +719,8 @@ def check_individual_write_imbalance_split(max_bytes_written, min_bytes_written) min_bytes_written: minimum byte written in the file ''' - if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > imbalance_size: + if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > thresholds['imbalance_size'][0]: + thresholds['imbalance_size'][1] = True issue = 'Load imbalance of {:.2f}% detected'.format( abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 ) @@ -739,6 +759,7 @@ def check_individual_read_imbalance(imbalance_count, detected_files, file_map): ''' if imbalance_count: + thresholds['imbalance_size'][1] = True issue = 'Detected read imbalance when accessing {} individual files.'.format( imbalance_count ) @@ -786,7 +807,8 @@ def check_individual_read_imbalance_split(max_bytes_read, min_bytes_read): min_bytes_written: minimum byte read in the file ''' - if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > imbalance_size: + if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > thresholds['imbalance_size'][0]: + thresholds['imbalance_size'][1] = True issue = 'Load imbalance of {:.2f}% detected'.format( abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 ) @@ -831,7 +853,8 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot ''' if mpiio_coll_reads == 0: - if total_mpiio_read_operations and total_mpiio_read_operations > collective_operations_absolute: + if total_mpiio_read_operations and total_mpiio_read_operations > thresholds['collective_operations_absolute'][0]: + thresholds['collective_operations_absolute'][1] = True issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format( mpiio_indep_reads, mpiio_indep_reads / total_mpiio_read_operations * 100 @@ -886,7 +909,8 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, ''' if mpiio_coll_writes == 0: - if total_mpiio_write_operations and total_mpiio_write_operations > collective_operations_absolute: + if total_mpiio_write_operations and total_mpiio_write_operations > thresholds['collective_operations_absolute'][0]: + thresholds['collective_operations_absolute'][1] = True issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format( mpiio_indep_writes, mpiio_indep_writes / total_mpiio_write_operations * 100 @@ -1074,6 +1098,42 @@ def display_content(console): ) +def display_thresholds(console): + tholdMessage = { + 'imbalance_operations': 'Minimum imbalance requests ratio: [white]{}%[/white]'.format(thresholds['imbalance_operations'][0] * 100), + 'small_bytes': 'Minimum size of a small request: [white]{} bytes[/white]'.format(thresholds['small_bytes'][0]), + 'small_requests': 'Maximum small requests ratio: [white]{}%[/white]'.format(thresholds['small_requests'][0] * 100), + 'small_requests_absolute': 'Maximum small requests: [white]{}[/white]'.format(thresholds['small_requests_absolute'][0]), + 'misaligned_requests': 'Maximum misaligned requests ratio: [white]{}%[/white]'.format(thresholds['misaligned_requests'][0] * 100), + 'random_operations': 'Maximum random request ratio: [white]{}%[/white]'.format(thresholds['random_operations'][0] * 100), + 'random_operations_absolute': 'Maximum random requests: [white]{}[/white]'.format(thresholds['random_operations_absolute'][0]), + 'metadata_time_rank': 'Maximum metadata process time per rank: [white]{} seconds[/white]'.format(thresholds['metadata_time_rank'][0]), + 'imbalance_size': 'Maximum read/write size difference ratio: [white]{}%[/white]'.format(thresholds['imbalance_size'][0] * 100), + 'imbalance_stragglers': 'Maximum ratio difference among ranks: [white]{}%[/white]'.format(thresholds['imbalance_stragglers'][0] * 100), + 'interface_stdio': 'Maximum STDIO usage ratio: [white]{}%[/white]'.format(thresholds['interface_stdio'][0] * 100), + 'collective_operations': 'Minimum MPI collective operation usage ratio: [white]{}%[/white]'.format(thresholds['collective_operations'][0] * 100), + 'collective_operations_absolute': 'Minimum MPI collective operations: [white]{}[/white]'.format(thresholds['collective_operations_absolute'][0]), + } + + toBeAppend = [] + if args.thold: + for name, message in tholdMessage.items(): + toBeAppend.append(message) + else: + for name, message in tholdMessage.items(): + if thresholds[name][1]: + toBeAppend.append(message) + + console.print( + Panel( + '\n'.join(toBeAppend), + title='THRESHOLDS', + title_align='left', + padding=1 + ) + ) + + def display_footer(console, insights_start_time, insights_end_time): console.print( Panel( diff --git a/drishti/includes/parser.py b/drishti/includes/parser.py index 7ddfdd6..fbc759c 100644 --- a/drishti/includes/parser.py +++ b/drishti/includes/parser.py @@ -56,6 +56,14 @@ help='Display extended details for the recommendations' ) +parser.add_argument( + '--threshold', + default=False, + action='store_true', + dest='thold', + help='Display all thresholds used for the report' +) + parser.add_argument( '--code', default=False,