From 68ebab837716cff966d841ec519edeb5ad063387 Mon Sep 17 00:00:00 2001 From: onewbiek Date: Fri, 2 Feb 2024 10:48:13 -0800 Subject: [PATCH] Enable thresholds display --- drishti/handlers/handle_darshan.py | 23 +++--- drishti/handlers/handle_recorder.py | 39 +++++----- drishti/includes/config.py | 48 +++++++------ drishti/includes/module.py | 108 +++++++++++++++++++++------- drishti/includes/parser.py | 8 +++ 5 files changed, 152 insertions(+), 74 deletions(-) diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index 775a838..b4c59bf 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -354,7 +354,7 @@ def handler(): ######################################################################################################################################################################### - count_long_metadata = len(df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > metadata_time_rank)]) + count_long_metadata = len(df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > thresholds['metadata_time_rank'][0])]) check_long_metadata(count_long_metadata, modules) @@ -375,7 +375,7 @@ def handler(): for index, row in shared_files.iterrows(): total_transfer_size = row['POSIX_BYTES_WRITTEN'] + row['POSIX_BYTES_READ'] - if total_transfer_size and abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size > imbalance_stragglers: + if total_transfer_size and abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size > thresholds['imbalance_stragglers'][0]: stragglers_count += 1 detected_files.append([ @@ -403,7 +403,7 @@ def handler(): for index, row in shared_files_times.iterrows(): total_transfer_time = row['POSIX_F_WRITE_TIME'] + row['POSIX_F_READ_TIME'] + row['POSIX_F_META_TIME'] - if total_transfer_time and abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time > imbalance_stragglers: + if total_transfer_time and abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time > thresholds['imbalance_stragglers'][0]: stragglers_count += 1 detected_files.append([ @@ -432,7 +432,7 @@ def handler(): detected_files = [] for index, row in aggregated.iterrows(): - if row['POSIX_BYTES_WRITTEN_max'] and abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row['POSIX_BYTES_WRITTEN_max'] > imbalance_size: + if row['POSIX_BYTES_WRITTEN_max'] and abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row['POSIX_BYTES_WRITTEN_max'] > thresholds['imbalance_size'][0]: imbalance_count += 1 detected_files.append([ @@ -448,7 +448,7 @@ def handler(): detected_files = [] for index, row in aggregated.iterrows(): - if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] > imbalance_size: + if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] > thresholds['imbalance_size'][0]: imbalance_count += 1 detected_files.append([ @@ -478,12 +478,12 @@ def handler(): mpiio_indep_reads = df_mpiio['counters']['MPIIO_INDEP_READS'].sum() detected_files = [] - if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > collective_operations_absolute: + if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > thresholds['collective_operations_absolute'][0]: files = pd.DataFrame(df_mpiio_collective_reads.groupby('id').sum()).reset_index() for index, row in df_mpiio_collective_reads.iterrows(): if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and - row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > collective_operations and - (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > collective_operations_absolute): + row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > thresholds['collective_operations'][0] and + (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > thresholds['collective_operations_absolute'][0]): detected_files.append([ row['id'], row['MPIIO_INDEP_READS'], row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100 @@ -502,13 +502,13 @@ def handler(): mpiio_indep_writes = df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() detected_files = [] - if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > collective_operations_absolute: + if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > thresholds['collective_operations_absolute'][0]: files = pd.DataFrame(df_mpiio_collective_writes.groupby('id').sum()).reset_index() for index, row in df_mpiio_collective_writes.iterrows(): if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and - row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > collective_operations and - (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > collective_operations_absolute): + row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > thresholds['collective_operations'][0] and + (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > thresholds['collective_operations_absolute'][0]): detected_files.append([ row['id'], row['MPIIO_INDEP_WRITES'], row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100 @@ -651,6 +651,7 @@ def handler(): console.print() display_content(console) + display_thresholds(console) display_footer(console, insights_start_time, insights_end_time) filename = '{}.html'.format(args.log_path) diff --git a/drishti/handlers/handle_recorder.py b/drishti/handlers/handle_recorder.py index 0007d11..34c4790 100644 --- a/drishti/handlers/handle_recorder.py +++ b/drishti/handlers/handle_recorder.py @@ -47,7 +47,9 @@ def handler(): file_map = None if os.path.exists(args.log_path + '.intervals.csv') and os.path.exists(args.log_path + '.records.csv') and os.path.exists(args.log_path + '.filemap.csv'): - print('Using existing parsed log file') + print('Using parsed file: {}'.format(os.path.abspath(args.log_path + '.intervals.csv'))) + print('Using parsed file: {}'.format(os.path.abspath(args.log_path + '.records.csv'))) + print('Using parsed file: {}'.format(os.path.abspath(args.log_path + '.filemap.csv'))) df_intervals = pd.read_csv(args.log_path + '.intervals.csv') df_posix_records = pd.read_csv(args.log_path + '.records.csv') df_file_map = pd.read_csv(args.log_path + '.filemap.csv') @@ -174,16 +176,16 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): # Get the number of small I/O operations (less than 1 MB) - total_reads_small = len(df_posix[(df_posix['function'].str.contains('read')) & (df_posix['size'] < small_bytes)]) - total_writes_small = len(df_posix[~(df_posix['function'].str.contains('read')) & (df_posix['size'] < small_bytes)]) + total_reads_small = len(df_posix[(df_posix['function'].str.contains('read')) & (df_posix['size'] < thresholds['small_bytes'][0])]) + total_writes_small = len(df_posix[~(df_posix['function'].str.contains('read')) & (df_posix['size'] < thresholds['small_bytes'][0])]) if args.split_files: detected_files = pd.DataFrame() else: detected_files = [] for id in file_map.keys(): - read_cnt = len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read')) & (df_posix['size'] < small_bytes)]) - write_cnt = len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read')) & (df_posix['size'] < small_bytes)]) + read_cnt = len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read')) & (df_posix['size'] < thresholds['small_bytes'][0])]) + write_cnt = len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read')) & (df_posix['size'] < thresholds['small_bytes'][0])]) detected_files.append([id, read_cnt, write_cnt]) column_names = ['id', 'total_reads', 'total_writes'] @@ -258,12 +260,12 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): total_shared_reads = len(df_posix[(df_posix['file_id'].isin(shared_files)) & (df_posix['function'].str.contains('read'))]) total_shared_reads_small = len(df_posix[(df_posix['file_id'].isin(shared_files)) & (df_posix['function'].str.contains('read')) - & (df_posix['size'] < small_bytes)]) + & (df_posix['size'] < thresholds['small_bytes'][0])]) total_shared_writes = len(df_posix[(df_posix['file_id'].isin(shared_files)) & ~(df_posix['function'].str.contains('read'))]) total_shared_writes_small = len(df_posix[(df_posix['file_id'].isin(shared_files)) & ~(df_posix['function'].str.contains('read')) - & (df_posix['size'] < small_bytes)]) + & (df_posix['size'] < thresholds['small_bytes'][0])]) if args.split_files: detected_files = pd.DataFrame() @@ -272,10 +274,10 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): for id in shared_files: read_cnt = len(df_posix[(df_posix['file_id'] == id) & (df_posix['function'].str.contains('read')) - & (df_posix['size'] < small_bytes)]) + & (df_posix['size'] < thresholds['small_bytes'][0])]) write_cnt = len(df_posix[(df_posix['file_id'] == id) & ~(df_posix['function'].str.contains('read')) - & (df_posix['size'] < small_bytes)]) + & (df_posix['size'] < thresholds['small_bytes'][0])]) detected_files.append([id, read_cnt, write_cnt]) column_names = ['id', 'INSIGHTS_POSIX_SMALL_READS', 'INSIGHTS_POSIX_SMALL_WRITES'] @@ -287,7 +289,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): # TODO: Assumed metadata operations: open, close, sync, create, seek df_detected = df_posix_records.groupby('rank')['duration'].sum().reset_index() - count_long_metadata = len(df_detected[(df_detected['duration'] > metadata_time_rank)]) + count_long_metadata = len(df_detected[(df_detected['duration'] > thresholds['metadata_time_rank'][0])]) check_long_metadata(count_long_metadata, modules) @@ -318,7 +320,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): slowest_rank_bytes = df_detected.loc[df_detected['duration'].idxmax(), 'size'] fastest_rank_bytes = df_detected.loc[df_detected['duration'].idxmin(), 'size'] - if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > imbalance_stragglers: + if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > thresholds['imbalance_stragglers'][0]: stragglers_count += 1 detected_files.append([ @@ -356,7 +358,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): slowest_rank_time = df_detected['duration'].max() fastest_rank_time = df_detected['duration'].min() - if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > imbalance_stragglers: + if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > thresholds['imbalance_stragglers'][0]: stragglers_count += 1 detected_files.append([ @@ -396,7 +398,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): max_bytes_written = df_detected['size'].max() min_bytes_written = df_detected['size'].min() - if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > imbalance_size: + if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > thresholds['imbalance_size'][0]: imbalance_count += 1 detected_files.append([ @@ -417,7 +419,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): max_bytes_read = df_detected['size'].max() min_bytes_read = df_detected['size'].min() - if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > imbalance_size: + if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > thresholds['imbalance_size'][0]: imbalance_count += 1 detected_files.append([ @@ -448,13 +450,13 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): detected_files = pd.DataFrame() else: detected_files = [] - if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > collective_operations_absolute: + if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > thresholds['collective_operations_absolute'][0]: for id in file_map.keys(): indep_read_count = df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all')) & (df_mpiio_reads['file_id'] == id)] indep_write_count = df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all')) & (df_mpiio_writes['file_id'] == id)] indep_total_count = indep_read_count + indep_write_count - if (indep_total_count > collective_operations_absolute and indep_read_count / indep_total_count > collective_operations): + if (indep_total_count > thresholds['collective_operations_absolute'][0] and indep_read_count / indep_total_count > thresholds['collective_operations'][0]): detected_files.append([ id, indep_read_count, indep_read_count / indep_total_count * 100 ]) @@ -468,13 +470,13 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): detected_files = pd.DataFrame() else: detected_files = [] - if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > collective_operations_absolute: + if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > thresholds['collective_operations_absolute'][0]: for id in file_map.keys(): indep_read_count = df_mpiio_reads[~(df_mpiio_reads['function'].str.contains('_all')) & (df_mpiio_reads['file_id'] == id)] indep_write_count = df_mpiio_writes[~(df_mpiio_writes['function'].str.contains('_all')) & (df_mpiio_writes['file_id'] == id)] indep_total_count = indep_read_count + indep_write_count - if (indep_total_count > collective_operations_absolute and indep_write_count / indep_total_count > collective_operations): + if (indep_total_count > thresholds['collective_operations_absolute'][0] and indep_write_count / indep_total_count > thresholds['collective_operations'][0]): detected_files.append([ id, indep_write_count, indep_write_count / indep_total_count * 100 ]) @@ -572,6 +574,7 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None): console.print() display_content(console) + display_thresholds(console) display_footer(console, insights_start_time, insights_end_time) if args.split_files: diff --git a/drishti/includes/config.py b/drishti/includes/config.py index f362dc2..82bd872 100644 --- a/drishti/includes/config.py +++ b/drishti/includes/config.py @@ -34,19 +34,21 @@ insights_total[WARN] = 0 insights_total[RECOMMENDATIONS] = 0 -imbalance_operations = 0.1 -small_bytes = 1048576 # 1MB -small_requests = 0.1 -small_requests_absolute = 1000 -misaligned_requests = 0.1 -metadata_time_rank = 30 # seconds -random_operations = 0.2 -random_operations_absolute = 1000 -imbalance_stragglers = 0.15 -imbalance_size = 0.30 -interface_stdio = 0.1 -collective_operations = 0.5 -collective_operations_absolute = 1000 +thresholds = { + 'imbalance_operations': [0.1, False], + 'small_bytes': [1048576, False], + 'small_requests': [0.1, False], + 'small_requests_absolute': [1000, False], + 'misaligned_requests': [0.1, False], + 'metadata_time_rank': [30, False], + 'random_operations': [0.2, False], + 'random_operations_absolute': [1000, False], + 'imbalance_stragglers': [0.15, False], + 'imbalance_size': [0.3, False], + 'interface_stdio': [0.1, False], + 'collective_operations': [0.5, False], + 'collective_operations_absolute': [1000, False], +} INSIGHTS_STDIO_HIGH_USAGE = 'S01' INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE = 'P01' @@ -98,6 +100,10 @@ def init_console(): insights_total[HIGH] = 0 insights_total[WARN] = 0 insights_total[RECOMMENDATIONS] = 0 + + for name in thresholds: + thresholds[name][1] = False + return console @@ -166,14 +172,14 @@ def validate_thresholds(): for category, thresholds_spec in data.items(): for threshold_name, threshold_value in thresholds_spec.items(): - globals()[threshold_name] = threshold_value - - assert(imbalance_operations >= 0.0 and imbalance_operations <= 1.0) - assert(small_requests >= 0.0 and small_requests <= 1.0) - assert(misaligned_requests >= 0.0 and misaligned_requests <= 1.0) - assert(random_operations >= 0.0 and random_operations <= 1.0) - - assert(metadata_time_rank >= 0.0) + thresholds[category + '_' + threshold_name][0] = threshold_value + + assert(thresholds['imbalance_operations'][0] >= 0.0 and thresholds['imbalance_operations'][0] <= 1.0) + assert(thresholds['small_requests'][0] >= 0.0 and thresholds['small_requests'][0] <= 1.0) + assert(thresholds['misaligned_requests'][0] >= 0.0 and thresholds['misaligned_requests'][0] <= 1.0) + assert(thresholds['random_operations'][0] >= 0.0 and thresholds['random_operations'][0] <= 1.0) + + assert(thresholds['metadata_time_rank'][0] >= 0.0) def convert_bytes(bytes_number): diff --git a/drishti/includes/module.py b/drishti/includes/module.py index cf90530..2731e69 100644 --- a/drishti/includes/module.py +++ b/drishti/includes/module.py @@ -26,7 +26,8 @@ def check_stdio(total_size, total_size_stdio): ''' - if total_size and total_size_stdio / total_size > interface_stdio: + if total_size and total_size_stdio / total_size > thresholds['interface_stdio'][0]: + thresholds['interface_stdio'][1] = True issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format( total_size_stdio / total_size * 100.0, convert_bytes(total_size_stdio) @@ -79,7 +80,7 @@ def check_operation_intensive(total_operations, total_reads, total_writes): total_writes: number of write operations been executed by the application ''' - if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > imbalance_operations: + if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > thresholds['imbalance_operations'][0]: issue = 'Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 ) @@ -88,7 +89,7 @@ def check_operation_intensive(total_operations, total_reads, total_writes): message(INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) ) - if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > imbalance_operations: + if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > thresholds['imbalance_operations'][0]: issue = 'Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 ) @@ -108,7 +109,7 @@ def check_size_intensive(total_size, total_read_size, total_written_size): total_written_size: Output I/O size measured in byte ''' - if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > imbalance_operations: + if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > thresholds['imbalance_operations'][0]: issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format( total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 ) @@ -117,7 +118,7 @@ def check_size_intensive(total_size, total_read_size, total_written_size): message(INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) ) - if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > imbalance_operations: + if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > thresholds['imbalance_operations'][0]: issue = 'Application is read size intensive ({:.2f}% write vs. {:.2f}% read)'.format( total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 ) @@ -143,7 +144,8 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr file_map: file id and file name pairing ''' - if total_reads_small and total_reads_small / total_reads > small_requests and total_reads_small > small_requests_absolute: + if total_reads_small and total_reads_small / total_reads > thresholds['small_requests'][0] and total_reads_small > thresholds['small_requests_absolute'][0]: + thresholds['small_requests_absolute'][1] = True issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests'.format( total_reads_small, total_reads_small / total_reads * 100.0 ) @@ -152,7 +154,7 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr recommendation = [] for index, row in detected_files.iterrows(): - if row['total_reads'] > (total_reads * small_requests / 2): + if row['total_reads'] > (total_reads * thresholds['small_requests'][0] / 2): detail.append( { 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( @@ -187,7 +189,8 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) ) - if total_writes_small and total_writes_small / total_writes > small_requests and total_writes_small > small_requests_absolute: + if total_writes_small and total_writes_small / total_writes > thresholds['small_requests'][0] and total_writes_small > thresholds['small_requests_absolute'][0]: + thresholds['small_requests_absolute'][1] = True issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests'.format( total_writes_small, total_writes_small / total_writes * 100.0 ) @@ -196,7 +199,7 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr recommendation = [] for index, row in detected_files.iterrows(): - if row['total_writes'] > (total_writes * small_requests / 2): + if row['total_writes'] > (total_writes * thresholds['small_requests'][0] / 2): detail.append( { 'message': '{} ({:.2f}%) small write requests are to "{}"'.format( @@ -243,7 +246,8 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali modules: all different mudules been used in the application ''' - if total_operations and total_mem_not_aligned / total_operations > misaligned_requests: + if total_operations and total_mem_not_aligned / total_operations > thresholds['misaligned_requests'][0]: + thresholds['misaligned_requests'][1] = True issue = 'Application has a high number ({:.2f}%) of misaligned memory requests'.format( total_mem_not_aligned / total_operations * 100.0 ) @@ -252,7 +256,8 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali message(INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, TARGET_DEVELOPER, HIGH, issue, None) ) - if total_operations and total_file_not_aligned / total_operations > misaligned_requests: + if total_operations and total_file_not_aligned / total_operations > thresholds['misaligned_requests'][0]: + thresholds['misaligned_requests'][1] = True issue = 'Application issues a high number ({:.2f}%) of misaligned file requests'.format( total_file_not_aligned / total_operations * 100.0 ) @@ -330,7 +335,9 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total if total_reads: - if read_random and read_random / total_reads > random_operations and read_random > random_operations_absolute: + if read_random and read_random / total_reads > thresholds['random_operations'][0] and read_random > thresholds['random_operations_absolute'][0]: + thresholds['random_operations'][1] = True + thresholds['random_operations_absolute'][1] = True issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format( read_random, read_random / total_reads * 100.0 ) @@ -355,7 +362,9 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total ) if total_writes: - if write_random and write_random / total_writes > random_operations and write_random > random_operations_absolute: + if write_random and write_random / total_writes > thresholds['random_operations'][0] and write_random > thresholds['random_operations_absolute'][0]: + thresholds['random_operations'][1] = True + thresholds['random_operations_absolute'][1] = True issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)'.format( write_random, write_random / total_writes * 100.0 ) @@ -395,7 +404,9 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t file_map: file id and file name pairing ''' - if total_shared_reads and total_shared_reads_small / total_shared_reads > small_requests and total_shared_reads_small > small_requests_absolute: + if total_shared_reads and total_shared_reads_small / total_shared_reads > thresholds['small_requests'][0] and total_shared_reads_small > thresholds['small_requests_absolute'][0]: + thresholds['small_requests'][1] = True + thresholds['small_requests_absolute'][1] = True issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format( total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0 ) @@ -403,7 +414,7 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t detail = [] for index, row in shared_files.iterrows(): - if row['INSIGHTS_POSIX_SMALL_READS'] > (total_shared_reads * small_requests / 2): + if row['INSIGHTS_POSIX_SMALL_READS'] > (total_shared_reads * thresholds['small_requests'][0] / 2): detail.append( { 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( @@ -425,7 +436,9 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) ) - if total_shared_writes and total_shared_writes_small / total_shared_writes > small_requests and total_shared_writes_small > small_requests_absolute: + if total_shared_writes and total_shared_writes_small / total_shared_writes > thresholds['small_requests'][0] and total_shared_writes_small > thresholds['small_requests_absolute'][0]: + thresholds['small_requests'][1] = True + thresholds['small_requests_absolute'][1] = True issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests'.format( total_shared_writes_small, total_shared_writes_small / total_shared_writes * 100.0 ) @@ -433,7 +446,7 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t detail = [] for index, row in shared_files.iterrows(): - if row['INSIGHTS_POSIX_SMALL_WRITES'] > (total_shared_writes * small_requests / 2): + if row['INSIGHTS_POSIX_SMALL_WRITES'] > (total_shared_writes * thresholds['small_requests'][0] / 2): detail.append( { 'message': '{} ({:.2f}%) small writes requests are to "{}"'.format( @@ -466,8 +479,9 @@ def check_long_metadata(count_long_metadata, modules): ''' if count_long_metadata > 0: + thresholds['metadata_time_rank'][1] = True issue = 'There are {} ranks where metadata operations take over {} seconds'.format( - count_long_metadata, metadata_time_rank + count_long_metadata, thresholds['metadata_time_rank'][0] ) recommendation = [ @@ -506,6 +520,7 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map): ''' if stragglers_count: + thresholds['imbalance_stragglers'][1] = True issue = 'Detected data transfer imbalance caused by stragglers when accessing {} shared file.'.format( stragglers_count ) @@ -547,7 +562,8 @@ def check_shared_data_imblance_split(slowest_rank_bytes, fastest_rank_bytes, tot total_transfer_size: total request size of that specific shared file ''' - if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > imbalance_stragglers: + if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > thresholds['imbalance_stragglers'][0]: + thresholds['imbalance_stragglers'][1] = True issue = 'Load imbalance of {:.2f}% detected'.format( abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100 ) @@ -580,6 +596,7 @@ def check_shared_time_imbalance(stragglers_count, detected_files, file_map): ''' if stragglers_count: + thresholds['imbalance_stragglers'][1] = True issue = 'Detected time imbalance caused by stragglers when accessing {} shared file.'.format( stragglers_count ) @@ -621,7 +638,8 @@ def check_shared_time_imbalance_split(slowest_rank_time, fastest_rank_time, tota total_transfer_size: total request time of that specific shared file ''' - if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > imbalance_stragglers: + if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > thresholds['imbalance_stragglers'][0]: + thresholds['imbalance_stragglers'][1] = True issue = 'Load imbalance of {:.2f}% detected'.format( abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100 ) @@ -653,6 +671,7 @@ def check_individual_write_imbalance(imbalance_count, detected_files, file_map): ''' if imbalance_count: + thresholds['imbalance_size'][1] = True issue = 'Detected write imbalance when accessing {} individual files'.format( imbalance_count ) @@ -700,7 +719,8 @@ def check_individual_write_imbalance_split(max_bytes_written, min_bytes_written) min_bytes_written: minimum byte written in the file ''' - if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > imbalance_size: + if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > thresholds['imbalance_size'][0]: + thresholds['imbalance_size'][1] = True issue = 'Load imbalance of {:.2f}% detected'.format( abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 ) @@ -739,6 +759,7 @@ def check_individual_read_imbalance(imbalance_count, detected_files, file_map): ''' if imbalance_count: + thresholds['imbalance_size'][1] = True issue = 'Detected read imbalance when accessing {} individual files.'.format( imbalance_count ) @@ -786,7 +807,8 @@ def check_individual_read_imbalance_split(max_bytes_read, min_bytes_read): min_bytes_written: minimum byte read in the file ''' - if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > imbalance_size: + if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > thresholds['imbalance_size'][0]: + thresholds['imbalance_size'][1] = True issue = 'Load imbalance of {:.2f}% detected'.format( abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 ) @@ -831,7 +853,8 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot ''' if mpiio_coll_reads == 0: - if total_mpiio_read_operations and total_mpiio_read_operations > collective_operations_absolute: + if total_mpiio_read_operations and total_mpiio_read_operations > thresholds['collective_operations_absolute'][0]: + thresholds['collective_operations_absolute'][1] = True issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format( mpiio_indep_reads, mpiio_indep_reads / total_mpiio_read_operations * 100 @@ -886,7 +909,8 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, ''' if mpiio_coll_writes == 0: - if total_mpiio_write_operations and total_mpiio_write_operations > collective_operations_absolute: + if total_mpiio_write_operations and total_mpiio_write_operations > thresholds['collective_operations_absolute'][0]: + thresholds['collective_operations_absolute'][1] = True issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format( mpiio_indep_writes, mpiio_indep_writes / total_mpiio_write_operations * 100 @@ -1074,6 +1098,42 @@ def display_content(console): ) +def display_thresholds(console): + tholdMessage = { + 'imbalance_operations': 'Minimum imbalance requests ratio: [white]{}%[/white]'.format(thresholds['imbalance_operations'][0] * 100), + 'small_bytes': 'Minimum size of a small request: [white]{} bytes[/white]'.format(thresholds['small_bytes'][0]), + 'small_requests': 'Maximum small requests ratio: [white]{}%[/white]'.format(thresholds['small_requests'][0] * 100), + 'small_requests_absolute': 'Maximum small requests: [white]{}[/white]'.format(thresholds['small_requests_absolute'][0]), + 'misaligned_requests': 'Maximum misaligned requests ratio: [white]{}%[/white]'.format(thresholds['misaligned_requests'][0] * 100), + 'random_operations': 'Maximum random request ratio: [white]{}%[/white]'.format(thresholds['random_operations'][0] * 100), + 'random_operations_absolute': 'Maximum random requests: [white]{}[/white]'.format(thresholds['random_operations_absolute'][0]), + 'metadata_time_rank': 'Maximum metadata process time per rank: [white]{} seconds[/white]'.format(thresholds['metadata_time_rank'][0]), + 'imbalance_size': 'Maximum read/write size difference ratio: [white]{}%[/white]'.format(thresholds['imbalance_size'][0] * 100), + 'imbalance_stragglers': 'Maximum ratio difference among ranks: [white]{}%[/white]'.format(thresholds['imbalance_stragglers'][0] * 100), + 'interface_stdio': 'Maximum STDIO usage ratio: [white]{}%[/white]'.format(thresholds['interface_stdio'][0] * 100), + 'collective_operations': 'Minimum MPI collective operation usage ratio: [white]{}%[/white]'.format(thresholds['collective_operations'][0] * 100), + 'collective_operations_absolute': 'Minimum MPI collective operations: [white]{}[/white]'.format(thresholds['collective_operations_absolute'][0]), + } + + toBeAppend = [] + if args.thold: + for name, message in tholdMessage.items(): + toBeAppend.append(message) + else: + for name, message in tholdMessage.items(): + if thresholds[name][1]: + toBeAppend.append(message) + + console.print( + Panel( + '\n'.join(toBeAppend), + title='THRESHOLDS', + title_align='left', + padding=1 + ) + ) + + def display_footer(console, insights_start_time, insights_end_time): console.print( Panel( diff --git a/drishti/includes/parser.py b/drishti/includes/parser.py index 7ddfdd6..fbc759c 100644 --- a/drishti/includes/parser.py +++ b/drishti/includes/parser.py @@ -56,6 +56,14 @@ help='Display extended details for the recommendations' ) +parser.add_argument( + '--threshold', + default=False, + action='store_true', + dest='thold', + help='Display all thresholds used for the report' +) + parser.add_argument( '--code', default=False,