diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py index c987979996..1dea6b20ed 100644 --- a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py +++ b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py @@ -84,6 +84,50 @@ def __init__(self, subtask: _Subtask): self._subtask = subtask self._labels = None self.utask_main_failure = None + self._utask_success_conditions = [ + uworker_msg_pb2.ErrorType.NO_ERROR, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.ANALYZE_NO_CRASH, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.PROGRESSION_BAD_STATE_MIN_MAX, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.REGRESSION_NO_CRASH, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.REGRESSION_LOW_CONFIDENCE_IN_REGRESSION_RANGE, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.MINIMIZE_UNREPRODUCIBLE_CRASH, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.MINIMIZE_CRASH_TOO_FLAKY, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_UNREPRODUCIBLE, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.ANALYZE_CLOSE_INVALID_UPLOADED, # pylint: disable=no-member + ] + self._utask_maybe_retry_conditions = [ + uworker_msg_pb2.ErrorType.ANALYZE_BUILD_SETUP, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.ANALYZE_NO_REVISIONS_LIST, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.TESTCASE_SETUP, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.MINIMIZE_SETUP, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.FUZZ_DATA_BUNDLE_SETUP_FAILURE, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.FUZZ_NO_FUZZ_TARGET_SELECTED, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.PROGRESSION_NO_CRASH, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.PROGRESSION_TIMEOUT, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.PROGRESSION_BUILD_SETUP_ERROR, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.REGRESSION_BUILD_SETUP_ERROR, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.REGRESSION_TIMEOUT_ERROR, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.SYMBOLIZE_BUILD_SETUP_ERROR, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.MINIMIZE_DEADLINE_EXCEEDED, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.MINIMIZE_DEADLINE_EXCEEDED_IN_MAIN_FILE_PHASE, # pylint: disable=no-member + ] + self._utask_failure_conditions = [ + uworker_msg_pb2.ErrorType.ANALYZE_NO_REVISION_INDEX, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.UNHANDLED, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.VARIANT_BUILD_SETUP, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.FUZZ_BUILD_SETUP_FAILURE, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.FUZZ_NO_FUZZER, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.PROGRESSION_REVISION_LIST_ERROR, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.PROGRESSION_BUILD_NOT_FOUND, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.PROGRESSION_BAD_BUILD, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.REGRESSION_REVISION_LIST_ERROR, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.REGRESSION_BUILD_NOT_FOUND, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.REGRESSION_BAD_BUILD_ERROR, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_FAILED, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.CORPUS_PRUNING_FUZZER_SETUP_FAILED, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.CORPUS_PRUNING_ERROR, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.FUZZ_BAD_BUILD, # pylint: disable=no-member + ] if subtask == _Subtask.PREPROCESS: self._preprocess_start_time_ns = self.start_time_ns @@ -125,6 +169,18 @@ def set_task_details(self, # Ensure we always have a value after this method returns. assert self._preprocess_start_time_ns is not None + def _infer_uworker_main_outcome(self, exc_type, uworker_error): + '''Infers, on a best effort basis, whether an uworker output implies + success or failure. If an unequivocal response is not possible, + classifies as maybe_retry.''' + if exc_type or uworker_error in self._utask_failure_conditions: + outcome = 'error' + elif uworker_error in self._utask_maybe_retry_conditions: + outcome = 'maybe_retry' + else: + outcome = 'success' + return outcome + def __exit__(self, _exc_type, _exc_value, _traceback): # Ignore exception details, let Python continue unwinding the stack. @@ -145,7 +201,8 @@ def __exit__(self, _exc_type, _exc_value, _traceback): # The only case where a task might fail without throwing, is in # utask_main, by returning an ErrorType proto which indicates # failure. - outcome = 'error' if _exc_type or self.utask_main_failure else 'success' + outcome = self._infer_uworker_main_outcome(_exc_type, + self.utask_main_failure) monitoring_metrics.TASK_OUTCOME_COUNT.increment({ **self._labels, 'outcome': outcome }) @@ -166,11 +223,6 @@ def __exit__(self, _exc_type, _exc_value, _traceback): monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment( trimmed_labels) - if error_condition != 'UNHANDLED_EXCEPTION': - task = self._labels['task'] - subtask = self._labels['subtask'] - logs.info(f'Task {task}, at subtask {subtask}, finished successfully.') - def ensure_uworker_env_type_safety(uworker_env): """Converts all values in |uworker_env| to str types. diff --git a/src/clusterfuzz/_internal/cron/triage.py b/src/clusterfuzz/_internal/cron/triage.py index 377673bd64..039b023acf 100644 --- a/src/clusterfuzz/_internal/cron/triage.py +++ b/src/clusterfuzz/_internal/cron/triage.py @@ -316,6 +316,25 @@ def _set_testcase_stuck_state(testcase: data_types.Testcase, state: bool): testcase.put() +untriaged_testcases = {} + + +def _increment_untriaged_testcase_count(job, status): + identifier = (job, status) + if identifier not in untriaged_testcases: + untriaged_testcases[identifier] = 0 + untriaged_testcases[identifier] += 1 + + +def _emit_untriaged_testcase_count_metric(): + for (job, status) in untriaged_testcases: + monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set( + untriaged_testcases[(job, status)], labels={ + 'job': job, + 'status': status, + }) + + def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase): """Emmits a metric to track age of untriaged testcases.""" if not testcase.timestamp: @@ -331,6 +350,12 @@ def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase): }) +PENDING_CRITICAL_TASKS = 'pending_critical_tasks' +PENDING_PROGRESSION = 'pending_progression' +PENDING_GROUPING = 'pending_grouping' +PENDING_FILING = 'pending_filing' + + def main(): """Files bugs.""" try: @@ -353,8 +378,6 @@ def main(): throttler = Throttler() - untriaged_testcases = 0 - for testcase_id in data_handler.get_open_testcase_id_iterator(): logs.info(f'Triaging {testcase_id}') try: @@ -386,7 +409,8 @@ def main(): _set_testcase_stuck_state(testcase, True) logs.info(f'Skipping testcase {testcase_id}, progression pending') _emit_untriaged_testcase_age_metric(testcase) - untriaged_testcases += 1 + _increment_untriaged_testcase_count(testcase.job_type, + PENDING_PROGRESSION) continue # If the testcase has a bug filed already, no triage is needed. @@ -410,6 +434,8 @@ def main(): _emit_untriaged_testcase_age_metric(testcase) untriaged_testcases += 1 _set_testcase_stuck_state(testcase, True) + _increment_untriaged_testcase_count(testcase.job_type, + PENDING_CRITICAL_TASKS) logs.info( f'Skipping testcase {testcase_id}, critical tasks still pending.') continue @@ -429,6 +455,7 @@ def main(): _emit_untriaged_testcase_age_metric(testcase) untriaged_testcases += 1 _set_testcase_stuck_state(testcase, True) + _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING) logs.info(f'Skipping testcase {testcase_id}, pending grouping.') continue @@ -437,6 +464,7 @@ def main(): _emit_untriaged_testcase_age_metric(testcase) untriaged_testcases += 1 _set_testcase_stuck_state(testcase, True) + _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING) logs.info(f'Skipping testcase {testcase_id}, pending grouping.') continue @@ -468,6 +496,7 @@ def main(): _emit_untriaged_testcase_age_metric(testcase) untriaged_testcases += 1 _set_testcase_stuck_state(testcase, False) + _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING) # File the bug first and then create filed bug metadata. if not _file_issue(testcase, issue_tracker, throttler): @@ -480,9 +509,7 @@ def main(): logs.info('Filed new issue %s for testcase %d.' % (testcase.bug_information, testcase_id)) - monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set( - untriaged_testcases, labels={}) - + _emit_untriaged_testcase_count_metric() logs.info('Triage testcases succeeded.') return True diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py index 5bd1b5b81f..cac65fd4ec 100644 --- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py +++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py @@ -372,7 +372,10 @@ description='Number of testcases that were not yet triaged ' '(have not yet completed analyze, regression,' ' minimization, impact task), in hours.', - field_spec=[], + field_spec=[ + monitor.StringField('job'), + monitor.StringField('status'), + ], ) ANALYZE_TASK_REPRODUCIBILITY = monitor.CounterMetric(