From 3923c6ea49479aed2f61411fe12a86e3cba10d9f Mon Sep 17 00:00:00 2001 From: Vitor Guidi Date: Mon, 16 Dec 2024 10:59:19 -0300 Subject: [PATCH] [Monitoring] Extend TESTCASE_UPLOAD_TRIAGE_DURATION to account for fuzzer generated test cases (#4481) [#4364](https://github.com/google/clusterfuzz/pull/4364) implemented the tracking for the time it takes clusterfuzz to complete several steps of the manually uploaded testcase lifecycle. As per Chrome's request, the metric will now contain an 'origin' label, which indicates if the testcase was 'manually_uploaded' or generated by a 'fuzzer'. The code was also simplified, by reusing the get_age_in_seconds method from the TestCase entity. Also, it adds the 'stuck_in_triage' boolean field to the testcase entity, to facilitate figuring out what testcases are in a stuck state, so follow up actions can be taken. Part of #4271 --- .../_internal/common/testcase_utils.py | 24 +++++++++++---- src/clusterfuzz/_internal/cron/triage.py | 30 ++++++++++++++++++- .../_internal/datastore/data_types.py | 5 ++++ .../_internal/metrics/monitoring_metrics.py | 6 ++-- 4 files changed, 57 insertions(+), 8 deletions(-) diff --git a/src/clusterfuzz/_internal/common/testcase_utils.py b/src/clusterfuzz/_internal/common/testcase_utils.py index 324cf0f54e..73fe9edfd4 100644 --- a/src/clusterfuzz/_internal/common/testcase_utils.py +++ b/src/clusterfuzz/_internal/common/testcase_utils.py @@ -31,6 +31,8 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str): + '''Finds out if a testcase is fuzzer generated or manually uploaded, + and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.''' testcase_upload_metadata = get_testcase_upload_metadata(testcase_id) if not testcase_upload_metadata: logs.warning(f'No upload metadata found for testcase {testcase_id},' @@ -45,9 +47,6 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str): 'analyze_launched', 'analyze_completed', 'minimize_completed', 'regression_completed', 'impact_completed', 'issue_updated' ] - elapsed_time_since_upload = datetime.datetime.utcnow() - elapsed_time_since_upload -= testcase_upload_metadata.timestamp - elapsed_time_since_upload = elapsed_time_since_upload.total_seconds() / 3600 testcase = data_handler.get_testcase_by_id(testcase_id) @@ -61,15 +60,30 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str): ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') return + from_fuzzer = not get_testcase_upload_metadata(testcase_id) + + assert step in [ + 'analyze_launched', 'analyze_completed', 'minimize_completed', + 'regression_completed', 'impact_completed', 'issue_updated' + ] + + if not testcase.get_age_in_seconds(): + logs.warning(f'No timestamp associated to testcase {testcase_id},' + ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') + return + + testcase_age_in_hours = testcase.get_age_in_seconds() / 3600 + logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase ' - f'{testcase_id} (age = {elapsed_time_since_upload}) ' + f'{testcase_id} (age = {testcase_age_in_hours} hours.) ' 'in step {step}.') monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add( - elapsed_time_since_upload, + testcase_age_in_hours, labels={ 'job': testcase.job_type, 'step': step, + 'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded' }) diff --git a/src/clusterfuzz/_internal/cron/triage.py b/src/clusterfuzz/_internal/cron/triage.py index d5759be553..7a82807c56 100644 --- a/src/clusterfuzz/_internal/cron/triage.py +++ b/src/clusterfuzz/_internal/cron/triage.py @@ -309,6 +309,13 @@ def _file_issue(testcase, issue_tracker, throttler): return filed +def _set_testcase_stuck_state(testcase: data_types.Testcase, state: bool): + if testcase.stuck_in_triage == state: + return + testcase.stuck_in_triage = state + testcase.put() + + untriaged_testcases = {} @@ -322,7 +329,8 @@ def _increment_untriaged_testcase_count(job, status): def _emit_untriaged_testcase_count_metric(): for (job, status) in untriaged_testcases: monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set( - untriaged_testcases[(job, status)], labels={ + untriaged_testcases[(job, status)], + labels={ 'job': job, 'status': status, }) @@ -383,14 +391,21 @@ def main(): # Skip if testcase's job is removed. if testcase.job_type not in all_jobs: + _set_testcase_stuck_state(testcase, False) + logs.info(f'Skipping testcase {testcase_id}, since its job was removed ' + f' ({testcase.job_type})') continue # Skip if testcase's job is in exclusions list. if testcase.job_type in excluded_jobs: + _set_testcase_stuck_state(testcase, False) + logs.info(f'Skipping testcase {testcase_id}, since its job is in the' + f' exclusion list ({testcase.job_type})') continue # Skip if we are running progression task at this time. if testcase.get_metadata('progression_pending'): + _set_testcase_stuck_state(testcase, True) logs.info(f'Skipping testcase {testcase_id}, progression pending') _emit_untriaged_testcase_age_metric(testcase) _increment_untriaged_testcase_count(testcase.job_type, @@ -399,17 +414,24 @@ def main(): # If the testcase has a bug filed already, no triage is needed. if _is_bug_filed(testcase): + _set_testcase_stuck_state(testcase, False) + logs.info( + f'Skipping testcase {testcase_id}, since a bug was already filed.') continue # Check if the crash is important, i.e. it is either a reproducible crash # or an unreproducible crash happening frequently. if not _is_crash_important(testcase): + _set_testcase_stuck_state(testcase, False) + logs.info( + f'Skipping testcase {testcase_id}, since the crash is not important.') continue # Require that all tasks like minimizaton, regression testing, etc have # finished. if not critical_tasks_completed: _emit_untriaged_testcase_age_metric(testcase) + _set_testcase_stuck_state(testcase, True) _increment_untriaged_testcase_count(testcase.job_type, PENDING_CRITICAL_TASKS) logs.info( @@ -429,6 +451,7 @@ def main(): if not testcase.group_id and not dates.time_has_expired( testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT): _emit_untriaged_testcase_age_metric(testcase) + _set_testcase_stuck_state(testcase, True) _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING) logs.info(f'Skipping testcase {testcase_id}, pending grouping.') continue @@ -436,6 +459,7 @@ def main(): if not testcase.get_metadata('ran_grouper'): # Testcase should be considered by the grouper first before filing. _emit_untriaged_testcase_age_metric(testcase) + _set_testcase_stuck_state(testcase, True) _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING) logs.info(f'Skipping testcase {testcase_id}, pending grouping.') continue @@ -450,6 +474,9 @@ def main(): # If there are similar issues to this test case already filed or recently # closed, skip filing a duplicate bug. if _check_and_update_similar_bug(testcase, issue_tracker): + _set_testcase_stuck_state(testcase, False) + logs.info(f'Skipping testcase {testcase_id}, since a similar bug' + ' was already filed.') continue # Clean up old triage messages that would be not applicable now. @@ -457,6 +484,7 @@ def main(): # A testcase is untriaged, until immediately before a bug is opened _emit_untriaged_testcase_age_metric(testcase) + _set_testcase_stuck_state(testcase, False) _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING) # File the bug first and then create filed bug metadata. diff --git a/src/clusterfuzz/_internal/datastore/data_types.py b/src/clusterfuzz/_internal/datastore/data_types.py index cfd16f93fe..bd4c3e2e44 100644 --- a/src/clusterfuzz/_internal/datastore/data_types.py +++ b/src/clusterfuzz/_internal/datastore/data_types.py @@ -580,6 +580,9 @@ class Testcase(Model): # corpus. trusted = ndb.BooleanProperty(default=False) + # Tracks if a testcase is stuck during triage. + stuck_in_triage = ndb.BooleanProperty(default=False) + def is_chromium(self): return self.project_name in ('chromium', 'chromium-testing') @@ -686,6 +689,8 @@ def get_created_time(self) -> ndb.DateTimeProperty: def get_age_in_seconds(self): current_time = datetime.datetime.utcnow() + if not self.get_created_time(): + return None testcase_age = current_time - self.get_created_time() return testcase_age.total_seconds() diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py index 9744967733..c30787836a 100644 --- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py +++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py @@ -223,12 +223,14 @@ TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric( 'uploaded_testcase_analysis/triage_duration_secs', description=('Time elapsed between testcase upload and completion' - ' of relevant tasks in the testcase upload lifecycle, ' - 'in hours.'), + ' of relevant tasks in the testcase upload lifecycle.' + ' Origin can be either from a fuzzer, or a manual' + ' upload. Measured in hours.'), bucketer=monitor.GeometricBucketer(), field_spec=[ monitor.StringField('step'), monitor.StringField('job'), + monitor.StringField('origin'), ], )