From 79ca59285e03adf10ea1ee3b3ce957b4fb887297 Mon Sep 17 00:00:00 2001
From: Vitor Guidi <vitorguidi@gmail.com>
Date: Mon, 23 Dec 2024 15:11:06 -0300
Subject: [PATCH] [Monitoring] Enrich UNTRIAGED_TESTCASE_AGE metric to track
 testcases stuck in analyze (#4547)

### Motivation

We currently have no way to tell if analyze task was successfully
executed. The TESTCASE_UPLOAD_TRIAGE_DURATION metric from #4364 would
only track duration for tasks that did finish.

An analyze_pending field is added to the Testcase entity in datastore,
which is set to False by default, to True for manually uploaded
testcases, and to False once analyze task postprocess runs.

It also increments the UNTRIAGED_TESTCASE_AGE metric from #4381 with a
status label, so we can know at what step the testcase is stuck, thus
allowing us to alert if analyze is taking longer to finish than
expected.

The alert itself could be, for instance, P50 age of untriaged testcase
(status=analyze_pending) > 3h.

Also, this retroactively addresses comments from #4481:

* Fixes docstring for emit_testcase_triage_duration_metric
* Removes assertions
* Renames TESTCASE_UPLOAD_TRIAGE_DURATION to TESTCASE_TRIAGE_DURATION,
since it now accounts for fuzzer generated testcases
* Use a boolean "from_fuzzer" field, instead of "origin" string, in
TESTCASE_TRIAGE_DURATION
---
 .../bot/tasks/utasks/analyze_task.py          |  2 +
 .../_internal/common/testcase_utils.py        | 40 ++++------------
 src/clusterfuzz/_internal/cron/triage.py      | 46 +++++++++++--------
 .../_internal/datastore/data_handler.py       |  3 +-
 .../_internal/datastore/data_types.py         |  4 ++
 .../_internal/metrics/monitoring_metrics.py   |  9 ++--
 6 files changed, 49 insertions(+), 55 deletions(-)

diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py
index 785974f735..1f0b7e8f58 100644
--- a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py
+++ b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py
@@ -552,6 +552,8 @@ def _update_testcase(output):
   if analyze_task_output.platform_id:
     testcase.platform_id = analyze_task_output.platform_id
 
+  testcase.analyze_pending = False
+
   testcase.put()
 
 
diff --git a/src/clusterfuzz/_internal/common/testcase_utils.py b/src/clusterfuzz/_internal/common/testcase_utils.py
index 73fe9edfd4..4314dcd55f 100644
--- a/src/clusterfuzz/_internal/common/testcase_utils.py
+++ b/src/clusterfuzz/_internal/common/testcase_utils.py
@@ -31,59 +31,39 @@
 
 
 def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
-  '''Finds out if a testcase is fuzzer generated or manually uploaded,
-      and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.'''
-  testcase_upload_metadata = get_testcase_upload_metadata(testcase_id)
-  if not testcase_upload_metadata:
-    logs.warning(f'No upload metadata found for testcase {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
-    return
-  if not testcase_upload_metadata.timestamp:
-    logs.warning(
-        f'No timestamp for testcase {testcase_upload_metadata.testcase_id},'
-        ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
-    return
-  assert step in [
-      'analyze_launched', 'analyze_completed', 'minimize_completed',
-      'regression_completed', 'impact_completed', 'issue_updated'
-  ]
-
+  """Finds out if a testcase is fuzzer generated or manually uploaded,
+      and emits the TESTCASE_TRIAGE_DURATION metric."""
   testcase = data_handler.get_testcase_by_id(testcase_id)
 
   if not testcase:
     logs.warning(f'No testcase found with id {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+                 ' failed to emit TESTCASE_TRIAGE_DURATION metric.')
     return
 
   if not testcase.job_type:
     logs.warning(f'No job_type associated to testcase {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+                 ' failed to emit TESTCASE_TRIAGE_DURATION metric.')
     return
 
   from_fuzzer = not get_testcase_upload_metadata(testcase_id)
 
-  assert step in [
-      'analyze_launched', 'analyze_completed', 'minimize_completed',
-      'regression_completed', 'impact_completed', 'issue_updated'
-  ]
-
   if not testcase.get_age_in_seconds():
     logs.warning(f'No timestamp associated to testcase {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+                 ' failed to emit TESTCASE_TRIAGE_DURATION metric.')
     return
 
-  testcase_age_in_hours = testcase.get_age_in_seconds() / 3600
+  testcase_age_in_hours = testcase.get_age_in_seconds() / (60 * 60)
 
-  logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase '
+  logs.info('Emiting TESTCASE_TRIAGE_DURATION metric for testcase '
             f'{testcase_id} (age = {testcase_age_in_hours} hours.) '
-            'in step {step}.')
+            f'in step {step}, from_fuzzer: {from_fuzzer}.')
 
-  monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
+  monitoring_metrics.TESTCASE_TRIAGE_DURATION.add(
       testcase_age_in_hours,
       labels={
           'job': testcase.job_type,
           'step': step,
-          'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded'
+          'from_fuzzer': from_fuzzer
       })
 
 
diff --git a/src/clusterfuzz/_internal/cron/triage.py b/src/clusterfuzz/_internal/cron/triage.py
index 0b96e45d94..f3c4887fe3 100644
--- a/src/clusterfuzz/_internal/cron/triage.py
+++ b/src/clusterfuzz/_internal/cron/triage.py
@@ -256,6 +256,8 @@ def _check_and_update_similar_bug(testcase, issue_tracker):
 
 def _emit_bug_filing_from_testcase_elapsed_time_metric(testcase):
   testcase_age = testcase.get_age_in_seconds()
+  if not testcase_age:
+    return
   monitoring_metrics.BUG_FILING_FROM_TESTCASE_ELAPSED_TIME.add(
       testcase_age,
       labels={
@@ -336,27 +338,30 @@ def _emit_untriaged_testcase_count_metric():
         })
 
 
-def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase):
+PENDING_ANALYZE = 'pending_analyze'
+PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
+PENDING_PROGRESSION = 'pending_progression'
+PENDING_GROUPING = 'pending_grouping'
+PENDING_FILING = 'pending_filing'
+
+
+def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase,
+                                        step: str):
   """Emmits a metric to track age of untriaged testcases."""
-  if not testcase.timestamp:
+  if not testcase.get_age_in_seconds():
     return
 
   logs.info(f'Emiting UNTRIAGED_TESTCASE_AGE for testcase {testcase.key.id()} '
-            f'(age = {testcase.get_age_in_seconds()})')
+            f'(age = {testcase.get_age_in_seconds()}), step = {step}')
   monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add(
       testcase.get_age_in_seconds() / 3600,
       labels={
           'job': testcase.job_type,
           'platform': testcase.platform,
+          'step': step,
       })
 
 
-PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
-PENDING_PROGRESSION = 'pending_progression'
-PENDING_GROUPING = 'pending_grouping'
-PENDING_FILING = 'pending_filing'
-
-
 def main():
   """Files bugs."""
   try:
@@ -409,7 +414,7 @@ def main():
     if testcase.get_metadata('progression_pending'):
       _set_testcase_stuck_state(testcase, True)
       logs.info(f'Skipping testcase {testcase_id}, progression pending')
-      _emit_untriaged_testcase_age_metric(testcase)
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_PROGRESSION)
       _increment_untriaged_testcase_count(testcase.job_type,
                                           PENDING_PROGRESSION)
       logs.info(f'Skipping testcase {testcase_id}, progression pending')
@@ -433,10 +438,12 @@ def main():
     # Require that all tasks like minimizaton, regression testing, etc have
     # finished.
     if not critical_tasks_completed:
-      _emit_untriaged_testcase_age_metric(testcase)
+      status = PENDING_CRITICAL_TASKS
+      if testcase.analyze_pending:
+        status = PENDING_ANALYZE
+      _emit_untriaged_testcase_age_metric(testcase, status)
       _set_testcase_stuck_state(testcase, True)
-      _increment_untriaged_testcase_count(testcase.job_type,
-                                          PENDING_CRITICAL_TASKS)
+      _increment_untriaged_testcase_count(testcase.job_type, status)
       logs.info(
           f'Skipping testcase {testcase_id}, critical tasks still pending.')
       continue
@@ -453,7 +460,7 @@ def main():
     # metadata works well.
     if not testcase.group_id and not dates.time_has_expired(
         testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
-      _emit_untriaged_testcase_age_metric(testcase)
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
       _set_testcase_stuck_state(testcase, True)
       _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
       logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
@@ -461,7 +468,7 @@ def main():
 
     if not testcase.get_metadata('ran_grouper'):
       # Testcase should be considered by the grouper first before filing.
-      _emit_untriaged_testcase_age_metric(testcase)
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
       _set_testcase_stuck_state(testcase, True)
       _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
       logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
@@ -491,16 +498,15 @@ def main():
     # Clean up old triage messages that would be not applicable now.
     testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False)
 
-    # A testcase is untriaged, until immediately before a bug is opened
-    _emit_untriaged_testcase_age_metric(testcase)
-    _set_testcase_stuck_state(testcase, False)
-    _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
-
     # File the bug first and then create filed bug metadata.
     if not _file_issue(testcase, issue_tracker, throttler):
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_FILING)
+      _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
       logs.info(f'Issue filing failed for testcase id {testcase_id}')
       continue
 
+    _set_testcase_stuck_state(testcase, False)
+
     _create_filed_bug_metadata(testcase)
     issue_filer.notify_issue_update(testcase, 'new')
 
diff --git a/src/clusterfuzz/_internal/datastore/data_handler.py b/src/clusterfuzz/_internal/datastore/data_handler.py
index ff9d608cfc..cf1aafb5ff 100644
--- a/src/clusterfuzz/_internal/datastore/data_handler.py
+++ b/src/clusterfuzz/_internal/datastore/data_handler.py
@@ -921,7 +921,7 @@ def critical_tasks_completed(testcase):
     return testcase.minimized_keys and testcase.regression
 
   return bool(testcase.minimized_keys and testcase.regression and
-              testcase.is_impact_set_flag)
+              testcase.is_impact_set_flag and not testcase.analyze_pending)
 
 
 # ------------------------------------------------------------------------------
@@ -1380,6 +1380,7 @@ def create_user_uploaded_testcase(key,
 
   testcase.timestamp = utils.utcnow()
   testcase.created = testcase.timestamp
+  testcase.analyze_pending = True
   testcase.uploader_email = uploader_email
   testcase.put()
 
diff --git a/src/clusterfuzz/_internal/datastore/data_types.py b/src/clusterfuzz/_internal/datastore/data_types.py
index bd4c3e2e44..ed0b3c16a3 100644
--- a/src/clusterfuzz/_internal/datastore/data_types.py
+++ b/src/clusterfuzz/_internal/datastore/data_types.py
@@ -583,6 +583,10 @@ class Testcase(Model):
   # Tracks if a testcase is stuck during triage.
   stuck_in_triage = ndb.BooleanProperty(default=False)
 
+  # Tracks if analyze task is pending.
+  # Defaults to false, since most testcases are fuzzer produced.
+  analyze_pending = ndb.BooleanProperty(default=False)
+
   def is_chromium(self):
     return self.project_name in ('chromium', 'chromium-testing')
 
diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
index cac65fd4ec..edd406a6d7 100644
--- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
+++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
@@ -231,17 +231,17 @@
     ],
 )
 
-TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
-    'uploaded_testcase_analysis/triage_duration_secs',
+TESTCASE_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
+    'testcase_analysis/triage_duration_hours',
     description=('Time elapsed between testcase upload and completion'
-                 ' of relevant tasks in the testcase upload lifecycle.'
+                 ' of relevant tasks in the testcase lifecycle.'
                  ' Origin can be either from a fuzzer, or a manual'
                  ' upload. Measured in hours.'),
     bucketer=monitor.GeometricBucketer(),
     field_spec=[
         monitor.StringField('step'),
         monitor.StringField('job'),
-        monitor.StringField('origin'),
+        monitor.BooleanField('from_fuzzer'),
     ],
 )
 
@@ -365,6 +365,7 @@
     field_spec=[
         monitor.StringField('job'),
         monitor.StringField('platform'),
+        monitor.StringField('step'),
     ])
 
 UNTRIAGED_TESTCASE_COUNT = monitor.GaugeMetric(