Skip to content

Commit

Permalink
[Monitoring] Extend TESTCASE_UPLOAD_TRIAGE_DURATION to account for fu…
Browse files Browse the repository at this point in the history
…zzer generated test cases (#4481)

[#4364](#4364) implemented the
tracking for the time it takes clusterfuzz to complete several steps of
the manually uploaded testcase lifecycle.

As per Chrome's request, the metric will now contain an 'origin' label,
which indicates if the testcase was 'manually_uploaded' or generated by
a 'fuzzer'.

The code was also simplified, by reusing the get_age_in_seconds method
from the TestCase entity.

Also, it adds the 'stuck_in_triage' boolean field to the testcase
entity, to facilitate figuring out what testcases are in a stuck state,
so follow up actions can be taken.

Part of #4271
  • Loading branch information
vitorguidi committed Dec 16, 2024
1 parent 800e9b9 commit 3923c6e
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 8 deletions.
24 changes: 19 additions & 5 deletions src/clusterfuzz/_internal/common/testcase_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@


def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
'''Finds out if a testcase is fuzzer generated or manually uploaded,
and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.'''
testcase_upload_metadata = get_testcase_upload_metadata(testcase_id)
if not testcase_upload_metadata:
logs.warning(f'No upload metadata found for testcase {testcase_id},'
Expand All @@ -45,9 +47,6 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
'analyze_launched', 'analyze_completed', 'minimize_completed',
'regression_completed', 'impact_completed', 'issue_updated'
]
elapsed_time_since_upload = datetime.datetime.utcnow()
elapsed_time_since_upload -= testcase_upload_metadata.timestamp
elapsed_time_since_upload = elapsed_time_since_upload.total_seconds() / 3600

testcase = data_handler.get_testcase_by_id(testcase_id)

Expand All @@ -61,15 +60,30 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
return

from_fuzzer = not get_testcase_upload_metadata(testcase_id)

assert step in [
'analyze_launched', 'analyze_completed', 'minimize_completed',
'regression_completed', 'impact_completed', 'issue_updated'
]

if not testcase.get_age_in_seconds():
logs.warning(f'No timestamp associated to testcase {testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
return

testcase_age_in_hours = testcase.get_age_in_seconds() / 3600

logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase '
f'{testcase_id} (age = {elapsed_time_since_upload}) '
f'{testcase_id} (age = {testcase_age_in_hours} hours.) '
'in step {step}.')

monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
elapsed_time_since_upload,
testcase_age_in_hours,
labels={
'job': testcase.job_type,
'step': step,
'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded'
})


Expand Down
30 changes: 29 additions & 1 deletion src/clusterfuzz/_internal/cron/triage.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,13 @@ def _file_issue(testcase, issue_tracker, throttler):
return filed


def _set_testcase_stuck_state(testcase: data_types.Testcase, state: bool):
if testcase.stuck_in_triage == state:
return
testcase.stuck_in_triage = state
testcase.put()


untriaged_testcases = {}


Expand All @@ -322,7 +329,8 @@ def _increment_untriaged_testcase_count(job, status):
def _emit_untriaged_testcase_count_metric():
for (job, status) in untriaged_testcases:
monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set(
untriaged_testcases[(job, status)], labels={
untriaged_testcases[(job, status)],
labels={
'job': job,
'status': status,
})
Expand Down Expand Up @@ -383,14 +391,21 @@ def main():

# Skip if testcase's job is removed.
if testcase.job_type not in all_jobs:
_set_testcase_stuck_state(testcase, False)
logs.info(f'Skipping testcase {testcase_id}, since its job was removed '
f' ({testcase.job_type})')
continue

# Skip if testcase's job is in exclusions list.
if testcase.job_type in excluded_jobs:
_set_testcase_stuck_state(testcase, False)
logs.info(f'Skipping testcase {testcase_id}, since its job is in the'
f' exclusion list ({testcase.job_type})')
continue

# Skip if we are running progression task at this time.
if testcase.get_metadata('progression_pending'):
_set_testcase_stuck_state(testcase, True)
logs.info(f'Skipping testcase {testcase_id}, progression pending')
_emit_untriaged_testcase_age_metric(testcase)
_increment_untriaged_testcase_count(testcase.job_type,
Expand All @@ -399,17 +414,24 @@ def main():

# If the testcase has a bug filed already, no triage is needed.
if _is_bug_filed(testcase):
_set_testcase_stuck_state(testcase, False)
logs.info(
f'Skipping testcase {testcase_id}, since a bug was already filed.')
continue

# Check if the crash is important, i.e. it is either a reproducible crash
# or an unreproducible crash happening frequently.
if not _is_crash_important(testcase):
_set_testcase_stuck_state(testcase, False)
logs.info(
f'Skipping testcase {testcase_id}, since the crash is not important.')
continue

# Require that all tasks like minimizaton, regression testing, etc have
# finished.
if not critical_tasks_completed:
_emit_untriaged_testcase_age_metric(testcase)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type,
PENDING_CRITICAL_TASKS)
logs.info(
Expand All @@ -429,13 +451,15 @@ def main():
if not testcase.group_id and not dates.time_has_expired(
testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
_emit_untriaged_testcase_age_metric(testcase)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
continue

if not testcase.get_metadata('ran_grouper'):
# Testcase should be considered by the grouper first before filing.
_emit_untriaged_testcase_age_metric(testcase)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
continue
Expand All @@ -450,13 +474,17 @@ def main():
# If there are similar issues to this test case already filed or recently
# closed, skip filing a duplicate bug.
if _check_and_update_similar_bug(testcase, issue_tracker):
_set_testcase_stuck_state(testcase, False)
logs.info(f'Skipping testcase {testcase_id}, since a similar bug'
' was already filed.')
continue

# Clean up old triage messages that would be not applicable now.
testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False)

# A testcase is untriaged, until immediately before a bug is opened
_emit_untriaged_testcase_age_metric(testcase)
_set_testcase_stuck_state(testcase, False)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)

# File the bug first and then create filed bug metadata.
Expand Down
5 changes: 5 additions & 0 deletions src/clusterfuzz/_internal/datastore/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,9 @@ class Testcase(Model):
# corpus.
trusted = ndb.BooleanProperty(default=False)

# Tracks if a testcase is stuck during triage.
stuck_in_triage = ndb.BooleanProperty(default=False)

def is_chromium(self):
return self.project_name in ('chromium', 'chromium-testing')

Expand Down Expand Up @@ -686,6 +689,8 @@ def get_created_time(self) -> ndb.DateTimeProperty:

def get_age_in_seconds(self):
current_time = datetime.datetime.utcnow()
if not self.get_created_time():
return None
testcase_age = current_time - self.get_created_time()
return testcase_age.total_seconds()

Expand Down
6 changes: 4 additions & 2 deletions src/clusterfuzz/_internal/metrics/monitoring_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,12 +223,14 @@
TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
'uploaded_testcase_analysis/triage_duration_secs',
description=('Time elapsed between testcase upload and completion'
' of relevant tasks in the testcase upload lifecycle, '
'in hours.'),
' of relevant tasks in the testcase upload lifecycle.'
' Origin can be either from a fuzzer, or a manual'
' upload. Measured in hours.'),
bucketer=monitor.GeometricBucketer(),
field_spec=[
monitor.StringField('step'),
monitor.StringField('job'),
monitor.StringField('origin'),
],
)

Expand Down

0 comments on commit 3923c6e

Please sign in to comment.