Skip to content

Commit

Permalink
Merge remainder monitoring PRs to oss fuzz (#4563)
Browse files Browse the repository at this point in the history
This merges #4512 , #4516 , #4547 , #4497,  #4560 and #4561
  • Loading branch information
vitorguidi authored Dec 27, 2024
1 parent 0472aa7 commit 246122f
Show file tree
Hide file tree
Showing 8 changed files with 1,684 additions and 107 deletions.
1,622 changes: 1,622 additions & 0 deletions infra/terraform/monitoring.tf

Large diffs are not rendered by default.

61 changes: 11 additions & 50 deletions src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,49 +85,16 @@ def __init__(self, subtask: _Subtask):
self._labels = None
self.utask_main_failure = None
self._utask_success_conditions = [
None, # This can be a successful return value in, ie, fuzz task
uworker_msg_pb2.ErrorType.NO_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.ANALYZE_NO_CRASH, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_BAD_STATE_MIN_MAX, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_NO_CRASH, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_LOW_CONFIDENCE_IN_REGRESSION_RANGE, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.MINIMIZE_UNREPRODUCIBLE_CRASH, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.MINIMIZE_CRASH_TOO_FLAKY, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_UNREPRODUCIBLE, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.ANALYZE_CLOSE_INVALID_UPLOADED, # pylint: disable=no-member
]
self._utask_maybe_retry_conditions = [
uworker_msg_pb2.ErrorType.ANALYZE_BUILD_SETUP, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.ANALYZE_NO_REVISIONS_LIST, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.TESTCASE_SETUP, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.MINIMIZE_SETUP, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.FUZZ_DATA_BUNDLE_SETUP_FAILURE, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.FUZZ_NO_FUZZ_TARGET_SELECTED, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_NO_CRASH, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_TIMEOUT, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_BUILD_SETUP_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_BUILD_SETUP_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_TIMEOUT_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.SYMBOLIZE_BUILD_SETUP_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.MINIMIZE_DEADLINE_EXCEEDED, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.MINIMIZE_DEADLINE_EXCEEDED_IN_MAIN_FILE_PHASE, # pylint: disable=no-member
]
self._utask_failure_conditions = [
uworker_msg_pb2.ErrorType.ANALYZE_NO_REVISION_INDEX, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.UNHANDLED, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.VARIANT_BUILD_SETUP, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.FUZZ_BUILD_SETUP_FAILURE, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.FUZZ_NO_FUZZER, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_REVISION_LIST_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_BUILD_NOT_FOUND, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_BAD_BUILD, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_REVISION_LIST_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_BUILD_NOT_FOUND, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_BAD_BUILD_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_FAILED, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.CORPUS_PRUNING_FUZZER_SETUP_FAILED, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.CORPUS_PRUNING_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.FUZZ_BAD_BUILD, # pylint: disable=no-member
]

if subtask == _Subtask.PREPROCESS:
self._preprocess_start_time_ns = self.start_time_ns
Expand Down Expand Up @@ -169,17 +136,11 @@ def set_task_details(self,
# Ensure we always have a value after this method returns.
assert self._preprocess_start_time_ns is not None

def _infer_uworker_main_outcome(self, exc_type, uworker_error):
'''Infers, on a best effort basis, whether an uworker output implies
success or failure. If an unequivocal response is not possible,
classifies as maybe_retry.'''
if exc_type or uworker_error in self._utask_failure_conditions:
outcome = 'error'
elif uworker_error in self._utask_maybe_retry_conditions:
outcome = 'maybe_retry'
else:
outcome = 'success'
return outcome
def _infer_uworker_main_outcome(self, exc_type, uworker_error) -> bool:
"""Returns True if task succeeded, False otherwise."""
if exc_type or uworker_error not in self._utask_success_conditions:
return False
return True

def __exit__(self, _exc_type, _exc_value, _traceback):
# Ignore exception details, let Python continue unwinding the stack.
Expand All @@ -201,12 +162,12 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
# The only case where a task might fail without throwing, is in
# utask_main, by returning an ErrorType proto which indicates
# failure.
outcome = self._infer_uworker_main_outcome(_exc_type,
self.utask_main_failure)
task_succeeded = self._infer_uworker_main_outcome(_exc_type,
self.utask_main_failure)
monitoring_metrics.TASK_OUTCOME_COUNT.increment({
**self._labels, 'outcome': outcome
**self._labels, 'task_succeeded': task_succeeded
})
if outcome == "success":
if task_succeeded:
error_condition = 'N/A'
elif _exc_type:
error_condition = 'UNHANDLED_EXCEPTION'
Expand All @@ -218,7 +179,7 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
# labels limit recommended by gcp.
trimmed_labels = self._labels
del trimmed_labels['job']
trimmed_labels['outcome'] = outcome
trimmed_labels['task_succeeded'] = task_succeeded
trimmed_labels['error_condition'] = error_condition
monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment(
trimmed_labels)
Expand Down
2 changes: 2 additions & 0 deletions src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,8 @@ def _update_testcase(output):
if analyze_task_output.platform_id:
testcase.platform_id = analyze_task_output.platform_id

testcase.analyze_pending = False

testcase.put()


Expand Down
40 changes: 10 additions & 30 deletions src/clusterfuzz/_internal/common/testcase_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,59 +31,39 @@


def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
'''Finds out if a testcase is fuzzer generated or manually uploaded,
and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.'''
testcase_upload_metadata = get_testcase_upload_metadata(testcase_id)
if not testcase_upload_metadata:
logs.warning(f'No upload metadata found for testcase {testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
return
if not testcase_upload_metadata.timestamp:
logs.warning(
f'No timestamp for testcase {testcase_upload_metadata.testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
return
assert step in [
'analyze_launched', 'analyze_completed', 'minimize_completed',
'regression_completed', 'impact_completed', 'issue_updated'
]

"""Finds out if a testcase is fuzzer generated or manually uploaded,
and emits the TESTCASE_TRIAGE_DURATION metric."""
testcase = data_handler.get_testcase_by_id(testcase_id)

if not testcase:
logs.warning(f'No testcase found with id {testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
' failed to emit TESTCASE_TRIAGE_DURATION metric.')
return

if not testcase.job_type:
logs.warning(f'No job_type associated to testcase {testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
' failed to emit TESTCASE_TRIAGE_DURATION metric.')
return

from_fuzzer = not get_testcase_upload_metadata(testcase_id)

assert step in [
'analyze_launched', 'analyze_completed', 'minimize_completed',
'regression_completed', 'impact_completed', 'issue_updated'
]

if not testcase.get_age_in_seconds():
logs.warning(f'No timestamp associated to testcase {testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
' failed to emit TESTCASE_TRIAGE_DURATION metric.')
return

testcase_age_in_hours = testcase.get_age_in_seconds() / 3600
testcase_age_in_hours = testcase.get_age_in_seconds() / (60 * 60)

logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase '
logs.info('Emiting TESTCASE_TRIAGE_DURATION metric for testcase '
f'{testcase_id} (age = {testcase_age_in_hours} hours.) '
'in step {step}.')
f'in step {step}, from_fuzzer: {from_fuzzer}.')

monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
monitoring_metrics.TESTCASE_TRIAGE_DURATION.add(
testcase_age_in_hours,
labels={
'job': testcase.job_type,
'step': step,
'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded'
'from_fuzzer': from_fuzzer
})


Expand Down
46 changes: 26 additions & 20 deletions src/clusterfuzz/_internal/cron/triage.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,8 @@ def _check_and_update_similar_bug(testcase, issue_tracker):

def _emit_bug_filing_from_testcase_elapsed_time_metric(testcase):
testcase_age = testcase.get_age_in_seconds()
if not testcase_age:
return
monitoring_metrics.BUG_FILING_FROM_TESTCASE_ELAPSED_TIME.add(
testcase_age,
labels={
Expand Down Expand Up @@ -336,27 +338,30 @@ def _emit_untriaged_testcase_count_metric():
})


def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase):
PENDING_ANALYZE = 'pending_analyze'
PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
PENDING_PROGRESSION = 'pending_progression'
PENDING_GROUPING = 'pending_grouping'
PENDING_FILING = 'pending_filing'


def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase,
step: str):
"""Emmits a metric to track age of untriaged testcases."""
if not testcase.timestamp:
if not testcase.get_age_in_seconds():
return

logs.info(f'Emiting UNTRIAGED_TESTCASE_AGE for testcase {testcase.key.id()} '
f'(age = {testcase.get_age_in_seconds()})')
f'(age = {testcase.get_age_in_seconds()}), step = {step}')
monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add(
testcase.get_age_in_seconds() / 3600,
labels={
'job': testcase.job_type,
'platform': testcase.platform,
'step': step,
})


PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
PENDING_PROGRESSION = 'pending_progression'
PENDING_GROUPING = 'pending_grouping'
PENDING_FILING = 'pending_filing'


def main():
"""Files bugs."""
try:
Expand Down Expand Up @@ -409,7 +414,7 @@ def main():
if testcase.get_metadata('progression_pending'):
_set_testcase_stuck_state(testcase, True)
logs.info(f'Skipping testcase {testcase_id}, progression pending')
_emit_untriaged_testcase_age_metric(testcase)
_emit_untriaged_testcase_age_metric(testcase, PENDING_PROGRESSION)
_increment_untriaged_testcase_count(testcase.job_type,
PENDING_PROGRESSION)
continue
Expand All @@ -432,10 +437,12 @@ def main():
# Require that all tasks like minimizaton, regression testing, etc have
# finished.
if not critical_tasks_completed:
_emit_untriaged_testcase_age_metric(testcase)
status = PENDING_CRITICAL_TASKS
if testcase.analyze_pending:
status = PENDING_ANALYZE
_emit_untriaged_testcase_age_metric(testcase, status)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type,
PENDING_CRITICAL_TASKS)
_increment_untriaged_testcase_count(testcase.job_type, status)
logs.info(
f'Skipping testcase {testcase_id}, critical tasks still pending.')
continue
Expand All @@ -452,15 +459,15 @@ def main():
# metadata works well.
if not testcase.group_id and not dates.time_has_expired(
testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
_emit_untriaged_testcase_age_metric(testcase)
_emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
continue

if not testcase.get_metadata('ran_grouper'):
# Testcase should be considered by the grouper first before filing.
_emit_untriaged_testcase_age_metric(testcase)
_emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
Expand Down Expand Up @@ -490,16 +497,15 @@ def main():
# Clean up old triage messages that would be not applicable now.
testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False)

# A testcase is untriaged, until immediately before a bug is opened
_emit_untriaged_testcase_age_metric(testcase)
_set_testcase_stuck_state(testcase, False)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)

# File the bug first and then create filed bug metadata.
if not _file_issue(testcase, issue_tracker, throttler):
_emit_untriaged_testcase_age_metric(testcase, PENDING_FILING)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
logs.info(f'Issue filing failed for testcase id {testcase_id}')
continue

_set_testcase_stuck_state(testcase, False)

_create_filed_bug_metadata(testcase)
issue_filer.notify_issue_update(testcase, 'new')

Expand Down
3 changes: 2 additions & 1 deletion src/clusterfuzz/_internal/datastore/data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -921,7 +921,7 @@ def critical_tasks_completed(testcase):
return testcase.minimized_keys and testcase.regression

return bool(testcase.minimized_keys and testcase.regression and
testcase.is_impact_set_flag)
testcase.is_impact_set_flag and not testcase.analyze_pending)


# ------------------------------------------------------------------------------
Expand Down Expand Up @@ -1379,6 +1379,7 @@ def create_user_uploaded_testcase(key,

testcase.timestamp = utils.utcnow()
testcase.created = testcase.timestamp
testcase.analyze_pending = True
testcase.uploader_email = uploader_email
testcase.put()

Expand Down
4 changes: 4 additions & 0 deletions src/clusterfuzz/_internal/datastore/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,10 @@ class Testcase(Model):
# Tracks if a testcase is stuck during triage.
stuck_in_triage = ndb.BooleanProperty(default=False)

# Tracks if analyze task is pending.
# Defaults to false, since most testcases are fuzzer produced.
analyze_pending = ndb.BooleanProperty(default=False)

def is_chromium(self):
return self.project_name in ('chromium', 'chromium-testing')

Expand Down
13 changes: 7 additions & 6 deletions src/clusterfuzz/_internal/metrics/monitoring_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,17 +231,17 @@
],
)

TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
'uploaded_testcase_analysis/triage_duration_secs',
TESTCASE_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
'testcase_analysis/triage_duration_hours',
description=('Time elapsed between testcase upload and completion'
' of relevant tasks in the testcase upload lifecycle.'
' of relevant tasks in the testcase lifecycle.'
' Origin can be either from a fuzzer, or a manual'
' upload. Measured in hours.'),
bucketer=monitor.GeometricBucketer(),
field_spec=[
monitor.StringField('step'),
monitor.StringField('job'),
monitor.StringField('origin'),
monitor.BooleanField('from_fuzzer'),
],
)

Expand All @@ -263,7 +263,7 @@
monitor.StringField('subtask'),
monitor.StringField('mode'),
monitor.StringField('platform'),
monitor.StringField('outcome'),
monitor.BooleanField('task_succeeded'),
])

TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric(
Expand All @@ -274,7 +274,7 @@
monitor.StringField('subtask'),
monitor.StringField('mode'),
monitor.StringField('platform'),
monitor.StringField('outcome'),
monitor.BooleanField('task_succeeded'),
monitor.StringField('error_condition'),
])

Expand Down Expand Up @@ -365,6 +365,7 @@
field_spec=[
monitor.StringField('job'),
monitor.StringField('platform'),
monitor.StringField('step'),
])

UNTRIAGED_TESTCASE_COUNT = monitor.GaugeMetric(
Expand Down

0 comments on commit 246122f

Please sign in to comment.