diff --git a/configs/test/batch/batch.yaml b/configs/test/batch/batch.yaml index c28b0715ed..12788e149d 100644 --- a/configs/test/batch/batch.yaml +++ b/configs/test/batch/batch.yaml @@ -20,10 +20,13 @@ mapping: disk_size_gb: 110 disk_type: pd-standard service_account_email: test-clusterfuzz-service-account-email - gce_region: 'gce-region' - gce_zone: 'gce-zone' - network: 'projects/google.com:clusterfuzz/global/networks/networkname' - subnetwork: 'projects/google.com:clusterfuzz/regions/gce-region/subnetworks/subnetworkname' + subconfigs: + - + name: central1-network1 + weight: .5 + - + name: central1-network2 + weight: .5 preemptible: false machine_type: n1-standard-1 LINUX-NONPREEMPTIBLE-UNPRIVILEGED: @@ -33,13 +36,19 @@ mapping: disk_size_gb: 110 disk_type: pd-standard service_account_email: test-unpriv-clusterfuzz-service-account-email - gce_region: 'gce-region' - gce_zone: 'gce-zone' - network: 'projects/google.com:clusterfuzz/global/networks/networkname' - subnetwork: 'projects/google.com:clusterfuzz/regions/gce-region/subnetworks/subnetworkname' preemptible: false machine_type: n1-standard-1 retry: true + subconfigs: + - + name: central1-network1 + weight: .2 + - + name: central1-network2 + weight: .3 + - + name: east4-network2 + weight: .5 LINUX-PREEMPTIBLE: clusterfuzz_release: 'prod' docker_image: 'gcr.io/clusterfuzz-images/base:a2f4dd6-202202070654' @@ -47,12 +56,12 @@ mapping: disk_size_gb: 75 disk_type: pd-standard service_account_email: test-clusterfuzz-service-account-email - gce_region: 'gce-region' - gce_zone: 'gce-zone' - network: 'projects/google.com:clusterfuzz/global/networks/networkname' - subnetwork: 'projects/google.com:clusterfuzz/regions/gce-region/subnetworks/subnetworkname' preemptible: true machine_type: n1-standard-1 + subconfigs: + - + name: east4-network2 + weight: 1 LINUX-PREEMPTIBLE-UNPRIVILEGED: clusterfuzz_release: 'prod' docker_image: 'gcr.io/clusterfuzz-images/base:a2f4dd6-202202070654' @@ -60,10 +69,23 @@ mapping: disk_size_gb: 75 disk_type: pd-standard service_account_email: test-unpriv-clusterfuzz-service-account-email - gce_region: 'gce-region' - gce_zone: 'gce-zone' - network: 'projects/google.com:clusterfuzz/global/networks/networkname' - subnetwork: 'projects/google.com:clusterfuzz/regions/gce-region/subnetworks/subnetworkname' preemptible: true machine_type: n1-standard-1 + subconfigs: + - + name: east4-network2 + weight: 1 project: 'test-clusterfuzz' +subconfigs: + central1-network1: + region: 'us-central1' + network: 'projects/project_name/global/networks/networkname' + subnetwork: 'projects/project_name/regions/us-central1/subnetworks/subnetworkname' + central1-network2: + region: 'us-central1' + network: 'projects/project_name/global/networks/networkname2' + subnetwork: 'projects/project_name/regions/us-central1/subnetworks/subnetworkname2' + east4-network2: + region: 'us-east4' + network: 'projects/project_name/global/networks/networkname2' + subnetwork: 'projects/project_name/regions/us-east4/subnetworks/subnetworkname2' diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf new file mode 100644 index 0000000000..d38064e76e --- /dev/null +++ b/infra/terraform/monitoring.tf @@ -0,0 +1,1629 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +provider "google" { + alias = "monitoring" + project = var.secondary_project_id + region = var.region +} + +resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { + provider = google.monitoring + dashboard_json = < Optional[PubSubTask]: +class PubSubTTask(PubSubTask): + """TTask that won't repeat on timeout.""" + TTASK_TIMEOUT = 30 * 60 + + @contextlib.contextmanager + def lease(self, _event=None): # pylint: disable=arguments-differ + """Maintain a lease for the task.""" + task_lease_timeout = TASK_LEASE_SECONDS_BY_COMMAND.get( + self.command, get_task_lease_timeout()) + + environment.set_value('TASK_LEASE_SECONDS', task_lease_timeout) + track_task_start(self, task_lease_timeout) + if _event is None: + _event = threading.Event() + if self.command != 'fuzz': + leaser_thread = _PubSubLeaserThread(self._pubsub_message, _event, + task_lease_timeout) + else: + leaser_thread = _PubSubLeaserThread( + self._pubsub_message, _event, self.TTASK_TIMEOUT, ack_on_timeout=True) + leaser_thread.start() + try: + yield leaser_thread + finally: + _event.set() + leaser_thread.join() + + # If we get here the task succeeded in running. Acknowledge the message. + self._pubsub_message.ack() + track_task_end() + + +def get_task_from_message(message, can_defer=True, + task_cls=None) -> Optional[PubSubTask]: """Returns a task constructed from the first of |messages| if possible.""" if message is None: return None try: - task = initialize_task(message) + task = initialize_task(message, task_cls=task_cls) except KeyError: logs.error('Received an invalid task, discarding...') message.ack() @@ -516,7 +562,7 @@ def get_task_from_message(message) -> Optional[PubSubTask]: # Check that this task should be run now (past the ETA). Otherwise we defer # its execution. - if task.defer(): + if can_defer and task.defer(): return None return task @@ -525,7 +571,7 @@ def get_task_from_message(message) -> Optional[PubSubTask]: def get_utask_mains() -> List[PubSubTask]: """Returns a list of tasks for preprocessing many utasks on this bot and then running the uworker_mains in the same batch job.""" - pubsub_puller = PubSubPuller(UTASK_MAINS_QUEUE) + pubsub_puller = PubSubPuller(UTASK_MAIN_QUEUE) messages = pubsub_puller.get_messages_time_limited(MAX_UTASKS, UTASK_QUEUE_PULL_SECONDS) return handle_multiple_utask_main_messages(messages) @@ -536,7 +582,7 @@ def handle_multiple_utask_main_messages(messages) -> List[PubSubTask]: bot.""" tasks = [] for message in messages: - task = get_task_from_message(message) + task = get_task_from_message(message, can_defer=False) if task is None: continue tasks.append(task) @@ -547,11 +593,13 @@ def handle_multiple_utask_main_messages(messages) -> List[PubSubTask]: return tasks -def initialize_task(message) -> PubSubTask: +def initialize_task(message, task_cls=None) -> PubSubTask: """Creates a task from |messages|.""" + if task_cls is None: + task_cls = PubSubTask if message.attributes.get('eventType') != 'OBJECT_FINALIZE': - return PubSubTask(message) + return task_cls(message) # Handle postprocess task. # The GCS API for pub/sub notifications uses the data field unlike @@ -585,13 +633,18 @@ class _PubSubLeaserThread(threading.Thread): EXTENSION_TIME_SECONDS = 10 * 60 # 10 minutes. - def __init__(self, message, done_event, max_lease_seconds): + def __init__(self, + message, + done_event, + max_lease_seconds, + ack_on_timeout=False): super().__init__() self.daemon = True self._message = message self._done_event = done_event self._max_lease_seconds = max_lease_seconds + self._ack_on_timeout = ack_on_timeout def run(self): """Run the leaser thread.""" @@ -603,6 +656,9 @@ def run(self): if time_left <= 0: logs.info('Lease reached maximum lease time of {} seconds, ' 'stopping renewal.'.format(self._max_lease_seconds)) + if self._ack_on_timeout: + logs.info('Acking on timeout') + self._message.ack() break extension_seconds = min(self.EXTENSION_TIME_SECONDS, time_left) @@ -631,18 +687,30 @@ def add_utask_main(command, input_url, job_type, wait_time=None): command, input_url, job_type, - queue=UTASK_MAINS_QUEUE, + queue=UTASK_MAIN_QUEUE, + utask_main=True, wait_time=wait_time, extra_info={'initial_command': initial_command}) -def bulk_add_tasks(tasks, queue=None, eta_now=False): +def bulk_add_tasks(tasks, queue=None, eta_now=False, utask_main=False): """Adds |tasks| in bulk to |queue|.""" # Old testcases may pass in queue=None explicitly, so we must check this here. if queue is None: queue = default_queue() + # We can preprocess on the preprocess bots regardless of queue. + if (utils.is_oss_fuzz() and queue != UTASK_MAIN_QUEUE and + tasks[0].command in UTASKS and not utask_main): + # TODO(metzman): `queue != UTASK_MAIN_QUEUE` and `not utask_main` are + # probably redundant. Get rid of the former. + # TODO(metzman): Do this everywhere, not just oss-fuzz. + logs.info(f'Using {PREPROCESS_QUEUE}.') + queue = PREPROCESS_QUEUE + for task in tasks: + assert task.command in UTASKS + # If callers want delays, they must do it themselves, because this function is # meant to be used for batch tasks which don't need this. # Use an ETA of right now for batch because we don't need extra delay, there @@ -665,7 +733,8 @@ def add_task(command, job_type, queue=None, wait_time=None, - extra_info=None): + extra_info=None, + utask_main=False): """Add a new task to the job queue.""" if wait_time is None: wait_time = random.randint(1, TASK_CREATION_WAIT_INTERVAL) @@ -683,7 +752,7 @@ def add_task(command, eta = utils.utcnow() + datetime.timedelta(seconds=wait_time) task = Task(command, argument, job_type, eta=eta, extra_info=extra_info) - bulk_add_tasks([task], queue=queue) + bulk_add_tasks([task], queue=queue, utask_main=utask_main) def get_task_lease_timeout(): diff --git a/src/clusterfuzz/_internal/bot/tasks/task_types.py b/src/clusterfuzz/_internal/bot/tasks/task_types.py index 39e83a536b..f001921fdc 100644 --- a/src/clusterfuzz/_internal/bot/tasks/task_types.py +++ b/src/clusterfuzz/_internal/bot/tasks/task_types.py @@ -214,3 +214,6 @@ def execute(self, task_argument, job_type, uworker_env): 'uworker_main': UworkerMainTask, 'variant': UTask, } + +for task in tasks.UTASKS: + assert COMMAND_TYPES[task] == UTask diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py index 005e9f50e8..d37194362f 100644 --- a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py +++ b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py @@ -25,8 +25,10 @@ from clusterfuzz._internal.base.tasks import task_utils from clusterfuzz._internal.bot.tasks.utasks import uworker_io from clusterfuzz._internal.bot.webserver import http_server +from clusterfuzz._internal.google_cloud_utils import storage from clusterfuzz._internal.metrics import logs from clusterfuzz._internal.metrics import monitoring_metrics +from clusterfuzz._internal.protos import uworker_msg_pb2 from clusterfuzz._internal.system import environment # Define an alias to appease pylint. @@ -74,12 +76,26 @@ class _MetricRecorder(contextlib.AbstractContextManager): Members: start_time_ns (int): The time at which this recorder was constructed, in nanoseconds since the Unix epoch. + utask_main_failure: this class stores the uworker_output.ErrorType + object returned by utask_main, and uses it to emmit a metric. """ def __init__(self, subtask: _Subtask): self.start_time_ns = time.time_ns() self._subtask = subtask self._labels = None + self.utask_main_failure = None + self._utask_success_conditions = [ + None, # This can be a successful return value in, ie, fuzz task + uworker_msg_pb2.ErrorType.NO_ERROR, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.ANALYZE_NO_CRASH, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.PROGRESSION_BAD_STATE_MIN_MAX, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.REGRESSION_NO_CRASH, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.REGRESSION_LOW_CONFIDENCE_IN_REGRESSION_RANGE, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.MINIMIZE_CRASH_TOO_FLAKY, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_UNREPRODUCIBLE, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.ANALYZE_CLOSE_INVALID_UPLOADED, # pylint: disable=no-member + ] if subtask == _Subtask.PREPROCESS: self._preprocess_start_time_ns = self.start_time_ns @@ -121,6 +137,12 @@ def set_task_details(self, # Ensure we always have a value after this method returns. assert self._preprocess_start_time_ns is not None + def _infer_uworker_main_outcome(self, exc_type, uworker_error) -> bool: + """Returns True if task succeeded, False otherwise.""" + if exc_type or uworker_error not in self._utask_success_conditions: + return False + return True + def __exit__(self, _exc_type, _exc_value, _traceback): # Ignore exception details, let Python continue unwinding the stack. @@ -138,6 +160,32 @@ def __exit__(self, _exc_type, _exc_value, _traceback): monitoring_metrics.UTASK_SUBTASK_E2E_DURATION_SECS.add( e2e_duration_secs, self._labels) + # The only case where a task might fail without throwing, is in + # utask_main, by returning an ErrorType proto which indicates + # failure. + task_succeeded = self._infer_uworker_main_outcome(_exc_type, + self.utask_main_failure) + monitoring_metrics.TASK_OUTCOME_COUNT.increment({ + **self._labels, 'task_succeeded': task_succeeded + }) + if task_succeeded: + error_condition = 'N/A' + elif _exc_type: + error_condition = 'UNHANDLED_EXCEPTION' + else: + error_condition = uworker_msg_pb2.ErrorType.Name( # pylint: disable=no-member + self.utask_main_failure) + # Get rid of job as a label, so we can have another metric to make + # error conditions more explicit, respecting the 30k distinct + # labels limit recommended by gcp. + trimmed_labels = { + **self._labels, 'task_succeeded': task_succeeded, + 'error_condition': error_condition + } + del trimmed_labels['job'] + monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment( + trimmed_labels) + def ensure_uworker_env_type_safety(uworker_env): """Converts all values in |uworker_env| to str types. @@ -226,6 +274,8 @@ def uworker_main_no_io(utask_module, serialized_uworker_input): return None # NOTE: Keep this in sync with `uworker_main()`. + if uworker_output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR: # pylint: disable=no-member + recorder.utask_main_failure = uworker_output.error_type uworker_output.bot_name = environment.get_value('BOT_NAME', '') uworker_output.platform_id = environment.get_platform_id() @@ -285,8 +335,13 @@ def uworker_main(input_download_url) -> None: """Executes the main part of a utask on the uworker (locally if not using remote executor).""" with _MetricRecorder(_Subtask.UWORKER_MAIN) as recorder: - uworker_input = uworker_io.download_and_deserialize_uworker_input( - input_download_url) + try: + uworker_input = uworker_io.download_and_deserialize_uworker_input( + input_download_url) + except storage.ExpiredSignedUrlError as e: + raise storage.ExpiredSignedUrlError( + 'Expired token, failed to download uworker_input: ' + f'{e.url}. {e.response_text}', e.url, e.response_text) uworker_output_upload_url = uworker_input.uworker_output_upload_url uworker_input.ClearField('uworker_output_upload_url') @@ -306,6 +361,9 @@ def uworker_main(input_download_url) -> None: logs.info('Starting utask_main: %s.' % utask_module) uworker_output = utask_module.utask_main(uworker_input) + if uworker_output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR: # pylint: disable=no-member + recorder.utask_main_failure = uworker_output.error_type + # NOTE: Keep this in sync with `uworker_main_no_io()`. uworker_output.bot_name = environment.get_value('BOT_NAME', '') uworker_output.platform_id = environment.get_platform_id() diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py index 785974f735..1f0b7e8f58 100644 --- a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py +++ b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py @@ -552,6 +552,8 @@ def _update_testcase(output): if analyze_task_output.platform_id: testcase.platform_id = analyze_task_output.platform_id + testcase.analyze_pending = False + testcase.put() diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/corpus_pruning_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/corpus_pruning_task.py index 4ed00611df..f4ae536ef8 100644 --- a/src/clusterfuzz/_internal/bot/tasks/utasks/corpus_pruning_task.py +++ b/src/clusterfuzz/_internal/bot/tasks/utasks/corpus_pruning_task.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Corpus pruning task.""" - import collections import datetime import json @@ -595,16 +594,12 @@ def _record_cross_pollination_stats(output): client.insert([big_query.Insert(row=bigquery_row, insert_id=None)]) -def do_corpus_pruning(uworker_input, context, revision) -> CorpusPruningResult: +def do_corpus_pruning(context, revision) -> CorpusPruningResult: """Run corpus pruning.""" # Set |FUZZ_TARGET| environment variable to help with unarchiving only fuzz # target and its related files. environment.set_value('FUZZ_TARGET', context.fuzz_target.binary) - if environment.is_trusted_host(): - from clusterfuzz._internal.bot.untrusted_runner import tasks_host - return tasks_host.do_corpus_pruning(uworker_input, context, revision) - if not build_manager.setup_build( revision=revision, fuzz_target=context.fuzz_target.binary): raise CorpusPruningError('Failed to setup build.') @@ -729,23 +724,7 @@ def do_corpus_pruning(uworker_input, context, revision) -> CorpusPruningResult: cross_pollination_stats=cross_pollination_stats) -def _update_crash_unit_path(context, crash): - """If running on a trusted host, updates the crash unit_path after copying - the file locally.""" - if not environment.is_trusted_host(): - return - from clusterfuzz._internal.bot.untrusted_runner import file_host - unit_path = os.path.join(context.bad_units_path, - os.path.basename(crash.unit_path)) - # Prevent the worker from escaping out of |context.bad_units_path|. - if not file_host.is_directory_parent(unit_path, context.bad_units_path): - raise CorpusPruningError('Invalid units path from worker.') - - file_host.copy_file_from_worker(crash.unit_path, unit_path) - crash.unit_path = unit_path - - -def _upload_corpus_crashes_zip(context: Context, result: CorpusPruningResult, +def _upload_corpus_crashes_zip(result: CorpusPruningResult, corpus_crashes_blob_name, corpus_crashes_upload_url): """Packs the corpus crashes in a zip file. The file is then uploaded @@ -754,7 +733,6 @@ def _upload_corpus_crashes_zip(context: Context, result: CorpusPruningResult, zip_filename = os.path.join(temp_dir, corpus_crashes_blob_name) with zipfile.ZipFile(zip_filename, 'w') as zip_file: for crash in result.crashes: - _update_crash_unit_path(context, crash) unit_name = os.path.basename(crash.unit_path) zip_file.write(crash.unit_path, unit_name, zipfile.ZIP_DEFLATED) @@ -766,9 +744,10 @@ def _upload_corpus_crashes_zip(context: Context, result: CorpusPruningResult, def _process_corpus_crashes(output: uworker_msg_pb2.Output): # pylint: disable=no-member """Process crashes found in the corpus.""" - if not output.corpus_pruning_task_output.crashes: - return - + # TODO(metzman): Fix this function after the holiday break. + # if not output.corpus_pruning_task_output.crashes: + return + # pylint: disable=unreachable corpus_pruning_output = output.corpus_pruning_task_output crash_revision = corpus_pruning_output.crash_revision fuzz_target = data_handler.get_fuzz_target(output.uworker_input.fuzzer_name) @@ -1007,13 +986,14 @@ def utask_main(uworker_input): uworker_output = None try: - result = do_corpus_pruning(uworker_input, context, revision) + result = do_corpus_pruning(context, revision) issue_metadata = engine_common.get_fuzz_target_issue_metadata(fuzz_target) issue_metadata = issue_metadata or {} - _upload_corpus_crashes_zip( - context, result, - uworker_input.corpus_pruning_task_input.corpus_crashes_blob_name, - uworker_input.corpus_pruning_task_input.corpus_crashes_upload_url) + # TODO(metzman): Fix this issue. + # _upload_corpus_crashes_zip( + # result, + # uworker_input.corpus_pruning_task_input.corpus_crashes_blob_name, + # uworker_input.corpus_pruning_task_input.corpus_crashes_upload_url) uworker_output = uworker_msg_pb2.Output( # pylint: disable=no-member corpus_pruning_task_output=uworker_msg_pb2.CorpusPruningTaskOutput( # pylint: disable=no-member coverage_info=_extract_coverage_information(context, result), diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/fuzz_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/fuzz_task.py index 4b0862405d..0f84d77b09 100644 --- a/src/clusterfuzz/_internal/bot/tasks/utasks/fuzz_task.py +++ b/src/clusterfuzz/_internal/bot/tasks/utasks/fuzz_task.py @@ -147,7 +147,7 @@ def from_testcase_manager_crash(cls, crash): get_unsymbolized_crash_stacktrace(crash.stack_file_path)) except Exception: logs.error( - 'Unable to read stacktrace from file %s.' % crash.stack_file_path) + f'Unable to read stacktrace from file {crash.stack_file_path}.') return None # If there are per-testcase additional flags, we need to store them. @@ -230,8 +230,7 @@ def __init__(self, self.crash_categories = state.crash_categories self.security_flag = crash_analyzer.is_security_issue( self.unsymbolized_crash_stacktrace, self.crash_type, self.crash_address) - self.key = '%s,%s,%s' % (self.crash_type, self.crash_state, - self.security_flag) # pylint: disable=attribute-defined-outside-init + self.key = f'{self.crash_type},{self.crash_state},{self.security_flag}' self.should_be_ignored = crash_analyzer.ignore_stacktrace( state.crash_stacktrace) @@ -278,9 +277,9 @@ def get_error(self): return f'Functional crash is ignored: {self.crash_state}' if self.should_be_ignored: - return ('False crash: %s\n\n---%s\n\n---%s' % - (self.crash_state, self.unsymbolized_crash_stacktrace, - self.crash_stacktrace)) + return (f'False crash: {self.crash_state}\n\n' + f'---{self.unsymbolized_crash_stacktrace}\n\n' + f'---{self.crash_stacktrace}') if self.is_uploaded() and not self.fuzzed_key: return f'Unable to store testcase in blobstore: {self.crash_state}' @@ -503,7 +502,7 @@ def _last_sync_time(sync_file_path): last_sync_time = datetime.datetime.utcfromtimestamp(float(file_contents)) except Exception as e: logs.error( - 'Malformed last sync file: "%s".' % str(e), + f'Malformed last sync file: "{e}".', path=sync_file_path, contents=file_contents) @@ -551,7 +550,7 @@ def sync_from_gcs(self): """Update sync state after a sync from GCS.""" already_synced = False sync_file_path = os.path.join( - self._data_directory, '.%s_sync' % self._project_qualified_target_name) + self._data_directory, f'.{self._project_qualified_target_name}_sync') # Get last time we synced corpus. if environment.is_trusted_host(): @@ -658,7 +657,7 @@ def get_testcases(testcase_count, testcase_directory, data_directory): # Create output strings. generated_testcase_string = ( - 'Generated %d/%d testcases.' % (generated_testcase_count, testcase_count)) + f'Generated {generated_testcase_count}/{testcase_count} testcases.') # Log the number of testcases generated. logs.info(generated_testcase_string) @@ -754,7 +753,7 @@ def store_fuzzer_run_results(testcase_file_paths, fuzzer, fuzzer_command, # Store fuzzer console output. bot_name = environment.get_value('BOT_NAME') if fuzzer_return_code is not None: - fuzzer_return_code_string = 'Return code (%d).' % fuzzer_return_code + fuzzer_return_code_string = f'Return code ({fuzzer_return_code}).' else: fuzzer_return_code_string = 'Fuzzer timed out.' truncated_fuzzer_output = truncate_fuzzer_output(fuzzer_output, @@ -1151,8 +1150,8 @@ def key_fn(crash): # Archiving testcase to blobstore might fail for all crashes within this # group. if not group.main_crash: - logs.info('Unable to store testcase in blobstore: %s' % - group.crashes[0].crash_state) + logs.info('Unable to store testcase in blobstore: ' + f'{group.crashes[0].crash_state}') continue group_proto = uworker_msg_pb2.FuzzTaskCrashGroup( @@ -1379,6 +1378,7 @@ def generate_blackbox_testcases( # Make sure we have a file to execute for the fuzzer. if not fuzzer.executable_path: logs.error(f'Fuzzer {fuzzer_name} does not have an executable path.') + return error_return_value # Get the fuzzer executable and chdir to its base directory. This helps to @@ -1410,7 +1410,7 @@ def generate_blackbox_testcases( fuzzer_timeout = environment.get_value('FUZZER_TIMEOUT') # Run the fuzzer. - logs.info('Running fuzzer - %s.' % fuzzer_command) + logs.info(f'Running fuzzer - {fuzzer_command}.') fuzzer_return_code, fuzzer_duration, fuzzer_output = ( process_handler.run_process( fuzzer_command, @@ -1655,10 +1655,10 @@ def do_blackbox_fuzzing(self, fuzzer, fuzzer_directory, job_type): trial_selector.setup_additional_args_for_app() logs.info('Starting to process testcases.') - logs.info('Redzone is %d bytes.' % self.redzone) - logs.info('Timeout multiplier is %s.' % str(self.timeout_multiplier)) - logs.info('App launch command is %s.' % - testcase_manager.get_command_line_for_application()) + logs.info(f'Redzone is {self.redzone} bytes.') + logs.info(f'Timeout multiplier is {self.timeout_multiplier}.') + logs.info('App launch command is ' + f'{testcase_manager.get_command_line_for_application()}.') # Start processing the testcases. while test_number < len(testcase_file_paths): @@ -1718,9 +1718,7 @@ def do_blackbox_fuzzing(self, fuzzer, fuzzer_directory, job_type): crashes.append(temp_queue.get()) process_handler.close_queue(temp_queue) - - logs.info('Upto %d' % test_number) - + logs.info(f'Upto {test_number}') if thread_error_occurred: break @@ -1824,10 +1822,9 @@ def run(self): self.data_directory = setup.get_data_bundle_directory( self.fuzzer, self.uworker_input.setup_input) if not self.data_directory: - logs.error( - 'Unable to setup data bundle %s.' % self.fuzzer.data_bundle_name) + logs.error(f'Unable to setup data bundle {self.fuzzer.data_bundle_name}.') return uworker_msg_pb2.Output( # pylint: disable=no-member - error_type=uworker_msg_pb2.ErrorType.FUZZ_DATA_BUNDLE_SETUP_FAILURE) # pylint: disable=no-member + error_type=uworker_msg_pb2.ErrorType.FUZZ_DATA_BUNDLE_SETUP_FAILURE) if engine_impl: crashes, fuzzer_metadata = self.do_engine_fuzzing(engine_impl) diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/minimize_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/minimize_task.py index c21c225641..9b6d46613b 100644 --- a/src/clusterfuzz/_internal/bot/tasks/utasks/minimize_task.py +++ b/src/clusterfuzz/_internal/bot/tasks/utasks/minimize_task.py @@ -293,10 +293,6 @@ def run(self, # the device. if environment.is_android(): android.device.push_testcases_to_device() - elif environment.is_trusted_host(): - from clusterfuzz._internal.bot.untrusted_runner import file_host - file_host.push_testcases_to_worker() - # If we need to write a command line file, only do so if the arguments have # changed. arguments_changed = arguments != self._previous_arguments @@ -1378,11 +1374,6 @@ def _run_libfuzzer_testcase(fuzz_target, process_handler.cleanup_stale_processes() shell.clear_temp_directory() - if environment.is_trusted_host(): - from clusterfuzz._internal.bot.untrusted_runner import file_host - file_host.copy_file_to_worker( - testcase_file_path, file_host.rebase_to_worker_root(testcase_file_path)) - test_timeout = environment.get_value('TEST_TIMEOUT', process_handler.DEFAULT_TEST_TIMEOUT) return testcase_manager.test_for_crash_with_retries( @@ -1397,15 +1388,6 @@ def _run_libfuzzer_testcase(fuzz_target, def run_libfuzzer_engine(tool_name, target_name, arguments, testcase_path, output_path, timeout): """Run the libFuzzer engine.""" - arguments = list(arguments) - if environment.is_trusted_host(): - from clusterfuzz._internal.bot.untrusted_runner import tasks_host - - # TODO(ochang): Remove hardcode. - return tasks_host.process_testcase('libFuzzer', tool_name, target_name, - arguments, testcase_path, output_path, - timeout) - target_path = engine_common.find_fuzzer_path( environment.get_value('BUILD_DIR'), target_name) if not target_path: @@ -1418,7 +1400,7 @@ def run_libfuzzer_engine(tool_name, target_name, arguments, testcase_path, assert tool_name == 'cleanse' func = engine_impl.cleanse - return func(target_path, arguments, testcase_path, output_path, timeout) + return func(target_path, list(arguments), testcase_path, output_path, timeout) def _run_libfuzzer_tool( diff --git a/src/clusterfuzz/_internal/bot/untrusted_runner/host.py b/src/clusterfuzz/_internal/bot/untrusted_runner/host.py index 585527fe22..1328798039 100644 --- a/src/clusterfuzz/_internal/bot/untrusted_runner/host.py +++ b/src/clusterfuzz/_internal/bot/untrusted_runner/host.py @@ -102,7 +102,6 @@ def __init__(self, channel): # The following are RPCs that execute larger tasks. Don't retry these. self.PruneCorpus = _wrap_call(self.PruneCorpus, num_retries=0) - self.ProcessTestcase = _wrap_call(self.ProcessTestcase, num_retries=0) self.EngineFuzz = _wrap_call(self.EngineFuzz, num_retries=0) self.EngineReproduce = _wrap_call(self.EngineReproduce, num_retries=0) # pylint: enable=invalid-name diff --git a/src/clusterfuzz/_internal/bot/untrusted_runner/tasks_host.py b/src/clusterfuzz/_internal/bot/untrusted_runner/tasks_host.py index 7cc2683374..1bf296a7b8 100644 --- a/src/clusterfuzz/_internal/bot/untrusted_runner/tasks_host.py +++ b/src/clusterfuzz/_internal/bot/untrusted_runner/tasks_host.py @@ -111,36 +111,6 @@ def do_corpus_pruning(uworker_input, context, cross_pollination_stats=pollination_stats) -def process_testcase(engine_name, tool_name, target_name, arguments, - testcase_path, output_path, timeout): - """Process testcase on untrusted worker.""" - if tool_name == 'minimize': - operation = untrusted_runner_pb2.ProcessTestcaseRequest.MINIMIZE - else: - operation = untrusted_runner_pb2.ProcessTestcaseRequest.CLEANSE - - rebased_testcase_path = file_host.rebase_to_worker_root(testcase_path) - file_host.copy_file_to_worker(testcase_path, rebased_testcase_path) - - request = untrusted_runner_pb2.ProcessTestcaseRequest( - engine=engine_name, - operation=operation, - target_name=target_name, - arguments=arguments, - testcase_path=file_host.rebase_to_worker_root(testcase_path), - output_path=file_host.rebase_to_worker_root(output_path), - timeout=timeout) - - response = host.stub().ProcessTestcase(request) - - rebased_output_path = file_host.rebase_to_worker_root(output_path) - file_host.copy_file_from_worker(rebased_output_path, output_path) - - return engine.ReproduceResult( - list(response.command), response.return_code, response.time_executed, - response.output) - - def _unpack_values(values): """Unpack protobuf values.""" unpacked = {} diff --git a/src/clusterfuzz/_internal/bot/untrusted_runner/tasks_impl.py b/src/clusterfuzz/_internal/bot/untrusted_runner/tasks_impl.py index 8a71ade599..3479dbd4fe 100644 --- a/src/clusterfuzz/_internal/bot/untrusted_runner/tasks_impl.py +++ b/src/clusterfuzz/_internal/bot/untrusted_runner/tasks_impl.py @@ -17,107 +17,13 @@ from google.protobuf.any_pb2 import Any # pylint: disable=no-name-in-module from clusterfuzz._internal.bot import testcase_manager -from clusterfuzz._internal.bot.tasks.utasks import corpus_pruning_task from clusterfuzz._internal.bot.tasks.utasks import fuzz_task -from clusterfuzz._internal.bot.tasks.utasks import minimize_task -from clusterfuzz._internal.datastore import data_types from clusterfuzz._internal.protos import untrusted_runner_pb2 from clusterfuzz.fuzz import engine # pylint:disable=no-member -def _proto_to_fuzz_target(proto): - """Convert protobuf to FuzzTarget.""" - return data_types.FuzzTarget( - engine=proto.engine, project=proto.project, binary=proto.binary) - - -def _proto_to_cross_pollinate_fuzzer(proto): - """Convert protobuf to CrossPollinateFuzzer.""" - return corpus_pruning_task.CrossPollinateFuzzer( - fuzz_target=_proto_to_fuzz_target(proto.fuzz_target), - backup_bucket_name=proto.backup_bucket_name, - corpus_engine_name=proto.corpus_engine_name) - - -def prune_corpus(request, _): - """Prune corpus.""" - context = corpus_pruning_task.Context( - request.uworker_input, _proto_to_fuzz_target(request.fuzz_target), [ - _proto_to_cross_pollinate_fuzzer(proto) - for proto in request.cross_pollinate_fuzzers - ]) - - result = corpus_pruning_task.do_corpus_pruning(request.uworker_input, context, - request.revision) - - cross_pollination_stats = None - if result.cross_pollination_stats: - cross_pollination_stats = untrusted_runner_pb2.CrossPollinationStats( - project_qualified_name=result.cross_pollination_stats. - project_qualified_name, - sources=result.cross_pollination_stats.sources, - initial_corpus_size=result.cross_pollination_stats.initial_corpus_size, - corpus_size=result.cross_pollination_stats.corpus_size, - initial_edge_coverage=result.cross_pollination_stats. - initial_edge_coverage, - edge_coverage=result.cross_pollination_stats.edge_coverage, - initial_feature_coverage=result.cross_pollination_stats. - initial_feature_coverage, - feature_coverage=result.cross_pollination_stats.feature_coverage) - - # Intentionally skip edge and function coverage values as those would come - # from fuzzer coverage cron task (see src/go/server/cron/coverage.go). - coverage_info = untrusted_runner_pb2.CoverageInfo( - corpus_size_units=result.coverage_info.corpus_size_units, - corpus_size_bytes=result.coverage_info.corpus_size_bytes, - corpus_location=result.coverage_info.corpus_location, - corpus_backup_location=result.coverage_info.corpus_backup_location, - quarantine_size_units=result.coverage_info.quarantine_size_units, - quarantine_size_bytes=result.coverage_info.quarantine_size_bytes, - quarantine_location=result.coverage_info.quarantine_location) - - crashes = [ - untrusted_runner_pb2.CorpusCrash( - crash_state=crash.crash_state, - crash_type=crash.crash_type, - crash_address=crash.crash_address, - crash_stacktrace=crash.crash_stacktrace, - unit_path=crash.unit_path, - security_flag=crash.security_flag, - ) for crash in result.crashes - ] - - return untrusted_runner_pb2.PruneCorpusResponse( - coverage_info=coverage_info, - crashes=crashes, - fuzzer_binary_name=result.fuzzer_binary_name, - revision=result.revision, - cross_pollination_stats=cross_pollination_stats) - - -def process_testcase(request, _): - """Process testcase.""" - tool_name_map = { - untrusted_runner_pb2.ProcessTestcaseRequest.MINIMIZE: 'minimize', - untrusted_runner_pb2.ProcessTestcaseRequest.CLEANSE: 'cleanse', - } - - # TODO(ochang): Support other engines. - assert request.engine == 'libFuzzer' - assert request.operation in tool_name_map - - result = minimize_task.run_libfuzzer_engine( - tool_name_map[request.operation], request.target_name, request.arguments, - request.testcase_path, request.output_path, request.timeout) - - return untrusted_runner_pb2.EngineReproduceResult( - return_code=result.return_code, - time_executed=result.time_executed, - output=result.output) - - def _pack_values(values): """Pack protobuf values.""" packed = {} diff --git a/src/clusterfuzz/_internal/common/testcase_utils.py b/src/clusterfuzz/_internal/common/testcase_utils.py index b0adb62134..4314dcd55f 100644 --- a/src/clusterfuzz/_internal/common/testcase_utils.py +++ b/src/clusterfuzz/_internal/common/testcase_utils.py @@ -31,41 +31,39 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str): - testcase_upload_metadata = get_testcase_upload_metadata(testcase_id) - if not testcase_upload_metadata: - logs.warning(f'No upload metadata found for testcase {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') - return - if not testcase_upload_metadata.timestamp: - logs.warning( - f'No timestamp for testcase {testcase_upload_metadata.testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') - return - assert step in [ - 'analyze_launched', 'analyze_completed', 'minimize_completed', - 'regression_completed', 'impact_completed', 'issue_updated' - ] - elapsed_time_since_upload = datetime.datetime.utcnow() - elapsed_time_since_upload -= testcase_upload_metadata.timestamp - elapsed_time_since_upload = elapsed_time_since_upload.total_seconds() - + """Finds out if a testcase is fuzzer generated or manually uploaded, + and emits the TESTCASE_TRIAGE_DURATION metric.""" testcase = data_handler.get_testcase_by_id(testcase_id) if not testcase: logs.warning(f'No testcase found with id {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') + ' failed to emit TESTCASE_TRIAGE_DURATION metric.') return if not testcase.job_type: logs.warning(f'No job_type associated to testcase {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') + ' failed to emit TESTCASE_TRIAGE_DURATION metric.') + return + + from_fuzzer = not get_testcase_upload_metadata(testcase_id) + + if not testcase.get_age_in_seconds(): + logs.warning(f'No timestamp associated to testcase {testcase_id},' + ' failed to emit TESTCASE_TRIAGE_DURATION metric.') return - monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add( - elapsed_time_since_upload, + testcase_age_in_hours = testcase.get_age_in_seconds() / (60 * 60) + + logs.info('Emiting TESTCASE_TRIAGE_DURATION metric for testcase ' + f'{testcase_id} (age = {testcase_age_in_hours} hours.) ' + f'in step {step}, from_fuzzer: {from_fuzzer}.') + + monitoring_metrics.TESTCASE_TRIAGE_DURATION.add( + testcase_age_in_hours, labels={ 'job': testcase.job_type, 'step': step, + 'from_fuzzer': from_fuzzer }) diff --git a/src/clusterfuzz/_internal/cron/external_testcase_reader.py b/src/clusterfuzz/_internal/cron/external_testcase_reader.py index 9ab64c9a95..500a08e0d3 100644 --- a/src/clusterfuzz/_internal/cron/external_testcase_reader.py +++ b/src/clusterfuzz/_internal/cron/external_testcase_reader.py @@ -13,6 +13,7 @@ # limitations under the License. """Automated ingestion of testcases via IssueTracker.""" +import datetime import re import requests @@ -26,56 +27,75 @@ ACCEPTED_FILETYPES = [ 'text/javascript', 'application/pdf', 'text/html', 'application/zip' ] +ISSUETRACKER_ACCEPTED_STATE = 'ACCEPTED' +ISSUETRACKER_WONTFIX_STATE = 'NOT_REPRODUCIBLE' -def close_invalid_issue(upload_request, attachment_info, description): +def close_issue_if_invalid(upload_request, attachment_info, description): """Closes any invalid upload requests with a helpful message.""" - comment_messsage = ( + comment_message = ( 'Hello, this issue is automatically closed. Please file a new bug after' - 'fixing the following issues:\n\n') + ' fixing the following issues:\n\n') invalid = False - # TODO(pgrace) remove after testing. - if upload_request.id == '373893311': + # TODO(pgrace) Remove after testing. + if upload_request.id == 373893311: return False - # TODO(pgrace) add secondary check for authorized reporters. + # TODO(pgrace) Add secondary check for authorized reporters. # Issue must have exactly one attachment. if len(attachment_info) != 1: - comment_messsage += 'Please provide exactly one attachment.\n' + comment_message += 'Please provide exactly one attachment.\n' invalid = True else: # Issue must use one of the supported testcase file types. if attachment_info[0]['contentType'] not in ACCEPTED_FILETYPES: - comment_messsage += ( + comment_message += ( 'Please provide an attachment of type: html, js, pdf, or zip.\n') invalid = True - if not attachment_info[0]['attachmentDataRef'] or \ - not attachment_info[0]['attachmentDataRef']['resourceName'] \ - or not attachment_info[0]['filename']: - comment_messsage += \ + if (not attachment_info[0]['attachmentDataRef'] or + not attachment_info[0]['attachmentDataRef']['resourceName'] or + not attachment_info[0]['filename']): + comment_message += \ 'Please check that the attachment uploaded successfully.\n' invalid = True # Issue must have valid flags as the description. flag_format = re.compile(r'^([ ]?\-\-[A-Za-z\-\_]*){50}$') if flag_format.match(description): - comment_messsage += ( + comment_message += ( 'Please provide flags in the format: "--test_flag_one --testflagtwo",\n' ) invalid = True if invalid: - comment_messsage += ( + comment_message += ( '\nPlease see the new bug template for more information on how to use' 'Clusterfuzz direct uploads.') - upload_request.status = 'not_reproducible' - upload_request.save(new_comment=comment_messsage, notify=True) + upload_request.status = ISSUETRACKER_WONTFIX_STATE + upload_request.save(new_comment=comment_message, notify=True) return invalid +def close_issue_if_not_reproducible(issue): + if issue.status == ISSUETRACKER_ACCEPTED_STATE and filed_one_day_ago( + issue.created_time): + comment_message = ('Clusterfuzz failed to reproduce - ' + 'please check testcase details for more info.') + issue.status = ISSUETRACKER_WONTFIX_STATE + issue.save(new_comment=comment_message, notify=True) + return True + return False + + +def filed_one_day_ago(issue_created_time_string): + created_time = datetime.datetime.strptime(issue_created_time_string, + '%Y-%m-%dT%H:%M:%S.%fZ') + return datetime.datetime.now() - created_time > datetime.timedelta(days=1) + + def submit_testcase(issue_id, file, filename, filetype, cmds): """Uploads the given testcase file to Clusterfuzz.""" if filetype == 'text/javascript': @@ -102,7 +122,7 @@ def submit_testcase(issue_id, file, filename, filetype, cmds): 'platform': 'Linux', 'csrf_token': form.generate_csrf_token(), 'upload_key': upload_info['key'], - # TODO(pgrace) replace with upload_info['bucket'] once testing complete. + # TODO(pgrace) Replace with upload_info['bucket'] once testing complete. 'bucket': 'clusterfuzz-test-bucket', 'key': upload_info['key'], 'GoogleAccessId': upload_info['google_access_id'], @@ -111,32 +131,45 @@ def submit_testcase(issue_id, file, filename, filetype, cmds): } return requests.post( - "https://clusterfuzz.com/upload-testcase/upload", data=data, timeout=10) + 'https://clusterfuzz.com/upload-testcase/upload', data=data, timeout=10) def handle_testcases(tracker): """Fetches and submits testcases from bugs or closes unnecssary bugs.""" - # TODO(pgrace) replace once testing complete with - # tracker.get_issues(["componentid:1600865"], is_open=True). - issues = [tracker.get_issue(373893311)] + # TODO(pgrace) remove ID filter once done testing. + issues = tracker.find_issues_with_filters( + keywords=[], + query_filters=['componentid:1600865', 'id:373893311'], + only_open=True) - # TODO(pgrace) implement rudimentary rate limiting + # TODO(pgrace) Implement rudimentary rate limiting. for issue in issues: - # TODO(pgrace) close out older bugs that may have failed to reproduce + # Close out older bugs that may have failed to reproduce. + if close_issue_if_not_reproducible(issue): + helpers.log('Closing issue {issue_id} as it failed to reproduce', + issue.id) + continue + # Close out invalid bugs. attachment_metadata = tracker.get_attachment_metadata(issue.id) commandline_flags = tracker.get_description(issue.id) - if close_invalid_issue(issue, attachment_metadata, commandline_flags): - helpers.log("Closing issue {issue_id} as it is invalid", issue.id) + if close_issue_if_invalid(issue, attachment_metadata, commandline_flags): + helpers.log('Closing issue {issue_id} as it is invalid', issue.id) continue + + # Submit valid testcases. # TODO(pgrace) replace with 0 once testing is complete attachment_metadata = attachment_metadata[6] attachment = tracker.get_attachment( attachment_metadata['attachmentDataRef']['resourceName']) submit_testcase(issue.id, attachment, attachment_metadata['filename'], attachment_metadata['contentType'], commandline_flags) - helpers.log("Submitted testcase file for issue {issue_id}", issue.id) + comment_message = 'Testcase submitted to clusterfuzz' + issue.status = ISSUETRACKER_ACCEPTED_STATE + issue.assignee = 'clusterfuzz@chromium.org' + issue.save(new_comment=comment_message, notify=True) + helpers.log('Submitted testcase file for issue {issue_id}', issue.id) def main(): diff --git a/src/clusterfuzz/_internal/cron/manage_vms.py b/src/clusterfuzz/_internal/cron/manage_vms.py index 0653299e1c..64d939dc9a 100644 --- a/src/clusterfuzz/_internal/cron/manage_vms.py +++ b/src/clusterfuzz/_internal/cron/manage_vms.py @@ -13,34 +13,20 @@ # limitations under the License. """Cron to managed VMs.""" -from collections import namedtuple from concurrent.futures import ThreadPoolExecutor import copy -import itertools import json import logging from typing import Any from typing import Dict from typing import Optional -from google.cloud import ndb - -from clusterfuzz._internal.base import utils from clusterfuzz._internal.config import local_config from clusterfuzz._internal.cron.helpers import bot_manager -from clusterfuzz._internal.datastore import data_types -from clusterfuzz._internal.datastore import ndb_utils from clusterfuzz._internal.google_cloud_utils import compute_engine_projects -PROJECT_MIN_CPUS = 1 - -# This is the maximum number of instances supported in a single instance group. -PROJECT_MAX_CPUS = 1000 - NUM_THREADS = 8 -WorkerInstance = namedtuple('WorkerInstance', ['name', 'project']) - class ManageVmsError(Exception): """Base exception class.""" @@ -51,21 +37,6 @@ def _get_project_ids(): return list(local_config.Config(local_config.GCE_CLUSTERS_PATH).get().keys()) -def _instance_name_from_url(instance_url): - """Extract instance name from url.""" - return instance_url.split('/')[-1] - - -def get_resource_name(prefix, project_name): - """Get a name that can be used for GCE resources.""" - # https://cloud.google.com/compute/docs/reference/latest/instanceGroupManagers - max_name_length = 58 - - project_name = project_name.lower().replace('_', '-') - name = prefix + '-' + project_name - return name[:max_name_length] - - def get_template_body(gce_project, template_name, task_tag=None, @@ -321,449 +292,9 @@ def update_cluster(self, str(e)) -class OssFuzzClustersManager(ClustersManager): - """Manager for clusters in OSS-Fuzz.""" - - def __init__(self, project_id): - super().__init__(project_id) - self.worker_to_assignment = {} - for assignment in self.gce_project.host_worker_assignments: - self.worker_to_assignment[assignment.worker] = assignment - - self.all_host_names = set() - - def update_clusters(self): - """Update all clusters in a project.""" - self.start_thread_pool() - - all_projects = list(data_types.OssFuzzProject.query().order( - data_types.OssFuzzProject.name)) - - self.cleanup_old_projects([project.name for project in all_projects]) - - projects = [project for project in all_projects if not project.high_end] - high_end_projects = [ - project for project in all_projects if project.high_end - ] - - project_infos = [ - self.get_or_create_project_info(project.name) for project in projects - ] - - high_end_project_infos = [ - self.get_or_create_project_info(project.name) - for project in high_end_projects - ] - - for project, project_info in itertools.chain( - list(zip(projects, project_infos)), - list(zip(high_end_projects, high_end_project_infos))): - self.cleanup_clusters(project, project_info) - - for cluster in self.gce_project.clusters: - self.update_project_cpus(projects, project_infos, high_end_projects, - high_end_project_infos, cluster) - - self.cleanup_old_assignments(self.all_host_names) - self.finish_updates() - - def get_or_create_project_info(self, project_name): - """Get OSS-Fuzz CPU info by project name (or create a new one if it doesn't - exist).""" - key = ndb.Key(data_types.OssFuzzProjectInfo, project_name) - project_info = key.get() - if not project_info: - project_info = data_types.OssFuzzProjectInfo( - name=project_name, id=project_name) - project_info.put() - - return project_info - - def get_or_create_host_worker_assignment(self, host_name, instance_num): - """Get OSS-Fuzz host worker assignment (or create a new one if it doesn't - exist).""" - key_id = '%s-%d' % (host_name, instance_num) - key = ndb.Key(data_types.HostWorkerAssignment, key_id) - assignment = key.get() - if not assignment: - assignment = data_types.HostWorkerAssignment( - host_name=host_name, instance_num=instance_num, id=key_id) - assignment.put() - - return assignment - - def cleanup_old_assignments(self, host_names): - """Remove old OSS-Fuzz host worker assignment entries.""" - to_delete = [] - for assignment in data_types.HostWorkerAssignment.query(): - if assignment.host_name not in host_names: - to_delete.append(assignment.key) - - ndb_utils.delete_multi(to_delete) - - def distribute_cpus(self, projects, total_cpus): - """Distribute OSS-Fuzz CPUs for each project by weight. - - |projects| should be sorted - alphabetically by name to ensure determinism for the same set of CPUs. - """ - available_cpus = total_cpus - total_weight = sum(project.cpu_weight for project in projects) - - cpu_count = [] - - for project in projects: - if total_weight: - share = project.cpu_weight / total_weight - else: - share = 0.0 - - share_cpus = int(total_cpus * share) - share_cpus = max(PROJECT_MIN_CPUS, share_cpus) - share_cpus = min(PROJECT_MAX_CPUS, share_cpus) - - if share_cpus <= available_cpus: - cpu_count.append(share_cpus) - available_cpus -= share_cpus - else: - cpu_count.append(0) - - # indexes into |project| sorted by highest weight first. - indexes_by_weight = sorted( - list(range(len(projects))), - key=lambda k: projects[k].cpu_weight, - reverse=True) - - # Distribute the remainder from rounding errors (and capping) up to the cap, - # preferring projects with a higher weight first. - while available_cpus: - cpus_allocated = 0 - - for i in range(len(cpu_count)): - project_index = indexes_by_weight[i] - - if cpu_count[project_index] < PROJECT_MAX_CPUS: - cpu_count[project_index] += 1 - cpus_allocated += 1 - - if cpus_allocated >= available_cpus: - break - - if not cpus_allocated: - # Hit the cap for each project. Realistically, this shouldn't ever - # happen. - break - - available_cpus -= cpus_allocated - - if available_cpus: - logging.warning('%d CPUs are not being used.', available_cpus) - - return cpu_count - - def do_assign_hosts_to_workers(self, host_names, worker_instances, - workers_per_host): - """Assign OSS-Fuzz host instances to workers.""" - # Sort host and worker instance names to make assignment deterministic for - # the same initial set of host and workers. - host_names.sort() - worker_instances.sort(key=lambda w: w.name) - - # Algorithm: - # For each host instance, - # - If there is already an assignment, and a worker with the same name - # still exists, do nothing. - # - Otherwise, assign it to the first unassigned worker (in alphabetical - # order). - # This should ensure that a worker is reassigned only if it was - # reimaged/new. - current_worker_names = {worker.name for worker in worker_instances} - previous_assigned_workers = set() - - new_assignments = [] - - for host_name in host_names: - for i in range(0, workers_per_host): - assignment = self.get_or_create_host_worker_assignment(host_name, i) - if (assignment.worker_name and - assignment.worker_name in current_worker_names): - # Existing assignment is still valid. Don't do anything for these. - logging.info('Keeping old assignment of %s(%d) -> %s.', host_name, i, - assignment.worker_name) - previous_assigned_workers.add(assignment.worker_name) - continue - - # This host instance was either unassigned or the worker it was - # connected to no longer exists, so we need to assign it to a new - # worker. - new_assignments.append(assignment) - - new_workers = [ - worker for worker in worker_instances - if worker.name not in previous_assigned_workers - ] - - assert len(new_assignments) == len(new_workers) - for assignment, worker in zip(new_assignments, new_workers): - assignment.worker_name = worker.name - assignment.project_name = worker.project - logging.info('New assignment: %s(%d) - >%s.', assignment.host_name, - assignment.instance_num, assignment.worker_name) - - return new_assignments - - def delete_gce_resources(self, project_info, cluster_info): - """Delete instance templates and instance groups.""" - manager = bot_manager.BotManager(self.gce_project.project_id, - cluster_info.gce_zone) - - resource_name = get_resource_name(cluster_info.cluster, project_info.name) - - try: - manager.instance_group(resource_name).delete() - except bot_manager.NotFoundError: - logging.info('Instance group %s already deleted.', resource_name) - - try: - manager.instance_template(resource_name).delete() - except bot_manager.NotFoundError: - logging.info('Instance template %s already deleted.', resource_name) - - def cleanup_old_projects(self, existing_project_names): - """Cleanup old projects.""" - to_delete = [] - - for project_info in list(data_types.OssFuzzProjectInfo.query()): - if project_info.name in existing_project_names: - continue - - logging.info('Deleting %s', project_info.name) - - for cluster_info in project_info.clusters: - self.delete_gce_resources(project_info, cluster_info) - - to_delete.append(project_info.key) - - ndb_utils.delete_multi(to_delete) - - def cleanup_clusters(self, project, project_info): - """Remove nonexistant clusters.""" - existing_cluster_names = [ - cluster.name for cluster in self.gce_project.clusters - ] - - # Delete clusters that no longer exist, or the if the high end flag changed - # for a project. - to_delete = [ - cluster_info for cluster_info in project_info.clusters if - (cluster_info.cluster not in existing_cluster_names or project.high_end - != self.gce_project.get_cluster(cluster_info.cluster).high_end) - ] - if not to_delete: - return - - for cluster_info in to_delete: - logging.info('Deleting old cluster %s for %s.', cluster_info.cluster, - project_info.name) - self.delete_gce_resources(project_info, cluster_info) - - project_info.clusters = [ - cluster_info for cluster_info in project_info.clusters - if cluster_info.cluster in existing_cluster_names - ] - project_info.put() - - def update_project_cluster(self, - project, - project_info, - cluster, - cpu_count, - disk_size_gb=None): - """Update cluster allocation for a project.""" - service_account = None - tls_cert = None - - if cluster.worker: - # If this cluster is for untrusted workers, use the project service - # account. - service_account = project.service_account - tls_cert = ndb.Key(data_types.WorkerTlsCert, project.name).get() - if not tls_cert: - logging.warning('TLS certs not set up yet for %s.', project.name) - return - - cluster_info = project_info.get_cluster_info(cluster.name) - if not cluster_info: - project_info.clusters.append( - data_types.OssFuzzProjectInfo.ClusterInfo( - cluster=cluster.name, - gce_zone=cluster.gce_zone, - cpu_count=cpu_count)) - cluster_info = project_info.clusters[-1] - - # Get a name that can be used for the instance template and instance group. - resource_name = get_resource_name(cluster.name, project_info.name) - - def do_update(): - """Update the cluster and cpu count info.""" - self.update_cluster( - cluster, - resource_name, - cpu_count, - task_tag=project_info.name, - disk_size_gb=disk_size_gb, - service_account=service_account, - tls_cert=tls_cert) - - cluster_info.cpu_count = cpu_count - - self.pending_updates.append(self.thread_pool.submit(do_update)) - - def update_project_cpus(self, projects, project_infos, high_end_projects, - high_end_project_infos, cluster): - """Update CPU allocations for each project.""" - # Calculate CPUs in each cluster. - if not cluster.distribute: - self.pending_updates.append( - self.thread_pool.submit(self.update_cluster, cluster, cluster.name, - cluster.instance_count)) - return - - if cluster.high_end: - current_projects = high_end_projects - current_project_infos = high_end_project_infos - else: - current_projects = projects - current_project_infos = project_infos - - cpu_counts = self.distribute_cpus(current_projects, cluster.instance_count) - - # Resize projects starting with ones that reduce number of CPUs. This is - # so that we always have quota when we're resizing a project cluster. - # pylint: disable=cell-var-from-loop - def _cpu_diff_key(index): - cluster_info = current_project_infos[index].get_cluster_info(cluster.name) - if cluster_info and cluster_info.cpu_count is not None: - old_cpu_count = cluster_info.cpu_count - else: - old_cpu_count = 0 - - return cpu_counts[index] - old_cpu_count - - resize_order = sorted(list(range(len(cpu_counts))), key=_cpu_diff_key) - for i in resize_order: - project = current_projects[i] - project_info = current_project_infos[i] - self.update_project_cluster( - project, - project_info, - cluster, - cpu_counts[i], - disk_size_gb=project.disk_size_gb) - - self.wait_updates() - ndb_utils.put_multi(project_infos) - ndb_utils.put_multi(high_end_project_infos) - - # If the workers are done, we're ready to assign them. - # Note: This assumes that hosts are always specified before workers. - if cluster.name in self.worker_to_assignment: - self.assign_hosts_to_workers(self.worker_to_assignment[cluster.name]) - - def get_all_workers_in_cluster(self, manager, cluster_name): - """Get all workers in a cluster.""" - workers = [] - project_infos = list(data_types.OssFuzzProjectInfo.query().order( - data_types.OssFuzzProjectInfo.name)) - - for project_info in project_infos: - cluster_info = next((cluster for cluster in project_info.clusters - if cluster.cluster == cluster_name), None) - if not cluster_info or cluster_info.cpu_count == 0: - continue - - worker_group_name = get_resource_name(cluster_info.cluster, - project_info.name) - worker_instance_group = manager.instance_group(worker_group_name) - if not worker_instance_group.exists(): - logging.error('Worker instance group %s does not exist.', - worker_group_name) - continue - - instances = list(worker_instance_group.list_managed_instances()) - if len(instances) != cluster_info.cpu_count: - logging.error( - 'Number of instances in instance group %s does not match.' - 'Expected %d, got %d.', worker_group_name, cluster_info.cpu_count, - len(instances)) - raise ManageVmsError('Inconsistent instance count in group.') - - for instance in instances: - workers.append( - WorkerInstance( - name=_instance_name_from_url(instance['instance']), - project=project_info.name)) - - return workers - - def assign_hosts_to_workers(self, assignment): - """Assign host instances to workers.""" - host_cluster = self.gce_project.get_cluster(assignment.host) - worker_cluster = self.gce_project.get_cluster(assignment.worker) - - if host_cluster.gce_zone != worker_cluster.gce_zone: - logging.error('Mismatching zones for %s and %s.', assignment.host, - assignment.worker) - return - - if (host_cluster.instance_count * assignment.workers_per_host != - worker_cluster.instance_count): - logging.error('Invalid host/worker cluster size for %s and %s.', - assignment.host, assignment.worker) - return - - if host_cluster.high_end != worker_cluster.high_end: - logging.error('Mismatching high end setting for %s and %s', - assignment.host, assignment.worker) - return - - manager = bot_manager.BotManager(self.gce_project.project_id, - host_cluster.gce_zone) - host_instance_group = manager.instance_group(host_cluster.name) - - if not host_instance_group.exists(): - logging.error('Host instance group %s does not exist.', host_cluster.name) - return - - host_names = [ - _instance_name_from_url(instance['instance']) - for instance in host_instance_group.list_managed_instances() - ] - self.all_host_names.update(host_names) - worker_instances = self.get_all_workers_in_cluster(manager, - worker_cluster.name) - - if len(worker_instances) != worker_cluster.instance_count: - logging.error( - 'Actual number of worker instances for %s did not match. ' - 'Expected %d, got %d.', worker_cluster.name, - worker_cluster.instance_count, len(worker_instances)) - return - - new_assignments = self.do_assign_hosts_to_workers( - host_names, worker_instances, assignment.workers_per_host) - ndb_utils.put_multi(new_assignments) - - def main(): """CPU distributor for OSS-Fuzz projects.""" - if utils.is_oss_fuzz(): - manager_class = OssFuzzClustersManager - else: - manager_class = ClustersManager - for project_id in _get_project_ids(): - manager = manager_class(project_id) - manager.update_clusters() + ClustersManager(project_id).update_clusters() logging.info('Mange VMs succeeded.') return True diff --git a/src/clusterfuzz/_internal/cron/schedule_fuzz.py b/src/clusterfuzz/_internal/cron/schedule_fuzz.py index 2245b2e04d..fb495adbe7 100644 --- a/src/clusterfuzz/_internal/cron/schedule_fuzz.py +++ b/src/clusterfuzz/_internal/cron/schedule_fuzz.py @@ -14,10 +14,13 @@ """Cron job to schedule fuzz tasks that run on batch.""" import collections +import multiprocessing import random import time from typing import Dict +from typing import List +from google.cloud import monitoring_v3 from googleapiclient import discovery from clusterfuzz._internal.base import tasks @@ -25,20 +28,54 @@ from clusterfuzz._internal.config import local_config from clusterfuzz._internal.datastore import data_types from clusterfuzz._internal.datastore import ndb_utils +from clusterfuzz._internal.google_cloud_utils import batch from clusterfuzz._internal.google_cloud_utils import credentials from clusterfuzz._internal.metrics import logs +# TODO(metzman): Actually implement this. +CPUS_PER_FUZZ_JOB = 2 -def _get_quotas(project, region): - gcp_credentials = credentials.get_default()[0] - compute = discovery.build('compute', 'v1', credentials=gcp_credentials) + +def _get_quotas(creds, project, region): + compute = discovery.build('compute', 'v1', credentials=creds) return compute.regions().get( # pylint: disable=no-member region=region, project=project).execute()['quotas'] -def get_available_cpus(project: str, region: str) -> int: +def count_unacked(creds, project_id, subscription_id): + """Counts the unacked messages in |subscription_id|.""" + # TODO(metzman): Not all of these are fuzz_tasks. Deal with that. + metric = 'pubsub.googleapis.com/subscription/num_undelivered_messages' + query_filter = (f'metric.type="{metric}" AND ' + f'resource.labels.subscription_id="{subscription_id}"') + time_now = time.time() + # Get the last 5 minutes. + time_interval = monitoring_v3.TimeInterval( + end_time={'seconds': int(time_now)}, + start_time={'seconds': int(time_now - 5 * 60)}, + ) + client = monitoring_v3.MetricServiceClient(credentials=creds) + results = client.list_time_series( + request={ + 'filter': query_filter, + 'interval': time_interval, + 'name': f'projects/{project_id}', + 'view': monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL, + }) + # Get the latest point. + for result in results: + if len(result.points) == 0: + continue + size = int(result.points[0].value.int64_value) + logs.info(f'Unacked in {subscription_id}: {result}') + return size + return 0 + + +def get_available_cpus_for_region(creds, project: str, region: str) -> int: """Returns the number of available CPUs in the current GCE region.""" - quotas = _get_quotas(project, region) + + quotas = _get_quotas(creds, project, region) # Sometimes, the preemptible quota is 0, which means the number of preemptible # CPUs is actually limited by the CPU quota. @@ -59,13 +96,18 @@ def get_available_cpus(project: str, region: str) -> int: assert preemptible_quota or cpu_quota if not preemptible_quota['limit']: - # Preemptible quota is not set. Obey the CPU quota since that limitss us. + # Preemptible quota is not set. Obey the CPU quota since that limits us. quota = cpu_quota else: quota = preemptible_quota assert quota['limit'], quota - return quota['limit'] - quota['usage'] + # TODO(metzman): Do this in a more configurable way. + # We need this because us-central1 and us-east4 have different numbers of + # cores alloted to us in their quota. Treat them the same to simplify things. + limit = quota['limit'] + limit -= quota['usage'] + return min(limit, 100_000) class BaseFuzzTaskScheduler: @@ -79,8 +121,7 @@ def get_fuzz_tasks(self): def _get_cpus_per_fuzz_job(self, job_name): del job_name - # TODO(metzman): Actually implement this. - return 2 + return CPUS_PER_FUZZ_JOB class FuzzTaskCandidate: @@ -182,26 +223,71 @@ def get_fuzz_tasks(available_cpus: int) -> [tasks.Task]: def get_batch_regions(batch_config): - mapping = batch_config.get('mapping') - return list(set(config['gce_region'] for config in mapping.values())) + fuzz_subconf_names = { + subconf['name'] for subconf in batch_config.get( + 'mapping.LINUX-PREEMPTIBLE-UNPRIVILEGED.subconfigs') + } + + subconfs = batch_config.get('subconfigs') + return list( + set(subconfs[subconf]['region'] + for subconf in subconfs + if subconf in fuzz_subconf_names)) + + +def get_available_cpus(project: str, regions: List[str]) -> int: + """Returns the available CPUs for fuzz tasks.""" + # TODO(metzman): This doesn't distinguish between fuzz and non-fuzz + # tasks (nor preemptible and non-preemptible CPUs). Fix this. + # Get total scheduled and queued. + creds = credentials.get_default()[0] + count_args = ((project, region) for region in regions) + with multiprocessing.Pool(2) as pool: + # These calls are extremely slow (about 1 minute total). + result = pool.starmap_async( # pylint: disable=no-member + batch.count_queued_or_scheduled_tasks, count_args) + waiting_tasks = count_unacked(creds, project, 'preprocess') + waiting_tasks += count_unacked(creds, project, 'utask_main') + region_counts = zip(*result.get()) # Group all queued and all scheduled. + + # Add up all queued and scheduled. + region_counts = [sum(tup) for tup in region_counts] + logs.info(f'Region counts: {region_counts}') + if region_counts[0] > 5000: + # Check queued tasks. + logs.info('Too many jobs queued, not scheduling more fuzzing.') + return 0 + waiting_tasks += sum(region_counts) # Add up queued and scheduled. + soon_occupied_cpus = waiting_tasks * CPUS_PER_FUZZ_JOB + logs.info(f'Soon occupied CPUs: {soon_occupied_cpus}') + available_cpus = sum( + get_available_cpus_for_region(creds, project, region) + for region in regions) + logs.info('Actually free CPUs (before subtracting soon ' + f'occupied): {available_cpus}') + available_cpus = max(available_cpus - soon_occupied_cpus, 0) + + # Don't schedule more than 10K tasks at once. So we don't overload batch. + print('len_regions', len(regions)) + # This number is arbitrary, but we aren't at full capacity at lower numbers. + available_cpus = min(available_cpus, 27_500 * len(regions)) + return available_cpus def schedule_fuzz_tasks() -> bool: """Schedules fuzz tasks.""" - # TODO(metzman): Remove this when we are ready to run on Chrome. - start = time.time() - + multiprocessing.set_start_method('spawn') batch_config = local_config.BatchConfig() - regions = get_batch_regions(batch_config) - # TODO(metzman): Make it possible to use multiple regions. - if len(regions) > 1: - region = 'us-central1' - else: - region = regions[0] project = batch_config.get('project') - available_cpus = get_available_cpus(project, region) - # TODO(metzman): Remove this as we move from experimental code to production. - available_cpus = min(available_cpus, 2500) + regions = get_batch_regions(batch_config) + start = time.time() + available_cpus = get_available_cpus(project, regions) + logs.error(f'{available_cpus} available CPUs.') + if not available_cpus: + # Not clear if this should be False or True. + # TODO(metzman): Revisit this. + return False + fuzz_tasks = get_fuzz_tasks(available_cpus) if not fuzz_tasks: logs.error('No fuzz tasks found to schedule.') diff --git a/src/clusterfuzz/_internal/cron/triage.py b/src/clusterfuzz/_internal/cron/triage.py index 1ed9778b60..f7d10b3c77 100644 --- a/src/clusterfuzz/_internal/cron/triage.py +++ b/src/clusterfuzz/_internal/cron/triage.py @@ -98,6 +98,60 @@ def _is_bug_filed(testcase): return False +def _is_blocking_progress_android(testcase): + """Checks the crash frequency if it is reported on libfuzzer""" + if testcase.job_type.startswith('libfuzzer'): + # Get crash statistics data on this unreproducible crash for last X days. + last_hour = crash_stats.get_last_successful_hour() + if not last_hour: + # No crash stats available, skip. + return False + + _, rows = crash_stats.get( + end=last_hour, + block='day', + days=data_types.FILE_CONSISTENT_UNREPRODUCIBLE_TESTCASE_DEADLINE, + group_by='reproducible_flag', + where_clause=( + 'crash_type = %s AND crash_state = %s AND security_flag = %s' % + (json.dumps(testcase.crash_type), json.dumps(testcase.crash_state), + json.dumps(testcase.security_flag))), + group_having_clause='', + sort_by='total_count', + offset=0, + limit=1) + + # Calculate total crash count and crash days count. + crash_days_indices = set() + total_crash_count = 0 + for row in rows: + if 'groups' not in row: + continue + + total_crash_count += row['totalCount'] + for group in row['groups']: + for index in group['indices']: + crash_days_indices.add(index['hour']) + + crash_days_count = len(crash_days_indices) + # Considers an unreproducible testcase as important if the crash + # occurred at least once everyday for the last 14 days and total + # crash count exceeded 14. + return (crash_days_count == + data_types.FILE_CONSISTENT_UNREPRODUCIBLE_TESTCASE_DEADLINE and + total_crash_count >= + data_types.FILE_UNREPRODUCIBLE_TESTCASE_MIN_STARTUP_CRASH_THRESHOLD) + + return False + + +def is_crash_important_android(testcase): + """"Indicate if the android crash is important to file.""" + if _is_blocking_progress_android(testcase): + return True + return False + + def _is_crash_important(testcase): """Indicate if the crash is important to file.""" if not testcase.one_time_crasher_flag: @@ -256,6 +310,8 @@ def _check_and_update_similar_bug(testcase, issue_tracker): def _emit_bug_filing_from_testcase_elapsed_time_metric(testcase): testcase_age = testcase.get_age_in_seconds() + if not testcase_age: + return monitoring_metrics.BUG_FILING_FROM_TESTCASE_ELAPSED_TIME.add( testcase_age, labels={ @@ -309,19 +365,54 @@ def _file_issue(testcase, issue_tracker, throttler): return filed -def _emit_untriaged_testcase_age_metric(critical_tasks_completed: bool, - testcase: data_types.Testcase): - """Emmits a metric to track age of untriaged testcases.""" - if critical_tasks_completed: +def _set_testcase_stuck_state(testcase: data_types.Testcase, state: bool): + if testcase.stuck_in_triage == state: return - if not testcase.timestamp: + testcase.stuck_in_triage = state + testcase.put() + + +untriaged_testcases = {} + + +def _increment_untriaged_testcase_count(job, status): + identifier = (job, status) + if identifier not in untriaged_testcases: + untriaged_testcases[identifier] = 0 + untriaged_testcases[identifier] += 1 + + +def _emit_untriaged_testcase_count_metric(): + for (job, status) in untriaged_testcases: + monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set( + untriaged_testcases[(job, status)], + labels={ + 'job': job, + 'status': status, + }) + + +PENDING_ANALYZE = 'pending_analyze' +PENDING_CRITICAL_TASKS = 'pending_critical_tasks' +PENDING_PROGRESSION = 'pending_progression' +PENDING_GROUPING = 'pending_grouping' +PENDING_FILING = 'pending_filing' + + +def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase, + step: str): + """Emmits a metric to track age of untriaged testcases.""" + if not testcase.get_age_in_seconds(): return + logs.info(f'Emiting UNTRIAGED_TESTCASE_AGE for testcase {testcase.key.id()} ' + f'(age = {testcase.get_age_in_seconds()}), step = {step}') monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add( - testcase.get_age_in_seconds(), + testcase.get_age_in_seconds() / 3600, labels={ 'job': testcase.job_type, 'platform': testcase.platform, + 'step': step, }) @@ -361,26 +452,30 @@ def main(): # Skip if testcase's job is removed. if testcase.job_type not in all_jobs: + _set_testcase_stuck_state(testcase, False) logs.info(f'Skipping testcase {testcase_id}, since its job was removed ' f' ({testcase.job_type})') continue # Skip if testcase's job is in exclusions list. if testcase.job_type in excluded_jobs: + _set_testcase_stuck_state(testcase, False) logs.info(f'Skipping testcase {testcase_id}, since its job is in the' f' exclusion list ({testcase.job_type})') continue - # Emmit the metric for testcases that should be triaged. - _emit_untriaged_testcase_age_metric(critical_tasks_completed, testcase) - # Skip if we are running progression task at this time. if testcase.get_metadata('progression_pending'): + _set_testcase_stuck_state(testcase, True) logs.info(f'Skipping testcase {testcase_id}, progression pending') + _emit_untriaged_testcase_age_metric(testcase, PENDING_PROGRESSION) + _increment_untriaged_testcase_count(testcase.job_type, + PENDING_PROGRESSION) continue # If the testcase has a bug filed already, no triage is needed. if _is_bug_filed(testcase): + _set_testcase_stuck_state(testcase, False) logs.info( f'Skipping testcase {testcase_id}, since a bug was already filed.') continue @@ -388,13 +483,28 @@ def main(): # Check if the crash is important, i.e. it is either a reproducible crash # or an unreproducible crash happening frequently. if not _is_crash_important(testcase): - logs.info( - f'Skipping testcase {testcase_id}, since the crash is not important.') - continue + # Check if the crash is a startup crash, i.e. it is causing the fuzzer + # to crash on startup and not allowing the fuzzer to run longer + if testcase.platform == "android" and is_crash_important_android( + testcase): + logs.info( + f'Considering testcase {testcase_id}, since it is a startup crash' + ' on android platform.') + else: + _set_testcase_stuck_state(testcase, False) + logs.info( + f'Skipping testcase {testcase_id}, as the crash is not important.') + continue # Require that all tasks like minimizaton, regression testing, etc have # finished. if not critical_tasks_completed: + status = PENDING_CRITICAL_TASKS + if testcase.analyze_pending: + status = PENDING_ANALYZE + _emit_untriaged_testcase_age_metric(testcase, status) + _set_testcase_stuck_state(testcase, True) + _increment_untriaged_testcase_count(testcase.job_type, status) logs.info( f'Skipping testcase {testcase_id}, critical tasks still pending.') continue @@ -411,11 +521,17 @@ def main(): # metadata works well. if not testcase.group_id and not dates.time_has_expired( testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT): + _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING) + _set_testcase_stuck_state(testcase, True) + _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING) logs.info(f'Skipping testcase {testcase_id}, pending grouping.') continue if not testcase.get_metadata('ran_grouper'): # Testcase should be considered by the grouper first before filing. + _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING) + _set_testcase_stuck_state(testcase, True) + _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING) logs.info(f'Skipping testcase {testcase_id}, pending grouping.') continue @@ -435,6 +551,7 @@ def main(): # If there are similar issues to this test case already filed or recently # closed, skip filing a duplicate bug. if _check_and_update_similar_bug(testcase, issue_tracker): + _set_testcase_stuck_state(testcase, False) logs.info(f'Skipping testcase {testcase_id}, since a similar bug' ' was already filed.') continue @@ -444,15 +561,20 @@ def main(): # File the bug first and then create filed bug metadata. if not _file_issue(testcase, issue_tracker, throttler): + _emit_untriaged_testcase_age_metric(testcase, PENDING_FILING) + _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING) logs.info(f'Issue filing failed for testcase id {testcase_id}') continue + _set_testcase_stuck_state(testcase, False) + _create_filed_bug_metadata(testcase) issue_filer.notify_issue_update(testcase, 'new') logs.info('Filed new issue %s for testcase %d.' % (testcase.bug_information, testcase_id)) + _emit_untriaged_testcase_count_metric() logs.info('Triage testcases succeeded.') return True diff --git a/src/clusterfuzz/_internal/datastore/data_handler.py b/src/clusterfuzz/_internal/datastore/data_handler.py index abe8d5c677..cf1aafb5ff 100644 --- a/src/clusterfuzz/_internal/datastore/data_handler.py +++ b/src/clusterfuzz/_internal/datastore/data_handler.py @@ -921,7 +921,7 @@ def critical_tasks_completed(testcase): return testcase.minimized_keys and testcase.regression return bool(testcase.minimized_keys and testcase.regression and - testcase.is_impact_set_flag) + testcase.is_impact_set_flag and not testcase.analyze_pending) # ------------------------------------------------------------------------------ @@ -977,7 +977,8 @@ def add_build_metadata(job_type, def create_data_bundle_bucket_and_iams(data_bundle_name, emails): """Creates a data bundle bucket and adds iams for access.""" bucket_name = get_data_bundle_bucket_name(data_bundle_name) - if not storage.create_bucket_if_needed(bucket_name): + location = local_config.ProjectConfig().get('data_bundle_bucket_location') + if not storage.create_bucket_if_needed(bucket_name, location=location): return False client = storage.create_discovery_storage_client() @@ -1379,6 +1380,7 @@ def create_user_uploaded_testcase(key, testcase.timestamp = utils.utcnow() testcase.created = testcase.timestamp + testcase.analyze_pending = True testcase.uploader_email = uploader_email testcase.put() diff --git a/src/clusterfuzz/_internal/datastore/data_types.py b/src/clusterfuzz/_internal/datastore/data_types.py index 31c6f41348..921c40aec9 100644 --- a/src/clusterfuzz/_internal/datastore/data_types.py +++ b/src/clusterfuzz/_internal/datastore/data_types.py @@ -58,6 +58,9 @@ # Minimum number of unreproducible crashes to see before filing it. FILE_UNREPRODUCIBLE_TESTCASE_MIN_CRASH_THRESHOLD = 100 +# Minimum number of unreproducible crashes to see before filing it for android. +FILE_UNREPRODUCIBLE_TESTCASE_MIN_STARTUP_CRASH_THRESHOLD = 14 + # Heartbeat wait interval. HEARTBEAT_WAIT_INTERVAL = 10 * 60 @@ -580,6 +583,13 @@ class Testcase(Model): # corpus. trusted = ndb.BooleanProperty(default=False) + # Tracks if a testcase is stuck during triage. + stuck_in_triage = ndb.BooleanProperty(default=False) + + # Tracks if analyze task is pending. + # Defaults to false, since most testcases are fuzzer produced. + analyze_pending = ndb.BooleanProperty(default=False) + def is_chromium(self): return self.project_name in ('chromium', 'chromium-testing') @@ -686,6 +696,8 @@ def get_created_time(self) -> ndb.DateTimeProperty: def get_age_in_seconds(self): current_time = datetime.datetime.utcnow() + if not self.get_created_time(): + return None testcase_age = current_time - self.get_created_time() return testcase_age.total_seconds() diff --git a/src/clusterfuzz/_internal/fuzzing/corpus_manager.py b/src/clusterfuzz/_internal/fuzzing/corpus_manager.py index ec9ba0a6cc..757e7c7c9f 100644 --- a/src/clusterfuzz/_internal/fuzzing/corpus_manager.py +++ b/src/clusterfuzz/_internal/fuzzing/corpus_manager.py @@ -663,21 +663,21 @@ def get_proto_corpus(bucket_name, # again. if max_download_urls is not None: urls = itertools.islice(urls, max_download_urls) - corpus_urls = dict( - storage.sign_urls_for_existing_files(urls, include_delete_urls)) - + corpus_urls = storage.sign_urls_for_existing_files(urls, include_delete_urls) upload_urls = storage.get_arbitrary_signed_upload_urls( gcs_url, num_uploads=max_upload_urls) corpus = uworker_msg_pb2.Corpus( # pylint: disable=no-member - corpus_urls=corpus_urls, - upload_urls=upload_urls, - gcs_url=gcs_url, - ) + gcs_url=gcs_url,) last_updated = _last_updated(_get_gcs_url(bucket_name, bucket_path)) if last_updated: timestamp = timestamp_pb2.Timestamp() # pylint: disable=no-member timestamp.FromDatetime(last_updated) corpus.last_updated_time.CopyFrom(timestamp) + # Iterate over imap_unordered results. + for upload_url in upload_urls: + corpus.upload_urls.append(upload_url) + for download_url, delete_url in corpus_urls: + corpus.corpus_urls[download_url] = delete_url return corpus diff --git a/src/clusterfuzz/_internal/google_cloud_utils/batch.py b/src/clusterfuzz/_internal/google_cloud_utils/batch.py index 553ebb2994..9c90fa23f4 100644 --- a/src/clusterfuzz/_internal/google_cloud_utils/batch.py +++ b/src/clusterfuzz/_internal/google_cloud_utils/batch.py @@ -14,7 +14,9 @@ """Cloud Batch helpers.""" import collections import threading +from typing import Dict from typing import List +from typing import Tuple import uuid from google.cloud import batch_v1 as batch @@ -25,6 +27,7 @@ from clusterfuzz._internal.base.tasks import task_utils from clusterfuzz._internal.config import local_config from clusterfuzz._internal.datastore import data_types +from clusterfuzz._internal.datastore import ndb_utils from clusterfuzz._internal.metrics import logs # TODO(metzman): Change to from . import credentials when we are done @@ -35,8 +38,6 @@ DEFAULT_RETRY_COUNT = 0 -TASK_BUNCH_SIZE = 20 - # Controls how many containers (ClusterFuzz tasks) can run on a single VM. # THIS SHOULD BE 1 OR THERE WILL BE SECURITY PROBLEMS. TASK_COUNT_PER_NODE = 1 @@ -54,7 +55,6 @@ 'subnetwork', 'preemptible', 'project', - 'gce_zone', 'machine_type', 'network', 'gce_region', @@ -66,10 +66,7 @@ def _create_batch_client_new(): """Creates a batch client.""" - creds, project = credentials.get_default() - if not project: - project = utils.get_application_id() - + creds, _ = credentials.get_default() return batch.BatchServiceClient(credentials=creds) @@ -108,9 +105,10 @@ def create_uworker_main_batch_job(module, job_type, input_download_url): def create_uworker_main_batch_jobs(batch_tasks: List[BatchTask]): """Creates batch jobs.""" job_specs = collections.defaultdict(list) + specs = _get_specs_from_config(batch_tasks) for batch_task in batch_tasks: logs.info(f'Scheduling {batch_task.command}, {batch_task.job_type}.') - spec = _get_spec_from_config(batch_task.command, batch_task.job_type) + spec = specs[(batch_task.command, batch_task.job_type)] job_specs[spec].append(batch_task.input_download_url) logs.info('Creating batch jobs.') @@ -119,7 +117,7 @@ def create_uworker_main_batch_jobs(batch_tasks: List[BatchTask]): logs.info('Batching utask_mains.') for spec, input_urls in job_specs.items(): for input_urls_portion in utils.batched(input_urls, - MAX_CONCURRENT_VMS_PER_JOB): + MAX_CONCURRENT_VMS_PER_JOB - 1): jobs.append(_create_job(spec, input_urls_portion)) return jobs @@ -209,7 +207,6 @@ def _create_job(spec, input_urls): job = batch.Job() job.task_groups = [task_group] job.allocation_policy = _get_allocation_policy(spec) - job.labels = {'env': 'testing', 'type': 'container'} job.logs_policy = batch.LogsPolicy() job.logs_policy.destination = batch.LogsPolicy.Destination.CLOUD_LOGGING job.priority = spec.priority @@ -251,27 +248,33 @@ def is_no_privilege_workload(command, job_name): def is_remote_task(command, job_name): try: - _get_spec_from_config(command, job_name) + _get_specs_from_config([BatchTask(command, job_name, None)]) return True except ValueError: return False -def _get_config_name(command, job_name): - """Returns the name of the config for |command| and |job_name|.""" - job = _get_job(job_name) - # As of this writing, batch only supports LINUX. - if utils.is_oss_fuzz(): - # TODO(b/377885331): In OSS-Fuzz, the platform can't be used because, as of - # it includes the project name. - config_name = 'LINUX' - else: - config_name = job.platform - if command == 'fuzz': - config_name += '-PREEMPTIBLE-UNPRIVILEGED' - else: - config_name += '-NONPREEMPTIBLE-UNPRIVILEGED' - return config_name +def _get_config_names( + batch_tasks: List[BatchTask]) -> Dict[Tuple[str, str], str]: + """"Gets the name of the configs for each batch_task. Returns a dict + that is indexed by command and job_type for efficient lookup.""" + job_names = {task.job_type for task in batch_tasks} + query = data_types.Job.query(data_types.Job.name.IN(list(job_names))) + jobs = ndb_utils.get_all_from_query(query) + job_map = {job.name: job for job in jobs} + config_map = {} + for task in batch_tasks: + if task.job_type not in job_map: + logs.error(f'{task.job_type} doesn\'t exist.') + continue + if task.command == 'fuzz': + suffix = '-PREEMPTIBLE-UNPRIVILEGED' + else: + suffix = '-NONPREEMPTIBLE-UNPRIVILEGED' + job = job_map[task.job_type] + platform = job.platform if not utils.is_oss_fuzz() else 'LINUX' + config_map[(task.command, task.job_type)] = f'{platform}{suffix}' + return config_map def _get_task_duration(command): @@ -279,46 +282,89 @@ def _get_task_duration(command): tasks.TASK_LEASE_SECONDS) -def _get_spec_from_config(command, job_name): +WeightedSubconfig = collections.namedtuple('WeightedSubconfig', + ['name', 'weight']) + + +def _get_subconfig(batch_config, instance_spec): + # TODO(metzman): Make this pick one at random or based on conditions. + all_subconfigs = batch_config.get('subconfigs', {}) + instance_subconfigs = instance_spec['subconfigs'] + weighted_subconfigs = [ + WeightedSubconfig(subconfig['name'], subconfig['weight']) + for subconfig in instance_subconfigs + ] + weighted_subconfig = utils.random_weighted_choice(weighted_subconfigs) + return all_subconfigs[weighted_subconfig.name] + + +def _get_specs_from_config(batch_tasks) -> Dict: """Gets the configured specifications for a batch workload.""" - config_name = _get_config_name(command, job_name) + if not batch_tasks: + return {} batch_config = _get_batch_config() - instance_spec = batch_config.get('mapping').get(config_name, None) - if instance_spec is None: - raise ValueError(f'No mapping for {config_name}') - project_name = batch_config.get('project') - docker_image = instance_spec['docker_image'] - user_data = instance_spec['user_data'] - should_retry = instance_spec.get('retry', False) - clusterfuzz_release = instance_spec.get('clusterfuzz_release', 'prod') - - # Lower numbers are lower priority. From: - # https://cloud.google.com/batch/docs/reference/rest/v1/projects.locations.jobs - low_priority = command == 'fuzz' - priority = 0 if low_priority else 1 - - max_run_duration = f'{_get_task_duration(command)}s' - if command == 'corpus_pruning': - should_retry = False # It is naturally retried the next day. - - spec = BatchWorkloadSpec( - clusterfuzz_release=clusterfuzz_release, - docker_image=docker_image, - user_data=user_data, - disk_size_gb=instance_spec['disk_size_gb'], - disk_type=instance_spec['disk_type'], - service_account_email=instance_spec['service_account_email'], - # TODO(metzman): Get rid of zone so that we can more easily run in - # multiple regions. - gce_zone=instance_spec['gce_zone'], - gce_region=instance_spec['gce_region'], - project=project_name, - network=instance_spec['network'], - subnetwork=instance_spec['subnetwork'], - preemptible=instance_spec['preemptible'], - machine_type=instance_spec['machine_type'], - priority=priority, - max_run_duration=max_run_duration, - retry=should_retry, - ) - return spec + config_map = _get_config_names(batch_tasks) + specs = {} + subconfig_map = {} + for task in batch_tasks: + if (task.command, task.job_type) in specs: + # Don't repeat work for no reason. + continue + config_name = config_map[(task.command, task.job_type)] + + instance_spec = batch_config.get('mapping').get(config_name) + if instance_spec is None: + raise ValueError(f'No mapping for {config_name}') + config_name = config_map[(task.command, task.job_type)] + project_name = batch_config.get('project') + clusterfuzz_release = instance_spec.get('clusterfuzz_release', 'prod') + # Lower numbers are a lower priority, meaning less likely to run From: + # https://cloud.google.com/batch/docs/reference/rest/v1/projects.locations.jobs + priority = 0 if task.command == 'fuzz' else 1 + max_run_duration = f'{_get_task_duration(task.command)}s' + # This saves us time and reduces fragementation, e.g. every linux fuzz task + # run in this call will run in the same zone. + if config_name not in subconfig_map: + subconfig = _get_subconfig(batch_config, instance_spec) + subconfig_map[config_name] = subconfig + + should_retry = instance_spec.get('retry', False) + if should_retry and task.command == 'corpus_pruning': + should_retry = False # It is naturally retried the next day. + + subconfig = subconfig_map[config_name] + spec = BatchWorkloadSpec( + docker_image=instance_spec['docker_image'], + disk_size_gb=instance_spec['disk_size_gb'], + disk_type=instance_spec['disk_type'], + user_data=instance_spec['user_data'], + service_account_email=instance_spec['service_account_email'], + preemptible=instance_spec['preemptible'], + machine_type=instance_spec['machine_type'], + gce_region=subconfig['region'], + network=subconfig['network'], + subnetwork=subconfig['subnetwork'], + project=project_name, + clusterfuzz_release=clusterfuzz_release, + priority=priority, + max_run_duration=max_run_duration, + retry=should_retry, + ) + specs[(task.command, task.job_type)] = spec + return specs + + +def count_queued_or_scheduled_tasks(project: str, + region: str) -> Tuple[int, int]: + """Counts the number of queued and scheduled tasks.""" + region = f'projects/{project}/locations/{region}' + jobs_filter = 'Status.State="SCHEDULED" OR Status.State="QUEUED"' + req = batch.types.ListJobsRequest(parent=region, filter=jobs_filter) + queued = 0 + scheduled = 0 + for job in _batch_client().list_jobs(request=req): + if job.status.state == batch.JobStatus.State.SCHEDULED: + scheduled += job.task_groups[0].task_count + elif job.status.state == batch.JobStatus.State.QUEUED: + queued += job.task_groups[0].task_count + return (queued, scheduled) diff --git a/src/clusterfuzz/_internal/google_cloud_utils/storage.py b/src/clusterfuzz/_internal/google_cloud_utils/storage.py index b4365b2ff9..ca703f69ce 100644 --- a/src/clusterfuzz/_internal/google_cloud_utils/storage.py +++ b/src/clusterfuzz/_internal/google_cloud_utils/storage.py @@ -14,6 +14,7 @@ """Functions for managing Google Cloud Storage.""" import collections +from concurrent import futures import copy import datetime import json @@ -24,6 +25,7 @@ from typing import List from typing import Tuple import uuid +from xml.etree import ElementTree as ET import google.auth.exceptions from googleapiclient.discovery import build @@ -32,6 +34,7 @@ import requests.exceptions from clusterfuzz._internal.base import concurrency +from clusterfuzz._internal.base import errors from clusterfuzz._internal.base import memoize from clusterfuzz._internal.base import retry from clusterfuzz._internal.base import utils @@ -117,7 +120,7 @@ class StorageProvider: """Core storage provider interface.""" - def create_bucket(self, name, object_lifecycle, cors): + def create_bucket(self, name, object_lifecycle, cors, location): """Create a new bucket.""" raise NotImplementedError @@ -198,7 +201,7 @@ def _chunk_size(self): return None - def create_bucket(self, name, object_lifecycle, cors): + def create_bucket(self, name, object_lifecycle, cors, location): """Create a new bucket.""" project_id = utils.get_application_id() request_body = {'name': name} @@ -208,6 +211,9 @@ def create_bucket(self, name, object_lifecycle, cors): if cors: request_body['cors'] = cors + if location: + request_body['location'] = location + client = create_discovery_storage_client() try: client.buckets().insert(project=project_id, body=request_body).execute() @@ -237,7 +243,6 @@ def list_blobs(self, remote_path, recursive=True, names_only=False): client = _storage_client() bucket = client.bucket(bucket_name) - properties = {} if recursive: delimiter = None @@ -249,23 +254,37 @@ def list_blobs(self, remote_path, recursive=True, names_only=False): else: fields = None - iterator = bucket.list_blobs( - prefix=path, delimiter=delimiter, fields=fields) - for blob in iterator: - properties['bucket'] = bucket_name - properties['name'] = blob.name - properties['updated'] = blob.updated - properties['size'] = blob.size - - yield properties - - if not recursive: - # When doing delimiter listings, the "directories" will be in `prefixes`. - for prefix in iterator.prefixes: - properties['bucket'] = bucket_name - properties['name'] = prefix + iterations = 0 + while True: + iterations += 1 + iterator = bucket.list_blobs( + prefix=path, delimiter=delimiter, fields=fields) + for blob in iterator: + properties = { + 'bucket': bucket_name, + 'name': blob.name, + 'updated': blob.updated, + 'size': blob.size, + } + yield properties + if not recursive: + # When doing delimiter listings, the "directories" will be in + # `prefixes`. + for prefix in iterator.prefixes: + properties = { + 'bucket': bucket_name, + 'name': prefix, + } + yield properties + + next_page_token = iterator.next_page_token + if next_page_token is None: + break + if iterations and iterations % 50 == 0: + logs.error('Might be infinite looping.') + def copy_file_from(self, remote_path, local_path): """Copy file from a remote path to a local path.""" client = _storage_client() @@ -543,7 +562,7 @@ def convert_path_for_write(self, remote_path, directory=OBJECTS_DIR): return fs_path - def create_bucket(self, name, object_lifecycle, cors): + def create_bucket(self, name, object_lifecycle, cors, location): """Create a new bucket.""" bucket_path = self._fs_bucket_path(name) if os.path.exists(bucket_path): @@ -905,13 +924,16 @@ def set_bucket_iam_policy(client, bucket_name, iam_policy): return None -def create_bucket_if_needed(bucket_name, object_lifecycle=None, cors=None): +def create_bucket_if_needed(bucket_name, + object_lifecycle=None, + cors=None, + location=None): """Creates a GCS bucket.""" provider = _provider() if provider.get_bucket(bucket_name): return True - if not provider.create_bucket(bucket_name, object_lifecycle, cors): + if not provider.create_bucket(bucket_name, object_lifecycle, cors, location): return False time.sleep(CREATE_BUCKET_DELAY) @@ -1077,7 +1099,8 @@ def get_blobs(cloud_storage_path, recursive=True): exception_types=_TRANSIENT_ERRORS) def list_blobs(cloud_storage_path, recursive=True): """Return blob names under the given cloud storage path.""" - for blob in _provider().list_blobs(cloud_storage_path, recursive=recursive): + for blob in _provider().list_blobs( + cloud_storage_path, recursive=recursive, names_only=True): yield blob['name'] @@ -1195,6 +1218,15 @@ def _integration_test_env_doesnt_support_signed_urls(): 'UNTRUSTED_RUNNER_TESTS') +class ExpiredSignedUrlError(errors.Error): + """Expired Signed URL.""" + + def __init__(self, message, url=None, response_text=None): + super().__init__(message) + self.url = url + self.response_text = response_text + + # Don't retry so hard. We don't want to slow down corpus downloading. @retry.wrap( retries=1, @@ -1205,11 +1237,21 @@ def _download_url(url): """Downloads |url| and returns the contents.""" if _integration_test_env_doesnt_support_signed_urls(): return read_data(url) - request = requests.get(url, timeout=HTTP_TIMEOUT_SECONDS) - if not request.ok: + response = requests.get(url, timeout=HTTP_TIMEOUT_SECONDS) + if not response.ok: + try: + element_tree = ET.fromstring(response.text) + error = element_tree.find('Code').text + if error == 'ExpiredToken': + raise ExpiredSignedUrlError('Expired token for signed URL.', url, + response.text) + except ExpiredSignedUrlError: + raise + except: + pass raise RuntimeError('Request to %s failed. Code: %d. Reason: %s' % - (url, request.status_code, request.reason)) - return request.content + (url, response.status_code, response.text)) + return response.content @retry.wrap( @@ -1233,7 +1275,6 @@ def str_to_bytes(data): def download_signed_url_to_file(url, filepath): - # print('filepath', filepath) contents = download_signed_url(url) os.makedirs(os.path.dirname(filepath), exist_ok=True) with open(filepath, 'wb') as fp: @@ -1364,11 +1405,10 @@ def _mappable_sign_urls_for_existing_file(url_and_include_delete_urls): return _sign_urls_for_existing_file(url, include_delete_urls) -def sign_urls_for_existing_files(urls, - include_delete_urls) -> List[Tuple[str, str]]: +def sign_urls_for_existing_files(urls, include_delete_urls): logs.info('Signing URLs for existing files.') args = ((url, include_delete_urls) for url in urls) - result = maybe_parallel_map(_sign_urls_for_existing_file, args) + result = parallel_map(_sign_urls_for_existing_file, args) logs.info('Done signing URLs for existing files.') return result @@ -1378,21 +1418,27 @@ def get_arbitrary_signed_upload_url(remote_directory): get_arbitrary_signed_upload_urls(remote_directory, num_uploads=1))[0] -def maybe_parallel_map(func, arguments): +def parallel_map(func, argument_list): """Wrapper around pool.map so we don't do it on OSS-Fuzz hosts which will OOM.""" - if not environment.is_tworker(): - # TODO(b/metzman): When the rearch is done, internal google CF won't have - # tworkers, but maybe should be using parallel. - return list(map(func, arguments)) - max_size = 2 - with concurrency.make_pool(cpu_bound=True, max_pool_size=max_size) as pool: - return list(pool.map(func, arguments)) - - -def get_arbitrary_signed_upload_urls(remote_directory: str, - num_uploads: int) -> List[str]: + timeout = 120 + with concurrency.make_pool(max_pool_size=max_size) as pool: + calls = {pool.submit(func, argument) for argument in argument_list} + while calls: + finished_calls, _ = futures.wait( + calls, timeout=timeout, return_when=futures.FIRST_COMPLETED) + if not finished_calls: + logs.error('No call completed.') + for call in calls: + call.cancel() + raise TimeoutError(f'Nothing completed within {timeout} seconds') + for call in finished_calls: + calls.remove(call) + yield call.result(timeout=timeout) + + +def get_arbitrary_signed_upload_urls(remote_directory: str, num_uploads: int): """Returns |num_uploads| number of signed upload URLs to upload files with unique arbitrary names to remote_directory.""" # We don't verify there are no collisions for uuid4s because it's extremely @@ -1408,6 +1454,6 @@ def get_arbitrary_signed_upload_urls(remote_directory: str, urls = (f'{base_path}-{idx}' for idx in range(num_uploads)) logs.info('Signing URLs for arbitrary uploads.') - result = maybe_parallel_map(get_signed_upload_url, urls) + result = parallel_map(get_signed_upload_url, urls) logs.info('Done signing URLs for arbitrary uploads.') return result diff --git a/src/clusterfuzz/_internal/issue_management/google_issue_tracker/issue_tracker.py b/src/clusterfuzz/_internal/issue_management/google_issue_tracker/issue_tracker.py index f32694abf1..f97de2a03f 100644 --- a/src/clusterfuzz/_internal/issue_management/google_issue_tracker/issue_tracker.py +++ b/src/clusterfuzz/_internal/issue_management/google_issue_tracker/issue_tracker.py @@ -425,6 +425,11 @@ def assignee(self, new_assignee): self._changed.add('assignee') self._data['issueState']['assignee'] = _make_user(new_assignee) + @property + def created_time(self): + """The time at which this issue was created.""" + return self._data['createdTime'] + @property def ccs(self): """The issue CC list.""" diff --git a/src/clusterfuzz/_internal/metrics/monitor.py b/src/clusterfuzz/_internal/metrics/monitor.py index 859732d7b0..5c278119cb 100644 --- a/src/clusterfuzz/_internal/metrics/monitor.py +++ b/src/clusterfuzz/_internal/metrics/monitor.py @@ -87,6 +87,7 @@ def _create_time_series(name: str, time_series: List[_TimeSeries]): _monitoring_v3_client.create_time_series(name=name, time_series=time_series) except Exception as e: logs.warning(f'Error uploading time series: {e}') + logs.warning(f'Time series - {name} - contents: {time_series}') class _MockMetric: @@ -313,9 +314,8 @@ def monitoring_v3_metric(self, metric, labels=None): for key, value in labels.items(): metric.labels[key] = str(value) - if not environment.is_running_on_k8s(): - bot_name = environment.get_value('BOT_NAME', None) - metric.labels['region'] = _get_region(bot_name) + bot_name = environment.get_value('BOT_NAME', None) + metric.labels['region'] = _get_region(bot_name) return metric @@ -569,6 +569,8 @@ def _initialize_monitored_resource(): # Use bot name here instance as that's more useful to us. if environment.is_running_on_k8s(): instance_name = environment.get_value('HOSTNAME') + elif environment.is_running_on_app_engine(): + instance_name = environment.get_value('GAE_INSTANCE') else: instance_name = environment.get_value('BOT_NAME') _monitored_resource.labels['instance_id'] = instance_name @@ -623,6 +625,9 @@ def metrics_store(): def _get_region(bot_name): """Get bot region.""" + if not bot_name: + return 'unknown' + try: regions = local_config.MonitoringRegionsConfig() except errors.BadConfigError: diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py index 58489eae45..dfd92e100e 100644 --- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py +++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py @@ -231,16 +231,20 @@ ], ) -TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric( - 'uploaded_testcase_analysis/triage_duration_secs', +TESTCASE_TRIAGE_DURATION = monitor.CumulativeDistributionMetric( + 'testcase_analysis/triage_duration_hours', description=('Time elapsed between testcase upload and completion' - ' of relevant tasks in the testcase upload lifecycle.'), + ' of relevant tasks in the testcase lifecycle.' + ' Origin can be either from a fuzzer, or a manual' + ' upload. Measured in hours.'), bucketer=monitor.GeometricBucketer(), field_spec=[ monitor.StringField('step'), monitor.StringField('job'), + monitor.BooleanField('from_fuzzer'), ], ) + TASK_RATE_LIMIT_COUNT = monitor.CounterMetric( 'task/rate_limit', description=('Counter for rate limit events.'), @@ -250,6 +254,30 @@ monitor.StringField('argument'), ]) +TASK_OUTCOME_COUNT = monitor.CounterMetric( + 'task/outcome', + description=('Counter metric for task outcome (success/failure).'), + field_spec=[ + monitor.StringField('task'), + monitor.StringField('job'), + monitor.StringField('subtask'), + monitor.StringField('mode'), + monitor.StringField('platform'), + monitor.BooleanField('task_succeeded'), + ]) + +TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric( + 'task/outcome_by_error_type', + description=('Counter metric for task outcome, with error type.'), + field_spec=[ + monitor.StringField('task'), + monitor.StringField('subtask'), + monitor.StringField('mode'), + monitor.StringField('platform'), + monitor.BooleanField('task_succeeded'), + monitor.StringField('error_condition'), + ]) + UTASK_SUBTASK_E2E_DURATION_SECS = monitor.CumulativeDistributionMetric( 'utask/subtask_e2e_duration_secs', description=( @@ -332,13 +360,25 @@ 'issues/untriaged_testcase_age', description='Age of testcases that were not yet triaged ' '(have not yet completed analyze, regression,' - ' minimization, impact task), in seconds.', + ' minimization, impact task), in hours.', bucketer=monitor.GeometricBucketer(), field_spec=[ monitor.StringField('job'), monitor.StringField('platform'), + monitor.StringField('step'), ]) +UNTRIAGED_TESTCASE_COUNT = monitor.GaugeMetric( + 'issues/untriaged_testcase_count', + description='Number of testcases that were not yet triaged ' + '(have not yet completed analyze, regression,' + ' minimization, impact task), in hours.', + field_spec=[ + monitor.StringField('job'), + monitor.StringField('status'), + ], +) + ANALYZE_TASK_REPRODUCIBILITY = monitor.CounterMetric( 'task/analyze/reproducibility', description='Outcome count for analyze task.', diff --git a/src/clusterfuzz/_internal/platforms/android/adb.py b/src/clusterfuzz/_internal/platforms/android/adb.py index b406bf2799..405a8366ac 100755 --- a/src/clusterfuzz/_internal/platforms/android/adb.py +++ b/src/clusterfuzz/_internal/platforms/android/adb.py @@ -242,6 +242,10 @@ def get_fastboot_command_line(fastboot_cmd): def get_fastboot_path(): """Return path to fastboot binary.""" + fastboot_path = environment.get_value('FASTBOOT') + if fastboot_path: + return fastboot_path + return os.path.join(environment.get_platform_resources_directory(), 'fastboot') diff --git a/src/clusterfuzz/_internal/platforms/linux/gestures.py b/src/clusterfuzz/_internal/platforms/linux/gestures.py index 9330ba550c..2c5aae3921 100644 --- a/src/clusterfuzz/_internal/platforms/linux/gestures.py +++ b/src/clusterfuzz/_internal/platforms/linux/gestures.py @@ -22,7 +22,6 @@ from clusterfuzz._internal.system import shell MAX_CHARS_TO_TYPE = 20 -RELOAD_GESTURE = 'key,F5' COORDINATE_DELTA_MIN = -100 COORDINATE_DELTA_MAX = 200 diff --git a/src/clusterfuzz/_internal/platforms/windows/gestures.py b/src/clusterfuzz/_internal/platforms/windows/gestures.py index e35cb50372..be490d30dd 100644 --- a/src/clusterfuzz/_internal/platforms/windows/gestures.py +++ b/src/clusterfuzz/_internal/platforms/windows/gestures.py @@ -26,8 +26,6 @@ # This can be imported from appengine, so make sure we don't exception out. pass -RELOAD_GESTURE = 'key,{F5}' - def find_windows_for_process(process_id): """Return visible windows belonging to a process.""" diff --git a/src/clusterfuzz/_internal/system/environment.py b/src/clusterfuzz/_internal/system/environment.py index abaae14a2e..bfaeef7dab 100644 --- a/src/clusterfuzz/_internal/system/environment.py +++ b/src/clusterfuzz/_internal/system/environment.py @@ -695,9 +695,11 @@ def get_engine_for_job(job_name=None): def is_minimization_supported(): """Return True if the current job supports minimization. - Currently blackbox-fuzzer jobs or libfuzzer support minimization. + Currently blackbox-fuzzer jobs or libfuzzer support minimization, unless + skipped using the SKIP_MINIMIZATION environment variable. """ - return not is_engine_fuzzer_job() or is_libfuzzer_job() + skipped = get_value('SKIP_MINIMIZATION', False) + return not skipped and (not is_engine_fuzzer_job() or is_libfuzzer_job()) def is_posix(): diff --git a/src/clusterfuzz/_internal/tests/appengine/handlers/cron/external_testcase_reader_test.py b/src/clusterfuzz/_internal/tests/appengine/handlers/cron/external_testcase_reader_test.py index b0dc1d6b33..276b411695 100644 --- a/src/clusterfuzz/_internal/tests/appengine/handlers/cron/external_testcase_reader_test.py +++ b/src/clusterfuzz/_internal/tests/appengine/handlers/cron/external_testcase_reader_test.py @@ -17,6 +17,8 @@ from unittest import mock from clusterfuzz._internal.cron import external_testcase_reader +from clusterfuzz._internal.issue_management.google_issue_tracker import \ + issue_tracker BASIC_ATTACHMENT = { 'attachmentId': '60127668', @@ -34,90 +36,107 @@ class ExternalTestcaseReaderTest(unittest.TestCase): """external_testcase_reader tests.""" def setUp(self): - self.issue_tracker = mock.MagicMock() - self.mock_submit_testcase = mock.MagicMock() - self.mock_close_invalid_issue = mock.MagicMock() + self.mock_basic_issue = mock.MagicMock() + self.mock_basic_issue.created_time = '2024-06-25T01:29:30.021Z' + self.mock_basic_issue.status = 'NEW' + external_testcase_reader.submit_testcase = mock.MagicMock() def test_handle_testcases(self): - """Test a basic handle_testcases where issue is valid.""" - mock_iter = mock.MagicMock() - mock_iter.__iter__.return_value = [mock.MagicMock()] - self.issue_tracker.find_issues.return_value = mock_iter - self.mock_close_invalid_issue.return_value = False - external_testcase_reader.close_invalid_issue = self.mock_close_invalid_issue - external_testcase_reader.submit_testcase = self.mock_submit_testcase - - external_testcase_reader.handle_testcases(self.issue_tracker) - self.mock_close_invalid_issue.assert_called_once() - self.issue_tracker.get_attachment.assert_called_once() - self.mock_submit_testcase.assert_called_once() + """Test a basic handle_testcases where issue is fit for submission.""" + mock_it = mock.create_autospec(issue_tracker.IssueTracker) + mock_it.find_issues_with_filters.return_value = [self.mock_basic_issue] + external_testcase_reader.close_issue_if_invalid = mock.MagicMock() + external_testcase_reader.close_issue_if_invalid.return_value = False + + external_testcase_reader.handle_testcases(mock_it) + external_testcase_reader.close_issue_if_invalid.assert_called_once() + mock_it.get_attachment.assert_called_once() + external_testcase_reader.submit_testcase.assert_called_once() def test_handle_testcases_invalid(self): """Test a basic handle_testcases where issue is invalid.""" - mock_iter = mock.MagicMock() - mock_iter.__iter__.return_value = [mock.MagicMock()] - self.issue_tracker.find_issues.return_value = mock_iter - self.mock_close_invalid_issue.return_value = True - external_testcase_reader.close_invalid_issue = self.mock_close_invalid_issue - external_testcase_reader.submit_testcase = self.mock_submit_testcase - - external_testcase_reader.handle_testcases(self.issue_tracker) - self.mock_close_invalid_issue.assert_called_once() - self.issue_tracker.get_attachment.assert_not_called() - self.mock_submit_testcase.assert_not_called() + mock_it = mock.create_autospec(issue_tracker.IssueTracker) + mock_it.find_issues_with_filters.return_value = [self.mock_basic_issue] + external_testcase_reader.close_issue_if_invalid = mock.MagicMock() + external_testcase_reader.close_issue_if_invalid.return_value = True + + external_testcase_reader.handle_testcases(mock_it) + external_testcase_reader.close_issue_if_invalid.assert_called_once() + mock_it.get_attachment.assert_not_called() + external_testcase_reader.submit_testcase.assert_not_called() + + def test_handle_testcases_not_reproducible(self): + """Test a basic handle_testcases where issue is not reprodiclbe.""" + mock_it = mock.create_autospec(issue_tracker.IssueTracker) + mock_it.find_issues_with_filters.return_value = [self.mock_basic_issue] + external_testcase_reader.close_issue_if_not_reproducible = mock.MagicMock() + external_testcase_reader.close_issue_if_not_reproducible.return_value = True + external_testcase_reader.close_issue_if_invalid = mock.MagicMock() + + external_testcase_reader.handle_testcases(mock_it) + external_testcase_reader.close_issue_if_invalid.assert_not_called() + mock_it.get_attachment.assert_not_called() + external_testcase_reader.submit_testcase.assert_not_called() def test_handle_testcases_no_issues(self): """Test a basic handle_testcases that returns no issues.""" - self.issue_tracker.find_issues.return_value = None - - external_testcase_reader.handle_testcases(self.issue_tracker) - self.mock_close_invalid_issue.assert_not_called() - self.issue_tracker.get_attachment.assert_not_called() - self.mock_submit_testcase.assert_not_called() + mock_it = mock.create_autospec(issue_tracker.IssueTracker) + mock_it.find_issues_with_filters.return_value = [] + external_testcase_reader.close_issue_if_invalid = mock.MagicMock() + + external_testcase_reader.handle_testcases(mock_it) + external_testcase_reader.close_issue_if_invalid.assert_not_called() + mock_it.get_attachment.assert_not_called() + external_testcase_reader.submit_testcase.assert_not_called() + + def test_close_issue_if_not_reproducible_true(self): + """Test a basic close_issue_if_invalid with valid flags.""" + external_testcase_reader.filed_one_day_ago = mock.MagicMock() + external_testcase_reader.filed_one_day_ago.return_value = True + self.mock_basic_issue.status = 'ACCEPTED' + self.assertEqual( + True, + external_testcase_reader.close_issue_if_not_reproducible( + self.mock_basic_issue)) - def test_close_invalid_issue_basic(self): - """Test a basic _close_invalid_issue with valid flags.""" - upload_request = mock.Mock() + def test_close_issue_if_invalid_basic(self): + """Test a basic close_issue_if_invalid with valid flags.""" attachment_info = [BASIC_ATTACHMENT] description = '--flag-one --flag_two' self.assertEqual( False, - external_testcase_reader.close_invalid_issue( - upload_request, attachment_info, description)) + external_testcase_reader.close_issue_if_invalid( + self.mock_basic_issue, attachment_info, description)) - def test_close_invalid_issue_no_flag(self): - """Test a basic _close_invalid_issue with no flags.""" - upload_request = mock.Mock() + def test_close_issue_if_invalid_no_flag(self): + """Test a basic close_issue_if_invalid with no flags.""" attachment_info = [BASIC_ATTACHMENT] description = '' self.assertEqual( False, - external_testcase_reader.close_invalid_issue( - upload_request, attachment_info, description)) + external_testcase_reader.close_issue_if_invalid( + self.mock_basic_issue, attachment_info, description)) - def test_close_invalid_issue_too_many_attachments(self): - """Test _close_invalid_issue with too many attachments.""" - upload_request = mock.Mock() + def test_close_issue_if_invalid_too_many_attachments(self): + """Test close_issue_if_invalid with too many attachments.""" attachment_info = [BASIC_ATTACHMENT, BASIC_ATTACHMENT] description = '' self.assertEqual( True, - external_testcase_reader.close_invalid_issue( - upload_request, attachment_info, description)) + external_testcase_reader.close_issue_if_invalid( + self.mock_basic_issue, attachment_info, description)) - def test_close_invalid_issue_no_attachments(self): - """Test _close_invalid_issue with no attachments.""" - upload_request = mock.Mock() + def test_close_issue_if_invalid_no_attachments(self): + """Test close_issue_if_invalid with no attachments.""" attachment_info = [] description = '' self.assertEqual( True, - external_testcase_reader.close_invalid_issue( - upload_request, attachment_info, description)) + external_testcase_reader.close_issue_if_invalid( + self.mock_basic_issue, attachment_info, description)) - def test_close_invalid_issue_invalid_upload(self): - """Test _close_invalid_issue with an invalid upload.""" - upload_request = mock.Mock() + def test_close_issue_if_invalid_invalid_upload(self): + """Test close_issue_if_invalid with an invalid upload.""" attachment_info = [{ 'attachmentId': '60127668', 'contentType': 'application/octet-stream', @@ -129,12 +148,11 @@ def test_close_invalid_issue_invalid_upload(self): description = '' self.assertEqual( True, - external_testcase_reader.close_invalid_issue( - upload_request, attachment_info, description)) + external_testcase_reader.close_issue_if_invalid( + self.mock_basic_issue, attachment_info, description)) - def test_close_invalid_issue_invalid_content_type(self): - """Test _close_invalid_issue with an invalid content type.""" - upload_request = mock.Mock() + def test_close_issue_if_invalid_invalid_content_type(self): + """Test close_issue_if_invalid with an invalid content type.""" attachment_info = [{ 'attachmentId': '60127668', 'contentType': 'application/octet-stream', @@ -148,5 +166,5 @@ def test_close_invalid_issue_invalid_content_type(self): description = '' self.assertEqual( True, - external_testcase_reader.close_invalid_issue( - upload_request, attachment_info, description)) + external_testcase_reader.close_issue_if_invalid( + self.mock_basic_issue, attachment_info, description)) diff --git a/src/clusterfuzz/_internal/tests/appengine/handlers/cron/manage_vms_test.py b/src/clusterfuzz/_internal/tests/appengine/handlers/cron/manage_vms_test.py index 34226c3d5f..aa5f84444d 100644 --- a/src/clusterfuzz/_internal/tests/appengine/handlers/cron/manage_vms_test.py +++ b/src/clusterfuzz/_internal/tests/appengine/handlers/cron/manage_vms_test.py @@ -13,1165 +13,6 @@ # limitations under the License. """manage_vms tests.""" -import copy -import functools -import unittest -from unittest import mock - -from google.cloud import ndb - -from clusterfuzz._internal.cron import manage_vms -from clusterfuzz._internal.cron.helpers import bot_manager -from clusterfuzz._internal.datastore import data_types -from clusterfuzz._internal.google_cloud_utils import compute_engine_projects -from clusterfuzz._internal.tests.test_libs import helpers as test_helpers -from clusterfuzz._internal.tests.test_libs import test_utils - -AUTO_HEALING_POLICY = compute_engine_projects.AutoHealingPolicy( - health_check='global/healthChecks/example-check', - initial_delay_sec=300, -) - -AUTO_HEALING_POLICY_DICT = { - 'healthCheck': 'global/healthChecks/example-check', - 'initialDelaySec': 300 -} - -INSTANCE_GROUPS = { - 'oss-fuzz-linux-zone2-pre-proj2': { - 'targetSize': 1, - 'autoHealingPolicies': [AUTO_HEALING_POLICY], - }, - 'oss-fuzz-linux-zone2-pre-proj3': { - 'targetSize': 499, - }, - 'oss-fuzz-linux-zone2-pre-proj4': { - 'targetSize': 99, - }, - 'oss-fuzz-linux-zone2-pre-proj5': { - 'targetSize': 99, - } -} - -INSTANCE_TEMPLATES = { - 'oss-fuzz-linux-zone2-pre-proj2': { - 'description': '{"version": 1}', - 'properties': { - 'metadata': { - 'items': [], - }, - 'disks': [{ - 'initializeParams': { - 'diskSizeGb': '30', - }, - }], - 'serviceAccounts': [{ - 'email': - 'email', - 'scopes': [ - 'https://www.googleapis.com/auth/' - 'devstorage.full_control', - 'https://www.googleapis.com/auth/logging.write', - 'https://www.googleapis.com/auth/userinfo.email', - 'https://www.googleapis.com/auth/appengine.apis', - 'https://www.googleapis.com/auth/prodxmon', - 'https://www.googleapis.com/auth/bigquery', - ] - }], - } - }, - 'oss-fuzz-linux-zone2-pre-proj3': { - 'description': '{"version": 1}', - 'properties': { - 'metadata': { - 'items': [], - }, - 'disks': [{ - 'initializeParams': { - 'diskSizeGb': '30', - }, - }], - 'serviceAccounts': [{ - 'email': - 'email', - 'scopes': [ - 'https://www.googleapis.com/auth/' - 'devstorage.full_control', - 'https://www.googleapis.com/auth/logging.write', - 'https://www.googleapis.com/auth/userinfo.email', - 'https://www.googleapis.com/auth/appengine.apis', - 'https://www.googleapis.com/auth/prodxmon', - 'https://www.googleapis.com/auth/bigquery', - ] - }], - } - }, - 'oss-fuzz-linux-zone2-pre-proj4': { - 'description': '{"version": 0}', - 'properties': { - 'metadata': { - 'items': [], - }, - 'disks': [{ - 'initializeParams': { - 'diskSizeGb': '30', - }, - }], - 'serviceAccounts': [{ - 'email': - 'email', - 'scopes': [ - 'https://www.googleapis.com/auth/' - 'devstorage.full_control', - 'https://www.googleapis.com/auth/logging.write', - 'https://www.googleapis.com/auth/userinfo.email', - 'https://www.googleapis.com/auth/appengine.apis', - 'https://www.googleapis.com/auth/prodxmon', - 'https://www.googleapis.com/auth/bigquery', - ] - }], - } - }, - 'oss-fuzz-linux-zone2-pre-proj5': { - 'description': '{"version": 1}', - 'properties': { - 'metadata': { - 'items': [], - }, - 'disks': [{ - 'initializeParams': { - 'diskSizeGb': '30', - }, - }], - 'serviceAccounts': [{ - 'email': - 'email', - 'scopes': [ - 'https://www.googleapis.com/auth/' - 'devstorage.full_control', - 'https://www.googleapis.com/auth/logging.write', - 'https://www.googleapis.com/auth/userinfo.email', - 'https://www.googleapis.com/auth/appengine.apis', - 'https://www.googleapis.com/auth/prodxmon', - 'https://www.googleapis.com/auth/bigquery', - ] - }], - } - } -} - -INSTANCES = { - 'oss-fuzz-linux-zone3-host': [{ - 'instance': 'https://blah/oss-fuzz-linux-zone3-host-abcd', - }, { - 'instance': 'https://blah/oss-fuzz-linux-zone3-host-efgh', - }], - 'oss-fuzz-linux-zone3-worker-proj1': [{ - 'instance': 'https://blah/oss-fuzz-linux-zone3-worker-proj1-%04d' % i - } for i in range(1, 2)], - 'oss-fuzz-linux-zone3-worker-proj2': [{ - 'instance': 'https://blah/oss-fuzz-linux-zone3-worker-proj2-%04d' % i - } for i in range(1, 5)], - 'oss-fuzz-linux-zone3-worker-proj3': [{ - 'instance': 'https://blah/oss-fuzz-linux-zone3-worker-proj3-%04d' % i - } for i in range(1, 10)], - 'oss-fuzz-linux-zone3-worker-proj4': [{ - 'instance': 'https://blah/oss-fuzz-linux-zone3-worker-proj4-%04d' % i - } for i in range(1, 2)], - 'oss-fuzz-linux-zone3-worker-proj5': [{ - 'instance': 'https://blah/oss-fuzz-linux-zone3-worker-proj5-%04d' % i - } for i in range(1, 2)], - 'oss-fuzz-linux-zone3-host-high-end': [{ - 'instance': 'https://blah/oss-fuzz-linux-zone3-host-high-end-1' - }], - 'oss-fuzz-linux-zone3-worker-high-end-proj6': [{ - 'instance': ('https://blah/' - 'oss-fuzz-linux-zone3-worker-high-end-proj6-%04d' % i) - } for i in range(1, 3)], -} - -OSS_FUZZ_CLUSTERS = compute_engine_projects.Project( - project_id='clusterfuzz-external', - clusters=[ - compute_engine_projects.Cluster( - name='oss-fuzz-linux-zone2-pre', - gce_zone='us-east2-a', - instance_count=997, - instance_template='external-pre-zone2', - distribute=True, - auto_healing_policy=AUTO_HEALING_POLICY, - worker=False, - high_end=False), - compute_engine_projects.Cluster( - name='oss-fuzz-linux-zone3-host', - gce_zone='us-central1-d', - instance_count=2, - instance_template='host-zone3', - distribute=False, - worker=False, - high_end=False), - compute_engine_projects.Cluster( - name='oss-fuzz-linux-zone3-worker', - gce_zone='us-central1-d', - instance_count=16, - instance_template='worker-zone3', - distribute=True, - worker=True, - high_end=False), - compute_engine_projects.Cluster( - name='oss-fuzz-linux-zone3-host-high-end', - gce_zone='us-central1-d', - instance_count=1, - instance_template='host-high-end-zone3', - distribute=False, - worker=False, - high_end=True), - compute_engine_projects.Cluster( - name='oss-fuzz-linux-zone3-worker-high-end', - gce_zone='us-central1-d', - instance_count=2, - instance_template='worker-zone3', - distribute=True, - worker=True, - high_end=True), - ], - instance_templates=[ - { - 'name': 'external-pre-zone2', - 'description': '{"version": 1}', - 'properties': { - 'metadata': { - 'items': [], - }, - 'disks': [{ - 'initializeParams': { - 'diskSizeGb': 30, - }, - }], - 'serviceAccounts': [{ - 'email': - 'email', - 'scopes': [ - 'https://www.googleapis.com/auth/' - 'devstorage.full_control', - 'https://www.googleapis.com/auth/logging.write', - 'https://www.googleapis.com/auth/userinfo.email', - 'https://www.googleapis.com/auth/appengine.apis', - 'https://www.googleapis.com/auth/prodxmon', - 'https://www.googleapis.com/auth/bigquery', - ] - }], - } - }, - { - 'name': 'host-zone3', - 'description': '{"version": 1}', - 'properties': { - 'metadata': { - 'items': [], - }, - 'disks': [{ - 'initializeParams': { - 'diskSizeGb': 30, - }, - }], - 'serviceAccounts': [{ - 'email': - 'email', - 'scopes': [ - 'https://www.googleapis.com/auth/' - 'devstorage.full_control', - 'https://www.googleapis.com/auth/logging.write', - 'https://www.googleapis.com/auth/userinfo.email', - 'https://www.googleapis.com/auth/appengine.apis', - 'https://www.googleapis.com/auth/prodxmon', - 'https://www.googleapis.com/auth/bigquery', - ] - }], - } - }, - { - 'name': 'worker-zone3', - 'description': '{"version": 1}', - 'properties': { - 'metadata': { - 'items': [], - }, - 'disks': [{ - 'initializeParams': { - 'diskSizeGb': 30, - }, - }], - 'serviceAccounts': [{ - 'email': - 'email', - 'scopes': [ - 'https://www.googleapis.com/auth/' - 'devstorage.full_control', - 'https://www.googleapis.com/auth/logging.write', - 'https://www.googleapis.com/auth/userinfo.email', - 'https://www.googleapis.com/auth/prodxmon', - ] - }], - } - }, - { - 'name': 'host-high-end-zone3', - 'description': '{"version": 1}', - 'properties': { - 'metadata': { - 'items': [], - }, - 'disks': [{ - 'initializeParams': { - 'diskSizeGb': 100, - }, - }], - 'serviceAccounts': [{ - 'email': - 'email', - 'scopes': [ - 'https://www.googleapis.com/auth/' - 'devstorage.full_control', - 'https://www.googleapis.com/auth/logging.write', - 'https://www.googleapis.com/auth/userinfo.email', - 'https://www.googleapis.com/auth/prodxmon', - ] - }], - } - }, - ], - host_worker_assignments=[ - compute_engine_projects.HostWorkerAssignment( - host='oss-fuzz-linux-zone3-host', - worker='oss-fuzz-linux-zone3-worker', - workers_per_host=8), - compute_engine_projects.HostWorkerAssignment( - host='oss-fuzz-linux-zone3-host-high-end', - worker='oss-fuzz-linux-zone3-worker-high-end', - workers_per_host=2), - ]) - - -def mock_resource(spec): - """Mock resource.""" - resource = mock.Mock(spec=spec) - resource.created = False - resource.body = None - - def create(*args, **kwargs): # pylint: disable=unused-argument - if resource.created: - raise bot_manager.AlreadyExistsError - - resource.created = True - - def get(): - if resource.created: - return resource.body - - raise bot_manager.NotFoundError - - def exists(): - return resource.created - - def delete(): - if not resource.created: - raise bot_manager.NotFoundError - - resource.created = False - - resource.create.side_effect = create - resource.get.side_effect = get - resource.exists.side_effect = exists - resource.delete.side_effect = delete - - return resource - - -class MockBotManager: - """Mock BotManager.""" - - def __init__(self, project_id, zone, instance_groups, instance_templates): - self.project_id = project_id - self.zone = zone - self.instance_groups = instance_groups - self.instance_templates = instance_templates - - def _get_resource(self, name, cache, values, spec): - """Get resource.""" - if name in cache: - return cache[name] - - resource = mock_resource(spec=spec) - if name in values: - resource.created = True - resource.body = values[name] - - cache[name] = resource - return resource - - def instance_group(self, name): - """Get an InstanceGroup resource with the given name.""" - resource = self._get_resource(name, self.instance_groups, INSTANCE_GROUPS, - bot_manager.InstanceGroup) - - if name in INSTANCES: - resource.list_managed_instances.return_value = INSTANCES[name] - - return resource - - def instance_template(self, name): - """Get an InstanceTemplate resource with the given name.""" - return self._get_resource(name, self.instance_templates, INSTANCE_TEMPLATES, - bot_manager.InstanceTemplate) - - -def expected_instance_template(gce_project_name, - name, - project_name, - disk_size_gb=None, - service_account=None, - tls_cert=False): - """Get the expected instance template for a project.""" - gce_project = compute_engine_projects.load_project(gce_project_name) - expected = copy.deepcopy(gce_project.get_instance_template(name)) - expected['properties']['metadata']['items'].append({ - 'key': 'task-tag', - 'value': project_name, - }) - - if disk_size_gb: - disk = expected['properties']['disks'][0] - disk['initializeParams']['diskSizeGb'] = disk_size_gb - - if service_account: - expected['properties']['serviceAccounts'][0]['email'] = service_account - - if tls_cert: - expected['properties']['metadata']['items'].extend([{ - 'key': 'tls-cert', - 'value': project_name + '_cert', - }, { - 'key': 'tls-key', - 'value': project_name + '_key', - }]) - - return expected - - -def expected_host_instance_template(gce_project_name, name): - """Get the expected instance template for a project.""" - gce_project = compute_engine_projects.load_project(gce_project_name) - return copy.deepcopy(gce_project.get_instance_template(name)) - - -@test_utils.with_cloud_emulators('datastore') -class CronTest(unittest.TestCase): - """Test manage_vms cron.""" - - def setUp(self): - test_helpers.patch_environ(self) - test_helpers.patch(self, [ - 'clusterfuzz._internal.base.utils.is_oss_fuzz', - 'clusterfuzz._internal.cron.helpers.bot_manager.BotManager', - 'clusterfuzz._internal.system.environment.is_running_on_app_engine', - 'clusterfuzz._internal.google_cloud_utils.compute_engine_projects.load_project', - ]) - - self.mock.is_oss_fuzz.return_value = True - self.mock.is_running_on_app_engine.return_value = True - self.mock.load_project.return_value = OSS_FUZZ_CLUSTERS - - data_types.OssFuzzProject( - id='proj1', - name='proj1', - cpu_weight=1.0, - service_account='proj1@serviceaccount.com').put() - - data_types.OssFuzzProject( - id='proj2', - name='proj2', - cpu_weight=2.0, - service_account='proj2@serviceaccount.com').put() - - data_types.OssFuzzProject( - id='proj3', - name='proj3', - cpu_weight=5.0, - service_account='proj3@serviceaccount.com').put() - - data_types.OssFuzzProject( - id='proj4', - name='proj4', - cpu_weight=1.0, - service_account='proj4@serviceaccount.com').put() - - data_types.OssFuzzProject( - id='proj5', - name='proj5', - cpu_weight=1.0, - service_account='proj5@serviceaccount.com', - disk_size_gb=10).put() - - data_types.OssFuzzProject( - id='proj6', - name='proj6', - cpu_weight=1.0, - service_account='proj6@serviceaccount.com', - high_end=True).put() - - for j in range(1, 7): - project_name = 'proj%d' % j - data_types.WorkerTlsCert( - id=project_name, - project_name=project_name, - cert_contents=project_name.encode() + b'_cert', - key_contents=project_name.encode() + b'_key').put() - - data_types.OssFuzzProjectInfo(id='old_proj', name='old_proj').put() - - data_types.OssFuzzProjectInfo( - id='proj2', - name='proj2', - clusters=[ - data_types.OssFuzzProjectInfo.ClusterInfo( - cluster='oss-fuzz-linux-zone2-pre', - gce_zone='us-east2-a', - cpu_count=1, - ), - data_types.OssFuzzProjectInfo.ClusterInfo( - cluster='old-cluster', - gce_zone='us-east2-a', - cpu_count=1, - ), - ]).put() - - data_types.OssFuzzProjectInfo( - id='proj3', - name='proj3', - clusters=[ - data_types.OssFuzzProjectInfo.ClusterInfo( - cluster='oss-fuzz-linux-zone2-pre', - gce_zone='us-east2-a', - cpu_count=499, - ) - ]).put() - - data_types.OssFuzzProjectInfo( - id='proj4', - name='proj4', - clusters=[ - data_types.OssFuzzProjectInfo.ClusterInfo( - cluster='oss-fuzz-linux-zone2-pre', - gce_zone='us-east2-a', - cpu_count=99, - ) - ]).put() - - data_types.OssFuzzProjectInfo( - id='proj5', - name='proj5', - clusters=[ - data_types.OssFuzzProjectInfo.ClusterInfo( - cluster='oss-fuzz-linux-zone2-pre', - gce_zone='us-east2-a', - cpu_count=99, - ) - ]).put() - - data_types.OssFuzzProjectInfo( - id='old_proj', - name='old_proj', - clusters=[ - data_types.OssFuzzProjectInfo.ClusterInfo( - cluster='oss-fuzz-linux-zone2-pre', - gce_zone='us-east2-a', - cpu_count=5, - ) - ]).put() - - data_types.HostWorkerAssignment( - id='old-host-0', - host_name='old-host', - worker_name='worker', - instance_num=0).put() - - instance_groups = {} - instance_templates = {} - self.mock.BotManager.side_effect = functools.partial( - MockBotManager, - instance_groups=instance_groups, - instance_templates=instance_templates) - - def test_update_cpus(self): - """Tests CPU distribution cron.""" - self.maxDiff = None - manager = manage_vms.OssFuzzClustersManager('clusterfuzz-external') - manager.update_clusters() - - proj1 = ndb.Key(data_types.OssFuzzProjectInfo, 'proj1').get() - self.assertIsNotNone(proj1) - self.assertDictEqual({ - 'name': - 'proj1', - 'clusters': [{ - 'cluster': 'oss-fuzz-linux-zone2-pre', - 'cpu_count': 100, - 'gce_zone': 'us-east2-a', - }, { - 'cluster': 'oss-fuzz-linux-zone3-worker', - 'cpu_count': 1, - 'gce_zone': 'us-central1-d', - }], - }, proj1.to_dict()) - - proj2 = ndb.Key(data_types.OssFuzzProjectInfo, 'proj2').get() - self.assertIsNotNone(proj2) - self.assertDictEqual({ - 'name': - 'proj2', - 'clusters': [{ - 'cluster': 'oss-fuzz-linux-zone2-pre', - 'cpu_count': 200, - 'gce_zone': 'us-east2-a', - }, { - 'cluster': 'oss-fuzz-linux-zone3-worker', - 'cpu_count': 4, - 'gce_zone': 'us-central1-d', - }], - }, proj2.to_dict()) - - proj3 = ndb.Key(data_types.OssFuzzProjectInfo, 'proj3').get() - self.assertIsNotNone(proj3) - self.assertDictEqual({ - 'name': - 'proj3', - 'clusters': [{ - 'cluster': 'oss-fuzz-linux-zone2-pre', - 'cpu_count': 499, - 'gce_zone': 'us-east2-a', - }, { - 'cluster': 'oss-fuzz-linux-zone3-worker', - 'cpu_count': 9, - 'gce_zone': 'us-central1-d', - }], - }, proj3.to_dict()) - - proj4 = ndb.Key(data_types.OssFuzzProjectInfo, 'proj4').get() - self.assertIsNotNone(proj4) - self.assertDictEqual({ - 'name': - 'proj4', - 'clusters': [{ - 'cluster': 'oss-fuzz-linux-zone2-pre', - 'cpu_count': 99, - 'gce_zone': 'us-east2-a', - }, { - 'cluster': 'oss-fuzz-linux-zone3-worker', - 'cpu_count': 1, - 'gce_zone': 'us-central1-d', - }], - }, proj4.to_dict()) - - proj5 = ndb.Key(data_types.OssFuzzProjectInfo, 'proj5').get() - self.assertIsNotNone(proj5) - self.assertDictEqual({ - 'name': - 'proj5', - 'clusters': [{ - 'cluster': 'oss-fuzz-linux-zone2-pre', - 'cpu_count': 99, - 'gce_zone': 'us-east2-a', - }, { - 'cluster': 'oss-fuzz-linux-zone3-worker', - 'cpu_count': 1, - 'gce_zone': 'us-central1-d', - }], - }, proj5.to_dict()) - - proj6 = ndb.Key(data_types.OssFuzzProjectInfo, 'proj6').get() - self.assertIsNotNone(proj6) - self.assertDictEqual({ - 'name': - 'proj6', - 'clusters': [{ - 'cluster': 'oss-fuzz-linux-zone3-worker-high-end', - 'cpu_count': 2, - 'gce_zone': 'us-central1-d', - }], - }, proj6.to_dict()) - - old_proj = ndb.Key(data_types.OssFuzzProjectInfo, 'old_proj').get() - self.assertIsNone(old_proj) - - mock_bot_manager = self.mock.BotManager('clusterfuzz-external', - 'us-east2-a') - - # proj1: new project. - mock_bot_manager.instance_template( - 'oss-fuzz-linux-zone2-pre-proj1').create.assert_called_with( - expected_instance_template('clusterfuzz-external', - 'external-pre-zone2', 'proj1')) - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-proj1').create.assert_called_with( - 'oss-fuzz-linux-zone2-pre-proj1', - 'oss-fuzz-linux-zone2-pre-proj1', - size=100, - auto_healing_policy=AUTO_HEALING_POLICY_DICT, - wait_for_instances=False) - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-proj1').resize.assert_not_called() - - # proj2: already exists. needs a resize. old cluster should be deleted. - mock_bot_manager.instance_template( - 'oss-fuzz-linux-zone2-pre-proj2').create.assert_not_called() - mock_bot_manager.instance_template( - 'oss-fuzz-linux-zone2-pre-proj2').delete.assert_not_called() - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-proj2').create.assert_not_called() - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-proj2').delete.assert_not_called() - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-proj2').resize.assert_called_with( - 200, wait_for_instances=False) - mock_bot_manager.instance_template( - 'old-cluster-proj2').delete.assert_called() - mock_bot_manager.instance_group('old-cluster-proj2').delete.assert_called() - - # proj3: already exists. no changes needed. - mock_bot_manager.instance_template( - 'oss-fuzz-linux-zone2-pre-proj3').delete.assert_not_called() - mock_bot_manager.instance_template( - 'oss-fuzz-linux-zone2-pre-proj3').create.assert_not_called() - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-proj3').create.assert_not_called() - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-proj3').resize.assert_not_called() - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-proj3').delete.assert_not_called() - - # proj4: needs a template update (version change). - mock_bot_manager.instance_template( - 'oss-fuzz-linux-zone2-pre-proj4').delete.assert_called() - mock_bot_manager.instance_template( - 'oss-fuzz-linux-zone2-pre-proj4').create.assert_called_with( - expected_instance_template('clusterfuzz-external', - 'external-pre-zone2', 'proj4')) - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-proj4').delete.assert_called() - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-proj4').create.assert_called_with( - 'oss-fuzz-linux-zone2-pre-proj4', - 'oss-fuzz-linux-zone2-pre-proj4', - size=99, - auto_healing_policy=AUTO_HEALING_POLICY_DICT, - wait_for_instances=False) - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-proj4').resize.assert_not_called() - - # proj5: needs a template update (disk size change). - mock_bot_manager.instance_template( - 'oss-fuzz-linux-zone2-pre-proj5').delete.assert_called() - mock_bot_manager.instance_template( - 'oss-fuzz-linux-zone2-pre-proj5').create.assert_called_with( - expected_instance_template( - 'clusterfuzz-external', - 'external-pre-zone2', - 'proj5', - disk_size_gb=10)) - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-proj5').delete.assert_called() - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-proj5').create.assert_called_with( - 'oss-fuzz-linux-zone2-pre-proj5', - 'oss-fuzz-linux-zone2-pre-proj5', - size=99, - auto_healing_policy=AUTO_HEALING_POLICY_DICT, - wait_for_instances=False) - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-proj5').resize.assert_not_called() - - # proj6: high end project. - for j in range(1, 6): - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone3-worker-high-end-proj' + - str(j)).create.assert_not_called() - - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone3-worker-high-end-proj6').create.assert_called() - - # old_proj: deleted. - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-old-proj').create.assert_not_called() - mock_bot_manager.instance_template( - 'oss-fuzz-linux-zone2-pre-old-proj').delete.assert_called() - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone2-pre-old-proj').delete.assert_called() - - # host instances: created. - mock_bot_manager.instance_template( - 'oss-fuzz-linux-zone3-host').create.assert_called_with( - expected_host_instance_template('clusterfuzz-external', - 'host-zone3')) - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone3-host').create.assert_called_with( - 'oss-fuzz-linux-zone3-host', - 'oss-fuzz-linux-zone3-host', - size=2, - auto_healing_policy=None, - wait_for_instances=False) - - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone3-host-high-end').create.assert_called_with( - 'oss-fuzz-linux-zone3-host-high-end', - 'oss-fuzz-linux-zone3-host-high-end', - size=1, - auto_healing_policy=None, - wait_for_instances=False) - - # Worker instances: created. - mock_bot_manager.instance_template( - 'oss-fuzz-linux-zone3-worker-proj1').create.assert_called_with( - expected_instance_template( - 'clusterfuzz-external', - 'worker-zone3', - 'proj1', - service_account='proj1@serviceaccount.com', - tls_cert=True)) - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone3-worker-proj1').create.assert_called_with( - 'oss-fuzz-linux-zone3-worker-proj1', - 'oss-fuzz-linux-zone3-worker-proj1', - size=1, - auto_healing_policy=None, - wait_for_instances=False) - - mock_bot_manager.instance_template( - 'oss-fuzz-linux-zone3-worker-proj2').create.assert_called_with( - expected_instance_template( - 'clusterfuzz-external', - 'worker-zone3', - 'proj2', - service_account='proj2@serviceaccount.com', - tls_cert=True)) - mock_bot_manager.instance_group( - 'oss-fuzz-linux-zone3-worker-proj2').create.assert_called_with( - 'oss-fuzz-linux-zone3-worker-proj2', - 'oss-fuzz-linux-zone3-worker-proj2', - size=4, - auto_healing_policy=None, - wait_for_instances=False) - - self.assertCountEqual([{ - 'instance_num': 0, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj1-0001', - 'project_name': 'proj1', - 'host_name': 'oss-fuzz-linux-zone3-host-abcd' - }, { - 'instance_num': 1, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj2-0001', - 'project_name': 'proj2', - 'host_name': 'oss-fuzz-linux-zone3-host-abcd' - }, { - 'instance_num': 2, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj2-0002', - 'project_name': 'proj2', - 'host_name': 'oss-fuzz-linux-zone3-host-abcd' - }, { - 'instance_num': 3, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj2-0003', - 'project_name': 'proj2', - 'host_name': 'oss-fuzz-linux-zone3-host-abcd' - }, { - 'instance_num': 4, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj2-0004', - 'project_name': 'proj2', - 'host_name': 'oss-fuzz-linux-zone3-host-abcd' - }, { - 'instance_num': 5, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj3-0001', - 'project_name': 'proj3', - 'host_name': 'oss-fuzz-linux-zone3-host-abcd' - }, { - 'instance_num': 6, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj3-0002', - 'project_name': 'proj3', - 'host_name': 'oss-fuzz-linux-zone3-host-abcd' - }, { - 'instance_num': 7, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj3-0003', - 'project_name': 'proj3', - 'host_name': 'oss-fuzz-linux-zone3-host-abcd' - }, { - 'instance_num': 0, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj3-0004', - 'project_name': 'proj3', - 'host_name': 'oss-fuzz-linux-zone3-host-efgh' - }, { - 'instance_num': 1, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj3-0005', - 'project_name': 'proj3', - 'host_name': 'oss-fuzz-linux-zone3-host-efgh' - }, { - 'instance_num': 2, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj3-0006', - 'project_name': 'proj3', - 'host_name': 'oss-fuzz-linux-zone3-host-efgh' - }, { - 'instance_num': 3, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj3-0007', - 'project_name': 'proj3', - 'host_name': 'oss-fuzz-linux-zone3-host-efgh' - }, { - 'instance_num': 4, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj3-0008', - 'project_name': 'proj3', - 'host_name': 'oss-fuzz-linux-zone3-host-efgh' - }, { - 'instance_num': 5, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj3-0009', - 'project_name': 'proj3', - 'host_name': 'oss-fuzz-linux-zone3-host-efgh' - }, { - 'instance_num': 6, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj4-0001', - 'project_name': 'proj4', - 'host_name': 'oss-fuzz-linux-zone3-host-efgh' - }, { - 'instance_num': 7, - 'worker_name': 'oss-fuzz-linux-zone3-worker-proj5-0001', - 'project_name': 'proj5', - 'host_name': 'oss-fuzz-linux-zone3-host-efgh' - }, { - 'instance_num': 0, - 'worker_name': 'oss-fuzz-linux-zone3-worker-high-end-proj6-0001', - 'project_name': 'proj6', - 'host_name': 'oss-fuzz-linux-zone3-host-high-end-1' - }, { - 'instance_num': 1, - 'worker_name': 'oss-fuzz-linux-zone3-worker-high-end-proj6-0002', - 'project_name': 'proj6', - 'host_name': 'oss-fuzz-linux-zone3-host-high-end-1' - }], [ - assignment.to_dict() - for assignment in data_types.HostWorkerAssignment.query() - ]) - - -class OssFuzzDistributeCpusTest(unittest.TestCase): - """Tests OSS-Fuzz CPU distribution.""" - - def setUp(self): - test_helpers.patch(self, [ - 'clusterfuzz._internal.google_cloud_utils.compute_engine_projects.load_project', - ]) - self.mock.load_project.return_value = OSS_FUZZ_CLUSTERS - - def test_equal(self): - """Tests for each project receiving equal share.""" - projects = [ - data_types.OssFuzzProject(name='proj1', cpu_weight=1.0), - data_types.OssFuzzProject(name='proj2', cpu_weight=1.0), - data_types.OssFuzzProject(name='proj3', cpu_weight=1.0), - ] - - result = manage_vms.OssFuzzClustersManager( - 'clusterfuzz-external').distribute_cpus(projects, 30) - self.assertListEqual([10, 10, 10], result) - - def test_equal_uneven(self): - """Tests for each project receiving equal share with an uneven division.""" - projects = [ - data_types.OssFuzzProject(name='proj1', cpu_weight=1.0), - data_types.OssFuzzProject(name='proj2', cpu_weight=1.0), - data_types.OssFuzzProject(name='proj3', cpu_weight=1.0), - ] - - result = manage_vms.OssFuzzClustersManager( - 'clusterfuzz-external').distribute_cpus(projects, 31) - self.assertListEqual([11, 10, 10], result) - - result = manage_vms.OssFuzzClustersManager( - 'clusterfuzz-external').distribute_cpus(projects, 32) - self.assertListEqual([11, 11, 10], result) - - def test_weight_preference(self): - """Tests that remainders are given to projects with higher weights - - first. - """ - projects = [ - data_types.OssFuzzProject(name='proj1', cpu_weight=1.0), - data_types.OssFuzzProject(name='proj2', cpu_weight=1.01), - data_types.OssFuzzProject(name='proj3', cpu_weight=1.1), - ] - - result = manage_vms.OssFuzzClustersManager( - 'clusterfuzz-external').distribute_cpus(projects, 4) - self.assertListEqual([1, 1, 2], result) - - result = manage_vms.OssFuzzClustersManager( - 'clusterfuzz-external').distribute_cpus(projects, 5) - self.assertListEqual([1, 2, 2], result) - - def test_not_enough(self): - """Tests allocation with not enough CPUs.""" - projects = [ - data_types.OssFuzzProject(name='proj1', cpu_weight=1.0), - data_types.OssFuzzProject(name='proj2', cpu_weight=1.0), - data_types.OssFuzzProject(name='proj3', cpu_weight=1.0), - ] - - result = manage_vms.OssFuzzClustersManager( - 'clusterfuzz-external').distribute_cpus(projects, 1) - self.assertListEqual([1, 0, 0], result) - - result = manage_vms.OssFuzzClustersManager( - 'clusterfuzz-external').distribute_cpus(projects, 2) - self.assertListEqual([1, 1, 0], result) - - def test_minimum(self): - """Tests that projects are given a minimum share.""" - projects = [ - data_types.OssFuzzProject(name='proj1', cpu_weight=0.0), - data_types.OssFuzzProject(name='proj2', cpu_weight=0.0), - data_types.OssFuzzProject(name='proj3', cpu_weight=0.0), - ] - - result = manage_vms.OssFuzzClustersManager( - 'clusterfuzz-external').distribute_cpus(projects, 3) - self.assertListEqual([1, 1, 1], result) - - result = manage_vms.OssFuzzClustersManager( - 'clusterfuzz-external').distribute_cpus(projects, 10) - self.assertListEqual([4, 3, 3], result) - - def test_maximum(self): - """Tests that projects are capped at the maximum share.""" - projects = [ - data_types.OssFuzzProject(name='proj1', cpu_weight=1.0), - data_types.OssFuzzProject(name='proj2', cpu_weight=1.0), - data_types.OssFuzzProject(name='proj3', cpu_weight=1.0), - ] - - result = manage_vms.OssFuzzClustersManager( - 'clusterfuzz-external').distribute_cpus(projects, 10000) - self.assertListEqual([1000, 1000, 1000], result) - - def test_primes(self): - """Test a bunch of different distributions.""" - projects = [ - data_types.OssFuzzProject(name='proj1', cpu_weight=2.0), - data_types.OssFuzzProject(name='proj2', cpu_weight=3.0), - data_types.OssFuzzProject(name='proj3', cpu_weight=5.0), - data_types.OssFuzzProject(name='proj4', cpu_weight=7.0), - data_types.OssFuzzProject(name='proj5', cpu_weight=11.0), - ] - - result = manage_vms.OssFuzzClustersManager( - 'clusterfuzz-external').distribute_cpus(projects, 101) - self.assertListEqual([7, 10, 18, 26, 40], result) - self.assertEqual(101, sum(result)) - - result = manage_vms.OssFuzzClustersManager( - 'clusterfuzz-external').distribute_cpus(projects, 887) - self.assertListEqual([63, 95, 158, 222, 349], result) - self.assertEqual(887, sum(result)) - - result = manage_vms.OssFuzzClustersManager( - 'clusterfuzz-external').distribute_cpus(projects, 2741) - self.assertListEqual([214, 313, 509, 705, 1000], result) - self.assertEqual(2741, sum(result)) - - result = manage_vms.OssFuzzClustersManager( - 'clusterfuzz-external').distribute_cpus(projects, 3571) - self.assertListEqual([356, 483, 738, 994, 1000], result) - self.assertEqual(3571, sum(result)) - - -@test_utils.with_cloud_emulators('datastore') -class AssignHostWorkerTest(unittest.TestCase): - """Tests host -> worker assignment.""" - - def setUp(self): - test_helpers.patch(self, [ - 'clusterfuzz._internal.google_cloud_utils.compute_engine_projects.load_project', - ]) - self.mock.load_project.return_value = OSS_FUZZ_CLUSTERS - - def test_assign_keep_existing(self): - """Test that assignment keeps existing assignments.""" - host_names = ['host'] - worker_instances = [ - manage_vms.WorkerInstance(name='worker-proj-0', project='proj'), - manage_vms.WorkerInstance(name='worker-proj-1', project='proj'), - manage_vms.WorkerInstance(name='worker-proj-2', project='proj'), - manage_vms.WorkerInstance(name='worker-proj-3', project='proj'), - manage_vms.WorkerInstance(name='worker-proj-4', project='proj'), - manage_vms.WorkerInstance(name='worker-proj-5', project='proj'), - manage_vms.WorkerInstance(name='worker-proj-6', project='proj'), - manage_vms.WorkerInstance(name='worker-proj-7', project='proj'), - ] - - data_types.HostWorkerAssignment( - host_name='host', - instance_num=2, - worker_name='worker-proj-6', - project_name='proj', - id='host-2').put() - - data_types.HostWorkerAssignment( - host_name='host', - instance_num=3, - worker_name='worker-proj-1', - project_name='proj', - id='host-3').put() - - data_types.HostWorkerAssignment( - host_name='host', - instance_num=0, - worker_name='worker-nonexistent-1', - project_name='nonexistent', - id='host-0').put() - - manager = manage_vms.OssFuzzClustersManager('clusterfuzz-external') - new_assignments = manager.do_assign_hosts_to_workers( - host_names, worker_instances, 8) - self.assertListEqual([ - { - 'host_name': 'host', - 'instance_num': 0, - 'project_name': 'proj', - 'worker_name': 'worker-proj-0' - }, - { - 'host_name': 'host', - 'instance_num': 1, - 'project_name': 'proj', - 'worker_name': 'worker-proj-2' - }, - { - 'host_name': 'host', - 'instance_num': 4, - 'project_name': 'proj', - 'worker_name': 'worker-proj-3' - }, - { - 'host_name': 'host', - 'instance_num': 5, - 'project_name': 'proj', - 'worker_name': 'worker-proj-4' - }, - { - 'host_name': 'host', - 'instance_num': 6, - 'project_name': 'proj', - 'worker_name': 'worker-proj-5' - }, - { - 'host_name': 'host', - 'instance_num': 7, - 'project_name': 'proj', - 'worker_name': 'worker-proj-7' - }, - ], [assignment.to_dict() for assignment in new_assignments]) +# TODO(unassigned): Write some tests for this module. +# All of the old tests were for the oss-fuzz manager that was deleted during the +# batch migration. diff --git a/src/clusterfuzz/_internal/tests/appengine/handlers/cron/schedule_fuzz_test.py b/src/clusterfuzz/_internal/tests/appengine/handlers/cron/schedule_fuzz_test.py index a849875c0b..7cd22829c5 100644 --- a/src/clusterfuzz/_internal/tests/appengine/handlers/cron/schedule_fuzz_test.py +++ b/src/clusterfuzz/_internal/tests/appengine/handlers/cron/schedule_fuzz_test.py @@ -17,6 +17,7 @@ from clusterfuzz._internal.cron import schedule_fuzz from clusterfuzz._internal.datastore import data_types +from clusterfuzz._internal.google_cloud_utils import credentials from clusterfuzz._internal.tests.test_libs import helpers as test_helpers from clusterfuzz._internal.tests.test_libs import test_utils @@ -78,24 +79,27 @@ def test_get_fuzz_tasks(self): self.assertListEqual(comparable_results, expected_results) -class TestGetAvailableCpus(unittest.TestCase): - """Tests for get_available_cpus.""" +class TestGetAvailableCpusForRegion(unittest.TestCase): + """Tests for get_available_cpus_for_region.""" def setUp(self): test_helpers.patch(self, ['clusterfuzz._internal.cron.schedule_fuzz._get_quotas']) + self.creds = credentials.get_default() def test_usage(self): - """Tests that get_available_cpus handles usage properly.""" + """Tests that get_available_cpus_for_region handles usage properly.""" self.mock._get_quotas.return_value = [{ 'metric': 'PREEMPTIBLE_CPUS', 'limit': 5, 'usage': 2 }] - self.assertEqual(schedule_fuzz.get_available_cpus('project', 'region'), 3) + self.assertEqual( + schedule_fuzz.get_available_cpus_for_region(self.creds, 'project', + 'region'), 3) def test_cpus_and_preemptible_cpus(self): - """Tests that get_available_cpus handles usage properly.""" + """Tests that get_available_cpus_for_region handles usage properly.""" self.mock._get_quotas.return_value = [{ 'metric': 'PREEMPTIBLE_CPUS', 'limit': 5, @@ -105,4 +109,6 @@ def test_cpus_and_preemptible_cpus(self): 'limit': 5, 'usage': 5 }] - self.assertEqual(schedule_fuzz.get_available_cpus('region', 'project'), 5) + self.assertEqual( + schedule_fuzz.get_available_cpus_for_region(self.creds, 'region', + 'project'), 5) diff --git a/src/clusterfuzz/_internal/tests/appengine/handlers/cron/triage_test.py b/src/clusterfuzz/_internal/tests/appengine/handlers/cron/triage_test.py index 5ab7e9d8e7..25476979d9 100644 --- a/src/clusterfuzz/_internal/tests/appengine/handlers/cron/triage_test.py +++ b/src/clusterfuzz/_internal/tests/appengine/handlers/cron/triage_test.py @@ -25,6 +25,67 @@ from clusterfuzz._internal.tests.test_libs import test_utils +@test_utils.with_cloud_emulators('datastore') +class AndroidCrashImportantTest(unittest.TestCase): + """Tests for is_crash_important_android.""" + + def setUp(self): + helpers.patch(self, [ + 'clusterfuzz._internal.metrics.crash_stats.get_last_successful_hour', + 'clusterfuzz._internal.metrics.crash_stats.get', + 'clusterfuzz._internal.base.utils.utcnow', + ]) + self.mock.utcnow.return_value = test_utils.CURRENT_TIME + + def test_is_crash_important_android_1(self): + """If this unreproducible testcase (libfuzzer) is crashing frequently, + then it is an important crash.""" + self.mock.get_last_successful_hour.return_value = 417325 + indices = [{ + 'count': 1, + 'hour': day_index + } for day_index in range(417325, 416989, -24)] + + self.mock.get.return_value = (1, [{ + 'totalCount': 14, + 'groups': [{ + 'indices': indices, + 'name': 'false', + },] + }]) + testcase = test_utils.create_generic_testcase() + testcase.job_type = 'libfuzzer_test' + testcase.platform = 'android' + testcase.one_time_crasher_flag = True + testcase.put() + + self.assertTrue(triage.is_crash_important_android(testcase)) + + def test_is_crash_important_android_2(self): + """If this unreproducible testcase (libfuzzer) is less than the + total crash threshold, then it is not important.""" + self.mock.get_last_successful_hour.return_value = 417325 + indices = [{ + 'count': day_index % 5 == 0, + 'hour': day_index + } for day_index in range(417325, 416989, -24)] + + self.mock.get.return_value = (1, [{ + 'totalCount': 3, + 'groups': [{ + 'indices': indices, + 'name': 'false', + },] + }]) + testcase = test_utils.create_generic_testcase() + testcase.job_type = 'libfuzzer_test' + testcase.platform = 'android' + testcase.one_time_crasher_flag = True + testcase.put() + + self.assertFalse(triage.is_crash_important_android(testcase)) + + @test_utils.with_cloud_emulators('datastore') class CrashImportantTest(unittest.TestCase): """Tests for _is_crash_important.""" diff --git a/src/clusterfuzz/_internal/tests/appengine/libs/issue_management/google_issue_tracker/google_issue_tracker_test.py b/src/clusterfuzz/_internal/tests/appengine/libs/issue_management/google_issue_tracker/google_issue_tracker_test.py index 6b3d176510..e1913810c6 100644 --- a/src/clusterfuzz/_internal/tests/appengine/libs/issue_management/google_issue_tracker/google_issue_tracker_test.py +++ b/src/clusterfuzz/_internal/tests/appengine/libs/issue_management/google_issue_tracker/google_issue_tracker_test.py @@ -120,6 +120,7 @@ def test_get_issue(self): self.assertCountEqual([], issue.components) self.assertCountEqual([], issue.ccs) self.assertEqual('test body', issue.body) + self.assertEqual('2019-06-25T01:29:30.021Z', issue.created_time) def test_closed(self): """Test a closed issue.""" diff --git a/src/clusterfuzz/_internal/tests/core/base/errors_test.py b/src/clusterfuzz/_internal/tests/core/base/errors_test.py new file mode 100644 index 0000000000..a53ea23b7c --- /dev/null +++ b/src/clusterfuzz/_internal/tests/core/base/errors_test.py @@ -0,0 +1,37 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for errors.py.""" + +import unittest + +from clusterfuzz._internal.base import errors +from clusterfuzz._internal.google_cloud_utils import storage + + +class TestErrorInList(unittest.TestCase): + """Tests error_in_list.""" + + def test_error_in_list(self): + try: + raise storage.ExpiredSignedUrlError( + 'Expired token, failed to download uworker_input: https://google.com', + 'https://google.com', 'Response text here') + except storage.ExpiredSignedUrlError as e: + self.assertTrue( + errors.error_in_list(str(e), errors.BOT_ERROR_TERMINATION_LIST)) + + def test_arbitrary_error(self): + """Tests proper handling of errors not in the list.""" + self.assertFalse( + errors.error_in_list('RuntimeError', errors.BOT_ERROR_TERMINATION_LIST)) diff --git a/src/clusterfuzz/_internal/tests/core/bot/tasks/utasks/corpus_pruning_task_test.py b/src/clusterfuzz/_internal/tests/core/bot/tasks/utasks/corpus_pruning_task_test.py index 7130b2bb19..3ff5a7fd38 100644 --- a/src/clusterfuzz/_internal/tests/core/bot/tasks/utasks/corpus_pruning_task_test.py +++ b/src/clusterfuzz/_internal/tests/core/bot/tasks/utasks/corpus_pruning_task_test.py @@ -57,7 +57,7 @@ def setUp(self): self.local_gcs_buckets_path = tempfile.mkdtemp() os.environ['LOCAL_GCS_BUCKETS_PATH'] = self.local_gcs_buckets_path os.environ['TEST_BLOBS_BUCKET'] = 'blobs-bucket' - storage._provider().create_bucket('blobs-bucket', None, None) + storage._provider().create_bucket('blobs-bucket', None, None, None) helpers.patch(self, [ 'clusterfuzz._internal.bot.fuzzers.engine_common.unpack_seed_corpus_if_needed', 'clusterfuzz._internal.bot.tasks.task_creation.create_tasks', @@ -234,14 +234,15 @@ def test_prune(self): '6fa8c57336628a7d733f684dc9404fbd09020543', ], corpus) - testcases = list(data_types.Testcase.query()) - self.assertEqual(1, len(testcases)) - self.assertEqual('Null-dereference WRITE', testcases[0].crash_type) - self.assertEqual('Foo\ntest_fuzzer.cc\n', testcases[0].crash_state) - self.assertEqual(1337, testcases[0].crash_revision) - self.assertEqual('test_fuzzer', - testcases[0].get_metadata('fuzzer_binary_name')) - self.assertEqual('label1,label2', testcases[0].get_metadata('issue_labels')) + # TODO(metzman): Re-enable this when we re-enable corpus crash reporting. + # testcases = list(data_types.Testcase.query()) + # self.assertEqual(1, len(testcases)) + # self.assertEqual('Null-dereference WRITE', testcases[0].crash_type) + # self.assertEqual('Foo\ntest_fuzzer.cc\n', testcases[0].crash_state) + # self.assertEqual(1337, testcases[0].crash_revision) + # self.assertEqual('test_fuzzer', + # testcases[0].get_metadata('fuzzer_binary_name')) + # self.assertEqual('label1,label2', testcases[0].get_metadata('issue_labels')) today = datetime.datetime.utcnow().date() # get_coverage_information on test_fuzzer rather than libFuzzer_test_fuzzer @@ -644,8 +645,7 @@ def test_upload_corpus_crashes_zip(self): cross_pollination_stats=None) corpus_pruning_task._upload_corpus_crashes_zip( - None, result, self.corpus_crashes_blob_name, - self.corpus_crashes_upload_url) + result, self.corpus_crashes_blob_name, self.corpus_crashes_upload_url) corpus_crashes_zip_local_path = os.path.join( self.temp_dir, f'{self.corpus_crashes_blob_name}.zip') diff --git a/src/clusterfuzz/_internal/tests/core/crash_analysis/stack_parsing/stack_analyzer_data/notreached_log_message.txt b/src/clusterfuzz/_internal/tests/core/crash_analysis/stack_parsing/stack_analyzer_data/notreached_log_message.txt new file mode 100644 index 0000000000..4290f8e2d0 --- /dev/null +++ b/src/clusterfuzz/_internal/tests/core/crash_analysis/stack_parsing/stack_analyzer_data/notreached_log_message.txt @@ -0,0 +1,52 @@ +[1201/130354.771719:FATAL:url_idna_icu.cc(58)] NOTREACHED hit. failed to open UTS46 data with error: U_FILE_ACCESS_ERROR. If you see this error message in a test environment your test environment likely lacks the required data tables for libicu. See https://crbug.com/778929. +#0 0x7ab1dca041f2 base::debug::CollectStackTrace() +#1 0x7ab1dc9d69fd base::debug::StackTrace::StackTrace() +#2 0x7ab1dc847fd3 logging::LogMessage::~LogMessage() +#3 0x7ab1dc849348 logging::LogMessage::~LogMessage() +#4 0x7ab1dc810b84 logging::CheckError::~CheckError() +#5 0x7ab1dbbdb465 url::(anonymous namespace)::CreateIDNA() +#6 0x7ab1dbbdb2fa url::IDNToASCII() +#7 0x7ab1dbbb9a31 url::(anonymous namespace)::DoIDNHost() +#8 0x7ab1dbbb8ce1 url::(anonymous namespace)::DoComplexHost() +#9 0x7ab1dbbb7b9f url::(anonymous namespace)::DoHost<>() +#10 0x7ab1dbbb7adc url::CanonicalizeHost() +#11 0x7ab1dbbca588 url::(anonymous namespace)::DoCanonicalizeStandardURL<>() +#12 0x7ab1dbbca202 url::CanonicalizeStandardURL() +#13 0x7ab1dbbd2e06 url::(anonymous namespace)::DoCanonicalize<>() +#14 0x7ab1dbbd26be url::Canonicalize() +#15 0x7ab1dbb9c9cf GURL::InitCanonical<>() +#16 0x563f6f97750e privacy_sandbox::ParseAttestationsFromStream() +#17 0x563f6f976306 LLVMFuzzerTestOneInput +#18 0x563f6f99fbdc fuzzer::Fuzzer::ExecuteCallback() +#19 0x563f6f98b720 fuzzer::RunOneTest() +#20 0x563f6f990370 fuzzer::FuzzerDriver() +#21 0x563f6f984b2b main +#22 0x7ab1c6c42083 __libc_start_main +#23 0x563f6f95eb4a _start +UndefinedBehaviorSanitizer:DEADLYSIGNAL +==2282163==ERROR: UndefinedBehaviorSanitizer: TRAP on unknown address 0x000000000000 (pc 0x7ab1dc849177 bp 0x7ffc8f7379c0 sp 0x7ffc8f736920 T2282163) + #0 0x7ab1dc849177 in ImmediateCrash base/immediate_crash.h:146:3 + #1 0x7ab1dc849177 in logging::LogMessage::~LogMessage() base/logging.cc:954:7 + #2 0x7ab1dc849347 in logging::LogMessage::~LogMessage() base/logging.cc:699:27 + #3 0x7ab1dc810b83 in logging::NotReachedError::~NotReachedError() base/check.cc:267:3 + #4 0x7ab1dbbdb464 in url::(anonymous namespace)::CreateIDNA(bool) url/url_idna_icu.cc:58:5 + #5 0x7ab1dbbdb2f9 in GetUIDNA url/url_idna_icu.cc:0 + #6 0x7ab1dbbdb2f9 in url::IDNToASCII(std::__Cr::basic_string_view>, url::CanonOutputT*) url/url_idna_icu.cc:97:18 + #7 0x7ab1dbbb9a30 in url::(anonymous namespace)::DoIDNHost(char16_t const*, unsigned long, url::CanonOutputT*) url/url_canon_host.cc:217:8 + #8 0x7ab1dbbb8ce0 in url::(anonymous namespace)::DoComplexHost(char const*, unsigned long, bool, bool, url::CanonOutputT*) url/url_canon_host.cc:318:10 + #9 0x7ab1dbbb7b9e in void url::(anonymous namespace)::DoHost(char const*, url::Component const&, url::CanonOutputT*, url::CanonHostInfo*) url/url_canon_host.cc:393:7 + #10 0x7ab1dbbb7adb in url::CanonicalizeHost(char const*, url::Component const&, url::CanonOutputT*, url::Component*) url/url_canon_host.cc:424:3 + #11 0x7ab1dbbca587 in bool url::(anonymous namespace)::DoCanonicalizeStandardURL(url::URLComponentSource const&, url::Parsed const&, url::SchemeType, url::CharsetConverter*, url::CanonOutputT*, url::Parsed*) url/url_canon_stdurl.cc:57:16 + #12 0x7ab1dbbca201 in url::CanonicalizeStandardURL(char const*, int, url::Parsed const&, url::SchemeType, url::CharsetConverter*, url::CanonOutputT*, url::Parsed*) url/url_canon_stdurl.cc:152:10 + #13 0x7ab1dbbd2e05 in bool url::(anonymous namespace)::DoCanonicalize(char const*, int, bool, url::(anonymous namespace)::WhitespaceRemovalPolicy, url::CharsetConverter*, url::CanonOutputT*, url::Parsed*) url/url_util.cc:283:15 + #14 0x7ab1dbbd26bd in url::Canonicalize(char const*, int, bool, url::CharsetConverter*, url::CanonOutputT*, url::Parsed*) url/url_util.cc:774:10 + #15 0x7ab1dbb9c9ce in void GURL::InitCanonical>, char>(std::__Cr::basic_string_view>, bool) url/gurl.cc:76:15 + #16 0x563f6f97750d in privacy_sandbox::ParseAttestationsFromStream(std::__Cr::basic_istream>&) components/privacy_sandbox/privacy_sandbox_attestations/privacy_sandbox_attestations_parser.cc:86:28 + #17 0x563f6f976305 in TestOneProtoInput components/privacy_sandbox/privacy_sandbox_attestations/privacy_sandbox_attestations_parser_proto_fuzzer.cc:29:3 + #18 0x563f6f976305 in LLVMFuzzerTestOneInput components/privacy_sandbox/privacy_sandbox_attestations/privacy_sandbox_attestations_parser_proto_fuzzer.cc:19:1 + #19 0x563f6f99fbdb in fuzzer::Fuzzer::ExecuteCallback(unsigned char const*, unsigned long) third_party/libFuzzer/src/FuzzerLoop.cpp:614:13 + #20 0x563f6f98b71f in fuzzer::RunOneTest(fuzzer::Fuzzer*, char const*, unsigned long) third_party/libFuzzer/src/FuzzerDriver.cpp:327:6 + #21 0x563f6f99036f in fuzzer::FuzzerDriver(int*, char***, int (*)(unsigned char const*, unsigned long)) third_party/libFuzzer/src/FuzzerDriver.cpp:862:9 + #22 0x563f6f984b2a in main third_party/libFuzzer/src/FuzzerMain.cpp:20:10 + #23 0x7ab1c6c42082 in __libc_start_main /build/glibc-BHL3KM/glibc-2.31/csu/libc-start.c:308:16 + #24 0x563f6f95eb49 in _start diff --git a/src/clusterfuzz/_internal/tests/core/crash_analysis/stack_parsing/stack_analyzer_test.py b/src/clusterfuzz/_internal/tests/core/crash_analysis/stack_parsing/stack_analyzer_test.py index bcf4fe4e58..6bdc2618c2 100644 --- a/src/clusterfuzz/_internal/tests/core/crash_analysis/stack_parsing/stack_analyzer_test.py +++ b/src/clusterfuzz/_internal/tests/core/crash_analysis/stack_parsing/stack_analyzer_test.py @@ -2751,6 +2751,23 @@ def test_check_log_message(self): expected_state, expected_stacktrace, expected_security_flag) + def test_notreached_log_message(self): + """Tests Chromium NOTREACHED()s as CHECK failures.""" + data = self._read_test_data('notreached_log_message.txt') + expected_type = 'CHECK failure' + expected_address = '' + expected_state = ( + 'failed to open UTS46 data with error: U_FILE_ACCESS_ERROR. If you see this error\n' + 'url::CreateIDNA\n' + 'url::IDNToASCII\n') + expected_stacktrace = data + expected_security_flag = False + + environment.set_value('ASSERTS_HAVE_SECURITY_IMPLICATION', False) + self._validate_get_crash_data(data, expected_type, expected_address, + expected_state, expected_stacktrace, + expected_security_flag) + def test_asan_container_overflow(self): """Test an ASan container overflow.""" data = self._read_test_data('asan_container_overflow_read.txt') diff --git a/src/clusterfuzz/_internal/tests/core/datastore/data_handler_test.py b/src/clusterfuzz/_internal/tests/core/datastore/data_handler_test.py index 178c0bd498..6a9edc6786 100644 --- a/src/clusterfuzz/_internal/tests/core/datastore/data_handler_test.py +++ b/src/clusterfuzz/_internal/tests/core/datastore/data_handler_test.py @@ -23,7 +23,6 @@ import parameterized from pyfakefs import fake_filesystem_unittest -from clusterfuzz._internal.config import local_config from clusterfuzz._internal.datastore import data_handler from clusterfuzz._internal.datastore import data_types from clusterfuzz._internal.google_cloud_utils import blobs @@ -73,14 +72,27 @@ class DataHandlerTest(unittest.TestCase): def setUp(self): helpers.patch_environ(self) - project_config_get = local_config.ProjectConfig.get helpers.patch(self, [ 'clusterfuzz._internal.base.utils.default_project_name', 'clusterfuzz._internal.config.db_config.get', - ('project_config_get', - 'clusterfuzz._internal.config.local_config.ProjectConfig.get'), + 'clusterfuzz._internal.config.local_config.ProjectConfig', + ('get_storage_provider', + 'clusterfuzz._internal.google_cloud_utils.storage._provider'), + 'clusterfuzz._internal.google_cloud_utils.storage.create_discovery_storage_client', + 'clusterfuzz._internal.google_cloud_utils.storage.get_bucket_iam_policy', ]) + self.mock.default_project_name.return_value = 'project' + + self.storage_provider = mock.Mock() + self.mock.get_storage_provider.return_value = self.storage_provider + + self.project_config = {} + self.mock.ProjectConfig.return_value = self.project_config + + # Disable artificial delay when creating buckets. + storage.CREATE_BUCKET_DELAY = 0 + self.job = data_types.Job( name='linux_asan_chrome', environment_string=('SUMMARY_PREFIX = project\n' @@ -175,8 +187,6 @@ def setUp(self): environment.set_value('FUZZ_DATA', '/tmp/inputs/fuzzer-common-data-bundles') environment.set_value('FUZZERS_DIR', '/tmp/inputs/fuzzers') - self.mock.default_project_name.return_value = 'project' - self.mock.project_config_get.side_effect = project_config_get def test_find_testcase(self): """Ensure that find_testcase behaves as expected.""" @@ -449,6 +459,26 @@ def test_get_issue_summary_bad_cast_without_crash_function(self): summary, 'project: Bad-cast to blink::LayoutBlock from ' 'blink::LayoutTableSection') + def test_create_data_bundle_bucket_and_iams(self): + self.storage_provider.get_bucket.return_value = None + self.storage_provider.create_bucket.return_value = True + + self.assertTrue(data_handler.create_data_bundle_bucket_and_iams('test', [])) + + self.storage_provider.create_bucket.assert_called_with( + 'test-corpus.test-clusterfuzz.appspot.com', None, None, None) + + def test_create_data_bundle_bucket_and_iams_with_location(self): + self.storage_provider.get_bucket.return_value = None + self.storage_provider.create_bucket.return_value = True + + self.project_config['data_bundle_bucket_location'] = 'NORTH-POLE' + + self.assertTrue(data_handler.create_data_bundle_bucket_and_iams('test', [])) + + self.storage_provider.create_bucket.assert_called_with( + 'test-corpus.test-clusterfuzz.appspot.com', None, None, 'NORTH-POLE') + def test_get_data_bundle_name_default(self): """Test getting the default data bundle bucket name.""" self.assertEqual('test-corpus.test-clusterfuzz.appspot.com', @@ -456,8 +486,7 @@ def test_get_data_bundle_name_default(self): def test_get_data_bundle_name_custom_suffix(self): """Test getting the data bundle bucket name with custom suffix.""" - self.mock.project_config_get.side_effect = None - self.mock.project_config_get.return_value = 'custom.suffix.com' + self.project_config['bucket_domain_suffix'] = 'custom.suffix.com' self.assertEqual('test-corpus.custom.suffix.com', data_handler.get_data_bundle_bucket_name('test')) @@ -485,7 +514,7 @@ def test_filter_stack_trace_upload(self): exceeds limit and an upload_url is provided.""" blob_name = blobs.generate_new_blob_name() blobs_bucket = 'blobs_bucket' - storage._provider().create_bucket(blobs_bucket, None, None) # pylint: disable=protected-access + storage._provider().create_bucket(blobs_bucket, None, None, None) # pylint: disable=protected-access gcs_path = storage.get_cloud_storage_file_path(blobs_bucket, blob_name) signed_upload_url = storage.get_signed_upload_url(gcs_path) diff --git a/src/clusterfuzz/_internal/tests/core/google_cloud_utils/batch_test.py b/src/clusterfuzz/_internal/tests/core/google_cloud_utils/batch_test.py index 5b9135103a..02f4209086 100644 --- a/src/clusterfuzz/_internal/tests/core/google_cloud_utils/batch_test.py +++ b/src/clusterfuzz/_internal/tests/core/google_cloud_utils/batch_test.py @@ -17,24 +17,32 @@ from clusterfuzz._internal.datastore import data_types from clusterfuzz._internal.google_cloud_utils import batch +from clusterfuzz._internal.tests.test_libs import helpers from clusterfuzz._internal.tests.test_libs import test_utils # pylint: disable=protected-access @test_utils.with_cloud_emulators('datastore') -class GetSpecFromConfigTest(unittest.TestCase): +class GetSpecsFromConfigTest(unittest.TestCase): """Tests for get_spec_from_config.""" def setUp(self): self.maxDiff = None self.job = data_types.Job(name='libfuzzer_chrome_asan', platform='LINUX') self.job.put() + helpers.patch(self, [ + 'clusterfuzz._internal.base.utils.random_weighted_choice', + ]) + self.mock.random_weighted_choice.return_value = batch.WeightedSubconfig( + name='east4-network2', + weight=1, + ) def test_nonpreemptible(self): """Tests that get_spec_from_config works for non-preemptibles as expected.""" - spec = batch._get_spec_from_config('analyze', self.job.name) + spec = _get_spec_from_config('analyze', self.job.name) expected_spec = batch.BatchWorkloadSpec( clusterfuzz_release='prod', docker_image='gcr.io/clusterfuzz-images/base:a2f4dd6-202202070654', @@ -43,10 +51,9 @@ def test_nonpreemptible(self): disk_type='pd-standard', service_account_email='test-unpriv-clusterfuzz-service-account-email', subnetwork= - 'projects/google.com:clusterfuzz/regions/gce-region/subnetworks/subnetworkname', - network='projects/google.com:clusterfuzz/global/networks/networkname', - gce_region='gce-region', - gce_zone='gce-zone', + 'projects/project_name/regions/us-east4/subnetworks/subnetworkname2', + network='projects/project_name/global/networks/networkname2', + gce_region='us-east4', project='test-clusterfuzz', preemptible=False, machine_type='n1-standard-1', @@ -61,7 +68,7 @@ def test_fuzz_get_spec_from_config(self): """Tests that get_spec_from_config works for fuzz tasks as expected.""" job = data_types.Job(name='libfuzzer_chrome_asan', platform='LINUX') job.put() - spec = batch._get_spec_from_config('fuzz', job.name) # pylint: disable=protected-access + spec = _get_spec_from_config('fuzz', job.name) expected_spec = batch.BatchWorkloadSpec( clusterfuzz_release='prod', docker_image='gcr.io/clusterfuzz-images/base:a2f4dd6-202202070654', @@ -70,10 +77,9 @@ def test_fuzz_get_spec_from_config(self): disk_type='pd-standard', service_account_email='test-unpriv-clusterfuzz-service-account-email', subnetwork= - 'projects/google.com:clusterfuzz/regions/gce-region/subnetworks/subnetworkname', - network='projects/google.com:clusterfuzz/global/networks/networkname', - gce_zone='gce-zone', - gce_region='gce-region', + 'projects/project_name/regions/us-east4/subnetworks/subnetworkname2', + network='projects/project_name/global/networks/networkname2', + gce_region='us-east4', project='test-clusterfuzz', preemptible=True, machine_type='n1-standard-1', @@ -87,13 +93,19 @@ def test_fuzz_get_spec_from_config(self): def test_corpus_pruning(self): """Tests that corpus pruning uses a spec of 24 hours and a different one than normal.""" - pruning_spec = batch._get_spec_from_config('corpus_pruning', self.job.name) + pruning_spec = _get_spec_from_config('corpus_pruning', self.job.name) self.assertEqual(pruning_spec.max_run_duration, f'{24 * 60 * 60}s') - normal_spec = batch._get_spec_from_config('analyze', self.job.name) + normal_spec = _get_spec_from_config('analyze', self.job.name) self.assertNotEqual(pruning_spec, normal_spec) job = data_types.Job(name='libfuzzer_chrome_msan', platform='LINUX') job.put() # This behavior is important for grouping batch alike tasks into a single # batch job. - pruning_spec2 = batch._get_spec_from_config('corpus_pruning', job.name) + pruning_spec2 = _get_spec_from_config('corpus_pruning', job.name) self.assertEqual(pruning_spec, pruning_spec2) + + +def _get_spec_from_config(command, job_name): + return list( + batch._get_specs_from_config([batch.BatchTask(command, job_name, + None)]).values())[0] diff --git a/src/clusterfuzz/_internal/tests/core/google_cloud_utils/blobs_test.py b/src/clusterfuzz/_internal/tests/core/google_cloud_utils/blobs_test.py index 01a7cb83ab..56942e6800 100644 --- a/src/clusterfuzz/_internal/tests/core/google_cloud_utils/blobs_test.py +++ b/src/clusterfuzz/_internal/tests/core/google_cloud_utils/blobs_test.py @@ -182,7 +182,7 @@ def setUp(self): test_utils.set_up_pyfakefs(self) os.environ['LOCAL_GCS_BUCKETS_PATH'] = '/local' os.environ['TEST_BLOBS_BUCKET'] = 'blobs-bucket' - self.provider.create_bucket('blobs-bucket', None, None) + self.provider.create_bucket('blobs-bucket', None, None, None) def test_get_blob_signed_upload_url_then_delete_blob(self): """Tests get_blob_signed_upload_url.""" diff --git a/src/clusterfuzz/_internal/tests/core/google_cloud_utils/storage_test.py b/src/clusterfuzz/_internal/tests/core/google_cloud_utils/storage_test.py index 175e6ac6cb..db438d9874 100644 --- a/src/clusterfuzz/_internal/tests/core/google_cloud_utils/storage_test.py +++ b/src/clusterfuzz/_internal/tests/core/google_cloud_utils/storage_test.py @@ -82,7 +82,7 @@ def setUp(self): def test_create_bucket(self): """Test create_bucket.""" - self.provider.create_bucket('test-bucket', None, None) + self.provider.create_bucket('test-bucket', None, None, None) self.assertTrue(os.path.isdir('/local/test-bucket')) def test_get_bucket(self): @@ -281,7 +281,7 @@ def test_download_signed_url(self): def test_upload_signed_url(self): """Tests upload_signed_url.""" contents = b'aa' - self.provider.create_bucket('test-bucket', None, None) + self.provider.create_bucket('test-bucket', None, None, None) self.provider.upload_signed_url(contents, 'gs://test-bucket/a') with open('/local/test-bucket/objects/a', 'rb') as fp: return self.assertEqual(fp.read(), contents) diff --git a/src/clusterfuzz/stacktraces/constants.py b/src/clusterfuzz/stacktraces/constants.py index baf121c007..1bca06c2fd 100644 --- a/src/clusterfuzz/stacktraces/constants.py +++ b/src/clusterfuzz/stacktraces/constants.py @@ -85,7 +85,7 @@ CFI_NODEBUG_ERROR_MARKER_REGEX = re.compile( r'CFI: Most likely a control flow integrity violation;.*') CHROME_CHECK_FAILURE_REGEX = re.compile( - r'\s*\[[^\]]*[:]([^\](]*).*\].*Check failed[:]\s*(.*)') + r'\s*\[[^\]]*[:]([^\](]*).*\].*(?:Check failed:|NOTREACHED hit.)\s*(.*)') CHROME_STACK_FRAME_REGEX = re.compile( r'[ ]*(#(?P[0-9]+)[ ]' # frame id (2) r'([xX0-9a-fA-F]+)[ ])' # addr (3) diff --git a/src/local/butler/deploy.py b/src/local/butler/deploy.py index 0898ba0c06..444a82a5c8 100644 --- a/src/local/butler/deploy.py +++ b/src/local/butler/deploy.py @@ -23,6 +23,8 @@ import tempfile import time +import pytz + from local.butler import appengine from local.butler import common from local.butler import constants @@ -47,9 +49,9 @@ Version = namedtuple('Version', ['id', 'deploy_time', 'traffic_split']) -def now(): +def now(tz=None): """Used for mocks.""" - return datetime.datetime.now() + return datetime.datetime.now(tz) def _get_services(paths): @@ -449,6 +451,27 @@ def _deploy_terraform(config_dir): common.execute(f'rm -rf {terraform_dir}/.terraform*') +def _is_safe_deploy_day(): + time_now_in_ny = now(pytz.timezone('America/New_York')) + day_now_in_ny = time_now_in_ny.weekday() + return day_now_in_ny not in {4, 5, 6} # The days of the week are 0-indexed. + + +def _enforce_safe_day_to_deploy(): + """Checks that is not an unsafe day (Friday, Saturday, or Sunday) to + deploy for chrome ClusterFuzz.""" + + config = local_config.Config() + if config.get('weekend_deploy_allowed', True): + return + + if not _is_safe_deploy_day(): + raise RuntimeError('Cannot deploy Fri-Sun to this CF instance except for ' + 'urgent fixes. See b/384493595. If needed, temporarily ' + 'delete+commit this. You are not too l33t for this ' + 'rule. Do not break it!') + + def _deploy_k8s(config_dir): """Deploys all k8s workloads.""" k8s_dir = os.path.join('infra', 'k8s') @@ -498,6 +521,8 @@ def execute(args): print('gsutil not found in PATH.') sys.exit(1) + _enforce_safe_day_to_deploy() + # Build templates before deployment. appengine.build_templates() diff --git a/src/python/bot/startup/run_bot.py b/src/python/bot/startup/run_bot.py index 8c674fba2c..132437cef5 100644 --- a/src/python/bot/startup/run_bot.py +++ b/src/python/bot/startup/run_bot.py @@ -194,9 +194,6 @@ def main(): sys.exit(-1) fuzzers_init.run() - - logs.info(f'PID is {os.getpid()}') - if environment.is_trusted_host(ensure_connected=False): from clusterfuzz._internal.bot.untrusted_runner import host host.init() @@ -226,6 +223,7 @@ def main(): clean_exit or errors.error_in_list(error_stacktrace, errors.BOT_ERROR_TERMINATION_LIST)) if should_terminate: + logs.info('Not retrying.') return logs.error( diff --git a/src/python/bot/startup/run_cron.py b/src/python/bot/startup/run_cron.py index 6a83dca108..3d94cbfa0d 100644 --- a/src/python/bot/startup/run_cron.py +++ b/src/python/bot/startup/run_cron.py @@ -60,8 +60,12 @@ def main(): task_module_name = f'clusterfuzz._internal.cron.{task}' with monitor.wrap_with_monitoring(), ndb_init.context(): - task_module = importlib.import_module(task_module_name) - return 0 if task_module.main() else 1 + try: + task_module = importlib.import_module(task_module_name) + return 0 if task_module.main() else 1 + except Exception as e: + logs.error(f'Unhandled exception in cron: {e}') + return 1 if __name__ == '__main__': diff --git a/src/python/other-bots/chromium-tests-syncer/run.py b/src/python/other-bots/chromium-tests-syncer/run.py index f29f395a31..44c4e89adf 100644 --- a/src/python/other-bots/chromium-tests-syncer/run.py +++ b/src/python/other-bots/chromium-tests-syncer/run.py @@ -23,6 +23,7 @@ import os import re import subprocess +import tarfile import time from clusterfuzz._internal.base import utils @@ -208,6 +209,39 @@ def create_gecko_tests_directory(tests_directory, gecko_checkout_subdirectory, target_subdirectory) +def create_fuzzilli_tests_directory(tests_directory): + """Create Fuzzilli tests directory from the autozilli GCS archives.""" + logs.info('Syncing fuzzilli tests.') + fuzzilli_tests_directory = os.path.join(tests_directory, 'fuzzilli') + remote_archive_tmpl = 'gs://autozilli/autozilli-%d.tgz' + + # Ensure we have an empty directory with no leftovers from a previous run. + shell.remove_directory(fuzzilli_tests_directory, recreate=True) + + def filter_members(member, path): + # We only need JS files and the settings.json from the archive. + if member.name.endswith('fzil') or member.name.startswith('fuzzdir/stats'): + return None + return tarfile.data_filter(member, path) + + for i in range(1, 10): + # Download archives number 1-9. + remote_archive = remote_archive_tmpl % i + logs.info(f'Processing {remote_archive}') + local_archive = os.path.join(fuzzilli_tests_directory, 'tmp.tgz') + subprocess.check_call(['gsutil', 'cp', remote_archive, local_archive]) + + # Extract relevant files. + with tarfile.open(local_archive) as tar: + tar.extractall(path=fuzzilli_tests_directory, filter=filter_members) + + # Clean up. + os.rename( + os.path.join(fuzzilli_tests_directory, 'fuzzdir'), + os.path.join(fuzzilli_tests_directory, f'fuzzdir-{i}')) + shell.remove_file(local_archive) + + def sync_tests(tests_archive_bucket: str, tests_archive_name: str, tests_directory: str): """Main sync routine.""" @@ -239,6 +273,8 @@ def sync_tests(tests_archive_bucket: str, tests_archive_name: str, create_gecko_tests_directory(tests_directory, 'gecko-dev', 'gecko-tests') + create_fuzzilli_tests_directory(tests_directory) + # Upload tests archive to google cloud storage. logs.info('Uploading tests archive to cloud.') tests_archive_local = os.path.join(tests_directory, tests_archive_name) @@ -264,6 +300,7 @@ def sync_tests(tests_archive_bucket: str, tests_archive_name: str, 'WebKit/JSTests/es6', 'WebKit/JSTests/stress', 'WebKit/LayoutTests', + 'fuzzilli', 'gecko-tests', 'v8/test/mjsunit', 'spidermonkey',