From e4b0136139c658a3830132c200d500fc05609f84 Mon Sep 17 00:00:00 2001 From: Titouan Rigoudy Date: Tue, 17 Dec 2024 05:14:05 +0100 Subject: [PATCH] Allow configuring default corpora bucket location. (#4479) Allow instances to specify the GCS bucket location for data bundle buckets in `project.yaml` as a new key: `data_bundle_bucket_location`. This will allow creating regional buckets instead of using the default `US` multi-region which results in high data transfer costs in Chrome's instance. --- .../_internal/datastore/data_handler.py | 3 +- .../_internal/google_cloud_utils/storage.py | 16 +++++-- .../tasks/utasks/corpus_pruning_task_test.py | 2 +- .../tests/core/datastore/data_handler_test.py | 47 +++++++++++++++---- .../core/google_cloud_utils/blobs_test.py | 2 +- .../core/google_cloud_utils/storage_test.py | 4 +- 6 files changed, 55 insertions(+), 19 deletions(-) diff --git a/src/clusterfuzz/_internal/datastore/data_handler.py b/src/clusterfuzz/_internal/datastore/data_handler.py index abe8d5c677..ff9d608cfc 100644 --- a/src/clusterfuzz/_internal/datastore/data_handler.py +++ b/src/clusterfuzz/_internal/datastore/data_handler.py @@ -977,7 +977,8 @@ def add_build_metadata(job_type, def create_data_bundle_bucket_and_iams(data_bundle_name, emails): """Creates a data bundle bucket and adds iams for access.""" bucket_name = get_data_bundle_bucket_name(data_bundle_name) - if not storage.create_bucket_if_needed(bucket_name): + location = local_config.ProjectConfig().get('data_bundle_bucket_location') + if not storage.create_bucket_if_needed(bucket_name, location=location): return False client = storage.create_discovery_storage_client() diff --git a/src/clusterfuzz/_internal/google_cloud_utils/storage.py b/src/clusterfuzz/_internal/google_cloud_utils/storage.py index b4365b2ff9..b8ae76fc93 100644 --- a/src/clusterfuzz/_internal/google_cloud_utils/storage.py +++ b/src/clusterfuzz/_internal/google_cloud_utils/storage.py @@ -117,7 +117,7 @@ class StorageProvider: """Core storage provider interface.""" - def create_bucket(self, name, object_lifecycle, cors): + def create_bucket(self, name, object_lifecycle, cors, location): """Create a new bucket.""" raise NotImplementedError @@ -198,7 +198,7 @@ def _chunk_size(self): return None - def create_bucket(self, name, object_lifecycle, cors): + def create_bucket(self, name, object_lifecycle, cors, location): """Create a new bucket.""" project_id = utils.get_application_id() request_body = {'name': name} @@ -208,6 +208,9 @@ def create_bucket(self, name, object_lifecycle, cors): if cors: request_body['cors'] = cors + if location: + request_body['location'] = location + client = create_discovery_storage_client() try: client.buckets().insert(project=project_id, body=request_body).execute() @@ -543,7 +546,7 @@ def convert_path_for_write(self, remote_path, directory=OBJECTS_DIR): return fs_path - def create_bucket(self, name, object_lifecycle, cors): + def create_bucket(self, name, object_lifecycle, cors, location): """Create a new bucket.""" bucket_path = self._fs_bucket_path(name) if os.path.exists(bucket_path): @@ -905,13 +908,16 @@ def set_bucket_iam_policy(client, bucket_name, iam_policy): return None -def create_bucket_if_needed(bucket_name, object_lifecycle=None, cors=None): +def create_bucket_if_needed(bucket_name, + object_lifecycle=None, + cors=None, + location=None): """Creates a GCS bucket.""" provider = _provider() if provider.get_bucket(bucket_name): return True - if not provider.create_bucket(bucket_name, object_lifecycle, cors): + if not provider.create_bucket(bucket_name, object_lifecycle, cors, location): return False time.sleep(CREATE_BUCKET_DELAY) diff --git a/src/clusterfuzz/_internal/tests/core/bot/tasks/utasks/corpus_pruning_task_test.py b/src/clusterfuzz/_internal/tests/core/bot/tasks/utasks/corpus_pruning_task_test.py index 7130b2bb19..8b72c0cf07 100644 --- a/src/clusterfuzz/_internal/tests/core/bot/tasks/utasks/corpus_pruning_task_test.py +++ b/src/clusterfuzz/_internal/tests/core/bot/tasks/utasks/corpus_pruning_task_test.py @@ -57,7 +57,7 @@ def setUp(self): self.local_gcs_buckets_path = tempfile.mkdtemp() os.environ['LOCAL_GCS_BUCKETS_PATH'] = self.local_gcs_buckets_path os.environ['TEST_BLOBS_BUCKET'] = 'blobs-bucket' - storage._provider().create_bucket('blobs-bucket', None, None) + storage._provider().create_bucket('blobs-bucket', None, None, None) helpers.patch(self, [ 'clusterfuzz._internal.bot.fuzzers.engine_common.unpack_seed_corpus_if_needed', 'clusterfuzz._internal.bot.tasks.task_creation.create_tasks', diff --git a/src/clusterfuzz/_internal/tests/core/datastore/data_handler_test.py b/src/clusterfuzz/_internal/tests/core/datastore/data_handler_test.py index 178c0bd498..6a9edc6786 100644 --- a/src/clusterfuzz/_internal/tests/core/datastore/data_handler_test.py +++ b/src/clusterfuzz/_internal/tests/core/datastore/data_handler_test.py @@ -23,7 +23,6 @@ import parameterized from pyfakefs import fake_filesystem_unittest -from clusterfuzz._internal.config import local_config from clusterfuzz._internal.datastore import data_handler from clusterfuzz._internal.datastore import data_types from clusterfuzz._internal.google_cloud_utils import blobs @@ -73,14 +72,27 @@ class DataHandlerTest(unittest.TestCase): def setUp(self): helpers.patch_environ(self) - project_config_get = local_config.ProjectConfig.get helpers.patch(self, [ 'clusterfuzz._internal.base.utils.default_project_name', 'clusterfuzz._internal.config.db_config.get', - ('project_config_get', - 'clusterfuzz._internal.config.local_config.ProjectConfig.get'), + 'clusterfuzz._internal.config.local_config.ProjectConfig', + ('get_storage_provider', + 'clusterfuzz._internal.google_cloud_utils.storage._provider'), + 'clusterfuzz._internal.google_cloud_utils.storage.create_discovery_storage_client', + 'clusterfuzz._internal.google_cloud_utils.storage.get_bucket_iam_policy', ]) + self.mock.default_project_name.return_value = 'project' + + self.storage_provider = mock.Mock() + self.mock.get_storage_provider.return_value = self.storage_provider + + self.project_config = {} + self.mock.ProjectConfig.return_value = self.project_config + + # Disable artificial delay when creating buckets. + storage.CREATE_BUCKET_DELAY = 0 + self.job = data_types.Job( name='linux_asan_chrome', environment_string=('SUMMARY_PREFIX = project\n' @@ -175,8 +187,6 @@ def setUp(self): environment.set_value('FUZZ_DATA', '/tmp/inputs/fuzzer-common-data-bundles') environment.set_value('FUZZERS_DIR', '/tmp/inputs/fuzzers') - self.mock.default_project_name.return_value = 'project' - self.mock.project_config_get.side_effect = project_config_get def test_find_testcase(self): """Ensure that find_testcase behaves as expected.""" @@ -449,6 +459,26 @@ def test_get_issue_summary_bad_cast_without_crash_function(self): summary, 'project: Bad-cast to blink::LayoutBlock from ' 'blink::LayoutTableSection') + def test_create_data_bundle_bucket_and_iams(self): + self.storage_provider.get_bucket.return_value = None + self.storage_provider.create_bucket.return_value = True + + self.assertTrue(data_handler.create_data_bundle_bucket_and_iams('test', [])) + + self.storage_provider.create_bucket.assert_called_with( + 'test-corpus.test-clusterfuzz.appspot.com', None, None, None) + + def test_create_data_bundle_bucket_and_iams_with_location(self): + self.storage_provider.get_bucket.return_value = None + self.storage_provider.create_bucket.return_value = True + + self.project_config['data_bundle_bucket_location'] = 'NORTH-POLE' + + self.assertTrue(data_handler.create_data_bundle_bucket_and_iams('test', [])) + + self.storage_provider.create_bucket.assert_called_with( + 'test-corpus.test-clusterfuzz.appspot.com', None, None, 'NORTH-POLE') + def test_get_data_bundle_name_default(self): """Test getting the default data bundle bucket name.""" self.assertEqual('test-corpus.test-clusterfuzz.appspot.com', @@ -456,8 +486,7 @@ def test_get_data_bundle_name_default(self): def test_get_data_bundle_name_custom_suffix(self): """Test getting the data bundle bucket name with custom suffix.""" - self.mock.project_config_get.side_effect = None - self.mock.project_config_get.return_value = 'custom.suffix.com' + self.project_config['bucket_domain_suffix'] = 'custom.suffix.com' self.assertEqual('test-corpus.custom.suffix.com', data_handler.get_data_bundle_bucket_name('test')) @@ -485,7 +514,7 @@ def test_filter_stack_trace_upload(self): exceeds limit and an upload_url is provided.""" blob_name = blobs.generate_new_blob_name() blobs_bucket = 'blobs_bucket' - storage._provider().create_bucket(blobs_bucket, None, None) # pylint: disable=protected-access + storage._provider().create_bucket(blobs_bucket, None, None, None) # pylint: disable=protected-access gcs_path = storage.get_cloud_storage_file_path(blobs_bucket, blob_name) signed_upload_url = storage.get_signed_upload_url(gcs_path) diff --git a/src/clusterfuzz/_internal/tests/core/google_cloud_utils/blobs_test.py b/src/clusterfuzz/_internal/tests/core/google_cloud_utils/blobs_test.py index 01a7cb83ab..56942e6800 100644 --- a/src/clusterfuzz/_internal/tests/core/google_cloud_utils/blobs_test.py +++ b/src/clusterfuzz/_internal/tests/core/google_cloud_utils/blobs_test.py @@ -182,7 +182,7 @@ def setUp(self): test_utils.set_up_pyfakefs(self) os.environ['LOCAL_GCS_BUCKETS_PATH'] = '/local' os.environ['TEST_BLOBS_BUCKET'] = 'blobs-bucket' - self.provider.create_bucket('blobs-bucket', None, None) + self.provider.create_bucket('blobs-bucket', None, None, None) def test_get_blob_signed_upload_url_then_delete_blob(self): """Tests get_blob_signed_upload_url.""" diff --git a/src/clusterfuzz/_internal/tests/core/google_cloud_utils/storage_test.py b/src/clusterfuzz/_internal/tests/core/google_cloud_utils/storage_test.py index 175e6ac6cb..db438d9874 100644 --- a/src/clusterfuzz/_internal/tests/core/google_cloud_utils/storage_test.py +++ b/src/clusterfuzz/_internal/tests/core/google_cloud_utils/storage_test.py @@ -82,7 +82,7 @@ def setUp(self): def test_create_bucket(self): """Test create_bucket.""" - self.provider.create_bucket('test-bucket', None, None) + self.provider.create_bucket('test-bucket', None, None, None) self.assertTrue(os.path.isdir('/local/test-bucket')) def test_get_bucket(self): @@ -281,7 +281,7 @@ def test_download_signed_url(self): def test_upload_signed_url(self): """Tests upload_signed_url.""" contents = b'aa' - self.provider.create_bucket('test-bucket', None, None) + self.provider.create_bucket('test-bucket', None, None, None) self.provider.upload_signed_url(contents, 'gs://test-bucket/a') with open('/local/test-bucket/objects/a', 'rb') as fp: return self.assertEqual(fp.read(), contents)