From 734a0f048a92315d0ee0f3794c6679d3d9aa6fd0 Mon Sep 17 00:00:00 2001 From: Paul Semel Date: Tue, 8 Oct 2024 18:39:44 +0200 Subject: [PATCH] [build_manager] add support for remote zip (#4263) This adds support for remote ZIP. As of now, performances are quite good locally, and the read ahead mechanism should keep reasonable performance. Also, given that the ClusterFuzz bots are having HDD, numbers might even be better there, as we're only storing on disk when unpacking the build. The memory consumption of this new feature is contant: it uses at most (and most of the time) 50 MB of RAM. --- .../_internal/bot/fuzzers/utils.py | 48 +++--- .../build_management/build_archive.py | 60 +++++-- .../build_management/build_manager.py | 128 +++++++++++--- src/clusterfuzz/_internal/system/archive.py | 162 +++++++++++++++++- .../tests/core/bot/fuzzers/utils_test.py | 44 +++-- .../build_management/build_archive_test.py | 6 +- .../build_management/build_manager_test.py | 102 ++++++++++- src/clusterfuzz/fuzz/__init__.py | 4 +- 8 files changed, 463 insertions(+), 91 deletions(-) diff --git a/src/clusterfuzz/_internal/bot/fuzzers/utils.py b/src/clusterfuzz/_internal/bot/fuzzers/utils.py index 62742e06c1..ffe8d4eac1 100644 --- a/src/clusterfuzz/_internal/bot/fuzzers/utils.py +++ b/src/clusterfuzz/_internal/bot/fuzzers/utils.py @@ -13,10 +13,13 @@ # limitations under the License. """Fuzzer utils.""" +import functools import os import re import stat import tempfile +from typing import Callable +from typing import Optional from clusterfuzz._internal.base import utils from clusterfuzz._internal.metrics import logs @@ -30,7 +33,7 @@ EXTRA_BUILD_DIR = '__extra_build' -def is_fuzz_target_local(file_path, file_handle=None): +def is_fuzz_target(file_path, file_opener: Optional[Callable] = None): """Returns whether |file_path| is a fuzz target binary (local path).""" if '@' in file_path: # GFT targets often have periods in the name that get misinterpreted as an @@ -53,7 +56,7 @@ def is_fuzz_target_local(file_path, file_handle=None): # Ignore files with disallowed extensions (to prevent opening e.g. .zips). return False - if not file_handle and not os.path.exists(file_path): + if not file_opener and not os.path.exists(file_path): # Ignore non-existent files for cases when we don't have a file handle. return False @@ -72,24 +75,27 @@ def is_fuzz_target_local(file_path, file_handle=None): logs.warning('Tried to read from non-regular file: %s.' % file_path) return False - # Use already provided file handle or open the file. - local_file_handle = file_handle or open(file_path, 'rb') - - result = False - for pattern in FUZZ_TARGET_SEARCH_BYTES: - # TODO(metzman): Bound this call so we don't read forever if something went - # wrong. - local_file_handle.seek(0) - result = utils.search_bytes_in_file(pattern, local_file_handle) - if result: - break - - if not file_handle: - # If this local file handle is owned by our function, close it now. - # Otherwise, it is caller's responsibility. - local_file_handle.close() - - return result + # Either use the file opener or open the file ourselves. + if not file_opener: + file_opener = functools.partial(open, mode='rb') + try: + with file_opener(file_path) as file_handle: + result = False + for pattern in FUZZ_TARGET_SEARCH_BYTES: + # TODO(metzman): Bound this call so we don't read forever if something + # went wrong. + file_handle.seek(0) + result = utils.search_bytes_in_file(pattern, file_handle) + if result: + break + + file_handle.close() + + return result + except Exception as e: + # In case we could not open the file, we consider it's not a fuzzer. + logs.warning(f'Could not open {file_path}: {e}') + return False def get_fuzz_targets_local(path): @@ -103,7 +109,7 @@ def get_fuzz_targets_local(path): continue file_path = os.path.join(root, filename) - if is_fuzz_target_local(file_path): + if is_fuzz_target(file_path): fuzz_target_paths.append(file_path) return fuzz_target_paths diff --git a/src/clusterfuzz/_internal/build_management/build_archive.py b/src/clusterfuzz/_internal/build_management/build_archive.py index 0e15bcba34..133fc1c2c1 100644 --- a/src/clusterfuzz/_internal/build_management/build_archive.py +++ b/src/clusterfuzz/_internal/build_management/build_archive.py @@ -190,14 +190,10 @@ def list_fuzz_targets(self) -> List[str]: from clusterfuzz._internal.bot.fuzzers import utils as fuzzer_utils for archive_file in self.list_members(): - file_content = self.try_open(archive_file.name) - if fuzzer_utils.is_fuzz_target_local(archive_file.name, file_content): + if fuzzer_utils.is_fuzz_target(archive_file.name, self.open): fuzz_target = fuzzer_utils.normalize_target_name(archive_file.name) self._fuzz_targets[fuzz_target] = archive_file.name - if file_content: - file_content.close() - return list(self._fuzz_targets.keys()) def unpacked_size(self, fuzz_target: Optional[str] = None) -> int: @@ -299,23 +295,19 @@ def get_target_dependencies( return res -# pylint: disable=redefined-builtin -def open(archive_path: str) -> BuildArchive: - """Opens the archive and gets the appropriate build archive based on the - `archive_path`. The resulting object is usable as a normal archive reader, - but provides additional feature related to build handling. +def open_with_reader(reader: archive.ArchiveReader) -> BuildArchive: + """Open the archive and gets the appropriate build archive based on the + provided archive information. Args: - archive_path: the path to the archive. + reader: the archive reader. Raises: - If the file could not be opened or if the archive type cannot be handled. + If the archive reader cannot be handled. Returns: - the build archive. + The build archive. """ - reader = archive.open(archive_path) - # Unfortunately, there is no good heuristic for determining which build # archive implementation to use. # Hopefully, we can search in the archive whether some files are present and @@ -328,3 +320,41 @@ def open(archive_path: str) -> BuildArchive: if reader.file_exists(args_gn_path): return ChromeBuildArchive(reader) return DefaultBuildArchive(reader) + + +def open(archive_path: str) -> BuildArchive: # pylint: disable=redefined-builtin + """Opens the archive and gets the appropriate build archive based on the + `archive_path`. The resulting object is usable as a normal archive reader, + but provides additional feature related to build handling. + + Args: + archive_path: the path to the archive. + + Raises: + If the file could not be opened or if the archive type cannot be handled. + + Returns: + The build archive. + """ + reader = archive.open(archive_path) + return open_with_reader(reader) + + +def open_uri(uri: str) -> BuildArchive: + """Opens a build archive over HTTP. This is only compatible with chromium as + of now. + + Args: + uri: the URI pointing to the zip file. + + Returns: + The build archive. + """ + reader = archive.ZipArchiveReader(archive.HttpZipFile(uri)) + return open_with_reader(reader) + + +def unzip_over_http_compatible(build_url: str) -> bool: + """Whether the build URL is compatible with unzipping over HTTP. + """ + return archive.HttpZipFile.is_uri_compatible(build_url) diff --git a/src/clusterfuzz/_internal/build_management/build_manager.py b/src/clusterfuzz/_internal/build_management/build_manager.py index af969d78a4..f6fd13dc28 100644 --- a/src/clusterfuzz/_internal/build_management/build_manager.py +++ b/src/clusterfuzz/_internal/build_management/build_manager.py @@ -14,11 +14,13 @@ """Build manager.""" from collections import namedtuple +import contextlib import os import re import shutil import subprocess import time +from typing import Optional from clusterfuzz._internal.base import errors from clusterfuzz._internal.base import utils @@ -402,23 +404,19 @@ def _post_setup_success(self, update_revision=True): if instrumented_library_paths: self._patch_rpaths(instrumented_library_paths) - def _unpack_build(self, base_build_dir, build_dir, build_url): - """Unpacks a build from a build url into the build directory.""" - # Track time taken to unpack builds so that it doesn't silently regress. - start_time = time.time() - - logs.info(f'Unpacking build from {build_url} into {build_dir}.') + @contextlib.contextmanager + def _download_and_open_build_archive(self, base_build_dir: str, + build_dir: str, build_url: str): + """Downloads the build archive at `build_url` and opens it. - # Free up memory. - utils.python_gc() - - # Remove the current build. - logs.info(f'Removing build directory {build_dir}.') - if not shell.remove_directory(build_dir, recreate=True): - logs.error(f'Unable to clear build directory {build_dir}.') - _handle_unrecoverable_error_on_windows() - return False + Args: + base_build_dir: the base build directory + build_dir: the current build directory + build_url: the build URL + Yields: + the build archive + """ # Download build archive locally. build_local_archive = os.path.join(build_dir, os.path.basename(build_url)) @@ -431,15 +429,83 @@ def _unpack_build(self, base_build_dir, build_dir, build_url): 'Failed to make space for download. ' 'Cleared all data directories to free up space, exiting.') - logs.info(f'Downloading build from {build_url}.') + logs.info(f'Downloading build from {build_url} to {build_local_archive}.') try: storage.copy_file_from(build_url, build_local_archive) except Exception as e: logs.error(f'Unable to download build from {build_url}: {e}') - return False + raise try: with build_archive.open(build_local_archive) as build: + yield build + finally: + shell.remove_file(build_local_archive) + + def _open_build_archive(self, base_build_dir: str, build_dir: str, + build_url: str, http_build_url: Optional[str], + unpack_everything: Optional[bool]): + """Gets a handle on a build archive for the current build. Depending on the + provided parameters, this function might download the build archive into + the build directory or directly use remote HTTP archive. + + Args: + unpack_everything: wether we should unpack the whole archive or try + selective unpacking. + base_build_dir: the base build directory. + build_dir: the current build directory. + build_url: the build URL. + http_build_url: the HTTP build URL. + + Raises: + if an error occurred while accessing the file over HTTP or while + downloading the file on disk. + + Returns: + the build archive. + """ + # We only want to use remote unzipping if we're not unpacking everything and + # if the HTTP URL is compatible with remote unzipping. + allow_unpack_over_http = environment.get_value( + 'ALLOW_UNPACK_OVER_HTTP', default_value=False) + can_unzip_over_http = ( + allow_unpack_over_http and not unpack_everything and http_build_url and + build_archive.unzip_over_http_compatible(http_build_url)) + + if not can_unzip_over_http: + return self._download_and_open_build_archive(base_build_dir, build_dir, + build_url) + logs.info("Opening an archive over HTTP, skipping archive download.") + assert http_build_url + return build_archive.open_uri(http_build_url) + + def _unpack_build(self, + base_build_dir, + build_dir, + build_url, + http_build_url=None): + """Unpacks a build from a build url into the build directory.""" + # Track time taken to unpack builds so that it doesn't silently regress. + start_time = time.time() + + unpack_everything = environment.get_value( + 'UNPACK_ALL_FUZZ_TARGETS_AND_FILES') + + logs.info(f'Unpacking build from {build_url} into {build_dir}.') + + # Free up memory. + utils.python_gc() + + # Remove the current build. + logs.info(f'Removing build directory {build_dir}.') + if not shell.remove_directory(build_dir, recreate=True): + logs.error(f'Unable to clear build directory {build_dir}.') + _handle_unrecoverable_error_on_windows() + return False + + try: + with self._open_build_archive(base_build_dir, build_dir, build_url, + http_build_url, unpack_everything) as build: unpack_everything = environment.get_value( 'UNPACK_ALL_FUZZ_TARGETS_AND_FILES') @@ -463,8 +529,7 @@ def _unpack_build(self, base_build_dir, build_dir, build_url): 'Cleared all data directories to free up space, exiting.') # Unpack the local build archive. - logs.info( - f'Unpacking build archive {build_local_archive} to {build_dir}.') + logs.info(f'Unpacking build archive {build_url} to {build_dir}.') trusted = not utils.is_oss_fuzz() build.unpack( @@ -473,7 +538,7 @@ def _unpack_build(self, base_build_dir, build_dir, build_url): trusted=trusted) except Exception as e: - logs.error(f'Unable to unpack build archive {build_local_archive}: {e}') + logs.error(f'Unable to unpack build archive {build_url}: {e}') return False if unpack_everything: @@ -484,9 +549,6 @@ def _unpack_build(self, base_build_dir, build_dir, build_url): partial_build_file_path = os.path.join(build_dir, PARTIAL_BUILD_FILE) utils.write_data_to_file('', partial_build_file_path) - # No point in keeping the archive around. - shell.remove_file(build_local_archive) - elapsed_time = time.time() - start_time elapsed_mins = elapsed_time / 60. log_func = logs.warning if elapsed_time > UNPACK_TIME_LIMIT else logs.info @@ -605,10 +667,20 @@ def __init__(self, revision, build_url, build_prefix='', - fuzz_target=None): + fuzz_target=None, + http_build_url=None): + """RegularBuild constructor. See Build constructor for other parameters. + + Args: + http_build_url: the http build URL. E.g. + http://storage.com/foo/bar.zip. Defaults to None. + build_url: the GCS bucket URL where the build is stored. E.g. + gs://foo/bar.zip. + """ super().__init__( base_build_dir, revision, build_prefix, fuzz_target=fuzz_target) self.build_url = build_url + self.http_build_url = http_build_url if build_prefix: self.build_dir_name = build_prefix.lower() @@ -630,7 +702,7 @@ def setup(self): build_update = not self.exists() if build_update: if not self._unpack_build(self.base_build_dir, self.build_dir, - self.build_url): + self.build_url, self.http_build_url): return False logs.info('Retrieved build r%d.' % self.revision) @@ -1116,6 +1188,9 @@ def setup_regular_build(revision, return None + # build_url points to a GCP bucket, and we're only converting it to its HTTP + # endpoint so that we can use remote unzipping. + http_build_url = build_url.replace('gs://', 'https://storage.googleapis.com/') base_build_dir = _base_build_dir(bucket_path) build_class = RegularBuild @@ -1133,7 +1208,8 @@ def setup_regular_build(revision, revision, build_url, build_prefix=build_prefix, - fuzz_target=fuzz_target) + fuzz_target=fuzz_target, + http_build_url=http_build_url) if build.setup(): result = build else: diff --git a/src/clusterfuzz/_internal/system/archive.py b/src/clusterfuzz/_internal/system/archive.py index e3a7ed185d..f0b180be27 100644 --- a/src/clusterfuzz/_internal/system/archive.py +++ b/src/clusterfuzz/_internal/system/archive.py @@ -15,6 +15,7 @@ import abc import dataclasses +import io import os import tarfile from typing import BinaryIO @@ -24,6 +25,8 @@ from typing import Union import zipfile +import requests + from clusterfuzz._internal.metrics import logs FILE_ATTRIBUTE = '0o10' @@ -39,6 +42,11 @@ ARCHIVE_FILE_EXTENSIONS = ( ZIP_FILE_EXTENSIONS + TAR_FILE_EXTENSIONS + LZMA_FILE_EXTENSIONS) +# This is the size of the internal buffer that we're using to cache HTTP +# range bytes. +HTTP_BUFFER_SIZE = 50 * 1024 * 1024 # 50 MB +HTTP_REQUEST_TIMEOUT = 5 # 5 seconds + StrBytesPathLike = Union[str, bytes, os.PathLike] MatchCallback = Callable[[str], bool] @@ -373,9 +381,9 @@ class ArchiveError(Exception): """ArchiveError""" -# pylint: disable=redefined-builtin -def open(archive_path: str, - file_obj: Optional[BinaryIO] = None) -> ArchiveReader: +def open( # pylint: disable=redefined-builtin + archive_path: str, + file_obj: Optional[BinaryIO] = None) -> ArchiveReader: """Opens the archive and gets the appropriate archive reader based on the `archive_path`. If `file_obj` is not none, the binary file-like object will be used to read the archive instead of opening `archive_path`. @@ -407,6 +415,154 @@ class ArchiveType: TAR_LZMA = 3 +@dataclasses.dataclass +class CacheBlock: + """Represents a cache entry for the HttpZipFile. + Members: + start: the start of the byte range in the file. + end: the end of the byte range in the file (inclusive). + content: the sequence of bytes for this range. + """ + start: int + end: int + content: bytes + + +class HttpZipFile(io.IOBase): + """This class is a very simple file-like object representation of a zip file + over HTTP. + It uses the 'Accept-Ranges' feature of HTTP to fetch parts (or all) of the + file. + See https://docs.python.org/3/glossary.html#term-file-like-object for the + definition of a file-like object. + """ + + @staticmethod + def is_uri_compatible(uri: str) -> bool: + try: + res = requests.head(uri, timeout=HTTP_REQUEST_TIMEOUT) + return res.headers.get('Accept-Ranges') is not None + except Exception as e: + logs.warning(f'Request to {uri} failed: {e}') + return False + + def __init__(self, uri: str): + self.uri = uri + self.session = requests.Session() + # This slows down the overall performances, because compressing something + # that's already encoded is unlikely to shrink its size, and we'll just + # waste time decoding the data twice. + self.session.headers.pop('Accept-Encoding') + resp = self.session.head(self.uri) + self.file_size = int(resp.headers.get('Content-Length', default=0)) + self._current_block = CacheBlock(0, 0, b'') + self._pos = 0 + assert resp.headers.get('Accept-Ranges') is not None + + def seekable(self) -> bool: + """Whether this is seekable.""" + return True + + def readable(self) -> bool: + """Whether this stream is readable.""" + return True + + def seek(self, offset: int, from_what: int = 0) -> int: + """Provides a seek implementation. + + Args: + offset: the offset + from_what: from where the offset should be computed. Defaults to 0. + """ + if from_what == 0: + self._pos = offset + elif from_what == 1: + self._pos = self._pos + offset + else: + self._pos = self.file_size + offset + self._pos = max(min(self._pos, self.file_size), 0) + return self._pos + + def tell(self) -> int: + """Provides a tell implementation. Returns the current cursor position. + + Returns: + the current cursor position. + """ + return self._pos + + def _fetch_from_http(self, start: int, end: int) -> bytes: + """Fetches the bytes from the HTTP URI using the ranges. + + Args: + start: the start of the range. + end (int): the end of the range (inclusive). + + Returns: + the read bytes. + """ + resp = self.session.get(self.uri, headers={'Range': f'bytes={start}-{end}'}) + return resp.content + + def _fetch_from_cache(self, start: int, end: int) -> bytes: + """Fetches the bytes from the cache. If the cache does not contain the + bytes, it will read bytes from HTTP and cache the result. + + Args: + start: the start of the range. + end: the end of the range (inclusive). + + Returns: + the read bytes. + """ + if self._current_block.start > start or self._current_block.end < end: + # We're reading at most `HTTP_BUFFER_SIZE` from the HTTP range request. + read_ahead_end = min(self.file_size - 1, start + HTTP_BUFFER_SIZE - 1) + self._current_block = CacheBlock( + start, read_ahead_end, self._fetch_from_http(start, read_ahead_end)) + inner_start = start - self._current_block.start + inner_end = end - self._current_block.start + return self._current_block.content[inner_start:inner_end + 1] + + def read(self, size: Optional[int] = -1) -> bytes: + """Read into this file-object. + + Args: + size: the size of the read. If not specified, reads all. + + Returns: + the read bytes. + """ + if not size or size == -1: + size = self.file_size - self._pos + read_size = min(self.file_size - self._pos, size) + end_range = self._pos + read_size - 1 + # If the request if greater than the buffer size, we're not caching + # the request, because it is likely that we are trying to read a big + # file in the archive, in which case it's unlikely that we read it again + # in a subsequent read. For small reads, however, it is likely that we are, + # for example, unpacking a whole directory. In this case, reading ahead + # helps reducing the number of HTTP requests that are being made. + if read_size > HTTP_BUFFER_SIZE: + content = self._fetch_from_http(self._pos, end_range) + else: + content = self._fetch_from_cache(self._pos, end_range) + self._pos += read_size + return content + + def read1(self, size: Optional[int] = -1) -> bytes: + """Read into the file-object in at most on system call. + This is exactly similar to the read implementation in our case. + + Args: + size: The size to read. Defaults to -1. + + Returns: + the read bytes. + """ + return self.read(size) + + def get_archive_type(archive_path: str) -> ArchiveType: """Get the type of the archive. diff --git a/src/clusterfuzz/_internal/tests/core/bot/fuzzers/utils_test.py b/src/clusterfuzz/_internal/tests/core/bot/fuzzers/utils_test.py index eaf399866b..30b3951e5c 100644 --- a/src/clusterfuzz/_internal/tests/core/bot/fuzzers/utils_test.py +++ b/src/clusterfuzz/_internal/tests/core/bot/fuzzers/utils_test.py @@ -24,7 +24,7 @@ class IsFuzzTargetLocalTest(unittest.TestCase): - """is_fuzz_target_local tests.""" + """is_fuzz_target tests.""" def setUp(self): test_helpers.patch_environ(self) @@ -42,74 +42,84 @@ def _create_file(self, name, contents=b''): def test_not_a_fuzzer_invalid_name(self): path = self._create_file('abc$_fuzzer', contents=b'LLVMFuzzerTestOneInput') - self.assertFalse(utils.is_fuzz_target_local(path)) + self.assertFalse(utils.is_fuzz_target(path)) def test_not_a_fuzzer_blocklisted_name(self): path = self._create_file( 'jazzer_driver', contents=b'LLVMFuzzerTestOneInput') - self.assertFalse(utils.is_fuzz_target_local(path)) + self.assertFalse(utils.is_fuzz_target(path)) def test_not_a_fuzzer_without_extension(self): path = self._create_file('abc', contents=b'anything') - self.assertFalse(utils.is_fuzz_target_local(path)) + self.assertFalse(utils.is_fuzz_target(path)) def test_not_a_fuzzer_with_extension(self): path = self._create_file('abc.dict', contents=b'LLVMFuzzerTestOneInput') - self.assertFalse(utils.is_fuzz_target_local(path)) + self.assertFalse(utils.is_fuzz_target(path)) def test_not_a_fuzzer_with_extension_and_suffix(self): path = self._create_file( 'abc_fuzzer.dict', contents=b'LLVMFuzzerTestOneInput') - self.assertFalse(utils.is_fuzz_target_local(path)) + self.assertFalse(utils.is_fuzz_target(path)) def test_fuzzer_posix(self): path = self._create_file('abc_fuzzer', contents=b'anything') - self.assertTrue(utils.is_fuzz_target_local(path)) + self.assertTrue(utils.is_fuzz_target(path)) def test_fuzzer_win(self): path = self._create_file('abc_fuzzer.exe', contents=b'anything') - self.assertTrue(utils.is_fuzz_target_local(path)) + self.assertTrue(utils.is_fuzz_target(path)) def test_fuzzer_py(self): path = self._create_file('abc_fuzzer.par', contents=b'anything') - self.assertTrue(utils.is_fuzz_target_local(path)) + self.assertTrue(utils.is_fuzz_target(path)) def test_fuzzer_not_exist(self): - self.assertFalse(utils.is_fuzz_target_local('/not_exist_fuzzer')) + self.assertFalse(utils.is_fuzz_target('/not_exist_fuzzer')) def test_fuzzer_without_suffix(self): path = self._create_file( 'abc', contents=b'anything\nLLVMFuzzerTestOneInput') - self.assertTrue(utils.is_fuzz_target_local(path)) + self.assertTrue(utils.is_fuzz_target(path)) def test_fuzzer_with_name_regex_match(self): environment.set_value('FUZZER_NAME_REGEX', '.*_custom$') path = self._create_file('a_custom', contents=b'anything') - self.assertTrue(utils.is_fuzz_target_local(path)) + self.assertTrue(utils.is_fuzz_target(path)) def test_fuzzer_with_file_string_and_without_name_regex_match(self): environment.set_value('FUZZER_NAME_REGEX', '.*_custom$') path = self._create_file( 'nomatch', contents=b'anything\nLLVMFuzzerTestOneInput') - self.assertFalse(utils.is_fuzz_target_local(path)) + self.assertFalse(utils.is_fuzz_target(path)) def test_fuzzer_without_file_string_and_without_name_regex_match(self): environment.set_value('FUZZER_NAME_REGEX', '.*_custom$') path = self._create_file('nomatch', contents=b'anything') - self.assertFalse(utils.is_fuzz_target_local(path)) + self.assertFalse(utils.is_fuzz_target(path)) def test_fuzzer_with_fuzzer_name_and_without_name_regex_match(self): environment.set_value('FUZZER_NAME_REGEX', '.*_custom$') path = self._create_file( 'a_fuzzer', contents=b'anything\nLLVMFuzzerTestOneInput') - self.assertTrue(utils.is_fuzz_target_local(path)) + self.assertTrue(utils.is_fuzz_target(path)) def test_file_handle(self): """Test with a file handle.""" + + class MockFileOpener: + + def __init__(self, fileobj): + self.fileobj = fileobj + + def __call__(self, _): + return self.fileobj + path = self._create_file( 'abc', contents=b'anything\nLLVMFuzzerTestOneInput') - with open(path, 'rb') as f: - self.assertTrue(utils.is_fuzz_target_local('name', f)) + f_opener = MockFileOpener(open(path, 'rb')) + self.assertTrue(utils.is_fuzz_target('name', f_opener)) + self.assertTrue(f_opener.fileobj.closed) class GetSupportingFileTest(unittest.TestCase): diff --git a/src/clusterfuzz/_internal/tests/core/build_management/build_archive_test.py b/src/clusterfuzz/_internal/tests/core/build_management/build_archive_test.py index 1f96e5d39f..c56180ea1f 100644 --- a/src/clusterfuzz/_internal/tests/core/build_management/build_archive_test.py +++ b/src/clusterfuzz/_internal/tests/core/build_management/build_archive_test.py @@ -132,15 +132,15 @@ def setUp(self): test_helpers.patch(self, [ 'clusterfuzz._internal.system.archive.ArchiveReader', 'clusterfuzz._internal.system.archive.open', - 'clusterfuzz._internal.bot.fuzzers.utils.is_fuzz_target_local', + 'clusterfuzz._internal.bot.fuzzers.utils.is_fuzz_target', ]) self.mock.open.return_value.list_members.return_value = [] - self.mock.is_fuzz_target_local.side_effect = self._mock_is_fuzz_target_local + self.mock.is_fuzz_target.side_effect = self._mock_is_fuzz_target self.build = build_archive.ChromeBuildArchive(self.mock.open.return_value) self._declared_fuzzers = [] self.maxDiff = None - def _mock_is_fuzz_target_local(self, target, _=None): + def _mock_is_fuzz_target(self, target, _=None): target = os.path.basename(target) return target in self._declared_fuzzers diff --git a/src/clusterfuzz/_internal/tests/core/build_management/build_manager_test.py b/src/clusterfuzz/_internal/tests/core/build_management/build_manager_test.py index cd1dd9c670..c8f273cc24 100644 --- a/src/clusterfuzz/_internal/tests/core/build_management/build_manager_test.py +++ b/src/clusterfuzz/_internal/tests/core/build_management/build_manager_test.py @@ -46,6 +46,7 @@ def _mock_unpack_build( _, build_dir, build_url, + http_build_url=None, ): """Mock _unpack_build.""" if not shell.remove_directory(build_dir, recreate=True): @@ -305,7 +306,8 @@ def test_setup(self): self.mock._unpack_build.assert_called_once_with( mock.ANY, '/builds/path_be4c9ca0267afcd38b7c1a3eebb5998d0908f025', '/builds/path_be4c9ca0267afcd38b7c1a3eebb5998d0908f025/revisions', - 'gs://path/file-release-2.zip') + 'gs://path/file-release-2.zip', + 'https://storage.googleapis.com/path/file-release-2.zip') self._assert_env_vars() self.assertEqual(os.environ['APP_REVISION'], '2') @@ -323,6 +325,20 @@ def test_setup(self): # Non-existent revisions do not result in any builds being set up. self.assertIsNone(build_manager.setup_regular_build(3)) + def test_setup_with_http_url(self): + """Tests setup build with compatible http remote zipping.""" + os.environ['RELEASE_BUILD_BUCKET_PATH'] = ( + 'gs://path/file-release-([0-9]+).zip') + self.mock.get_build_urls_list.return_value = [ + 'gs://path/file-release-10.zip', + 'gs://path/file-release-2.zip', + 'gs://path/file-release-1.zip', + ] + self.mock.time.return_value = 1000.0 + build = build_manager.setup_regular_build(2) + self.assertEqual(build.http_build_url, + 'https://storage.googleapis.com/path/file-release-2.zip') + def test_setup_with_extra(self): """Tests setting up a build with an extra build set.""" os.environ['RELEASE_BUILD_BUCKET_PATH'] = ( @@ -355,12 +371,13 @@ def mock_get_build_urls_list(bucket_path, reverse=True): mock.call( mock.ANY, '/builds/path_be4c9ca0267afcd38b7c1a3eebb5998d0908f025', '/builds/path_be4c9ca0267afcd38b7c1a3eebb5998d0908f025/revisions', - 'gs://path/file-release-2.zip'), + 'gs://path/file-release-2.zip', + 'https://storage.googleapis.com/path/file-release-2.zip'), mock.call( mock.ANY, '/builds/path_be4c9ca0267afcd38b7c1a3eebb5998d0908f025/revisions', '/builds/path_be4c9ca0267afcd38b7c1a3eebb5998d0908f025/revisions/__extra_build', - 'gs://path2/file-release-2.zip') + 'gs://path2/file-release-2.zip', None) ]) self._assert_env_vars() @@ -411,6 +428,8 @@ def setUp(self): 'clusterfuzz._internal.bot.fuzzers.utils.get_fuzz_targets', 'clusterfuzz._internal.build_management.build_archive.BuildArchive', 'clusterfuzz._internal.build_management.build_archive.open', + 'clusterfuzz._internal.build_management.build_archive.open_uri', + 'clusterfuzz._internal.build_management.build_archive.unzip_over_http_compatible', 'clusterfuzz._internal.build_management.build_manager.get_build_urls_list', 'clusterfuzz._internal.build_management.build_manager._make_space', 'clusterfuzz._internal.system.shell.clear_temp_directory', @@ -440,9 +459,14 @@ def setUp(self): self.mock.open.return_value.__enter__.return_value.list_fuzz_targets.return_value = [ 'target1', 'target2', 'target3' ] + self.mock.open_uri.return_value.__enter__.return_value.list_fuzz_targets.return_value = [ + 'target1', 'target2', 'target3' + ] + self.mock.unzip_over_http_compatible.return_value = False self.mock._make_space.return_value = True self.mock.open.return_value.__enter__.return_value.unpack.return_value = True + self.mock.open_uri.return_value.__enter__.return_value.unpack.return_value = True self.mock.time.return_value = 1000.0 os.environ['RELEASE_BUILD_BUCKET_PATH'] = ( @@ -671,6 +695,76 @@ def __eq__(_, fuzz_target): # pylint: disable=no-self-argument trusted=True) ]) + def test_setup_fuzz_over_http(self): + """Tests setup fuzzing with compatible remote unzipping.""" + os.environ['TASK_NAME'] = 'fuzz' + os.environ['RELEASE_BUILD_URL_PATTERN'] = ( + 'https://example.com/path/file-release-([0-9]+).zip') + os.environ['ALLOW_UNPACK_OVER_HTTP'] = "True" + self.mock.unzip_over_http_compatible.return_value = True + self.mock.time.return_value = 1000.0 + build = build_manager.setup_regular_build(2) + self.assertIsInstance(build, build_manager.RegularBuild) + self.assertEqual(_get_timestamp(build.base_build_dir), 1000.0) + + self._assert_env_vars() + self.assertEqual(os.environ['APP_REVISION'], '2') + + self.assertEqual( + 1, + self.mock.open_uri.return_value.__enter__.return_value.unpack.call_count + ) + self.assertEqual(1, self.mock.unzip_over_http_compatible.call_count) + + # Test setting up build again. + self.mock.time.return_value = 1005.0 + build = build_manager.setup_regular_build(2) + + self.assertIsInstance(build, build_manager.RegularBuild) + + self.assertEqual(_get_timestamp(build.base_build_dir), 1005.0) + + # Since it was a partial build, the unpack should be called again. + self.assertEqual( + 2, + self.mock.open_uri.return_value.__enter__.return_value.unpack.call_count + ) + + self.assertCountEqual(['target1', 'target2', 'target3'], build.fuzz_targets) + + def test_setup_fuzz_over_http_unpack_all(self): + """Tests setup fuzzing with compatible remote unzipping.""" + os.environ['UNPACK_ALL_FUZZ_TARGETS_AND_FILES'] = 'True' + os.environ['TASK_NAME'] = 'fuzz' + os.environ['RELEASE_BUILD_URL_PATTERN'] = ( + 'https://example.com/path/file-release-([0-9]+).zip') + self.mock.unzip_over_http_compatible.return_value = True + self.mock.time.return_value = 1000.0 + build = build_manager.setup_regular_build(2) + self.assertIsInstance(build, build_manager.RegularBuild) + self.assertEqual(_get_timestamp(build.base_build_dir), 1000.0) + + self._assert_env_vars() + self.assertEqual(os.environ['APP_REVISION'], '2') + + self.assertEqual( + 1, self.mock.open.return_value.__enter__.return_value.unpack.call_count) + + # Test setting up build again. + os.environ['FUZZ_TARGET'] = '' + self.mock.time.return_value = 1005.0 + build = build_manager.setup_regular_build(2) + + self.assertIsInstance(build, build_manager.RegularBuild) + + self.assertEqual(_get_timestamp(build.base_build_dir), 1005.0) + + # Not a partial build, so unpack shouldn't be called again. + self.assertEqual( + 1, self.mock.open.return_value.__enter__.return_value.unpack.call_count) + + self.assertCountEqual(['target1', 'target2', 'target3'], build.fuzz_targets) + def test_delete(self): """Test deleting this build.""" os.environ['FUZZ_TARGET'] = 'fuzz_target' @@ -1237,7 +1331,7 @@ def tearDown(self): # pylint: disable=unused-argument def mock_unpack_build(self, test_build_dir, actual_self, base_build_dir, - build_dir, url): + build_dir, url, http_build_url): test_data_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'build_manager_data', test_build_dir) diff --git a/src/clusterfuzz/fuzz/__init__.py b/src/clusterfuzz/fuzz/__init__.py index c5316c0358..6279b798a9 100644 --- a/src/clusterfuzz/fuzz/__init__.py +++ b/src/clusterfuzz/fuzz/__init__.py @@ -42,9 +42,9 @@ def get_engine(name): return engine_impl -def is_fuzz_target(file_path, file_handle=None): +def is_fuzz_target(file_path, file_opener=None): """Returns whether |file_path| is a fuzz target.""" - return utils.is_fuzz_target_local(file_path, file_handle) + return utils.is_fuzz_target(file_path, file_opener) def get_fuzz_targets(directory):