Skip to content

Commit

Permalink
[build_manager] add support for remote zip (#4263)
Browse files Browse the repository at this point in the history
This adds support for remote ZIP.

As of now, performances are quite good locally, and the read ahead
mechanism should keep reasonable performance. Also, given that the
ClusterFuzz bots are having HDD, numbers might even be better there, as
we're only storing on disk when unpacking the build.

The memory consumption of this new feature is contant: it uses at most
(and most of the time) 50 MB of RAM.
  • Loading branch information
paulsemel authored Oct 8, 2024
1 parent bc7fc9e commit 734a0f0
Show file tree
Hide file tree
Showing 8 changed files with 463 additions and 91 deletions.
48 changes: 27 additions & 21 deletions src/clusterfuzz/_internal/bot/fuzzers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@
# limitations under the License.
"""Fuzzer utils."""

import functools
import os
import re
import stat
import tempfile
from typing import Callable
from typing import Optional

from clusterfuzz._internal.base import utils
from clusterfuzz._internal.metrics import logs
Expand All @@ -30,7 +33,7 @@
EXTRA_BUILD_DIR = '__extra_build'


def is_fuzz_target_local(file_path, file_handle=None):
def is_fuzz_target(file_path, file_opener: Optional[Callable] = None):
"""Returns whether |file_path| is a fuzz target binary (local path)."""
if '@' in file_path:
# GFT targets often have periods in the name that get misinterpreted as an
Expand All @@ -53,7 +56,7 @@ def is_fuzz_target_local(file_path, file_handle=None):
# Ignore files with disallowed extensions (to prevent opening e.g. .zips).
return False

if not file_handle and not os.path.exists(file_path):
if not file_opener and not os.path.exists(file_path):
# Ignore non-existent files for cases when we don't have a file handle.
return False

Expand All @@ -72,24 +75,27 @@ def is_fuzz_target_local(file_path, file_handle=None):
logs.warning('Tried to read from non-regular file: %s.' % file_path)
return False

# Use already provided file handle or open the file.
local_file_handle = file_handle or open(file_path, 'rb')

result = False
for pattern in FUZZ_TARGET_SEARCH_BYTES:
# TODO(metzman): Bound this call so we don't read forever if something went
# wrong.
local_file_handle.seek(0)
result = utils.search_bytes_in_file(pattern, local_file_handle)
if result:
break

if not file_handle:
# If this local file handle is owned by our function, close it now.
# Otherwise, it is caller's responsibility.
local_file_handle.close()

return result
# Either use the file opener or open the file ourselves.
if not file_opener:
file_opener = functools.partial(open, mode='rb')
try:
with file_opener(file_path) as file_handle:
result = False
for pattern in FUZZ_TARGET_SEARCH_BYTES:
# TODO(metzman): Bound this call so we don't read forever if something
# went wrong.
file_handle.seek(0)
result = utils.search_bytes_in_file(pattern, file_handle)
if result:
break

file_handle.close()

return result
except Exception as e:
# In case we could not open the file, we consider it's not a fuzzer.
logs.warning(f'Could not open {file_path}: {e}')
return False


def get_fuzz_targets_local(path):
Expand All @@ -103,7 +109,7 @@ def get_fuzz_targets_local(path):
continue

file_path = os.path.join(root, filename)
if is_fuzz_target_local(file_path):
if is_fuzz_target(file_path):
fuzz_target_paths.append(file_path)

return fuzz_target_paths
Expand Down
60 changes: 45 additions & 15 deletions src/clusterfuzz/_internal/build_management/build_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,14 +190,10 @@ def list_fuzz_targets(self) -> List[str]:
from clusterfuzz._internal.bot.fuzzers import utils as fuzzer_utils

for archive_file in self.list_members():
file_content = self.try_open(archive_file.name)
if fuzzer_utils.is_fuzz_target_local(archive_file.name, file_content):
if fuzzer_utils.is_fuzz_target(archive_file.name, self.open):
fuzz_target = fuzzer_utils.normalize_target_name(archive_file.name)
self._fuzz_targets[fuzz_target] = archive_file.name

if file_content:
file_content.close()

return list(self._fuzz_targets.keys())

def unpacked_size(self, fuzz_target: Optional[str] = None) -> int:
Expand Down Expand Up @@ -299,23 +295,19 @@ def get_target_dependencies(
return res


# pylint: disable=redefined-builtin
def open(archive_path: str) -> BuildArchive:
"""Opens the archive and gets the appropriate build archive based on the
`archive_path`. The resulting object is usable as a normal archive reader,
but provides additional feature related to build handling.
def open_with_reader(reader: archive.ArchiveReader) -> BuildArchive:
"""Open the archive and gets the appropriate build archive based on the
provided archive information.
Args:
archive_path: the path to the archive.
reader: the archive reader.
Raises:
If the file could not be opened or if the archive type cannot be handled.
If the archive reader cannot be handled.
Returns:
the build archive.
The build archive.
"""
reader = archive.open(archive_path)

# Unfortunately, there is no good heuristic for determining which build
# archive implementation to use.
# Hopefully, we can search in the archive whether some files are present and
Expand All @@ -328,3 +320,41 @@ def open(archive_path: str) -> BuildArchive:
if reader.file_exists(args_gn_path):
return ChromeBuildArchive(reader)
return DefaultBuildArchive(reader)


def open(archive_path: str) -> BuildArchive: # pylint: disable=redefined-builtin
"""Opens the archive and gets the appropriate build archive based on the
`archive_path`. The resulting object is usable as a normal archive reader,
but provides additional feature related to build handling.
Args:
archive_path: the path to the archive.
Raises:
If the file could not be opened or if the archive type cannot be handled.
Returns:
The build archive.
"""
reader = archive.open(archive_path)
return open_with_reader(reader)


def open_uri(uri: str) -> BuildArchive:
"""Opens a build archive over HTTP. This is only compatible with chromium as
of now.
Args:
uri: the URI pointing to the zip file.
Returns:
The build archive.
"""
reader = archive.ZipArchiveReader(archive.HttpZipFile(uri))
return open_with_reader(reader)


def unzip_over_http_compatible(build_url: str) -> bool:
"""Whether the build URL is compatible with unzipping over HTTP.
"""
return archive.HttpZipFile.is_uri_compatible(build_url)
128 changes: 102 additions & 26 deletions src/clusterfuzz/_internal/build_management/build_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
"""Build manager."""

from collections import namedtuple
import contextlib
import os
import re
import shutil
import subprocess
import time
from typing import Optional

from clusterfuzz._internal.base import errors
from clusterfuzz._internal.base import utils
Expand Down Expand Up @@ -402,23 +404,19 @@ def _post_setup_success(self, update_revision=True):
if instrumented_library_paths:
self._patch_rpaths(instrumented_library_paths)

def _unpack_build(self, base_build_dir, build_dir, build_url):
"""Unpacks a build from a build url into the build directory."""
# Track time taken to unpack builds so that it doesn't silently regress.
start_time = time.time()

logs.info(f'Unpacking build from {build_url} into {build_dir}.')
@contextlib.contextmanager
def _download_and_open_build_archive(self, base_build_dir: str,
build_dir: str, build_url: str):
"""Downloads the build archive at `build_url` and opens it.
# Free up memory.
utils.python_gc()

# Remove the current build.
logs.info(f'Removing build directory {build_dir}.')
if not shell.remove_directory(build_dir, recreate=True):
logs.error(f'Unable to clear build directory {build_dir}.')
_handle_unrecoverable_error_on_windows()
return False
Args:
base_build_dir: the base build directory
build_dir: the current build directory
build_url: the build URL
Yields:
the build archive
"""
# Download build archive locally.
build_local_archive = os.path.join(build_dir, os.path.basename(build_url))

Expand All @@ -431,15 +429,83 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
'Failed to make space for download. '
'Cleared all data directories to free up space, exiting.')

logs.info(f'Downloading build from {build_url}.')
logs.info(f'Downloading build from {build_url} to {build_local_archive}.')
try:
storage.copy_file_from(build_url, build_local_archive)
except Exception as e:
logs.error(f'Unable to download build from {build_url}: {e}')
return False
raise

try:
with build_archive.open(build_local_archive) as build:
yield build
finally:
shell.remove_file(build_local_archive)

def _open_build_archive(self, base_build_dir: str, build_dir: str,
build_url: str, http_build_url: Optional[str],
unpack_everything: Optional[bool]):
"""Gets a handle on a build archive for the current build. Depending on the
provided parameters, this function might download the build archive into
the build directory or directly use remote HTTP archive.
Args:
unpack_everything: wether we should unpack the whole archive or try
selective unpacking.
base_build_dir: the base build directory.
build_dir: the current build directory.
build_url: the build URL.
http_build_url: the HTTP build URL.
Raises:
if an error occurred while accessing the file over HTTP or while
downloading the file on disk.
Returns:
the build archive.
"""
# We only want to use remote unzipping if we're not unpacking everything and
# if the HTTP URL is compatible with remote unzipping.
allow_unpack_over_http = environment.get_value(
'ALLOW_UNPACK_OVER_HTTP', default_value=False)
can_unzip_over_http = (
allow_unpack_over_http and not unpack_everything and http_build_url and
build_archive.unzip_over_http_compatible(http_build_url))

if not can_unzip_over_http:
return self._download_and_open_build_archive(base_build_dir, build_dir,
build_url)
logs.info("Opening an archive over HTTP, skipping archive download.")
assert http_build_url
return build_archive.open_uri(http_build_url)

def _unpack_build(self,
base_build_dir,
build_dir,
build_url,
http_build_url=None):
"""Unpacks a build from a build url into the build directory."""
# Track time taken to unpack builds so that it doesn't silently regress.
start_time = time.time()

unpack_everything = environment.get_value(
'UNPACK_ALL_FUZZ_TARGETS_AND_FILES')

logs.info(f'Unpacking build from {build_url} into {build_dir}.')

# Free up memory.
utils.python_gc()

# Remove the current build.
logs.info(f'Removing build directory {build_dir}.')
if not shell.remove_directory(build_dir, recreate=True):
logs.error(f'Unable to clear build directory {build_dir}.')
_handle_unrecoverable_error_on_windows()
return False

try:
with self._open_build_archive(base_build_dir, build_dir, build_url,
http_build_url, unpack_everything) as build:
unpack_everything = environment.get_value(
'UNPACK_ALL_FUZZ_TARGETS_AND_FILES')

Expand All @@ -463,8 +529,7 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
'Cleared all data directories to free up space, exiting.')

# Unpack the local build archive.
logs.info(
f'Unpacking build archive {build_local_archive} to {build_dir}.')
logs.info(f'Unpacking build archive {build_url} to {build_dir}.')
trusted = not utils.is_oss_fuzz()

build.unpack(
Expand All @@ -473,7 +538,7 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
trusted=trusted)

except Exception as e:
logs.error(f'Unable to unpack build archive {build_local_archive}: {e}')
logs.error(f'Unable to unpack build archive {build_url}: {e}')
return False

if unpack_everything:
Expand All @@ -484,9 +549,6 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
partial_build_file_path = os.path.join(build_dir, PARTIAL_BUILD_FILE)
utils.write_data_to_file('', partial_build_file_path)

# No point in keeping the archive around.
shell.remove_file(build_local_archive)

elapsed_time = time.time() - start_time
elapsed_mins = elapsed_time / 60.
log_func = logs.warning if elapsed_time > UNPACK_TIME_LIMIT else logs.info
Expand Down Expand Up @@ -605,10 +667,20 @@ def __init__(self,
revision,
build_url,
build_prefix='',
fuzz_target=None):
fuzz_target=None,
http_build_url=None):
"""RegularBuild constructor. See Build constructor for other parameters.
Args:
http_build_url: the http build URL. E.g.
http://storage.com/foo/bar.zip. Defaults to None.
build_url: the GCS bucket URL where the build is stored. E.g.
gs://foo/bar.zip.
"""
super().__init__(
base_build_dir, revision, build_prefix, fuzz_target=fuzz_target)
self.build_url = build_url
self.http_build_url = http_build_url

if build_prefix:
self.build_dir_name = build_prefix.lower()
Expand All @@ -630,7 +702,7 @@ def setup(self):
build_update = not self.exists()
if build_update:
if not self._unpack_build(self.base_build_dir, self.build_dir,
self.build_url):
self.build_url, self.http_build_url):
return False

logs.info('Retrieved build r%d.' % self.revision)
Expand Down Expand Up @@ -1116,6 +1188,9 @@ def setup_regular_build(revision,

return None

# build_url points to a GCP bucket, and we're only converting it to its HTTP
# endpoint so that we can use remote unzipping.
http_build_url = build_url.replace('gs://', 'https://storage.googleapis.com/')
base_build_dir = _base_build_dir(bucket_path)

build_class = RegularBuild
Expand All @@ -1133,7 +1208,8 @@ def setup_regular_build(revision,
revision,
build_url,
build_prefix=build_prefix,
fuzz_target=fuzz_target)
fuzz_target=fuzz_target,
http_build_url=http_build_url)
if build.setup():
result = build
else:
Expand Down
Loading

0 comments on commit 734a0f0

Please sign in to comment.