From 39f52c356a7c02ce472e9775d9538dfe02b82acf Mon Sep 17 00:00:00 2001 From: Xinhao Yuan Date: Fri, 6 Dec 2024 11:20:10 -0800 Subject: [PATCH] Use `fuzz_tests_in_current_shard` instead of computing the sharding again. This should be non-functional change for now, but it would smooth the transition to the unified execution model where each update-corpus invocation operates on a single test and fuzz_tests_in_current_shard contains exactly that test. PiperOrigin-RevId: 703558945 --- centipede/BUILD | 5 ++ centipede/centipede_default_callbacks.cc | 4 ++ centipede/centipede_interface.cc | 32 ++++----- centipede/seed_corpus_maker_lib.cc | 88 +++++++++++++++++++++--- centipede/seed_corpus_maker_lib.h | 7 +- centipede/seed_corpus_maker_lib_test.cc | 65 +++++++++++++++++ common/remote_file_oss.cc | 4 ++ 7 files changed, 177 insertions(+), 28 deletions(-) diff --git a/centipede/BUILD b/centipede/BUILD index 6835c2ed..39282425 100644 --- a/centipede/BUILD +++ b/centipede/BUILD @@ -913,6 +913,7 @@ cc_library( ":environment", ":mutation_input", ":runner_result", + ":stop", "@com_google_absl//absl/log", "@com_google_absl//absl/log:check", "@com_google_fuzztest//common:defs", @@ -1121,6 +1122,7 @@ cc_library( ":thread_pool", ":util", ":workdir", + "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/log", "@com_google_absl//absl/log:check", "@com_google_absl//absl/random", @@ -1688,8 +1690,11 @@ cc_test( ":feature", ":seed_corpus_maker_lib", ":workdir", + "@com_google_absl//absl/log:check", "@com_google_absl//absl/strings", + "@com_google_fuzztest//common:defs", "@com_google_fuzztest//common:logging", + "@com_google_fuzztest//common:remote_file", "@com_google_fuzztest//common:test_util", "@com_google_googletest//:gtest_main", ], diff --git a/centipede/centipede_default_callbacks.cc b/centipede/centipede_default_callbacks.cc index 6d211215..1b2464bd 100644 --- a/centipede/centipede_default_callbacks.cc +++ b/centipede/centipede_default_callbacks.cc @@ -25,6 +25,7 @@ #include "./centipede/environment.h" #include "./centipede/mutation_input.h" #include "./centipede/runner_result.h" +#include "./centipede/stop.h" #include "./common/defs.h" #include "./common/logging.h" // IWYU pragma: keep @@ -83,6 +84,9 @@ void CentipedeDefaultCallbacks::Mutate( LOG_FIRST_N(WARNING, 5) << "Custom mutator returned no mutants: falling back to internal " "default mutator"; + } else if (ShouldStop()) { + LOG(INFO) << "Stop condition detected. Ignoring mutation errors."; + return; } else { LOG(WARNING) << "Custom mutator undetected or misbehaving:"; CHECK(!custom_mutator_is_usable_.has_value()) diff --git a/centipede/centipede_interface.cc b/centipede/centipede_interface.cc index dbd309cb..154295da 100644 --- a/centipede/centipede_interface.cc +++ b/centipede/centipede_interface.cc @@ -483,12 +483,15 @@ int UpdateCorpusDatabaseForFuzzTests( CentipedeCallbacksFactory &callbacks_factory) { env.UpdateWithTargetConfig(fuzztest_config); + std::vector fuzz_tests_to_run = + fuzztest_config.fuzz_tests_in_current_shard; + std::sort(fuzz_tests_to_run.begin(), fuzz_tests_to_run.end()); + absl::Time start_time = absl::Now(); LOG(INFO) << "Starting the update of the corpus database for fuzz tests:" << "\nBinary: " << env.binary << "\nCorpus database: " << fuzztest_config.corpus_database - << "\nFuzz tests: " - << absl::StrJoin(fuzztest_config.fuzz_tests, ", "); + << "\nFuzz tests: " << absl::StrJoin(fuzz_tests_to_run, ", "); // Step 1: Preliminary set up of test sharding, binary info, etc. const auto [test_shard_index, total_test_shards] = SetUpTestSharding(); @@ -523,9 +526,8 @@ int UpdateCorpusDatabaseForFuzzTests( // Find the last index of a fuzz test for which we already have a workdir. bool is_resuming = false; int resuming_fuzztest_idx = 0; - for (int i = 0; i < fuzztest_config.fuzz_tests.size(); ++i) { - if (i % total_test_shards != test_shard_index) continue; - env.workdir = base_workdir_path / fuzztest_config.fuzz_tests[i]; + for (int i = 0; i < fuzz_tests_to_run.size(); ++i) { + env.workdir = base_workdir_path / fuzz_tests_to_run[i]; // Check the existence of the coverage path to not only make sure the // workdir exists, but also that it was created for the same binary as in // this run. @@ -536,19 +538,17 @@ int UpdateCorpusDatabaseForFuzzTests( } LOG_IF(INFO, is_resuming) << "Resuming from the fuzz test " - << fuzztest_config.fuzz_tests[resuming_fuzztest_idx] + << fuzz_tests_to_run[resuming_fuzztest_idx] << " (index: " << resuming_fuzztest_idx << ")"; // Step 3: Iterate over the fuzz tests and run them. const std::string binary = env.binary; - for (int i = resuming_fuzztest_idx; i < fuzztest_config.fuzz_tests.size(); - ++i) { - if (i % total_test_shards != test_shard_index) continue; + for (int i = resuming_fuzztest_idx; i < fuzz_tests_to_run.size(); ++i) { ReportErrorWhenNotEnoughTimeToRunEverything( start_time, fuzztest_config.time_limit, /*executed_tests_in_shard=*/i / total_test_shards, fuzztest_config.fuzz_tests.size(), total_test_shards); - env.workdir = base_workdir_path / fuzztest_config.fuzz_tests[i]; + env.workdir = base_workdir_path / fuzz_tests_to_run[i]; if (RemotePathExists(env.workdir) && !is_resuming) { // This could be a workdir from a failed run that used a different version // of the binary. We delete it so that we don't have to deal with the @@ -562,7 +562,7 @@ int UpdateCorpusDatabaseForFuzzTests( // Seed the fuzzing session with the latest coverage corpus from the // previous fuzzing session. const std::filesystem::path fuzztest_db_path = - corpus_database_path / fuzztest_config.fuzz_tests[i]; + corpus_database_path / fuzz_tests_to_run[i]; const std::filesystem::path coverage_dir = fuzztest_db_path / "coverage"; if (RemotePathExists(coverage_dir.c_str()) && !is_resuming) { CHECK_OK(GenerateSeedCorpusFromConfig( @@ -572,8 +572,8 @@ int UpdateCorpusDatabaseForFuzzTests( // TODO: b/338217594 - Call the FuzzTest binary in a flag-agnostic way. constexpr std::string_view kFuzzTestFuzzFlag = "--fuzz="; - env.binary = absl::StrCat(binary, " ", kFuzzTestFuzzFlag, - fuzztest_config.fuzz_tests[i]); + env.binary = + absl::StrCat(binary, " ", kFuzzTestFuzzFlag, fuzz_tests_to_run[i]); absl::Duration time_limit = fuzztest_config.GetTimeLimitPerTest(); absl::Duration time_spent = absl::ZeroDuration(); @@ -585,8 +585,8 @@ int UpdateCorpusDatabaseForFuzzTests( } is_resuming = false; - LOG(INFO) << "Fuzzing " << fuzztest_config.fuzz_tests[i] << " for " - << time_limit << "\n\tTest binary: " << env.binary; + LOG(INFO) << "Fuzzing " << fuzz_tests_to_run[i] << " for " << time_limit + << "\n\tTest binary: " << env.binary; const absl::Time start_time = absl::Now(); ClearEarlyStopRequestAndSetStopTime(/*stop_time=*/start_time + time_limit); @@ -597,7 +597,7 @@ int UpdateCorpusDatabaseForFuzzTests( record_fuzzing_time.Stop(); if (!stats_root_path.empty()) { - const auto stats_dir = stats_root_path / fuzztest_config.fuzz_tests[i]; + const auto stats_dir = stats_root_path / fuzz_tests_to_run[i]; CHECK_OK(RemoteMkdir(stats_dir.c_str())); CHECK_OK(RemotePathRename( workdir.FuzzingStatsPath(), diff --git a/centipede/seed_corpus_maker_lib.cc b/centipede/seed_corpus_maker_lib.cc index 677ebc4c..66296a0f 100644 --- a/centipede/seed_corpus_maker_lib.cc +++ b/centipede/seed_corpus_maker_lib.cc @@ -36,6 +36,7 @@ #include #include +#include "absl/container/flat_hash_set.h" #include "absl/log/check.h" #include "absl/log/log.h" #include "absl/random/random.h" @@ -115,21 +116,54 @@ absl::Status SampleSeedCorpusElementsFromSource( // LOG(INFO) << "Selected " << src_dirs.size() << " corpus dir(s)"; } - // Find all the corpus shard files in the found dirs. + // Find all the corpus shard and legecy input files in the found dirs. std::vector corpus_shard_fnames; + std::vector legacy_input_fnames; for (const auto& dir : src_dirs) { - const std::string shards_glob = fs::path{dir} / source.shard_rel_glob; - // NOTE: `RemoteGlobMatch` appends to the output list. - const auto prev_num_shards = corpus_shard_fnames.size(); - RETURN_IF_NOT_OK(RemoteGlobMatch(shards_glob, corpus_shard_fnames)); - LOG(INFO) << "Found " << (corpus_shard_fnames.size() - prev_num_shards) - << " shard(s) matching " << shards_glob; + absl::flat_hash_set current_corpus_shard_fnames; + if (!source.shard_rel_glob.empty()) { + std::vector matched_fnames; + const std::string glob = fs::path{dir} / source.shard_rel_glob; + const auto match_status = RemoteGlobMatch(glob, matched_fnames); + if (!match_status.ok() && !absl::IsNotFound(match_status)) { + LOG(ERROR) << "Got error when glob-matching in " << dir << ": " + << match_status; + } else { + current_corpus_shard_fnames.insert(matched_fnames.begin(), + matched_fnames.end()); + corpus_shard_fnames.insert(corpus_shard_fnames.end(), + matched_fnames.begin(), + matched_fnames.end()); + LOG(INFO) << "Found " << matched_fnames.size() << " shard(s) matching " + << glob; + } + } + if (!source.legacy_input_rel_glob.empty()) { + std::vector matched_fnames; + const std::string glob = fs::path{dir} / source.legacy_input_rel_glob; + const auto match_status = RemoteGlobMatch(glob, matched_fnames); + if (!match_status.ok() && !absl::IsNotFound(match_status)) { + LOG(ERROR) << "Got error when glob-matching in " << dir << ": " + << match_status; + } else { + size_t num_added_legacy_inputs = 0; + for (auto& fname : matched_fnames) { + if (current_corpus_shard_fnames.contains(fname)) continue; + if (RemotePathIsDirectory(fname)) continue; + ++num_added_legacy_inputs; + legacy_input_fnames.push_back(std::move(fname)); + } + LOG(INFO) << "Found " << num_added_legacy_inputs + << " legacy input(s) with glob " << glob; + } + } } - LOG(INFO) << "Found " << corpus_shard_fnames.size() - << " shard(s) total in source " << source.dir_glob; + LOG(INFO) << "Found " << corpus_shard_fnames.size() << " shard(s) and " + << legacy_input_fnames.size() << " legacy input(s) total in source " + << source.dir_glob; - if (corpus_shard_fnames.empty()) { + if (corpus_shard_fnames.empty() && legacy_input_fnames.empty()) { LOG(WARNING) << "Skipping empty source " << source.dir_glob; return absl::OkStatus(); } @@ -195,9 +229,41 @@ absl::Status SampleSeedCorpusElementsFromSource( // } } - RPROF_SNAPSHOT_AND_LOG("Done reading"); + RPROF_SNAPSHOT_AND_LOG("Done reading shards"); InputAndFeaturesVec src_elts; + + if (!legacy_input_fnames.empty()) { + constexpr size_t kLeastNumLegacyInputsPerBatch = 256; + const size_t legacy_input_batches = std::max( + size_t{1}, legacy_input_fnames.size() / kLeastNumLegacyInputsPerBatch); + const size_t batch_size = + (legacy_input_fnames.size() - 1) / legacy_input_batches + 1; + constexpr int kMaxReadThreads = 32; + ThreadPool threads{std::min(kMaxReadThreads, legacy_input_batches)}; + src_elts.resize(legacy_input_fnames.size()); + + for (int batch = 0; batch < legacy_input_batches; ++batch) { + threads.Schedule([batch, batch_size, &legacy_input_fnames, &src_elts] { + for (int index = batch * batch_size; index < (batch + 1) * batch_size && + index < legacy_input_fnames.size(); + ++index) { + ByteArray input; + const auto& path = legacy_input_fnames[index]; + const auto read_status = RemoteFileGetContents(path, input); + if (!read_status.ok()) { + LOG(WARNING) << "Error while reading legacy input path " << path + << ": " << read_status; + continue; + } + src_elts[index] = {input, {}}; + } + }); + } + } + + RPROF_SNAPSHOT_AND_LOG("Done reading"); + size_t src_num_features = 0; for (int s = 0; s < num_shards; ++s) { diff --git a/centipede/seed_corpus_maker_lib.h b/centipede/seed_corpus_maker_lib.h index 68604fdb..f5e272f0 100644 --- a/centipede/seed_corpus_maker_lib.h +++ b/centipede/seed_corpus_maker_lib.h @@ -33,11 +33,16 @@ namespace centipede { // Native struct used by the seed corpus library for seed corpus source. // // TODO(b/362576261): Currently this is mirroring the `proto::SeedCorpusSource` -// proto. But in the future it may change with the core seeding API. +// proto. But in the future it may change with the core seeding API - any +// difference is commented below. struct SeedCorpusSource { std::string dir_glob; uint32_t num_recent_dirs; std::string shard_rel_glob; + // If non-empty, will be used to glob the legacy input files (with one input + // in each file) in the source dirs. Any files matching `shard_rel_glob` will + // be skipped. + std::string legacy_input_rel_glob; std::variant sampled_fraction_or_count; }; diff --git a/centipede/seed_corpus_maker_lib_test.cc b/centipede/seed_corpus_maker_lib_test.cc index a2e2b124..cdd38b48 100644 --- a/centipede/seed_corpus_maker_lib_test.cc +++ b/centipede/seed_corpus_maker_lib_test.cc @@ -21,13 +21,17 @@ #include // NOLINT #include #include +#include #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "absl/log/check.h" #include "absl/strings/str_cat.h" #include "./centipede/feature.h" #include "./centipede/workdir.h" +#include "./common/defs.h" #include "./common/logging.h" // IWYU pragma: keep +#include "./common/remote_file.h" #include "./common/test_util.h" namespace centipede { @@ -36,6 +40,7 @@ namespace { namespace fs = std::filesystem; using ::testing::IsSubsetOf; +using ::testing::IsSupersetOf; inline constexpr auto kIdxDigits = WorkDir::kDigitsInShardIndex; @@ -178,5 +183,65 @@ TEST(SeedCorpusMakerLibTest, RoundTripWriteReadWrite) { } } +TEST(SeedCorpusMakerLibTest, LoadsBothLegacyInputsAndShardsFromSource) { + const fs::path test_dir = GetTestTempDir(test_info_->name()); + chdir(test_dir.c_str()); + + const InputAndFeaturesVec kShardedInputs = { + {{0}, {}}, + {{1}, {feature_domains::kNoFeature}}, + {{0, 1}, {0x11, 0x23}}, + }; + constexpr std::string_view kCovBin = "bin"; + constexpr std::string_view kCovHash = "hash"; + constexpr std::string_view kRelDir1 = "dir/foo"; + + const std::vector kLegacyInputs = {{0, 1, 2}, {0, 1, 2, 3}}; + // Write sharded inputs. + { + constexpr size_t kNumShards = 2; + const SeedCorpusDestination destination = { + .dir_path = std::string(kRelDir1), + .shard_rel_glob = absl::StrCat("distilled-", kCovBin, ".*"), + .shard_index_digits = kIdxDigits, + .num_shards = kNumShards, + }; + CHECK_OK(WriteSeedCorpusElementsToDestination( // + kShardedInputs, kCovBin, kCovHash, destination)); + const std::string workdir = (test_dir / kRelDir1).c_str(); + ASSERT_NO_FATAL_FAILURE(VerifyShardsExist( // + workdir, kCovBin, kCovHash, kNumShards, ShardType::kDistilled)); + } + + // Write legacy inputs + for (int i = 0; i < kLegacyInputs.size(); ++i) { + const auto path = std::filesystem::path(test_dir) / kRelDir1 / + absl::StrCat("legacy_input_", i); + CHECK_OK(RemoteFileSetContents(path.string(), kLegacyInputs[i])); + } + + // Test that sharded and legacy inputs matches what we wrote. + { + InputAndFeaturesVec elements; + ASSERT_OK(SampleSeedCorpusElementsFromSource( // + SeedCorpusSource{ + .dir_glob = std::string(kRelDir1), + .num_recent_dirs = 1, + .shard_rel_glob = absl::StrCat("distilled-", kCovBin, ".*"), + // Intentionally try to match the shard files and test if they will + // be read as legacy inputs. + .legacy_input_rel_glob = "*", + .sampled_fraction_or_count = 1.0f, + }, + kCovBin, kCovHash, elements)); + EXPECT_EQ(elements.size(), kShardedInputs.size() + kLegacyInputs.size()); + EXPECT_THAT(elements, IsSupersetOf(kShardedInputs)); + EXPECT_THAT(elements, IsSupersetOf(InputAndFeaturesVec{ + {{0, 1, 2}, {}}, + {{0, 1, 2, 3}, {}}, + })); + } +} + } // namespace } // namespace centipede diff --git a/common/remote_file_oss.cc b/common/remote_file_oss.cc index 2ea73afe..646d5fa6 100644 --- a/common/remote_file_oss.cc +++ b/common/remote_file_oss.cc @@ -349,6 +349,10 @@ absl::Status RemoteGlobMatch(std::string_view glob, if (int ret = ::glob(std::string{glob}.c_str(), GLOB_TILDE, HandleGlobError, &glob_ret); ret != 0) { + if (ret == GLOB_NOMATCH) { + return absl::NotFoundError(absl::StrCat( + "glob() returned NOMATCH for pattern: ", std::string(glob))); + } return absl::UnknownError(absl::StrCat( "glob() failed, pattern: ", std::string(glob), ", returned: ", ret)); }