From 2dbca59f8198c2ba62b3da2e434864d09901ddfd Mon Sep 17 00:00:00 2001 From: Xinhao Yuan Date: Fri, 6 Dec 2024 11:20:10 -0800 Subject: [PATCH] #Centipede Extend the seed corpus maker library to support legacy input glob. This is the easiest way to support seeding regression/crash inputs for replaying. This feature may also be useful in other contexts. PiperOrigin-RevId: 703558948 --- centipede/BUILD | 5 ++ centipede/centipede_default_callbacks.cc | 4 ++ centipede/seed_corpus_maker_lib.cc | 88 +++++++++++++++++++++--- centipede/seed_corpus_maker_lib.h | 7 +- centipede/seed_corpus_maker_lib_test.cc | 65 +++++++++++++++++ common/remote_file_oss.cc | 4 ++ 6 files changed, 161 insertions(+), 12 deletions(-) diff --git a/centipede/BUILD b/centipede/BUILD index 6835c2ed..39282425 100644 --- a/centipede/BUILD +++ b/centipede/BUILD @@ -913,6 +913,7 @@ cc_library( ":environment", ":mutation_input", ":runner_result", + ":stop", "@com_google_absl//absl/log", "@com_google_absl//absl/log:check", "@com_google_fuzztest//common:defs", @@ -1121,6 +1122,7 @@ cc_library( ":thread_pool", ":util", ":workdir", + "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/log", "@com_google_absl//absl/log:check", "@com_google_absl//absl/random", @@ -1688,8 +1690,11 @@ cc_test( ":feature", ":seed_corpus_maker_lib", ":workdir", + "@com_google_absl//absl/log:check", "@com_google_absl//absl/strings", + "@com_google_fuzztest//common:defs", "@com_google_fuzztest//common:logging", + "@com_google_fuzztest//common:remote_file", "@com_google_fuzztest//common:test_util", "@com_google_googletest//:gtest_main", ], diff --git a/centipede/centipede_default_callbacks.cc b/centipede/centipede_default_callbacks.cc index 6d211215..1b2464bd 100644 --- a/centipede/centipede_default_callbacks.cc +++ b/centipede/centipede_default_callbacks.cc @@ -25,6 +25,7 @@ #include "./centipede/environment.h" #include "./centipede/mutation_input.h" #include "./centipede/runner_result.h" +#include "./centipede/stop.h" #include "./common/defs.h" #include "./common/logging.h" // IWYU pragma: keep @@ -83,6 +84,9 @@ void CentipedeDefaultCallbacks::Mutate( LOG_FIRST_N(WARNING, 5) << "Custom mutator returned no mutants: falling back to internal " "default mutator"; + } else if (ShouldStop()) { + LOG(INFO) << "Stop condition detected. Ignoring mutation errors."; + return; } else { LOG(WARNING) << "Custom mutator undetected or misbehaving:"; CHECK(!custom_mutator_is_usable_.has_value()) diff --git a/centipede/seed_corpus_maker_lib.cc b/centipede/seed_corpus_maker_lib.cc index 677ebc4c..caa0d92f 100644 --- a/centipede/seed_corpus_maker_lib.cc +++ b/centipede/seed_corpus_maker_lib.cc @@ -36,6 +36,7 @@ #include #include +#include "absl/container/flat_hash_set.h" #include "absl/log/check.h" #include "absl/log/log.h" #include "absl/random/random.h" @@ -115,21 +116,54 @@ absl::Status SampleSeedCorpusElementsFromSource( // LOG(INFO) << "Selected " << src_dirs.size() << " corpus dir(s)"; } - // Find all the corpus shard files in the found dirs. + // Find all the corpus shard and legecy input files in the found dirs. std::vector corpus_shard_fnames; + std::vector legacy_input_fnames; for (const auto& dir : src_dirs) { - const std::string shards_glob = fs::path{dir} / source.shard_rel_glob; - // NOTE: `RemoteGlobMatch` appends to the output list. - const auto prev_num_shards = corpus_shard_fnames.size(); - RETURN_IF_NOT_OK(RemoteGlobMatch(shards_glob, corpus_shard_fnames)); - LOG(INFO) << "Found " << (corpus_shard_fnames.size() - prev_num_shards) - << " shard(s) matching " << shards_glob; + absl::flat_hash_set current_corpus_shard_fnames; + if (!source.shard_rel_glob.empty()) { + std::vector matched_fnames; + const std::string glob = fs::path{dir} / source.shard_rel_glob; + const auto match_status = RemoteGlobMatch(glob, matched_fnames); + if (!match_status.ok() && !absl::IsNotFound(match_status)) { + LOG(ERROR) << "Got error when glob-matching in " << dir << ": " + << match_status; + } else { + current_corpus_shard_fnames.insert(matched_fnames.begin(), + matched_fnames.end()); + corpus_shard_fnames.insert(corpus_shard_fnames.end(), + matched_fnames.begin(), + matched_fnames.end()); + LOG(INFO) << "Found " << matched_fnames.size() << " shard(s) matching " + << glob; + } + } + if (!source.legacy_input_rel_glob.empty()) { + std::vector matched_fnames; + const std::string glob = fs::path{dir} / source.legacy_input_rel_glob; + const auto match_status = RemoteGlobMatch(glob, matched_fnames); + if (!match_status.ok() && !absl::IsNotFound(match_status)) { + LOG(ERROR) << "Got error when glob-matching in " << dir << ": " + << match_status; + } else { + size_t num_inserted_fnames = 0; + for (auto& fname : matched_fnames) { + if (current_corpus_shard_fnames.contains(fname)) continue; + if (RemotePathIsDirectory(fname)) continue; + ++num_inserted_fnames; + legacy_input_fnames.push_back(std::move(fname)); + } + LOG(INFO) << "Found " << num_inserted_fnames + << " legacy input(s) with glob " << glob; + } + } } - LOG(INFO) << "Found " << corpus_shard_fnames.size() - << " shard(s) total in source " << source.dir_glob; + LOG(INFO) << "Found " << corpus_shard_fnames.size() << " shard(s) and " + << legacy_input_fnames.size() << " legacy input(s) total in source " + << source.dir_glob; - if (corpus_shard_fnames.empty()) { + if (corpus_shard_fnames.empty() && legacy_input_fnames.empty()) { LOG(WARNING) << "Skipping empty source " << source.dir_glob; return absl::OkStatus(); } @@ -195,9 +229,41 @@ absl::Status SampleSeedCorpusElementsFromSource( // } } - RPROF_SNAPSHOT_AND_LOG("Done reading"); + RPROF_SNAPSHOT_AND_LOG("Done reading shards"); InputAndFeaturesVec src_elts; + + if (!legacy_input_fnames.empty()) { + constexpr size_t kLeastNumLegacyInputsPerBatch = 256; + const size_t legacy_input_batches = std::max( + size_t{1}, legacy_input_fnames.size() / kLeastNumLegacyInputsPerBatch); + const size_t batch_size = + (legacy_input_fnames.size() - 1) / legacy_input_batches + 1; + constexpr int kMaxReadThreads = 32; + ThreadPool threads{std::min(kMaxReadThreads, legacy_input_batches)}; + src_elts.resize(legacy_input_fnames.size()); + + for (int batch = 0; batch < legacy_input_batches; ++batch) { + threads.Schedule([batch, batch_size, &legacy_input_fnames, &src_elts] { + for (int index = batch * batch_size; index < (batch + 1) * batch_size && + index < legacy_input_fnames.size(); + ++index) { + ByteArray input; + const auto& path = legacy_input_fnames[index]; + const auto read_status = RemoteFileGetContents(path, input); + if (!read_status.ok()) { + LOG(WARNING) << "Error while reading legacy input path " << path + << ": " << read_status; + continue; + } + src_elts[index] = {input, {}}; + } + }); + } + } + + RPROF_SNAPSHOT_AND_LOG("Done reading"); + size_t src_num_features = 0; for (int s = 0; s < num_shards; ++s) { diff --git a/centipede/seed_corpus_maker_lib.h b/centipede/seed_corpus_maker_lib.h index 68604fdb..f5e272f0 100644 --- a/centipede/seed_corpus_maker_lib.h +++ b/centipede/seed_corpus_maker_lib.h @@ -33,11 +33,16 @@ namespace centipede { // Native struct used by the seed corpus library for seed corpus source. // // TODO(b/362576261): Currently this is mirroring the `proto::SeedCorpusSource` -// proto. But in the future it may change with the core seeding API. +// proto. But in the future it may change with the core seeding API - any +// difference is commented below. struct SeedCorpusSource { std::string dir_glob; uint32_t num_recent_dirs; std::string shard_rel_glob; + // If non-empty, will be used to glob the legacy input files (with one input + // in each file) in the source dirs. Any files matching `shard_rel_glob` will + // be skipped. + std::string legacy_input_rel_glob; std::variant sampled_fraction_or_count; }; diff --git a/centipede/seed_corpus_maker_lib_test.cc b/centipede/seed_corpus_maker_lib_test.cc index a2e2b124..cdd38b48 100644 --- a/centipede/seed_corpus_maker_lib_test.cc +++ b/centipede/seed_corpus_maker_lib_test.cc @@ -21,13 +21,17 @@ #include // NOLINT #include #include +#include #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "absl/log/check.h" #include "absl/strings/str_cat.h" #include "./centipede/feature.h" #include "./centipede/workdir.h" +#include "./common/defs.h" #include "./common/logging.h" // IWYU pragma: keep +#include "./common/remote_file.h" #include "./common/test_util.h" namespace centipede { @@ -36,6 +40,7 @@ namespace { namespace fs = std::filesystem; using ::testing::IsSubsetOf; +using ::testing::IsSupersetOf; inline constexpr auto kIdxDigits = WorkDir::kDigitsInShardIndex; @@ -178,5 +183,65 @@ TEST(SeedCorpusMakerLibTest, RoundTripWriteReadWrite) { } } +TEST(SeedCorpusMakerLibTest, LoadsBothLegacyInputsAndShardsFromSource) { + const fs::path test_dir = GetTestTempDir(test_info_->name()); + chdir(test_dir.c_str()); + + const InputAndFeaturesVec kShardedInputs = { + {{0}, {}}, + {{1}, {feature_domains::kNoFeature}}, + {{0, 1}, {0x11, 0x23}}, + }; + constexpr std::string_view kCovBin = "bin"; + constexpr std::string_view kCovHash = "hash"; + constexpr std::string_view kRelDir1 = "dir/foo"; + + const std::vector kLegacyInputs = {{0, 1, 2}, {0, 1, 2, 3}}; + // Write sharded inputs. + { + constexpr size_t kNumShards = 2; + const SeedCorpusDestination destination = { + .dir_path = std::string(kRelDir1), + .shard_rel_glob = absl::StrCat("distilled-", kCovBin, ".*"), + .shard_index_digits = kIdxDigits, + .num_shards = kNumShards, + }; + CHECK_OK(WriteSeedCorpusElementsToDestination( // + kShardedInputs, kCovBin, kCovHash, destination)); + const std::string workdir = (test_dir / kRelDir1).c_str(); + ASSERT_NO_FATAL_FAILURE(VerifyShardsExist( // + workdir, kCovBin, kCovHash, kNumShards, ShardType::kDistilled)); + } + + // Write legacy inputs + for (int i = 0; i < kLegacyInputs.size(); ++i) { + const auto path = std::filesystem::path(test_dir) / kRelDir1 / + absl::StrCat("legacy_input_", i); + CHECK_OK(RemoteFileSetContents(path.string(), kLegacyInputs[i])); + } + + // Test that sharded and legacy inputs matches what we wrote. + { + InputAndFeaturesVec elements; + ASSERT_OK(SampleSeedCorpusElementsFromSource( // + SeedCorpusSource{ + .dir_glob = std::string(kRelDir1), + .num_recent_dirs = 1, + .shard_rel_glob = absl::StrCat("distilled-", kCovBin, ".*"), + // Intentionally try to match the shard files and test if they will + // be read as legacy inputs. + .legacy_input_rel_glob = "*", + .sampled_fraction_or_count = 1.0f, + }, + kCovBin, kCovHash, elements)); + EXPECT_EQ(elements.size(), kShardedInputs.size() + kLegacyInputs.size()); + EXPECT_THAT(elements, IsSupersetOf(kShardedInputs)); + EXPECT_THAT(elements, IsSupersetOf(InputAndFeaturesVec{ + {{0, 1, 2}, {}}, + {{0, 1, 2, 3}, {}}, + })); + } +} + } // namespace } // namespace centipede diff --git a/common/remote_file_oss.cc b/common/remote_file_oss.cc index 2ea73afe..646d5fa6 100644 --- a/common/remote_file_oss.cc +++ b/common/remote_file_oss.cc @@ -349,6 +349,10 @@ absl::Status RemoteGlobMatch(std::string_view glob, if (int ret = ::glob(std::string{glob}.c_str(), GLOB_TILDE, HandleGlobError, &glob_ret); ret != 0) { + if (ret == GLOB_NOMATCH) { + return absl::NotFoundError(absl::StrCat( + "glob() returned NOMATCH for pattern: ", std::string(glob))); + } return absl::UnknownError(absl::StrCat( "glob() failed, pattern: ", std::string(glob), ", returned: ", ret)); }