diff --git a/centipede/BUILD b/centipede/BUILD index eb7b1d4e..39282425 100644 --- a/centipede/BUILD +++ b/centipede/BUILD @@ -1122,6 +1122,7 @@ cc_library( ":thread_pool", ":util", ":workdir", + "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/log", "@com_google_absl//absl/log:check", "@com_google_absl//absl/random", @@ -1689,8 +1690,11 @@ cc_test( ":feature", ":seed_corpus_maker_lib", ":workdir", + "@com_google_absl//absl/log:check", "@com_google_absl//absl/strings", + "@com_google_fuzztest//common:defs", "@com_google_fuzztest//common:logging", + "@com_google_fuzztest//common:remote_file", "@com_google_fuzztest//common:test_util", "@com_google_googletest//:gtest_main", ], diff --git a/centipede/seed_corpus_maker_lib.cc b/centipede/seed_corpus_maker_lib.cc index 677ebc4c..4ad20f78 100644 --- a/centipede/seed_corpus_maker_lib.cc +++ b/centipede/seed_corpus_maker_lib.cc @@ -36,6 +36,7 @@ #include #include +#include "absl/container/flat_hash_set.h" #include "absl/log/check.h" #include "absl/log/log.h" #include "absl/random/random.h" @@ -115,21 +116,54 @@ absl::Status SampleSeedCorpusElementsFromSource( // LOG(INFO) << "Selected " << src_dirs.size() << " corpus dir(s)"; } - // Find all the corpus shard files in the found dirs. + // Find all the corpus shard and individual input files in the found dirs. std::vector corpus_shard_fnames; + std::vector individual_input_fnames; for (const auto& dir : src_dirs) { - const std::string shards_glob = fs::path{dir} / source.shard_rel_glob; - // NOTE: `RemoteGlobMatch` appends to the output list. - const auto prev_num_shards = corpus_shard_fnames.size(); - RETURN_IF_NOT_OK(RemoteGlobMatch(shards_glob, corpus_shard_fnames)); - LOG(INFO) << "Found " << (corpus_shard_fnames.size() - prev_num_shards) - << " shard(s) matching " << shards_glob; + absl::flat_hash_set current_corpus_shard_fnames; + if (!source.shard_rel_glob.empty()) { + std::vector matched_fnames; + const std::string glob = fs::path{dir} / source.shard_rel_glob; + const auto match_status = RemoteGlobMatch(glob, matched_fnames); + if (!match_status.ok() && !absl::IsNotFound(match_status)) { + LOG(ERROR) << "Got error when glob-matching in " << dir << ": " + << match_status; + } else { + current_corpus_shard_fnames.insert(matched_fnames.begin(), + matched_fnames.end()); + corpus_shard_fnames.insert(corpus_shard_fnames.end(), + matched_fnames.begin(), + matched_fnames.end()); + LOG(INFO) << "Found " << matched_fnames.size() << " shard(s) matching " + << glob; + } + } + if (!source.individual_input_rel_glob.empty()) { + std::vector matched_fnames; + const std::string glob = fs::path{dir} / source.individual_input_rel_glob; + const auto match_status = RemoteGlobMatch(glob, matched_fnames); + if (!match_status.ok() && !absl::IsNotFound(match_status)) { + LOG(ERROR) << "Got error when glob-matching in " << dir << ": " + << match_status; + } else { + size_t num_added_individual_inputs = 0; + for (auto& fname : matched_fnames) { + if (current_corpus_shard_fnames.contains(fname)) continue; + if (RemotePathIsDirectory(fname)) continue; + ++num_added_individual_inputs; + individual_input_fnames.push_back(std::move(fname)); + } + LOG(INFO) << "Found " << num_added_individual_inputs + << " individual input(s) with glob " << glob; + } + } } - LOG(INFO) << "Found " << corpus_shard_fnames.size() - << " shard(s) total in source " << source.dir_glob; + LOG(INFO) << "Found " << corpus_shard_fnames.size() << " shard(s) and " + << individual_input_fnames.size() + << " individual input(s) total in source " << source.dir_glob; - if (corpus_shard_fnames.empty()) { + if (corpus_shard_fnames.empty() && individual_input_fnames.empty()) { LOG(WARNING) << "Skipping empty source " << source.dir_glob; return absl::OkStatus(); } @@ -140,10 +174,12 @@ absl::Status SampleSeedCorpusElementsFromSource( // const auto num_shards = corpus_shard_fnames.size(); std::vector src_elts_per_shard(num_shards); std::vector src_elts_with_features_per_shard(num_shards, 0); + InputAndFeaturesVec src_elts; { constexpr int kMaxReadThreads = 32; - ThreadPool threads{std::min(kMaxReadThreads, num_shards)}; + ThreadPool threads{std::min( + kMaxReadThreads, std::max(num_shards, individual_input_fnames.size()))}; for (int shard = 0; shard < num_shards; ++shard) { const auto& corpus_fname = corpus_shard_fnames[shard]; @@ -193,11 +229,27 @@ absl::Status SampleSeedCorpusElementsFromSource( // threads.Schedule(read_shard); } + + RPROF_SNAPSHOT_AND_LOG("Done reading shards"); + + src_elts.resize(individual_input_fnames.size()); + for (size_t index = 0; index < individual_input_fnames.size(); ++index) { + threads.Schedule([index, &individual_input_fnames, &src_elts] { + ByteArray input; + const auto& path = individual_input_fnames[index]; + const auto read_status = RemoteFileGetContents(path, input); + if (!read_status.ok()) { + LOG(WARNING) << "Skipping individual input path " << path + << " due to read error: " << read_status; + return; + } + src_elts[index] = {std::move(input), {}}; + }); + } } RPROF_SNAPSHOT_AND_LOG("Done reading"); - InputAndFeaturesVec src_elts; size_t src_num_features = 0; for (int s = 0; s < num_shards; ++s) { @@ -217,6 +269,16 @@ absl::Status SampleSeedCorpusElementsFromSource( // RPROF_SNAPSHOT_AND_LOG("Done merging"); + // Remove empty inputs possibly due to read errors. + auto remove_it = + std::remove_if(src_elts.begin(), src_elts.end(), + [](const auto& elt) { return std::get<0>(elt).empty(); }); + if (remove_it != src_elts.end()) { + LOG(WARNING) << "Removed " << std::distance(remove_it, src_elts.end()) + << " empty inputs"; + src_elts.erase(remove_it, src_elts.end()); + } + LOG(INFO) << "Read total of " << src_elts.size() << " elements (" << src_num_features << " with features) from source " << source.dir_glob; diff --git a/centipede/seed_corpus_maker_lib.h b/centipede/seed_corpus_maker_lib.h index 68604fdb..5c3bb1c6 100644 --- a/centipede/seed_corpus_maker_lib.h +++ b/centipede/seed_corpus_maker_lib.h @@ -33,11 +33,16 @@ namespace centipede { // Native struct used by the seed corpus library for seed corpus source. // // TODO(b/362576261): Currently this is mirroring the `proto::SeedCorpusSource` -// proto. But in the future it may change with the core seeding API. +// proto. But in the future it may change with the core seeding API - any +// difference is commented below. struct SeedCorpusSource { std::string dir_glob; uint32_t num_recent_dirs; std::string shard_rel_glob; + // If non-empty, will be used to glob the individual input files (with one + // input in each file) in the source dirs. Any files matching `shard_rel_glob` + // will be skipped. + std::string individual_input_rel_glob; std::variant sampled_fraction_or_count; }; diff --git a/centipede/seed_corpus_maker_lib_test.cc b/centipede/seed_corpus_maker_lib_test.cc index a2e2b124..4064c3e3 100644 --- a/centipede/seed_corpus_maker_lib_test.cc +++ b/centipede/seed_corpus_maker_lib_test.cc @@ -21,13 +21,17 @@ #include // NOLINT #include #include +#include #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "absl/log/check.h" #include "absl/strings/str_cat.h" #include "./centipede/feature.h" #include "./centipede/workdir.h" +#include "./common/defs.h" #include "./common/logging.h" // IWYU pragma: keep +#include "./common/remote_file.h" #include "./common/test_util.h" namespace centipede { @@ -36,6 +40,7 @@ namespace { namespace fs = std::filesystem; using ::testing::IsSubsetOf; +using ::testing::IsSupersetOf; inline constexpr auto kIdxDigits = WorkDir::kDigitsInShardIndex; @@ -178,5 +183,69 @@ TEST(SeedCorpusMakerLibTest, RoundTripWriteReadWrite) { } } +TEST(SeedCorpusMakerLibTest, LoadsBothIndividualInputsAndShardsFromSource) { + const fs::path test_dir = GetTestTempDir(test_info_->name()); + chdir(test_dir.c_str()); + + const InputAndFeaturesVec kShardedInputs = { + {{0}, {}}, + {{1}, {feature_domains::kNoFeature}}, + {{0, 1}, {0x11, 0x23}}, + }; + constexpr std::string_view kCovBin = "bin"; + constexpr std::string_view kCovHash = "hash"; + constexpr std::string_view kRelDir = "dir/foo"; + + const std::vector kIndividualInputs = { + {0, 1, 2}, + {0, 1, 2, 3}, + // Empty input expected to be not in the sample result. + {}}; + // Write sharded inputs. + { + constexpr size_t kNumShards = 2; + const SeedCorpusDestination destination = { + .dir_path = std::string(kRelDir), + .shard_rel_glob = absl::StrCat("distilled-", kCovBin, ".*"), + .shard_index_digits = kIdxDigits, + .num_shards = kNumShards, + }; + CHECK_OK(WriteSeedCorpusElementsToDestination( // + kShardedInputs, kCovBin, kCovHash, destination)); + const std::string workdir = (test_dir / kRelDir).c_str(); + ASSERT_NO_FATAL_FAILURE(VerifyShardsExist( // + workdir, kCovBin, kCovHash, kNumShards, ShardType::kDistilled)); + } + + // Write individual inputs + for (int i = 0; i < kIndividualInputs.size(); ++i) { + const auto path = std::filesystem::path(test_dir) / kRelDir / + absl::StrCat("individual_input_", i); + CHECK_OK(RemoteFileSetContents(path.string(), kIndividualInputs[i])); + } + + // Test that sharded and individual inputs matches what we wrote. + { + InputAndFeaturesVec elements; + ASSERT_OK(SampleSeedCorpusElementsFromSource( // + SeedCorpusSource{ + .dir_glob = std::string(kRelDir), + .num_recent_dirs = 1, + .shard_rel_glob = absl::StrCat("distilled-", kCovBin, ".*"), + // Intentionally try to match the shard files and test if they will + // be read as individual inputs. + .individual_input_rel_glob = "*", + .sampled_fraction_or_count = 1.0f, + }, + kCovBin, kCovHash, elements)); + EXPECT_EQ(elements.size(), 5); // Non-empty inputs + EXPECT_THAT(elements, IsSupersetOf(kShardedInputs)); + EXPECT_THAT(elements, IsSupersetOf(InputAndFeaturesVec{ + {{0, 1, 2}, {}}, + {{0, 1, 2, 3}, {}}, + })); + } +} + } // namespace } // namespace centipede