Skip to content

Commit

Permalink
#Centipede Extend the seed corpus maker library to support individual…
Browse files Browse the repository at this point in the history
… input globs.

This is the easiest way to support seeding regression/crash inputs for replaying. This feature may also be useful in other contexts.

PiperOrigin-RevId: 703558948
  • Loading branch information
xinhaoyuan authored and copybara-github committed Dec 16, 2024
1 parent 4a1a58c commit ca88829
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 13 deletions.
4 changes: 4 additions & 0 deletions centipede/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -1122,6 +1122,7 @@ cc_library(
":thread_pool",
":util",
":workdir",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/log",
"@com_google_absl//absl/log:check",
"@com_google_absl//absl/random",
Expand Down Expand Up @@ -1689,8 +1690,11 @@ cc_test(
":feature",
":seed_corpus_maker_lib",
":workdir",
"@com_google_absl//absl/log:check",
"@com_google_absl//absl/strings",
"@com_google_fuzztest//common:defs",
"@com_google_fuzztest//common:logging",
"@com_google_fuzztest//common:remote_file",
"@com_google_fuzztest//common:test_util",
"@com_google_googletest//:gtest_main",
],
Expand Down
86 changes: 74 additions & 12 deletions centipede/seed_corpus_maker_lib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <variant>
#include <vector>

#include "absl/container/flat_hash_set.h"
#include "absl/log/check.h"
#include "absl/log/log.h"
#include "absl/random/random.h"
Expand Down Expand Up @@ -115,21 +116,54 @@ absl::Status SampleSeedCorpusElementsFromSource( //
LOG(INFO) << "Selected " << src_dirs.size() << " corpus dir(s)";
}

// Find all the corpus shard files in the found dirs.
// Find all the corpus shard and individual input files in the found dirs.

std::vector<std::string> corpus_shard_fnames;
std::vector<std::string> individual_input_fnames;
for (const auto& dir : src_dirs) {
const std::string shards_glob = fs::path{dir} / source.shard_rel_glob;
// NOTE: `RemoteGlobMatch` appends to the output list.
const auto prev_num_shards = corpus_shard_fnames.size();
RETURN_IF_NOT_OK(RemoteGlobMatch(shards_glob, corpus_shard_fnames));
LOG(INFO) << "Found " << (corpus_shard_fnames.size() - prev_num_shards)
<< " shard(s) matching " << shards_glob;
absl::flat_hash_set<std::string> current_corpus_shard_fnames;
if (!source.shard_rel_glob.empty()) {
std::vector<std::string> matched_fnames;
const std::string glob = fs::path{dir} / source.shard_rel_glob;
const auto match_status = RemoteGlobMatch(glob, matched_fnames);
if (!match_status.ok() && !absl::IsNotFound(match_status)) {
LOG(ERROR) << "Got error when glob-matching in " << dir << ": "
<< match_status;
} else {
current_corpus_shard_fnames.insert(matched_fnames.begin(),
matched_fnames.end());
corpus_shard_fnames.insert(corpus_shard_fnames.end(),
matched_fnames.begin(),
matched_fnames.end());
LOG(INFO) << "Found " << matched_fnames.size() << " shard(s) matching "
<< glob;
}
}
if (!source.individual_input_rel_glob.empty()) {
std::vector<std::string> matched_fnames;
const std::string glob = fs::path{dir} / source.individual_input_rel_glob;
const auto match_status = RemoteGlobMatch(glob, matched_fnames);
if (!match_status.ok() && !absl::IsNotFound(match_status)) {
LOG(ERROR) << "Got error when glob-matching in " << dir << ": "
<< match_status;
} else {
size_t num_added_individual_inputs = 0;
for (auto& fname : matched_fnames) {
if (current_corpus_shard_fnames.contains(fname)) continue;
if (RemotePathIsDirectory(fname)) continue;
++num_added_individual_inputs;
individual_input_fnames.push_back(std::move(fname));
}
LOG(INFO) << "Found " << num_added_individual_inputs
<< " individual input(s) with glob " << glob;
}
}
}
LOG(INFO) << "Found " << corpus_shard_fnames.size()
<< " shard(s) total in source " << source.dir_glob;
LOG(INFO) << "Found " << corpus_shard_fnames.size() << " shard(s) and "
<< individual_input_fnames.size()
<< " individual input(s) total in source " << source.dir_glob;

if (corpus_shard_fnames.empty()) {
if (corpus_shard_fnames.empty() && individual_input_fnames.empty()) {
LOG(WARNING) << "Skipping empty source " << source.dir_glob;
return absl::OkStatus();
}
Expand All @@ -140,10 +174,12 @@ absl::Status SampleSeedCorpusElementsFromSource( //
const auto num_shards = corpus_shard_fnames.size();
std::vector<InputAndFeaturesVec> src_elts_per_shard(num_shards);
std::vector<size_t> src_elts_with_features_per_shard(num_shards, 0);
InputAndFeaturesVec src_elts;

{
constexpr int kMaxReadThreads = 32;
ThreadPool threads{std::min<int>(kMaxReadThreads, num_shards)};
ThreadPool threads{std::min<int>(
kMaxReadThreads, std::max(num_shards, individual_input_fnames.size()))};

for (int shard = 0; shard < num_shards; ++shard) {
const auto& corpus_fname = corpus_shard_fnames[shard];
Expand Down Expand Up @@ -193,11 +229,27 @@ absl::Status SampleSeedCorpusElementsFromSource( //

threads.Schedule(read_shard);
}

RPROF_SNAPSHOT_AND_LOG("Done reading shards");

src_elts.resize(individual_input_fnames.size());
for (size_t index = 0; index < individual_input_fnames.size(); ++index) {
threads.Schedule([index, &individual_input_fnames, &src_elts] {
ByteArray input;
const auto& path = individual_input_fnames[index];
const auto read_status = RemoteFileGetContents(path, input);
if (!read_status.ok()) {
LOG(WARNING) << "Skipping individual input path " << path
<< " due to read error: " << read_status;
return;
}
src_elts[index] = {std::move(input), {}};
});
}
}

RPROF_SNAPSHOT_AND_LOG("Done reading");

InputAndFeaturesVec src_elts;
size_t src_num_features = 0;

for (int s = 0; s < num_shards; ++s) {
Expand All @@ -217,6 +269,16 @@ absl::Status SampleSeedCorpusElementsFromSource( //

RPROF_SNAPSHOT_AND_LOG("Done merging");

// Remove empty inputs possibly due to read errors.
auto remove_it =
std::remove_if(src_elts.begin(), src_elts.end(),
[](const auto& elt) { return std::get<0>(elt).empty(); });
if (remove_it != src_elts.end()) {
LOG(WARNING) << "Removed " << std::distance(remove_it, src_elts.end())
<< " empty inputs";
src_elts.erase(remove_it, src_elts.end());
}

LOG(INFO) << "Read total of " << src_elts.size() << " elements ("
<< src_num_features << " with features) from source "
<< source.dir_glob;
Expand Down
7 changes: 6 additions & 1 deletion centipede/seed_corpus_maker_lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,16 @@ namespace centipede {
// Native struct used by the seed corpus library for seed corpus source.
//
// TODO(b/362576261): Currently this is mirroring the `proto::SeedCorpusSource`
// proto. But in the future it may change with the core seeding API.
// proto. But in the future it may change with the core seeding API - any
// difference is commented below.
struct SeedCorpusSource {
std::string dir_glob;
uint32_t num_recent_dirs;
std::string shard_rel_glob;
// If non-empty, will be used to glob the individual input files (with one
// input in each file) in the source dirs. Any files matching `shard_rel_glob`
// will be skipped.
std::string individual_input_rel_glob;
std::variant<float, uint32_t> sampled_fraction_or_count;
};

Expand Down
69 changes: 69 additions & 0 deletions centipede/seed_corpus_maker_lib_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,17 @@
#include <filesystem> // NOLINT
#include <string>
#include <string_view>
#include <vector>

#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/log/check.h"
#include "absl/strings/str_cat.h"
#include "./centipede/feature.h"
#include "./centipede/workdir.h"
#include "./common/defs.h"
#include "./common/logging.h" // IWYU pragma: keep
#include "./common/remote_file.h"
#include "./common/test_util.h"

namespace centipede {
Expand All @@ -36,6 +40,7 @@ namespace {
namespace fs = std::filesystem;

using ::testing::IsSubsetOf;
using ::testing::IsSupersetOf;

inline constexpr auto kIdxDigits = WorkDir::kDigitsInShardIndex;

Expand Down Expand Up @@ -178,5 +183,69 @@ TEST(SeedCorpusMakerLibTest, RoundTripWriteReadWrite) {
}
}

TEST(SeedCorpusMakerLibTest, LoadsBothIndividualInputsAndShardsFromSource) {
const fs::path test_dir = GetTestTempDir(test_info_->name());
chdir(test_dir.c_str());

const InputAndFeaturesVec kShardedInputs = {
{{0}, {}},
{{1}, {feature_domains::kNoFeature}},
{{0, 1}, {0x11, 0x23}},
};
constexpr std::string_view kCovBin = "bin";
constexpr std::string_view kCovHash = "hash";
constexpr std::string_view kRelDir = "dir/foo";

const std::vector<ByteArray> kIndividualInputs = {
{0, 1, 2},
{0, 1, 2, 3},
// Empty input expected to be not in the sample result.
{}};
// Write sharded inputs.
{
constexpr size_t kNumShards = 2;
const SeedCorpusDestination destination = {
.dir_path = std::string(kRelDir),
.shard_rel_glob = absl::StrCat("distilled-", kCovBin, ".*"),
.shard_index_digits = kIdxDigits,
.num_shards = kNumShards,
};
CHECK_OK(WriteSeedCorpusElementsToDestination( //
kShardedInputs, kCovBin, kCovHash, destination));
const std::string workdir = (test_dir / kRelDir).c_str();
ASSERT_NO_FATAL_FAILURE(VerifyShardsExist( //
workdir, kCovBin, kCovHash, kNumShards, ShardType::kDistilled));
}

// Write individual inputs
for (int i = 0; i < kIndividualInputs.size(); ++i) {
const auto path = std::filesystem::path(test_dir) / kRelDir /
absl::StrCat("individual_input_", i);
CHECK_OK(RemoteFileSetContents(path.string(), kIndividualInputs[i]));
}

// Test that sharded and individual inputs matches what we wrote.
{
InputAndFeaturesVec elements;
ASSERT_OK(SampleSeedCorpusElementsFromSource( //
SeedCorpusSource{
.dir_glob = std::string(kRelDir),
.num_recent_dirs = 1,
.shard_rel_glob = absl::StrCat("distilled-", kCovBin, ".*"),
// Intentionally try to match the shard files and test if they will
// be read as individual inputs.
.individual_input_rel_glob = "*",
.sampled_fraction_or_count = 1.0f,
},
kCovBin, kCovHash, elements));
EXPECT_EQ(elements.size(), 5); // Non-empty inputs
EXPECT_THAT(elements, IsSupersetOf(kShardedInputs));
EXPECT_THAT(elements, IsSupersetOf(InputAndFeaturesVec{
{{0, 1, 2}, {}},
{{0, 1, 2, 3}, {}},
}));
}
}

} // namespace
} // namespace centipede

0 comments on commit ca88829

Please sign in to comment.