Skip to content

Commit

Permalink
#Centipede Extend the seed corpus maker library to support legacy inp…
Browse files Browse the repository at this point in the history
…ut glob.

This is the easiest way to support seeding regression/crash inputs for replaying. This feature may also be useful in other contexts.

PiperOrigin-RevId: 703558948
  • Loading branch information
xinhaoyuan authored and copybara-github committed Dec 6, 2024
1 parent f5868e2 commit 2dbca59
Show file tree
Hide file tree
Showing 6 changed files with 161 additions and 12 deletions.
5 changes: 5 additions & 0 deletions centipede/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -913,6 +913,7 @@ cc_library(
":environment",
":mutation_input",
":runner_result",
":stop",
"@com_google_absl//absl/log",
"@com_google_absl//absl/log:check",
"@com_google_fuzztest//common:defs",
Expand Down Expand Up @@ -1121,6 +1122,7 @@ cc_library(
":thread_pool",
":util",
":workdir",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/log",
"@com_google_absl//absl/log:check",
"@com_google_absl//absl/random",
Expand Down Expand Up @@ -1688,8 +1690,11 @@ cc_test(
":feature",
":seed_corpus_maker_lib",
":workdir",
"@com_google_absl//absl/log:check",
"@com_google_absl//absl/strings",
"@com_google_fuzztest//common:defs",
"@com_google_fuzztest//common:logging",
"@com_google_fuzztest//common:remote_file",
"@com_google_fuzztest//common:test_util",
"@com_google_googletest//:gtest_main",
],
Expand Down
4 changes: 4 additions & 0 deletions centipede/centipede_default_callbacks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "./centipede/environment.h"
#include "./centipede/mutation_input.h"
#include "./centipede/runner_result.h"
#include "./centipede/stop.h"
#include "./common/defs.h"
#include "./common/logging.h" // IWYU pragma: keep

Expand Down Expand Up @@ -83,6 +84,9 @@ void CentipedeDefaultCallbacks::Mutate(
LOG_FIRST_N(WARNING, 5)
<< "Custom mutator returned no mutants: falling back to internal "
"default mutator";
} else if (ShouldStop()) {
LOG(INFO) << "Stop condition detected. Ignoring mutation errors.";
return;
} else {
LOG(WARNING) << "Custom mutator undetected or misbehaving:";
CHECK(!custom_mutator_is_usable_.has_value())
Expand Down
88 changes: 77 additions & 11 deletions centipede/seed_corpus_maker_lib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <variant>
#include <vector>

#include "absl/container/flat_hash_set.h"
#include "absl/log/check.h"
#include "absl/log/log.h"
#include "absl/random/random.h"
Expand Down Expand Up @@ -115,21 +116,54 @@ absl::Status SampleSeedCorpusElementsFromSource( //
LOG(INFO) << "Selected " << src_dirs.size() << " corpus dir(s)";
}

// Find all the corpus shard files in the found dirs.
// Find all the corpus shard and legecy input files in the found dirs.

std::vector<std::string> corpus_shard_fnames;
std::vector<std::string> legacy_input_fnames;
for (const auto& dir : src_dirs) {
const std::string shards_glob = fs::path{dir} / source.shard_rel_glob;
// NOTE: `RemoteGlobMatch` appends to the output list.
const auto prev_num_shards = corpus_shard_fnames.size();
RETURN_IF_NOT_OK(RemoteGlobMatch(shards_glob, corpus_shard_fnames));
LOG(INFO) << "Found " << (corpus_shard_fnames.size() - prev_num_shards)
<< " shard(s) matching " << shards_glob;
absl::flat_hash_set<std::string> current_corpus_shard_fnames;
if (!source.shard_rel_glob.empty()) {
std::vector<std::string> matched_fnames;
const std::string glob = fs::path{dir} / source.shard_rel_glob;
const auto match_status = RemoteGlobMatch(glob, matched_fnames);
if (!match_status.ok() && !absl::IsNotFound(match_status)) {
LOG(ERROR) << "Got error when glob-matching in " << dir << ": "
<< match_status;
} else {
current_corpus_shard_fnames.insert(matched_fnames.begin(),
matched_fnames.end());
corpus_shard_fnames.insert(corpus_shard_fnames.end(),
matched_fnames.begin(),
matched_fnames.end());
LOG(INFO) << "Found " << matched_fnames.size() << " shard(s) matching "
<< glob;
}
}
if (!source.legacy_input_rel_glob.empty()) {
std::vector<std::string> matched_fnames;
const std::string glob = fs::path{dir} / source.legacy_input_rel_glob;
const auto match_status = RemoteGlobMatch(glob, matched_fnames);
if (!match_status.ok() && !absl::IsNotFound(match_status)) {
LOG(ERROR) << "Got error when glob-matching in " << dir << ": "
<< match_status;
} else {
size_t num_inserted_fnames = 0;
for (auto& fname : matched_fnames) {
if (current_corpus_shard_fnames.contains(fname)) continue;
if (RemotePathIsDirectory(fname)) continue;
++num_inserted_fnames;
legacy_input_fnames.push_back(std::move(fname));
}
LOG(INFO) << "Found " << num_inserted_fnames
<< " legacy input(s) with glob " << glob;
}
}
}
LOG(INFO) << "Found " << corpus_shard_fnames.size()
<< " shard(s) total in source " << source.dir_glob;
LOG(INFO) << "Found " << corpus_shard_fnames.size() << " shard(s) and "
<< legacy_input_fnames.size() << " legacy input(s) total in source "
<< source.dir_glob;

if (corpus_shard_fnames.empty()) {
if (corpus_shard_fnames.empty() && legacy_input_fnames.empty()) {
LOG(WARNING) << "Skipping empty source " << source.dir_glob;
return absl::OkStatus();
}
Expand Down Expand Up @@ -195,9 +229,41 @@ absl::Status SampleSeedCorpusElementsFromSource( //
}
}

RPROF_SNAPSHOT_AND_LOG("Done reading");
RPROF_SNAPSHOT_AND_LOG("Done reading shards");

InputAndFeaturesVec src_elts;

if (!legacy_input_fnames.empty()) {
constexpr size_t kLeastNumLegacyInputsPerBatch = 256;
const size_t legacy_input_batches = std::max(
size_t{1}, legacy_input_fnames.size() / kLeastNumLegacyInputsPerBatch);
const size_t batch_size =
(legacy_input_fnames.size() - 1) / legacy_input_batches + 1;
constexpr int kMaxReadThreads = 32;
ThreadPool threads{std::min<int>(kMaxReadThreads, legacy_input_batches)};
src_elts.resize(legacy_input_fnames.size());

for (int batch = 0; batch < legacy_input_batches; ++batch) {
threads.Schedule([batch, batch_size, &legacy_input_fnames, &src_elts] {
for (int index = batch * batch_size; index < (batch + 1) * batch_size &&
index < legacy_input_fnames.size();
++index) {
ByteArray input;
const auto& path = legacy_input_fnames[index];
const auto read_status = RemoteFileGetContents(path, input);
if (!read_status.ok()) {
LOG(WARNING) << "Error while reading legacy input path " << path
<< ": " << read_status;
continue;
}
src_elts[index] = {input, {}};
}
});
}
}

RPROF_SNAPSHOT_AND_LOG("Done reading");

size_t src_num_features = 0;

for (int s = 0; s < num_shards; ++s) {
Expand Down
7 changes: 6 additions & 1 deletion centipede/seed_corpus_maker_lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,16 @@ namespace centipede {
// Native struct used by the seed corpus library for seed corpus source.
//
// TODO(b/362576261): Currently this is mirroring the `proto::SeedCorpusSource`
// proto. But in the future it may change with the core seeding API.
// proto. But in the future it may change with the core seeding API - any
// difference is commented below.
struct SeedCorpusSource {
std::string dir_glob;
uint32_t num_recent_dirs;
std::string shard_rel_glob;
// If non-empty, will be used to glob the legacy input files (with one input
// in each file) in the source dirs. Any files matching `shard_rel_glob` will
// be skipped.
std::string legacy_input_rel_glob;
std::variant<float, uint32_t> sampled_fraction_or_count;
};

Expand Down
65 changes: 65 additions & 0 deletions centipede/seed_corpus_maker_lib_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,17 @@
#include <filesystem> // NOLINT
#include <string>
#include <string_view>
#include <vector>

#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/log/check.h"
#include "absl/strings/str_cat.h"
#include "./centipede/feature.h"
#include "./centipede/workdir.h"
#include "./common/defs.h"
#include "./common/logging.h" // IWYU pragma: keep
#include "./common/remote_file.h"
#include "./common/test_util.h"

namespace centipede {
Expand All @@ -36,6 +40,7 @@ namespace {
namespace fs = std::filesystem;

using ::testing::IsSubsetOf;
using ::testing::IsSupersetOf;

inline constexpr auto kIdxDigits = WorkDir::kDigitsInShardIndex;

Expand Down Expand Up @@ -178,5 +183,65 @@ TEST(SeedCorpusMakerLibTest, RoundTripWriteReadWrite) {
}
}

TEST(SeedCorpusMakerLibTest, LoadsBothLegacyInputsAndShardsFromSource) {
const fs::path test_dir = GetTestTempDir(test_info_->name());
chdir(test_dir.c_str());

const InputAndFeaturesVec kShardedInputs = {
{{0}, {}},
{{1}, {feature_domains::kNoFeature}},
{{0, 1}, {0x11, 0x23}},
};
constexpr std::string_view kCovBin = "bin";
constexpr std::string_view kCovHash = "hash";
constexpr std::string_view kRelDir1 = "dir/foo";

const std::vector<ByteArray> kLegacyInputs = {{0, 1, 2}, {0, 1, 2, 3}};
// Write sharded inputs.
{
constexpr size_t kNumShards = 2;
const SeedCorpusDestination destination = {
.dir_path = std::string(kRelDir1),
.shard_rel_glob = absl::StrCat("distilled-", kCovBin, ".*"),
.shard_index_digits = kIdxDigits,
.num_shards = kNumShards,
};
CHECK_OK(WriteSeedCorpusElementsToDestination( //
kShardedInputs, kCovBin, kCovHash, destination));
const std::string workdir = (test_dir / kRelDir1).c_str();
ASSERT_NO_FATAL_FAILURE(VerifyShardsExist( //
workdir, kCovBin, kCovHash, kNumShards, ShardType::kDistilled));
}

// Write legacy inputs
for (int i = 0; i < kLegacyInputs.size(); ++i) {
const auto path = std::filesystem::path(test_dir) / kRelDir1 /
absl::StrCat("legacy_input_", i);
CHECK_OK(RemoteFileSetContents(path.string(), kLegacyInputs[i]));
}

// Test that sharded and legacy inputs matches what we wrote.
{
InputAndFeaturesVec elements;
ASSERT_OK(SampleSeedCorpusElementsFromSource( //
SeedCorpusSource{
.dir_glob = std::string(kRelDir1),
.num_recent_dirs = 1,
.shard_rel_glob = absl::StrCat("distilled-", kCovBin, ".*"),
// Intentionally try to match the shard files and test if they will
// be read as legacy inputs.
.legacy_input_rel_glob = "*",
.sampled_fraction_or_count = 1.0f,
},
kCovBin, kCovHash, elements));
EXPECT_EQ(elements.size(), kShardedInputs.size() + kLegacyInputs.size());
EXPECT_THAT(elements, IsSupersetOf(kShardedInputs));
EXPECT_THAT(elements, IsSupersetOf(InputAndFeaturesVec{
{{0, 1, 2}, {}},
{{0, 1, 2, 3}, {}},
}));
}
}

} // namespace
} // namespace centipede
4 changes: 4 additions & 0 deletions common/remote_file_oss.cc
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,10 @@ absl::Status RemoteGlobMatch(std::string_view glob,
if (int ret = ::glob(std::string{glob}.c_str(), GLOB_TILDE, HandleGlobError,
&glob_ret);
ret != 0) {
if (ret == GLOB_NOMATCH) {
return absl::NotFoundError(absl::StrCat(
"glob() returned NOMATCH for pattern: ", std::string(glob)));
}
return absl::UnknownError(absl::StrCat(
"glob() failed, pattern: ", std::string(glob), ", returned: ", ret));
}
Expand Down

0 comments on commit 2dbca59

Please sign in to comment.