Skip to content

Commit

Permalink
#Centipede Extend the seed corpus maker library to support legacy inp…
Browse files Browse the repository at this point in the history
…ut globs.

This is the easiest way to support seeding regression/crash inputs for replaying. This feature may also be useful in other contexts.

PiperOrigin-RevId: 703558948
  • Loading branch information
xinhaoyuan authored and copybara-github committed Dec 13, 2024
1 parent 7b10721 commit 4a4f6ff
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 13 deletions.
4 changes: 4 additions & 0 deletions centipede/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -1122,6 +1122,7 @@ cc_library(
":thread_pool",
":util",
":workdir",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/log",
"@com_google_absl//absl/log:check",
"@com_google_absl//absl/random",
Expand Down Expand Up @@ -1689,8 +1690,11 @@ cc_test(
":feature",
":seed_corpus_maker_lib",
":workdir",
"@com_google_absl//absl/log:check",
"@com_google_absl//absl/strings",
"@com_google_fuzztest//common:defs",
"@com_google_fuzztest//common:logging",
"@com_google_fuzztest//common:remote_file",
"@com_google_fuzztest//common:test_util",
"@com_google_googletest//:gtest_main",
],
Expand Down
86 changes: 74 additions & 12 deletions centipede/seed_corpus_maker_lib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <variant>
#include <vector>

#include "absl/container/flat_hash_set.h"
#include "absl/log/check.h"
#include "absl/log/log.h"
#include "absl/random/random.h"
Expand Down Expand Up @@ -115,21 +116,54 @@ absl::Status SampleSeedCorpusElementsFromSource( //
LOG(INFO) << "Selected " << src_dirs.size() << " corpus dir(s)";
}

// Find all the corpus shard files in the found dirs.
// Find all the corpus shard and individual input files in the found dirs.

std::vector<std::string> corpus_shard_fnames;
std::vector<std::string> individual_input_fnames;
for (const auto& dir : src_dirs) {
const std::string shards_glob = fs::path{dir} / source.shard_rel_glob;
// NOTE: `RemoteGlobMatch` appends to the output list.
const auto prev_num_shards = corpus_shard_fnames.size();
RETURN_IF_NOT_OK(RemoteGlobMatch(shards_glob, corpus_shard_fnames));
LOG(INFO) << "Found " << (corpus_shard_fnames.size() - prev_num_shards)
<< " shard(s) matching " << shards_glob;
absl::flat_hash_set<std::string> current_corpus_shard_fnames;
if (!source.shard_rel_glob.empty()) {
std::vector<std::string> matched_fnames;
const std::string glob = fs::path{dir} / source.shard_rel_glob;
const auto match_status = RemoteGlobMatch(glob, matched_fnames);
if (!match_status.ok() && !absl::IsNotFound(match_status)) {
LOG(ERROR) << "Got error when glob-matching in " << dir << ": "
<< match_status;
} else {
current_corpus_shard_fnames.insert(matched_fnames.begin(),
matched_fnames.end());
corpus_shard_fnames.insert(corpus_shard_fnames.end(),
matched_fnames.begin(),
matched_fnames.end());
LOG(INFO) << "Found " << matched_fnames.size() << " shard(s) matching "
<< glob;
}
}
if (!source.individual_input_rel_glob.empty()) {
std::vector<std::string> matched_fnames;
const std::string glob = fs::path{dir} / source.individual_input_rel_glob;
const auto match_status = RemoteGlobMatch(glob, matched_fnames);
if (!match_status.ok() && !absl::IsNotFound(match_status)) {
LOG(ERROR) << "Got error when glob-matching in " << dir << ": "
<< match_status;
} else {
size_t num_added_individual_inputs = 0;
for (auto& fname : matched_fnames) {
if (current_corpus_shard_fnames.contains(fname)) continue;
if (RemotePathIsDirectory(fname)) continue;
++num_added_individual_inputs;
individual_input_fnames.push_back(std::move(fname));
}
LOG(INFO) << "Found " << num_added_individual_inputs
<< " individual input(s) with glob " << glob;
}
}
}
LOG(INFO) << "Found " << corpus_shard_fnames.size()
<< " shard(s) total in source " << source.dir_glob;
LOG(INFO) << "Found " << corpus_shard_fnames.size() << " shard(s) and "
<< individual_input_fnames.size()
<< " individual input(s) total in source " << source.dir_glob;

if (corpus_shard_fnames.empty()) {
if (corpus_shard_fnames.empty() && individual_input_fnames.empty()) {
LOG(WARNING) << "Skipping empty source " << source.dir_glob;
return absl::OkStatus();
}
Expand All @@ -140,10 +174,12 @@ absl::Status SampleSeedCorpusElementsFromSource( //
const auto num_shards = corpus_shard_fnames.size();
std::vector<InputAndFeaturesVec> src_elts_per_shard(num_shards);
std::vector<size_t> src_elts_with_features_per_shard(num_shards, 0);
InputAndFeaturesVec src_elts;

{
constexpr int kMaxReadThreads = 32;
ThreadPool threads{std::min<int>(kMaxReadThreads, num_shards)};
ThreadPool threads{std::min<int>(
kMaxReadThreads, std::max(num_shards, individual_input_fnames.size()))};

for (int shard = 0; shard < num_shards; ++shard) {
const auto& corpus_fname = corpus_shard_fnames[shard];
Expand Down Expand Up @@ -193,11 +229,27 @@ absl::Status SampleSeedCorpusElementsFromSource( //

threads.Schedule(read_shard);
}

RPROF_SNAPSHOT_AND_LOG("Done reading shards");

src_elts.resize(individual_input_fnames.size());
for (size_t index = 0; index < individual_input_fnames.size(); ++index) {
threads.Schedule([index, &individual_input_fnames, &src_elts] {
ByteArray input;
const auto& path = individual_input_fnames[index];
const auto read_status = RemoteFileGetContents(path, input);
if (!read_status.ok()) {
LOG(WARNING) << "Skipping individual input path " << path
<< " due to read error: " << read_status;
return;
}
src_elts[index] = {std::move(input), {}};
});
}
}

RPROF_SNAPSHOT_AND_LOG("Done reading");

InputAndFeaturesVec src_elts;
size_t src_num_features = 0;

for (int s = 0; s < num_shards; ++s) {
Expand All @@ -217,6 +269,16 @@ absl::Status SampleSeedCorpusElementsFromSource( //

RPROF_SNAPSHOT_AND_LOG("Done merging");

// Remove empty inputs possibly due to read errors.
auto remove_it =
std::remove_if(src_elts.begin(), src_elts.end(),
[](const auto& elt) { return std::get<0>(elt).empty(); });
if (remove_it != src_elts.end()) {
LOG(WARNING) << "Removed " << std::distance(remove_it, src_elts.end())
<< " empty inputs";
src_elts.erase(remove_it, src_elts.end());
}

LOG(INFO) << "Read total of " << src_elts.size() << " elements ("
<< src_num_features << " with features) from source "
<< source.dir_glob;
Expand Down
7 changes: 6 additions & 1 deletion centipede/seed_corpus_maker_lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,16 @@ namespace centipede {
// Native struct used by the seed corpus library for seed corpus source.
//
// TODO(b/362576261): Currently this is mirroring the `proto::SeedCorpusSource`
// proto. But in the future it may change with the core seeding API.
// proto. But in the future it may change with the core seeding API - any
// difference is commented below.
struct SeedCorpusSource {
std::string dir_glob;
uint32_t num_recent_dirs;
std::string shard_rel_glob;
// If non-empty, will be used to glob the individual input files (with one
// input in each file) in the source dirs. Any files matching `shard_rel_glob`
// will be skipped.
std::string individual_input_rel_glob;
std::variant<float, uint32_t> sampled_fraction_or_count;
};

Expand Down
69 changes: 69 additions & 0 deletions centipede/seed_corpus_maker_lib_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,17 @@
#include <filesystem> // NOLINT
#include <string>
#include <string_view>
#include <vector>

#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/log/check.h"
#include "absl/strings/str_cat.h"
#include "./centipede/feature.h"
#include "./centipede/workdir.h"
#include "./common/defs.h"
#include "./common/logging.h" // IWYU pragma: keep
#include "./common/remote_file.h"
#include "./common/test_util.h"

namespace centipede {
Expand All @@ -36,6 +40,7 @@ namespace {
namespace fs = std::filesystem;

using ::testing::IsSubsetOf;
using ::testing::IsSupersetOf;

inline constexpr auto kIdxDigits = WorkDir::kDigitsInShardIndex;

Expand Down Expand Up @@ -178,5 +183,69 @@ TEST(SeedCorpusMakerLibTest, RoundTripWriteReadWrite) {
}
}

TEST(SeedCorpusMakerLibTest, LoadsBothIndividualInputsAndShardsFromSource) {
const fs::path test_dir = GetTestTempDir(test_info_->name());
chdir(test_dir.c_str());

const InputAndFeaturesVec kShardedInputs = {
{{0}, {}},
{{1}, {feature_domains::kNoFeature}},
{{0, 1}, {0x11, 0x23}},
};
constexpr std::string_view kCovBin = "bin";
constexpr std::string_view kCovHash = "hash";
constexpr std::string_view kRelDir = "dir/foo";

const std::vector<ByteArray> kIndividualInputs = {
{0, 1, 2},
{0, 1, 2, 3},
// Empty input expected to be not in the sample result.
{}};
// Write sharded inputs.
{
constexpr size_t kNumShards = 2;
const SeedCorpusDestination destination = {
.dir_path = std::string(kRelDir),
.shard_rel_glob = absl::StrCat("distilled-", kCovBin, ".*"),
.shard_index_digits = kIdxDigits,
.num_shards = kNumShards,
};
CHECK_OK(WriteSeedCorpusElementsToDestination( //
kShardedInputs, kCovBin, kCovHash, destination));
const std::string workdir = (test_dir / kRelDir).c_str();
ASSERT_NO_FATAL_FAILURE(VerifyShardsExist( //
workdir, kCovBin, kCovHash, kNumShards, ShardType::kDistilled));
}

// Write individual inputs
for (int i = 0; i < kIndividualInputs.size(); ++i) {
const auto path = std::filesystem::path(test_dir) / kRelDir /
absl::StrCat("individual_input_", i);
CHECK_OK(RemoteFileSetContents(path.string(), kIndividualInputs[i]));
}

// Test that sharded and individual inputs matches what we wrote.
{
InputAndFeaturesVec elements;
ASSERT_OK(SampleSeedCorpusElementsFromSource( //
SeedCorpusSource{
.dir_glob = std::string(kRelDir),
.num_recent_dirs = 1,
.shard_rel_glob = absl::StrCat("distilled-", kCovBin, ".*"),
// Intentionally try to match the shard files and test if they will
// be read as individual inputs.
.individual_input_rel_glob = "*",
.sampled_fraction_or_count = 1.0f,
},
kCovBin, kCovHash, elements));
EXPECT_EQ(elements.size(), 5); // Non-empty inputs
EXPECT_THAT(elements, IsSupersetOf(kShardedInputs));
EXPECT_THAT(elements, IsSupersetOf(InputAndFeaturesVec{
{{0, 1, 2}, {}},
{{0, 1, 2, 3}, {}},
}));
}
}

} // namespace
} // namespace centipede

0 comments on commit 4a4f6ff

Please sign in to comment.