Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add population support to GRG samples #3

Merged
merged 1 commit into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,13 @@ endif()

set(GRGL_TEST_SOURCES
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_common.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_construct.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_bloom_filter.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_grg.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_main.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_mutation.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_serialize.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_util.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_visitor.cpp
)

Expand Down Expand Up @@ -201,6 +203,6 @@ if(ENABLE_TESTS)

# Now simply link against gtest or gtest_main as needed. Eg
add_executable(grgl_test ${GRGL_TEST_SOURCES})
target_link_libraries(grgl_test gtest_main tskit grgl)
target_link_libraries(grgl_test gtest_main tskit grgl z ${BGEN_LIBS})
add_test(NAME grgl_test COMMAND grgl_test)
endif()
7 changes: 5 additions & 2 deletions include/grgl/mut_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ class MutationIterator {

virtual void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) = 0;

virtual std::vector<std::string> getIndividualIds() = 0;

virtual size_t countMutations() const = 0;

bool next(MutationAndSamples& mutAndSamples, size_t& totalSamples);
Expand Down Expand Up @@ -114,6 +116,7 @@ class VCFMutationIterator : public MutationIterator {

void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) override;
size_t countMutations() const override;
std::vector<std::string> getIndividualIds() override;

protected:
void buffer_next(size_t& totalSamples) override;
Expand All @@ -132,8 +135,8 @@ class IGDMutationIterator : public MutationIterator {
const char* filename, FloatRange genomeRange, bool binaryMutations, bool emitMissingData, bool flipRefMajor);

void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) override;

size_t countMutations() const override;
std::vector<std::string> getIndividualIds() override;

protected:
void buffer_next(size_t& totalSamples) override;
Expand All @@ -157,8 +160,8 @@ class BGENMutationIterator : public MutationIterator {
~BGENMutationIterator() override;

void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) override;

size_t countMutations() const override;
std::vector<std::string> getIndividualIds() override;

protected:
void buffer_next(size_t& totalSamples) override;
Expand Down
21 changes: 13 additions & 8 deletions pygrgl/clicmd/construct.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,23 @@ def add_options(subparser):
subparser.add_argument("--parts", "-p", type=int, default=8,
help="The number of parts to split the sequence into; defaults to 8")
subparser.add_argument("--jobs", "-j", type=int, default=1,
help="Number of jobs (threads/cores) to use. Defaults to 1.")
help="Number of jobs (threads/cores) to use. Defaults to 1.")
subparser.add_argument("--trees", "-t", type=int, default=1,
help="Number of trees to use during shape construction. Defaults to 1.")
help="Number of trees to use during shape construction. Defaults to 1.")
subparser.add_argument("--binary-muts", "-b", action="store_true",
help="Use binary mutations (don't track specific alternate alleles).")
help="Use binary mutations (don't track specific alternate alleles).")
subparser.add_argument("--no-file-cleanup", "-c", action="store_true",
help="Do not cleanup intermediate files (for debugging, e.g.).")
help="Do not cleanup intermediate files (for debugging, e.g.).")
subparser.add_argument("--no-maf-flip", action="store_true",
help="Do not switch the reference allele with the major allele")
help="Do not switch the reference allele with the major allele")
subparser.add_argument("--shape-lf-filter", "-f", type=float, default=10.0,
help="During shape construction ignore mutations with counts less than this."
"If value is <1.0 then it is treated as a frequency. Defaults to 10 (count).")
help="During shape construction ignore mutations with counts less than this."
"If value is <1.0 then it is treated as a frequency. Defaults to 10 (count).")
subparser.add_argument("--population-ids", default=None,
help="Format: \"filename:fieldname\". Read population ids from the given "
"tab-separate file, using the given fieldname.")
subparser.add_argument("--verbose", "-v", action="store_true",
help="Verbose output, including timing information.")
help="Verbose output, including timing information.")

grgl_exe = which("grgl")
grg_merge_exe = which("grg-merge")
Expand Down Expand Up @@ -56,6 +59,8 @@ def build_shape(range_triple, args, input_file):
command = [grgl_exe, input_file]
if args.no_maf_flip:
command.append("--no-maf-flip")
if args.population_ids:
command.extend(["--population-ids", args.population_ids])
command.extend(["--lf-filter", str(args.shape_lf_filter)])
command.extend(["-l", "-s", "-r", f"{base}:{base+pspans}",
"-o", out_filename_tree(input_file, part, tnum)])
Expand Down
37 changes: 36 additions & 1 deletion src/build_shape.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <chrono>
#include <deque>
#include <limits>
#include <stdexcept>
#include <unordered_map>

#include "grgl/grg.h"
Expand Down Expand Up @@ -232,7 +233,8 @@ MutableGRGPtr createEmptyGRGFromSamples(const std::string& sampleFile,
const bool useBinaryMuts,
const bool emitMissingData,
const bool flipRefMajor,
const double dropBelowThreshold) {
const double dropBelowThreshold,
const std::map<std::string, std::string>& indivIdToPop) {
MutableGRGPtr result;
NodeToHapVect hashIndex;
std::cout << "Building genotype hash index..." << std::endl;
Expand All @@ -252,6 +254,39 @@ MutableGRGPtr createEmptyGRGFromSamples(const std::string& sampleFile,

std::cout << "Done" << std::endl;
result = std::make_shared<MutableGRG>(hashIndex.size());
if (!indivIdToPop.empty()) {
size_t ploidy = 0;
size_t numIndividuals = 0;
bool isPhased = false;
mutationIterator->getMetadata(ploidy, numIndividuals, isPhased);
std::vector<std::string> indivIds = mutationIterator->getIndividualIds();
release_assert(indivIds.size() == numIndividuals);
std::map<std::string, size_t> popDescriptionMap;
for (NodeID individual = 0; individual < indivIds.size(); individual++) {
const auto& stringId = indivIds[individual];
const auto& findIt = indivIdToPop.find(stringId);
if (findIt == indivIdToPop.end()) {
std::stringstream ssErr;
ssErr << "Could not find population mapping for individual " << stringId;
throw std::runtime_error(ssErr.str());
}
const auto& popDescription = findIt->second;
const size_t nextPopId = popDescriptionMap.size();
const auto& findPopIt = popDescriptionMap.emplace(popDescription, nextPopId);
const auto popId = findPopIt.first->second;
if (findPopIt.second) {
release_assert(popId == nextPopId);
result->addPopulation(popDescription);
} else {
release_assert(popId != nextPopId);
}
for (NodeID offset = 0; offset < ploidy; offset++) {
const NodeID sampleId = (individual * ploidy) + offset;
release_assert(sampleId < result->numSamples());
result->getNodeData(sampleId).populationId = popId;
}
}
}
std::cout << "Adding GRG shape from genotype hashes..." << std::endl;
const bool buildBinaryTrees = true;
addGrgShapeFromHashing(result, hashIndex, result->getSampleNodes(), buildBinaryTrees);
Expand Down
3 changes: 2 additions & 1 deletion src/build_shape.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ MutableGRGPtr createEmptyGRGFromSamples(const std::string& sampleFile,
bool useBinaryMuts,
bool emitMissingData,
bool flipRefMajor,
double dropBelowThreshold);
double dropBelowThreshold,
const std::map<std::string, std::string>& indivIdToPop);

}

Expand Down
66 changes: 63 additions & 3 deletions src/gconverter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,26 @@
#include "picovcf.hpp"
#include "util.h"

static grgl::NodeIDList trimIndividuals(grgl::NodeIDList sampleList,
const size_t ploidy,
const std::unordered_map<grgl::NodeID, grgl::NodeID>& keepIndividuals) {
if (!keepIndividuals.empty()) {
grgl::NodeIDList newList;
newList.reserve(keepIndividuals.size() * ploidy);
for (auto sampleId : sampleList) {
const auto extra = sampleId % ploidy;
const auto indivId = sampleId / ploidy;
const auto& findIt = keepIndividuals.find(indivId);
if (findIt != keepIndividuals.end()) {
grgl::NodeID newID = (findIt->second * ploidy) + extra;
newList.emplace_back(newID);
}
}
return std::move(newList);
}
return std::move(sampleList);
}

static void trim(grgl::NodeIDList& sampleList, size_t newNumSamples) {
if (0 != newNumSamples) {
size_t trimTo = sampleList.size();
Expand Down Expand Up @@ -90,6 +110,7 @@ static void mutationIteratorToIGD(const std::string& inFilename,
const double fpPerVariant = 0,
const double fnPerVariant = 0,
const size_t trimToSamples = 0,
std::string keepIndividualFile = "",
bool verbose = true) {
constexpr size_t EMIT_EVERY = 10000;

Expand All @@ -114,7 +135,39 @@ static void mutationIteratorToIGD(const std::string& inFilename,

const size_t numSamples = numIndividuals * ploidy;
release_assert(trimToSamples == 0 || (trimToSamples % ploidy == 0));
const size_t effectiveSamples = (0 == trimToSamples) ? numSamples : trimToSamples;
size_t effectiveSamples = (0 == trimToSamples) ? numSamples : trimToSamples;

std::unordered_map<grgl::NodeID, grgl::NodeID> keepIndividuals;
auto individualIds = iterator->getIndividualIds();
if (!keepIndividualFile.empty()) {
if (individualIds.empty()) {
throw grgl::BadInputFileFailure("Individual filtering requires the input file to contain individual IDs");
}
std::map<std::string, grgl::NodeID> idToNodeId;
for (size_t i = 0; i < individualIds.size(); i++) {
idToNodeId.emplace(individualIds[i], i);
}
std::vector<std::string> newIndividualIds;
std::ifstream filterFile(keepIndividualFile);
std::string indivId;
size_t count = 0;
while (std::getline(filterFile, indivId)) {
release_assert(!indivId.empty());
const auto& findIt = idToNodeId.find(indivId);
if (findIt == idToNodeId.end()) {
std::stringstream ssErr;
ssErr << "Could not find individual with id " << indivId;
throw grgl::BadInputFileFailure(ssErr.str().c_str());
}
keepIndividuals.emplace(findIt->second, count++);
newIndividualIds.push_back(indivId);
}
individualIds = std::move(newIndividualIds);
size_t newSamples = individualIds.size() * ploidy;
if (newSamples < effectiveSamples) {
effectiveSamples = newSamples;
}
}

std::ofstream outFile(outFilename, std::ios::binary);
picovcf::IGDWriter writer(ploidy, effectiveSamples / ploidy, isPhased);
Expand All @@ -134,6 +187,7 @@ static void mutationIteratorToIGD(const std::string& inFilename,
missingCount += mutAndSamples.samples.size();
}
trim(mutAndSamples.samples, trimToSamples);
mutAndSamples.samples = trimIndividuals(std::move(mutAndSamples.samples), ploidy, keepIndividuals);
if (fpPerVariant + fnPerVariant > 0) {
const double fp = fpPerVariant + fpLeftovers;
const size_t fpThisVariant = (size_t)fp;
Expand All @@ -156,7 +210,7 @@ static void mutationIteratorToIGD(const std::string& inFilename,
}
writer.writeIndex(outFile);
writer.writeVariantInfo(outFile);
writer.writeIndividualIds(outFile, {});
writer.writeIndividualIds(outFile, individualIds);
outFile.seekp(0);
writer.writeHeader(outFile, inFilename, "");
}
Expand All @@ -179,6 +233,11 @@ int main(int argc, char* argv[]) {
"genomeRange",
"Only convert for the given genome range: 'x:y' means [x, y) (x inclusive, y exclusive)",
{'r', "range"});
args::ValueFlag<std::string> keepIndivs(
parser,
"keepIndivs",
"Only retain the individuals with the IDs given in this file (one ID per line).",
{'i', "keep-indivs"});
try {
parser.ParseCLI(argc, argv);
} catch (args::Help&) {
Expand Down Expand Up @@ -231,6 +290,7 @@ int main(int argc, char* argv[]) {
restrictRange,
falseNegPerVariant ? *falseNegPerVariant : 0,
falsePosPerVariant ? *falsePosPerVariant : 0,
trimTo ? *trimTo : 0);
trimTo ? *trimTo : 0,
keepIndivs ? *keepIndivs : "");
return 0;
}
19 changes: 18 additions & 1 deletion src/grgl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ int main(int argc, char** argv) {
"ts-node-times",
"When converting tree-seq, use node times instead of mutation times",
{"ts-node-times"});
args::ValueFlag<std::string> populationIds(parser,
"population-ids",
"Format: \"filename:fieldname\". Read population ids from the given "
"tab-separate file, using the given fieldname.",
{"population-ids"});
try {
parser.ParseCLI(argc, argv);
} catch (args::Help&) {
Expand Down Expand Up @@ -155,6 +160,17 @@ int main(int argc, char** argv) {
restrictRange = grgl::FloatRange(gStart, gEnd);
}

std::map<std::string, std::string> indivIdToPop;
if (populationIds) {
std::vector<std::string> parts = split(*populationIds, ':');
if (parts.size() != 2) {
std::cerr << "Must specify \"filename:fieldname\" for --population-ids" << std::endl;
return 1;
}
indivIdToPop = loadMapFromTSV(parts[0], "sample", parts[1]);
}
std::cout << "loaded " << indivIdToPop.size() << " id->pops\n";

grgl::GRGPtr theGRG;
START_TIMING_OPERATION();
if (ends_with(*infile, ".trees")) {
Expand Down Expand Up @@ -185,7 +201,8 @@ int main(int argc, char** argv) {
binaryMutations,
missingDataHandling == MDH_ADD_TO_GRG,
!noMAFFlip,
lfFilter ? *lfFilter : 0.0);
lfFilter ? *lfFilter : 0.0,
indivIdToPop);
dumpStats(theGRG);
} else {
std::cerr << "Unsupported/undetected filetype for " << *infile << std::endl;
Expand Down
20 changes: 20 additions & 0 deletions src/mut_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ size_t VCFMutationIterator::countMutations() const {
return mutations;
}

std::vector<std::string> VCFMutationIterator::getIndividualIds() { return m_vcf->getIndividualLabels(); }

void VCFMutationIterator::buffer_next(size_t& totalSamples) {
bool foundMutations = false;
while (!foundMutations && m_vcf->hasNextVariant() && m_alreadyLoaded.empty()) {
Expand Down Expand Up @@ -265,6 +267,8 @@ size_t IGDMutationIterator::countMutations() const {
return mutations;
}

std::vector<std::string> IGDMutationIterator::getIndividualIds() { return m_igd->getIndividualIds(); }

void IGDMutationIterator::buffer_next(size_t& totalSamples) {
totalSamples = m_igd->numSamples();

Expand Down Expand Up @@ -398,6 +402,22 @@ size_t BGENMutationIterator::countMutations() const {
return mutations;
}

std::vector<std::string> BGENMutationIterator::getIndividualIds() {
if (bgen_file_contain_samples(m_file)) {
std::vector<std::string> result;
struct bgen_samples* samples = bgen_file_read_samples(m_file);
release_assert(samples != nullptr);
for (size_t i = 0; i < bgen_file_nsamples(m_file); i++) {
const bgen_string* bgStr = bgen_samples_get(samples, i);
release_assert(bgStr != nullptr);
result.emplace_back(bgen_string_data(bgStr));
}
bgen_samples_destroy(samples);
return std::move(result);
}
return {};
}

void BGENMutationIterator::buffer_next(size_t& totalSamples) {
bool foundMutations = false;
while (m_currentVariant < bgen_partition_nvariants(m_partition) && m_alreadyLoaded.empty()) {
Expand Down
Loading
Loading