From 0098a8b52e51afc855ac2e3e75ab12efd9208796 Mon Sep 17 00:00:00 2001 From: Drew DeHaas Date: Wed, 3 Jul 2024 15:19:17 -0400 Subject: [PATCH] Add population support to GRG samples * Add new argument to "grg construct", add unit tests * "grg construct --population-ids " now attaches populations to the data. * Add population ID map support to grgl executable * Set populationID on the sample nodes based on the individual IDs. * Unit tests for the population ID handling in GRG construction and the associated helper functions. * Add individual ID support to MutationIterator * Add individual filtering to gconverter --- CMakeLists.txt | 4 +- include/grgl/mut_iterator.h | 7 +++- pygrgl/clicmd/construct.py | 21 ++++++---- src/build_shape.cpp | 37 +++++++++++++++++- src/build_shape.h | 3 +- src/gconverter.cpp | 66 ++++++++++++++++++++++++++++++-- src/grgl.cpp | 19 ++++++++- src/mut_iterator.cpp | 20 ++++++++++ src/util.h | 54 +++++++++++++++++++++++++- test/unit/test_construct.cpp | 72 +++++++++++++++++++++++++++++++++++ test/unit/test_util.cpp | 38 ++++++++++++++++++ test/unit/testing_utilities.h | 31 +++++++++++++++ 12 files changed, 354 insertions(+), 18 deletions(-) create mode 100644 test/unit/test_construct.cpp create mode 100644 test/unit/test_util.cpp create mode 100644 test/unit/testing_utilities.h diff --git a/CMakeLists.txt b/CMakeLists.txt index d0d5c0e..f5db4a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,11 +64,13 @@ endif() set(GRGL_TEST_SOURCES ${CMAKE_CURRENT_LIST_DIR}/test/unit/test_common.cpp + ${CMAKE_CURRENT_LIST_DIR}/test/unit/test_construct.cpp ${CMAKE_CURRENT_LIST_DIR}/test/unit/test_bloom_filter.cpp ${CMAKE_CURRENT_LIST_DIR}/test/unit/test_grg.cpp ${CMAKE_CURRENT_LIST_DIR}/test/unit/test_main.cpp ${CMAKE_CURRENT_LIST_DIR}/test/unit/test_mutation.cpp ${CMAKE_CURRENT_LIST_DIR}/test/unit/test_serialize.cpp + ${CMAKE_CURRENT_LIST_DIR}/test/unit/test_util.cpp ${CMAKE_CURRENT_LIST_DIR}/test/unit/test_visitor.cpp ) @@ -201,6 +203,6 @@ if(ENABLE_TESTS) # Now simply link against gtest or gtest_main as needed. Eg add_executable(grgl_test ${GRGL_TEST_SOURCES}) - target_link_libraries(grgl_test gtest_main tskit grgl) + target_link_libraries(grgl_test gtest_main tskit grgl z ${BGEN_LIBS}) add_test(NAME grgl_test COMMAND grgl_test) endif() diff --git a/include/grgl/mut_iterator.h b/include/grgl/mut_iterator.h index 801bc6c..2ec2699 100644 --- a/include/grgl/mut_iterator.h +++ b/include/grgl/mut_iterator.h @@ -75,6 +75,8 @@ class MutationIterator { virtual void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) = 0; + virtual std::vector getIndividualIds() = 0; + virtual size_t countMutations() const = 0; bool next(MutationAndSamples& mutAndSamples, size_t& totalSamples); @@ -114,6 +116,7 @@ class VCFMutationIterator : public MutationIterator { void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) override; size_t countMutations() const override; + std::vector getIndividualIds() override; protected: void buffer_next(size_t& totalSamples) override; @@ -132,8 +135,8 @@ class IGDMutationIterator : public MutationIterator { const char* filename, FloatRange genomeRange, bool binaryMutations, bool emitMissingData, bool flipRefMajor); void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) override; - size_t countMutations() const override; + std::vector getIndividualIds() override; protected: void buffer_next(size_t& totalSamples) override; @@ -157,8 +160,8 @@ class BGENMutationIterator : public MutationIterator { ~BGENMutationIterator() override; void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) override; - size_t countMutations() const override; + std::vector getIndividualIds() override; protected: void buffer_next(size_t& totalSamples) override; diff --git a/pygrgl/clicmd/construct.py b/pygrgl/clicmd/construct.py index a567efc..15f3c3e 100755 --- a/pygrgl/clicmd/construct.py +++ b/pygrgl/clicmd/construct.py @@ -15,20 +15,23 @@ def add_options(subparser): subparser.add_argument("--parts", "-p", type=int, default=8, help="The number of parts to split the sequence into; defaults to 8") subparser.add_argument("--jobs", "-j", type=int, default=1, - help="Number of jobs (threads/cores) to use. Defaults to 1.") + help="Number of jobs (threads/cores) to use. Defaults to 1.") subparser.add_argument("--trees", "-t", type=int, default=1, - help="Number of trees to use during shape construction. Defaults to 1.") + help="Number of trees to use during shape construction. Defaults to 1.") subparser.add_argument("--binary-muts", "-b", action="store_true", - help="Use binary mutations (don't track specific alternate alleles).") + help="Use binary mutations (don't track specific alternate alleles).") subparser.add_argument("--no-file-cleanup", "-c", action="store_true", - help="Do not cleanup intermediate files (for debugging, e.g.).") + help="Do not cleanup intermediate files (for debugging, e.g.).") subparser.add_argument("--no-maf-flip", action="store_true", - help="Do not switch the reference allele with the major allele") + help="Do not switch the reference allele with the major allele") subparser.add_argument("--shape-lf-filter", "-f", type=float, default=10.0, - help="During shape construction ignore mutations with counts less than this." - "If value is <1.0 then it is treated as a frequency. Defaults to 10 (count).") + help="During shape construction ignore mutations with counts less than this." + "If value is <1.0 then it is treated as a frequency. Defaults to 10 (count).") + subparser.add_argument("--population-ids", default=None, + help="Format: \"filename:fieldname\". Read population ids from the given " + "tab-separate file, using the given fieldname.") subparser.add_argument("--verbose", "-v", action="store_true", - help="Verbose output, including timing information.") + help="Verbose output, including timing information.") grgl_exe = which("grgl") grg_merge_exe = which("grg-merge") @@ -56,6 +59,8 @@ def build_shape(range_triple, args, input_file): command = [grgl_exe, input_file] if args.no_maf_flip: command.append("--no-maf-flip") + if args.population_ids: + command.extend(["--population-ids", args.population_ids]) command.extend(["--lf-filter", str(args.shape_lf_filter)]) command.extend(["-l", "-s", "-r", f"{base}:{base+pspans}", "-o", out_filename_tree(input_file, part, tnum)]) diff --git a/src/build_shape.cpp b/src/build_shape.cpp index 9afe0fc..85ffb60 100644 --- a/src/build_shape.cpp +++ b/src/build_shape.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "grgl/grg.h" @@ -232,7 +233,8 @@ MutableGRGPtr createEmptyGRGFromSamples(const std::string& sampleFile, const bool useBinaryMuts, const bool emitMissingData, const bool flipRefMajor, - const double dropBelowThreshold) { + const double dropBelowThreshold, + const std::map& indivIdToPop) { MutableGRGPtr result; NodeToHapVect hashIndex; std::cout << "Building genotype hash index..." << std::endl; @@ -252,6 +254,39 @@ MutableGRGPtr createEmptyGRGFromSamples(const std::string& sampleFile, std::cout << "Done" << std::endl; result = std::make_shared(hashIndex.size()); + if (!indivIdToPop.empty()) { + size_t ploidy = 0; + size_t numIndividuals = 0; + bool isPhased = false; + mutationIterator->getMetadata(ploidy, numIndividuals, isPhased); + std::vector indivIds = mutationIterator->getIndividualIds(); + release_assert(indivIds.size() == numIndividuals); + std::map popDescriptionMap; + for (NodeID individual = 0; individual < indivIds.size(); individual++) { + const auto& stringId = indivIds[individual]; + const auto& findIt = indivIdToPop.find(stringId); + if (findIt == indivIdToPop.end()) { + std::stringstream ssErr; + ssErr << "Could not find population mapping for individual " << stringId; + throw std::runtime_error(ssErr.str()); + } + const auto& popDescription = findIt->second; + const size_t nextPopId = popDescriptionMap.size(); + const auto& findPopIt = popDescriptionMap.emplace(popDescription, nextPopId); + const auto popId = findPopIt.first->second; + if (findPopIt.second) { + release_assert(popId == nextPopId); + result->addPopulation(popDescription); + } else { + release_assert(popId != nextPopId); + } + for (NodeID offset = 0; offset < ploidy; offset++) { + const NodeID sampleId = (individual * ploidy) + offset; + release_assert(sampleId < result->numSamples()); + result->getNodeData(sampleId).populationId = popId; + } + } + } std::cout << "Adding GRG shape from genotype hashes..." << std::endl; const bool buildBinaryTrees = true; addGrgShapeFromHashing(result, hashIndex, result->getSampleNodes(), buildBinaryTrees); diff --git a/src/build_shape.h b/src/build_shape.h index 94d9900..ef4436c 100644 --- a/src/build_shape.h +++ b/src/build_shape.h @@ -23,7 +23,8 @@ MutableGRGPtr createEmptyGRGFromSamples(const std::string& sampleFile, bool useBinaryMuts, bool emitMissingData, bool flipRefMajor, - double dropBelowThreshold); + double dropBelowThreshold, + const std::map& indivIdToPop); } diff --git a/src/gconverter.cpp b/src/gconverter.cpp index dc1f689..238ef39 100644 --- a/src/gconverter.cpp +++ b/src/gconverter.cpp @@ -32,6 +32,26 @@ #include "picovcf.hpp" #include "util.h" +static grgl::NodeIDList trimIndividuals(grgl::NodeIDList sampleList, + const size_t ploidy, + const std::unordered_map& keepIndividuals) { + if (!keepIndividuals.empty()) { + grgl::NodeIDList newList; + newList.reserve(keepIndividuals.size() * ploidy); + for (auto sampleId : sampleList) { + const auto extra = sampleId % ploidy; + const auto indivId = sampleId / ploidy; + const auto& findIt = keepIndividuals.find(indivId); + if (findIt != keepIndividuals.end()) { + grgl::NodeID newID = (findIt->second * ploidy) + extra; + newList.emplace_back(newID); + } + } + return std::move(newList); + } + return std::move(sampleList); +} + static void trim(grgl::NodeIDList& sampleList, size_t newNumSamples) { if (0 != newNumSamples) { size_t trimTo = sampleList.size(); @@ -90,6 +110,7 @@ static void mutationIteratorToIGD(const std::string& inFilename, const double fpPerVariant = 0, const double fnPerVariant = 0, const size_t trimToSamples = 0, + std::string keepIndividualFile = "", bool verbose = true) { constexpr size_t EMIT_EVERY = 10000; @@ -114,7 +135,39 @@ static void mutationIteratorToIGD(const std::string& inFilename, const size_t numSamples = numIndividuals * ploidy; release_assert(trimToSamples == 0 || (trimToSamples % ploidy == 0)); - const size_t effectiveSamples = (0 == trimToSamples) ? numSamples : trimToSamples; + size_t effectiveSamples = (0 == trimToSamples) ? numSamples : trimToSamples; + + std::unordered_map keepIndividuals; + auto individualIds = iterator->getIndividualIds(); + if (!keepIndividualFile.empty()) { + if (individualIds.empty()) { + throw grgl::BadInputFileFailure("Individual filtering requires the input file to contain individual IDs"); + } + std::map idToNodeId; + for (size_t i = 0; i < individualIds.size(); i++) { + idToNodeId.emplace(individualIds[i], i); + } + std::vector newIndividualIds; + std::ifstream filterFile(keepIndividualFile); + std::string indivId; + size_t count = 0; + while (std::getline(filterFile, indivId)) { + release_assert(!indivId.empty()); + const auto& findIt = idToNodeId.find(indivId); + if (findIt == idToNodeId.end()) { + std::stringstream ssErr; + ssErr << "Could not find individual with id " << indivId; + throw grgl::BadInputFileFailure(ssErr.str().c_str()); + } + keepIndividuals.emplace(findIt->second, count++); + newIndividualIds.push_back(indivId); + } + individualIds = std::move(newIndividualIds); + size_t newSamples = individualIds.size() * ploidy; + if (newSamples < effectiveSamples) { + effectiveSamples = newSamples; + } + } std::ofstream outFile(outFilename, std::ios::binary); picovcf::IGDWriter writer(ploidy, effectiveSamples / ploidy, isPhased); @@ -134,6 +187,7 @@ static void mutationIteratorToIGD(const std::string& inFilename, missingCount += mutAndSamples.samples.size(); } trim(mutAndSamples.samples, trimToSamples); + mutAndSamples.samples = trimIndividuals(std::move(mutAndSamples.samples), ploidy, keepIndividuals); if (fpPerVariant + fnPerVariant > 0) { const double fp = fpPerVariant + fpLeftovers; const size_t fpThisVariant = (size_t)fp; @@ -156,7 +210,7 @@ static void mutationIteratorToIGD(const std::string& inFilename, } writer.writeIndex(outFile); writer.writeVariantInfo(outFile); - writer.writeIndividualIds(outFile, {}); + writer.writeIndividualIds(outFile, individualIds); outFile.seekp(0); writer.writeHeader(outFile, inFilename, ""); } @@ -179,6 +233,11 @@ int main(int argc, char* argv[]) { "genomeRange", "Only convert for the given genome range: 'x:y' means [x, y) (x inclusive, y exclusive)", {'r', "range"}); + args::ValueFlag keepIndivs( + parser, + "keepIndivs", + "Only retain the individuals with the IDs given in this file (one ID per line).", + {'i', "keep-indivs"}); try { parser.ParseCLI(argc, argv); } catch (args::Help&) { @@ -231,6 +290,7 @@ int main(int argc, char* argv[]) { restrictRange, falseNegPerVariant ? *falseNegPerVariant : 0, falsePosPerVariant ? *falsePosPerVariant : 0, - trimTo ? *trimTo : 0); + trimTo ? *trimTo : 0, + keepIndivs ? *keepIndivs : ""); return 0; } diff --git a/src/grgl.cpp b/src/grgl.cpp index af0cf16..85c0c5e 100644 --- a/src/grgl.cpp +++ b/src/grgl.cpp @@ -99,6 +99,11 @@ int main(int argc, char** argv) { "ts-node-times", "When converting tree-seq, use node times instead of mutation times", {"ts-node-times"}); + args::ValueFlag populationIds(parser, + "population-ids", + "Format: \"filename:fieldname\". Read population ids from the given " + "tab-separate file, using the given fieldname.", + {"population-ids"}); try { parser.ParseCLI(argc, argv); } catch (args::Help&) { @@ -155,6 +160,17 @@ int main(int argc, char** argv) { restrictRange = grgl::FloatRange(gStart, gEnd); } + std::map indivIdToPop; + if (populationIds) { + std::vector parts = split(*populationIds, ':'); + if (parts.size() != 2) { + std::cerr << "Must specify \"filename:fieldname\" for --population-ids" << std::endl; + return 1; + } + indivIdToPop = loadMapFromTSV(parts[0], "sample", parts[1]); + } + std::cout << "loaded " << indivIdToPop.size() << " id->pops\n"; + grgl::GRGPtr theGRG; START_TIMING_OPERATION(); if (ends_with(*infile, ".trees")) { @@ -185,7 +201,8 @@ int main(int argc, char** argv) { binaryMutations, missingDataHandling == MDH_ADD_TO_GRG, !noMAFFlip, - lfFilter ? *lfFilter : 0.0); + lfFilter ? *lfFilter : 0.0, + indivIdToPop); dumpStats(theGRG); } else { std::cerr << "Unsupported/undetected filetype for " << *infile << std::endl; diff --git a/src/mut_iterator.cpp b/src/mut_iterator.cpp index 06ccfef..4e12007 100644 --- a/src/mut_iterator.cpp +++ b/src/mut_iterator.cpp @@ -174,6 +174,8 @@ size_t VCFMutationIterator::countMutations() const { return mutations; } +std::vector VCFMutationIterator::getIndividualIds() { return m_vcf->getIndividualLabels(); } + void VCFMutationIterator::buffer_next(size_t& totalSamples) { bool foundMutations = false; while (!foundMutations && m_vcf->hasNextVariant() && m_alreadyLoaded.empty()) { @@ -265,6 +267,8 @@ size_t IGDMutationIterator::countMutations() const { return mutations; } +std::vector IGDMutationIterator::getIndividualIds() { return m_igd->getIndividualIds(); } + void IGDMutationIterator::buffer_next(size_t& totalSamples) { totalSamples = m_igd->numSamples(); @@ -398,6 +402,22 @@ size_t BGENMutationIterator::countMutations() const { return mutations; } +std::vector BGENMutationIterator::getIndividualIds() { + if (bgen_file_contain_samples(m_file)) { + std::vector result; + struct bgen_samples* samples = bgen_file_read_samples(m_file); + release_assert(samples != nullptr); + for (size_t i = 0; i < bgen_file_nsamples(m_file); i++) { + const bgen_string* bgStr = bgen_samples_get(samples, i); + release_assert(bgStr != nullptr); + result.emplace_back(bgen_string_data(bgStr)); + } + bgen_samples_destroy(samples); + return std::move(result); + } + return {}; +} + void BGENMutationIterator::buffer_next(size_t& totalSamples) { bool foundMutations = false; while (m_currentVariant < bgen_partition_nvariants(m_partition) && m_alreadyLoaded.empty()) { diff --git a/src/util.h b/src/util.h index f775a42..b705000 100644 --- a/src/util.h +++ b/src/util.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include #define release_assert(condition) do { \ if (!(condition)) { \ @@ -38,7 +40,7 @@ inline void split(const std::string &s, char delim, Out result) { inline std::vector split(const std::string &s, char delim) { std::vector elems; split(s, delim, std::back_inserter(elems)); - return elems; + return std::move(elems); } template @@ -130,5 +132,55 @@ inline grgl::NodeIDList loadNodeIDs(const std::string& filename) { return std::move(result); } +/** + * Helper to convert data from a tab-separate file into a string->string map. + */ +inline std::map loadMapFromTSV( + const std::string& filename, + const std::string& lhsField, + const std::string& rhsField) { + release_assert(lhsField != rhsField); + std::ifstream infile(filename); + std::string line; + size_t numCols = 0; + size_t lineNum = 0; + size_t lhsIndex = std::numeric_limits::max(); + size_t rhsIndex = std::numeric_limits::max(); + std::map result; + while (std::getline(infile, line)) { + auto tokens = split(line, '\t'); + if (lineNum == 0) { + numCols = tokens.size(); + for (size_t i = 0; i < tokens.size(); i++) { + if (tokens[i] == lhsField) { + lhsIndex = i; + } + if (tokens[i] == rhsField) { + rhsIndex = i; + } + } + std::stringstream ssErr; + if (lhsIndex >= numCols) { + ssErr << "Could not find TSV header named \"" << lhsField << "\". "; + } + if (rhsIndex >= numCols) { + ssErr << "Could not find TSV header named \"" << rhsField << "\". "; + } + auto errorString = ssErr.str(); + if (!errorString.empty()) { + throw std::runtime_error(errorString); + } + } else { + if (numCols != tokens.size()) { + std::stringstream ssErr; + ssErr << "Malformed TSV file: wrong number of columns at line " << lineNum; + throw std::runtime_error(ssErr.str()); + } + result.emplace(tokens.at(lhsIndex), tokens.at(rhsIndex)); + } + lineNum++; + } + return std::move(result); +} #endif /* GRGL_UTIL_H */ diff --git a/test/unit/test_construct.cpp b/test/unit/test_construct.cpp new file mode 100644 index 0000000..4747e10 --- /dev/null +++ b/test/unit/test_construct.cpp @@ -0,0 +1,72 @@ +#include +#include +#include + +#include "grgl/grg.h" +#include "build_shape.h" +#include "testing_utilities.h" + +using namespace grgl; + +inline std::string randomVcfLine(size_t position, size_t numIndivs) { + static std::random_device randDevice; + static std::mt19937 generator(randDevice()); + + std::uniform_int_distribution fnSampler(0, 3); + std::stringstream ss; + ss << "1\t" << position << "\tV1\tA\tC\t.\tPASS\t.\tGT"; + for (size_t i = 0; i < numIndivs; i++) { + size_t allele = fnSampler(generator); + ss << "\t" << (allele & 0x1U) << "|" << ((allele & 0x2U) >> 1U); + } + ss << "\n"; + return ss.str(); +} + +TEST(Construct, WithPopIds) { + std::string vcfHeaderString = + "##fileformat=VCFv4.2\n" + "##source=TEST\n" + "##FORMAT=\n" + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tX1\tX2\tX3\tA1\tA2\tB4\tZ1\n" + ; + const size_t numIndividuals = 7; + std::stringstream vcfFileSS; + vcfFileSS << vcfHeaderString; + vcfFileSS << randomVcfLine(60000, numIndividuals); + vcfFileSS << randomVcfLine(60001, numIndividuals); + vcfFileSS << randomVcfLine(60002, numIndividuals); + vcfFileSS << randomVcfLine(60003, numIndividuals); + vcfFileSS << randomVcfLine(60004, numIndividuals); + vcfFileSS << randomVcfLine(60005, numIndividuals); + vcfFileSS << randomVcfLine(60006, numIndividuals); + auto filename = writeTempFile(vcfFileSS.str(), ".vcf"); + std::map indivIdToPop; + FloatRange fullRange; + + // Test1: Incomplete individual -> population map + indivIdToPop.emplace("Z1", "Population2"); + indivIdToPop.emplace("X1", "Population4"); + EXPECT_THROW(createEmptyGRGFromSamples(filename, fullRange, 8, false, false, false, 0.0, + indivIdToPop), std::runtime_error); + + // Test2: Complete individual -> population map + indivIdToPop.emplace("A2", "Population3"); + indivIdToPop.emplace("X2", "Population1"); + indivIdToPop.emplace("B4", "Population3"); + indivIdToPop.emplace("A1", "Population1"); + indivIdToPop.emplace("X3", "Population4"); + auto grg = createEmptyGRGFromSamples(filename, fullRange, 8, false, false, false, 0.0, + indivIdToPop); + ASSERT_EQ(grg->numSamples(), numIndividuals*2); + auto popDescriptions = grg->getPopulations(); + ASSERT_EQ(popDescriptions.size(), 4); + ASSERT_EQ(popDescriptions[0], "Population4"); // Based on order of individuals in file + // Test a bunch of sample nodes + ASSERT_EQ(popDescriptions.at(grg->getNodeData(0).populationId), "Population4"); + ASSERT_EQ(popDescriptions.at(grg->getNodeData(12).populationId), "Population2"); + ASSERT_EQ(popDescriptions.at(grg->getNodeData(13).populationId), "Population2"); + ASSERT_EQ(popDescriptions.at(grg->getNodeData(8).populationId), "Population3"); + + remove_file(filename); +} diff --git a/test/unit/test_util.cpp b/test/unit/test_util.cpp new file mode 100644 index 0000000..b770e0b --- /dev/null +++ b/test/unit/test_util.cpp @@ -0,0 +1,38 @@ +#include +#include + +#include "util.h" +#include "testing_utilities.h" + +TEST(Util, MapFromTSVGood) { + std::string testString = + "sample\tpop\tsuper_pop\trandom\n" + "A\tPOP1\tSPOP5\t2342830492\n" + "B\tPOP2\tSPOP3\tksdjflksjdf\n" + ; + auto filename = writeTempFile(testString); + auto resultMap = loadMapFromTSV(filename, "sample", "pop"); + ASSERT_EQ(resultMap.size(), 2); + ASSERT_NE(resultMap.find("A"), resultMap.end()); + ASSERT_NE(resultMap.find("B"), resultMap.end()); + ASSERT_EQ(resultMap.find("A")->second, "POP1"); + ASSERT_EQ(resultMap.find("B")->second, "POP2"); + + auto otherResultMap = loadMapFromTSV(filename, "random", "super_pop"); + ASSERT_EQ(otherResultMap.size(), 2); + ASSERT_EQ(otherResultMap.find("ksdjflksjdf")->second, "SPOP3"); + ASSERT_EQ(otherResultMap.find("2342830492")->second, "SPOP5"); + + remove_file(filename); +} + +TEST(Util, MapFromTSVBad) { + std::string testString = + "sample\tpop\tsuper_pop\trandom\n" + "A\tPOP1\tSPOP5\t2342830492\n" + "B\tPOP2\tksdjflksjdf\n" + ; + auto filename = writeTempFile(testString); + EXPECT_THROW(loadMapFromTSV(filename, "sample", "pop"), std::runtime_error); + remove_file(filename); +} diff --git a/test/unit/testing_utilities.h b/test/unit/testing_utilities.h new file mode 100644 index 0000000..74a3422 --- /dev/null +++ b/test/unit/testing_utilities.h @@ -0,0 +1,31 @@ +#ifndef GRGL_TESTING_UTILITIES_H +#define GRGL_TESTING_UTILITIES_H + +#include +#include +#include + +static inline std::string writeTempFile(const std::string& contents, std::string ext = "") { + constexpr size_t MAX_PATH = 256; + char filename[MAX_PATH]; + strcpy(filename, "grgl_test_XXXXXX"); + int fd = mkstemp(filename); + release_assert(fd > 0); + release_assert(contents.size() == write(fd, contents.c_str(), contents.size())); + fsync(fd); + close(fd); + + if (!ext.empty()) { + std::stringstream ss; + ss << filename << ext; + rename(filename, ss.str().c_str()); + return ss.str(); + } + return std::string(filename); +} + +static inline void remove_file(const std::string& filename) { + unlink(filename.c_str()); +} + +#endif /* GRGL_TESTING_UTILITIES_H */ \ No newline at end of file