diff --git a/.gitignore b/.gitignore index 0a0ce28..75c46b9 100644 --- a/.gitignore +++ b/.gitignore @@ -7,8 +7,8 @@ __pycache__ build.log *.so *.grg -gconverter -gindexer -grg-merge -grgl -grgp +/gconverter +/gindexer +/grg-merge +/grgl +/grgp diff --git a/include/grgl/common.h b/include/grgl/common.h new file mode 100644 index 0000000..df9bfe5 --- /dev/null +++ b/include/grgl/common.h @@ -0,0 +1,119 @@ +/* Genotype Representation Graph Library (GRGL) + * Copyright (C) 2024 April Wei + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * should have received a copy of the GNU General Public License + * with this program. If not, see . + */ +#ifndef GRGL_COMMON_H +#define GRGL_COMMON_H + +#include +#include +#include +#include +#include + +namespace grgl { + +/** + * Exception thrown when the GRG API is used incorrectly (bad parameters, incorrect preconditions, etc.). + */ +class ApiMisuseFailure : public std::runtime_error { +public: + explicit ApiMisuseFailure(char const* const message) + : std::runtime_error(message) {} +}; + +/** + * Exception thrown when the input file does not meet conditions needed for GRG. + */ +class BadInputFileFailure : public std::runtime_error { +public: + explicit BadInputFileFailure(char const* const message) + : std::runtime_error(message) {} +}; + +class FloatRange { +public: + static constexpr double RANGE_UNSPECIFIED = std::numeric_limits::quiet_NaN(); + + FloatRange() + : m_start(RANGE_UNSPECIFIED), + m_end(RANGE_UNSPECIFIED) {} + + FloatRange(const double start, const double end) + : m_start(start), + m_end(end) {} + + ~FloatRange() = default; + FloatRange(const FloatRange&) = default; + FloatRange(FloatRange&&) = default; + FloatRange& operator=(const FloatRange&) = default; + FloatRange& operator=(FloatRange&&) = default; + + double start() const { return m_start; } + + double end() const { return m_end; } + + inline bool isUnspecified() const { return std::isnan(m_start) || std::isnan(m_end); } + + bool isNormalized() const { return !isUnspecified() && m_end <= 1.0; } + + FloatRange normalized(size_t absStart, size_t absEnd) const { + if (isUnspecified()) { + return {}; + } + // The +1 is because our range checking is all (inclusive, exclusive], but + // we need our upper range to be 1.0, so we make sure that the max value is + // <1.0 + const size_t span = (absEnd - absStart) + 1; + if (!isNormalized() && span > 0) { + return {(m_start - (double)absStart) / (double)span, (m_end - (double)absStart) / (double)span}; + } + return *this; + } + + FloatRange denormalized(size_t absStart, size_t absEnd) const { + if (isUnspecified()) { + return {}; + } + if (isNormalized()) { + const size_t span = (absEnd - absStart) + 1; + return {(m_start * (double)span) + (double)absStart, (m_end * (double)span) + (double)absStart}; + } + return *this; + } + + bool contains(double position) const { + assert(isUnspecified() || m_end > 1.0 || position <= 1.0); + // An unspecified range includes everything. + return isUnspecified() || (m_start <= position && m_end > position); + } + +private: + double m_start; + double m_end; +}; + +inline bool operator==(const FloatRange& a, const FloatRange& b) { + return (a.isUnspecified() && b.isUnspecified()) || (a.start() == b.start() && a.end() == b.end()); +} + +// Ah, magic: https://www.boost.org/doc/libs/1_55_0/doc/html/hash/reference.html#header.boost.functional.hash_hpp +inline std::size_t hash_combine(std::size_t hash1, std::size_t hash2) { + return hash1 ^ (hash2 + 0x9e3779b9 + (hash1 << 6U) + (hash1 >> 2U)); +} + +} // namespace grgl + +#endif /* GRGL_COMMON_H */ diff --git a/include/grgl/grg.h b/include/grgl/grg.h new file mode 100644 index 0000000..06e22d0 --- /dev/null +++ b/include/grgl/grg.h @@ -0,0 +1,402 @@ +/* Genotype Representation Graph Library (GRGL) + * Copyright (C) 2024 April Wei + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * should have received a copy of the GNU General Public License + * with this program. If not, see . + */ +#ifndef GRG_H +#define GRG_H + +#include +#include +#include +#include + +#include "grgnode.h" +#include "mutation.h" +#include "visitor.h" + +namespace grgl { + +/** + * Abstract GRG base class. + */ +class GRG : public std::enable_shared_from_this { +public: + class NodeListIterator { + public: + using VectIter = std::vector::const_iterator; + + NodeListIterator(VectIter begin, VectIter end) + : m_begin(begin), + m_end(end) {} + + VectIter begin() const { return m_begin; } + VectIter end() const { return m_end; } + + private: + VectIter m_begin; + VectIter m_end; + }; + + enum { + DEFAULT_NODE_CAPACITY = 1024, + }; + + explicit GRG(size_t numSamples) + : m_numSamples(numSamples) {} + + virtual ~GRG() = default; + GRG(const GRG&) = delete; + GRG(GRG&&) = delete; + GRG& operator=(const GRG&) = delete; + GRG& operator=(GRG&&) = delete; + + // Don't compare GRGs. Use equivalentGRGs if needed. + bool operator==(const GRG& rhs) const = delete; + + /** + * Is the given nodeID associated with a sample node? + * + * The first numSamples node IDs are reserved for samples, so you can also just use + * `nodeId < grg.numSamples()`. + * + * @param[in] nodeId The node ID to check. + */ + bool isSample(const NodeID nodeId) const { return nodeId < this->m_numSamples; } + + size_t numSamples() const { return m_numSamples; } + + virtual bool nodesAreOrdered() const = 0; + virtual size_t numNodes() const = 0; + virtual size_t numEdges() const = 0; + virtual size_t numUpEdges(NodeID nodeId) const = 0; + virtual size_t numDownEdges(NodeID nodeId) const = 0; + virtual NodeListIterator getDownEdges(NodeID nodeId) const = 0; + virtual NodeListIterator getUpEdges(NodeID nodeId) const = 0; + virtual NodeData& getNodeData(NodeID nodeId) = 0; + + NodeIDList getSampleNodes() const { + NodeIDList result; + for (grgl::NodeID i = 0; i < this->numSamples(); i++) { + result.push_back(i); + } + return result; + } + + NodeIDList getRootNodes() const { + NodeIDList result; + for (grgl::NodeID nodeId = 0; nodeId < this->numNodes(); nodeId++) { + if (this->numUpEdges(nodeId) == 0) { + result.push_back(nodeId); + } + } + return result; + } + + const std::vector& getMutations() const { return m_mutations; } + + size_t numMutations() const { return m_mutations.size(); } + + const std::multimap& getNodeMutationPairs() const { return m_nodeToMutations; } + + const Mutation& getMutationById(MutationId mutId) const { return m_mutations.at(mutId); } + + void setMutationById(MutationId mutId, Mutation mutation) { m_mutations.at(mutId) = std::move(mutation); } + + std::vector getUnmappedMutations() const { return getMutationsForNode(INVALID_NODE_ID); } + + std::vector getMutationsForNode(const NodeID nodeId) const { + std::vector result; + auto findRange = m_nodeToMutations.equal_range(nodeId); + for (auto it = findRange.first; it != findRange.second; it++) { + result.emplace_back(it->second); + } + return result; + } + + bool nodeHasMutations(const NodeID nodeId) const { + auto findIt = m_nodeToMutations.find(nodeId); + return findIt != m_nodeToMutations.end(); + } + + /** + * Visit nodes breadth-first, starting at the given nodes and following up or + * down edges. + * + * @param[in] visitor The visitor that will be called back. Typically owns any state + * that is being computed. + * @param[in] direction Whether to go up or down the graph. + * @param[in] seedList The nodes to start traversal from. + * @param[in] maxQueueWidth The maximum width of the queue; restricts the number of + * end-to-end paths that will be visited. + */ + void visitBfs(GRGVisitor& visitor, + TraversalDirection direction, + const NodeIDList& seedList, + ssize_t maxQueueWidth = -1) const; + + /** + * Visit nodes depth-first, starting at the given nodes and following up or + * down edges. + * + * @param[in] visitor The visitor that will be called back. Typically owns any state + * that is being computed. + * @param[in] direction Whether to go up or down the graph. + * @param[in] seedList The nodes to start traversal from. + * @param[in] maxPathDepth The maximum depth of any path that will be explored. + * Traversal stops when this is reached. Typically only useful when forwardOnly + * is true. + * @param[in] forwardOnly Typically depth-first search reaches a given node, computes + * (recursively) the values for all successors, and then computes the current + * node value. This causes each node to be visited twice. However, setting + * forwardOnly will only visit nodes in the forward direction. It also causes + * nodes to be visited an arbitrary number of times. + */ + void visitDfs(GRGVisitor& visitor, + TraversalDirection direction, + const NodeIDList& seedList, + bool forwardOnly = false) const; + + virtual std::vector topologicalSort(TraversalDirection direction) const = 0; + + virtual void visitTopo(GRGVisitor& visitor, + TraversalDirection direction, + const NodeIDList& seedList, + const std::vector* sortOrder = nullptr) const = 0; + + /** + * Add a population to the GRG. + */ + size_t addPopulation(std::string populationDescription) { + const auto popId = m_populations.size(); + m_populations.push_back(std::move(populationDescription)); + return popId; + } + + /** + * The populations (their descriptions) represented by this GRG. + */ + const std::vector& getPopulations() const { return m_populations; } + + MutationId addMutation(const Mutation& mutation, NodeID nodeId); + +protected: + // The position is the mutation ID. + std::vector m_mutations; + // Each node can have multiple mutations. To get the nodes for a particular mutation you have + // to iterate the entire list. Client code should construct its own reverse map if needed. + std::multimap m_nodeToMutations; + + // (Optional) list of population descriptions. The position corresponds to the population + // ID, which can be used to tag nodes. + std::vector m_populations; + + const size_t m_numSamples; +}; + +using GRGPtr = std::shared_ptr; +using ConstGRGPtr = std::shared_ptr; + +class MutableGRG : public GRG { +public: + /** + * Construct a GRG for a given number of samples. + * + * @param[in] The number of samples that will be used to construct the graph. + * @param[in] (Optional) the initial capacity of the node vector. If you know in advance roughly + * how many nodes will be created this can improve performance. + */ + explicit MutableGRG(size_t numSamples, size_t initialNodeCapacity = DEFAULT_NODE_CAPACITY) + : GRG(numSamples) { + if (initialNodeCapacity < numSamples) { + initialNodeCapacity = numSamples * 2; + } + this->m_nodes.reserve(initialNodeCapacity); + this->makeNode(numSamples); + } + + bool nodesAreOrdered() const override { return false; } + + size_t numNodes() const override { return m_nodes.size(); } + + size_t numEdges() const override { + size_t edgeCount = 0; + for (const auto& node : m_nodes) { + edgeCount += node->getDownEdges().size(); + } + return edgeCount; + } + + size_t numDownEdges(NodeID nodeId) const override { return m_nodes.at(nodeId)->getDownEdges().size(); } + + size_t numUpEdges(NodeID nodeId) const override { return m_nodes.at(nodeId)->getUpEdges().size(); } + + NodeListIterator getDownEdges(NodeID nodeId) const override { + const NodeIDList& edges = m_nodes.at(nodeId)->getDownEdges(); + return {edges.begin(), edges.end()}; + } + + NodeListIterator getUpEdges(NodeID nodeId) const override { + const NodeIDList& edges = m_nodes.at(nodeId)->getUpEdges(); + return {edges.begin(), edges.end()}; + } + + NodeData& getNodeData(NodeID nodeId) override { return m_nodes.at(nodeId)->getNodeData(); } + + /** + * Create a new node in the GRG. + * + * This is the only valid way to construct a GRG node. When you create a GRG you specify the number + * of sample nodes, and those nodes are created right away. Each call to `makeNode()` after that will + * generate sequential-ID nodes. Use `connect()` to connect the newly created node to other nodes. + */ + NodeID makeNode(const size_t count = 1) { + const auto nextId = this->m_nodes.size(); + for (size_t i = 0; i < count; i++) { + this->m_nodes.push_back(std::make_shared()); + } + return nextId; + } + + /** + * Create a graph edge from one node (the source) to another node (the target). + * + * In GRG, "source node" always refers to the node "above" the other node. I.e., this function constructs + * a down edge from source to target, and an up edge from target to source. In ARG parlance, the source + * node is the parent and the target is the child. + * + * @param[in] srcId The ID of the source node. + * @param[in] tgtId The ID of the target node. + */ + void connect(NodeID srcId, NodeID tgtId); + + /** + * Remove a graph edge between two nodes. + * + * The source and target nodes have the same meaning as in connect(). + * + * @param[in] srcId The ID of the source node. + * @param[in] tgtId The ID of the target node. + */ + void disconnect(NodeID srcId, NodeID tgtId); + + /** + * Merge another GRG into this one. Only succeeds if both GRGs have the same number of + * samples. + * + * This assumes that the GRGs were constructed from the same sampleset -- e.g., they + * could be constructed from two subsets of the same sampleset (as long as both were + * constructed with the same sample node numbering) or from a subset of mutations against + * the same sampleset. + * + * @param otherGrgFiles The list of GRG filenames to load and merge. + */ + void merge(const std::list& otherGrgFiles, bool combineNodes = true); + + /** + * Retrieve a node by ID. + * + * @param[in] nodeId The ID of the node in question. + */ + GRGNodePtr getNode(const NodeID nodeId) const { + const NodeID checkId = removeMarks(nodeId); + return this->m_nodes.at(checkId); + } + + std::vector topologicalSort(TraversalDirection direction) const override; + + void visitTopo(GRGVisitor& visitor, + TraversalDirection direction, + const NodeIDList& seedList, + const std::vector* sortOrder = nullptr) const override; + + /** + * Compact the edge vectors in the GRG. If done infrequently won't affect the amortized edge addition + * cost but will reduce overall RAM usage by a non-trivial amount. + */ + void compact(NodeID nodeId = INVALID_NODE_ID); + +private: + // The list of nodes. The node's position in this vector must match its ID. + std::vector m_nodes; +}; + +using MutableGRGPtr = std::shared_ptr; +using ConstMutableGRGPtr = std::shared_ptr; + +class CSRGRG : public GRG { +public: + explicit CSRGRG(size_t numSamples, size_t edgeCount, size_t nodeCount) + : GRG(numSamples), + m_downEdges(edgeCount), + m_upEdges(edgeCount), + m_downPositions(nodeCount + 2), + m_upPositions(nodeCount + 2), + m_nodeData(nodeCount) {} + + void finalize() { + // Just a convenience: the last slot points to the end of the edge vector. + m_downPositions[m_downPositions.size() - 1] = m_downEdges.size(); + m_upPositions[m_upPositions.size() - 1] = m_upEdges.size(); + } + + bool nodesAreOrdered() const override { return true; } + + size_t numNodes() const override { return m_downPositions.size() - 2; } + + size_t numEdges() const override { return m_downEdges.size(); } + + size_t numDownEdges(NodeID nodeId) const override { + return m_downPositions.at(nodeId + 2) - m_downPositions[nodeId + 1]; + } + + size_t numUpEdges(NodeID nodeId) const override { return m_upPositions.at(nodeId + 2) - m_upPositions[nodeId + 1]; } + + NodeListIterator getDownEdges(NodeID nodeId) const override { + const NodeIDSizeT end = m_downPositions.at(nodeId + 2); + const NodeIDSizeT start = m_downPositions[nodeId + 1]; + return {m_downEdges.begin() + start, m_downEdges.begin() + end}; + } + + NodeListIterator getUpEdges(NodeID nodeId) const override { + const NodeIDSizeT end = m_upPositions.at(nodeId + 2); + const NodeIDSizeT start = m_upPositions[nodeId + 1]; + return {m_upEdges.begin() + start, m_upEdges.begin() + end}; + } + + NodeData& getNodeData(NodeID nodeId) override { return m_nodeData.at(nodeId); } + + std::vector topologicalSort(TraversalDirection direction) const override; + + void visitTopo(GRGVisitor& visitor, + TraversalDirection direction, + const NodeIDList& seedList, + const std::vector* sortOrder = nullptr) const override; + +private: + std::vector m_downPositions; + NodeIDList m_downEdges; + std::vector m_upPositions; + NodeIDList m_upEdges; + std::vector m_nodeData; + + friend GRGPtr readImmutableGrg(std::istream& inStream, bool loadUpEdges); +}; + +using CSRGRGPtr = std::shared_ptr; +using ConstCSRGRGPtr = std::shared_ptr; + +} // namespace grgl + +#endif /* GRG_H */ diff --git a/include/grgl/grgnode.h b/include/grgl/grgnode.h new file mode 100644 index 0000000..0e997ab --- /dev/null +++ b/include/grgl/grgnode.h @@ -0,0 +1,184 @@ +/* Genotype Representation Graph Library (GRGL) + * Copyright (C) 2024 April Wei + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * should have received a copy of the GNU General Public License + * with this program. If not, see . + */ +#ifndef GRG_NODE_H +#define GRG_NODE_H + +#include +#include +#include +#include +#include +#include + +#include "mutation.h" + +#define PERFORM_DUP_EDGE_CHECKS 0 + +#if PERFORM_DUP_EDGE_CHECKS +#include "util.h" +#include + +#define CHECK_DUP_EDGES(edge_list) \ + do { \ + std::sort((edge_list).begin(), (edge_list.end())); \ + const size_t origSize = (edge_list).size(); \ + (edge_list).erase(std::unique((edge_list).begin(), (edge_list).end()), (edge_list).end()); \ + release_assert(origSize == (edge_list).size()); \ + } while (0) +#else +#define CHECK_DUP_EDGES(edge_list) +#endif + +namespace grgl { + +using PopulationID = uint16_t; +constexpr PopulationID POPULATION_UNSPECIFIED = 0xffff; + +#ifdef COMPACT_NODE_IDS +using NodeID = uint32_t; +using NodeIDSizeT = uint32_t; + +// We support about 250 million nodes. +#define MAX_GRG_NODES (0x3fffffffU - 1) +#define INVALID_NODE_ID (0x3fffffffU) +// The upper 4 bits are available for flags. +#define GRG_NODE_FLAG_MASK (0xc0000000U) + +enum NodeMark { + NODE_MARK_1 = 0x40000000U, + NODE_MARK_2 = 0x80000000U, +}; +#else +using NodeID = uint64_t; +using NodeIDSizeT = uint64_t; + +// We support about trillion nodes. +#define MAX_GRG_NODES (0x000000ffffffffff - 1) +#define INVALID_NODE_ID (0x000000ffffffffff) +// The upper 24 bits are available for flags. +#define GRG_NODE_FLAG_MASK (0xffffff0000000000) + +enum NodeMark { + NODE_MARK_1 = 0x0000010000000000, + NODE_MARK_2 = 0x0000020000000000, + NODE_MARK_3 = 0x0000040000000000, + NODE_MARK_4 = 0x0000080000000000, +}; +#endif + +static_assert((INVALID_NODE_ID & GRG_NODE_FLAG_MASK) == 0, "Invalid masks"); + +using NodeIDList = std::vector; +using NodeIDSetOrdered = std::set; +#ifdef USE_TREE_SETS +using NodeIDSet = NodeIDSetOrdered; +#else +using NodeIDSet = std::unordered_set; +#endif + +inline NodeID markNodeId(const NodeID nodeId, NodeMark markNum, bool value) { + if (value) { + return nodeId | markNum; + } + return nodeId & ~static_cast(markNum); +} + +inline bool hasMark(const NodeID nodeId, NodeMark markNum) { return (nodeId & markNum) > 0; } + +inline NodeID removeMarks(const NodeID nodeId) { return nodeId & (~GRG_NODE_FLAG_MASK); } + +class MutableGRG; + +class GRGNode; +using GRGNodePtr = std::shared_ptr; + +/** + * Container for information that can be attached to graph nodes. + */ +struct NodeData { + NodeData() = default; + explicit NodeData(const PopulationID popId) + : populationId(popId) {} + + PopulationID populationId{POPULATION_UNSPECIFIED}; +}; + +/** + * A node in the Genomic Representation Graph (GRG). Typically only accessed via the GRG + * class, and not independently. + */ +class GRGNode { +public: + explicit GRGNode(const PopulationID populationId = POPULATION_UNSPECIFIED) + : m_nodeData(populationId) {} + + virtual ~GRGNode() = default; + GRGNode(const GRGNode&) = delete; + GRGNode(GRGNode&&) = delete; + GRGNode& operator=(const GRGNode&) = delete; + GRGNode& operator=(GRGNode&&) = delete; + + const NodeIDList& getDownEdges() const { return this->m_downEdges; } + + const NodeIDList& getUpEdges() const { return this->m_upEdges; } + + bool operator==(const GRGNode& rhs) const = delete; + + NodeData& getNodeData() { return m_nodeData; } + + const NodeData& getNodeData() const { return m_nodeData; } + +protected: + static bool edgeDelete(NodeIDList& edges, const NodeID nodeId) { + size_t laggingCounter = 0; + size_t leadingCounter = 0; + for (leadingCounter = 0; leadingCounter < edges.size(); leadingCounter++) { + if (edges[leadingCounter] != nodeId) { + if (laggingCounter < leadingCounter) { + edges[laggingCounter] = edges[leadingCounter]; + } + laggingCounter++; + } + } + if (leadingCounter != laggingCounter) { + edges.resize(laggingCounter); + return true; + } + return false; + } + + void addDownEdge(const NodeID target) { + this->m_downEdges.push_back(target); + CHECK_DUP_EDGES(this->m_downEdges); + } + + bool deleteDownEdge(const NodeID target) { return edgeDelete(this->m_downEdges, target); } + + void addUpEdge(const NodeID source) { this->m_upEdges.push_back(source); } + + bool deleteUpEdge(const NodeID source) { return edgeDelete(this->m_upEdges, source); } + + NodeIDList m_downEdges; + NodeIDList m_upEdges; + NodeData m_nodeData; + + friend MutableGRG; +}; + +} // namespace grgl + +#endif /* GRG_NODE_H */ diff --git a/include/grgl/map_mutations.h b/include/grgl/map_mutations.h new file mode 100644 index 0000000..43758d5 --- /dev/null +++ b/include/grgl/map_mutations.h @@ -0,0 +1,55 @@ +/* Genotype Representation Graph Library (GRGL) + * Copyright (C) 2024 April Wei + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * should have received a copy of the GNU General Public License + * with this program. If not, see . + */ +#ifndef MAP_MUTATIONS_H +#define MAP_MUTATIONS_H + +#include "grgl/common.h" +#include "grgl/mut_iterator.h" + +#include + +namespace grgl { + +class MutableGRG; +using MutableGRGPtr = std::shared_ptr; + +struct MutationMappingStats { + size_t totalMutations{}; + size_t emptyMutations{}; + size_t mutationsWithOneSample{}; + size_t mutationsWithNoCandidates{}; + size_t reusedNodes{}; + size_t reusedNodeCoverage{}; + size_t reusedExactly{}; + size_t singletonSampleEdges{}; // Direct edge from Mutation to sample + size_t newTreeNodes{}; + size_t samplesProcessed{}; + size_t numCandidates{}; + size_t reuseSizeBiggerThanHistMax{}; + size_t numWithSingletons{}; + size_t maxSingletons{}; + size_t reusedMutNodes{}; + std::vector reuseSizeHist; +}; + +class GenotypeHashIndex; + +MutationMappingStats mapMutations(const MutableGRGPtr& grg, MutationIterator& mutations); + +}; // namespace grgl + +#endif /* MAP_MUTATIONS_H */ diff --git a/include/grgl/mut_iterator.h b/include/grgl/mut_iterator.h new file mode 100644 index 0000000..6394186 --- /dev/null +++ b/include/grgl/mut_iterator.h @@ -0,0 +1,176 @@ +/* Genotype Representation Graph Library (GRGL) + * Copyright (C) 2024 April Wei + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * should have received a copy of the GNU General Public License + * with this program. If not, see . + */ +#ifndef GRGL_MUT_ITERATOR_H +#define GRGL_MUT_ITERATOR_H + +#include "grgl/common.h" +#include "grgl/grgnode.h" +#include "util.h" +#include + +// Forward declarations +namespace picovcf { +class VCFFile; +class IGDData; +} // namespace picovcf + +#if BGEN_ENABLED +struct bgen_file; +struct bgen_partition; +#endif + +namespace grgl { + +class Mutation; + +using MutationAndSamples = std::pair; + +/** + * A MutationIterator provides sequential access to the _mutations_ (not variants) in an input + * file. The difference between "mutation" and "variant" here is that the former only has one + * alternate allele, and the latter can have many -- mutation is like a normalization. + * + * @param[in] binaryMutations If true, all alternate alleles will be combined into a single mutation + * with allele value Mutation::ALLELE_1. + * @param[in] emitMissingData If true, an "extra" Mutation will be emitted for each variant that has + * missing data, with allele value Mutation::ALLELE_MISSING. Otherwise missing data will be + * omitted entirely (i.e., indistinguishable from the reference allele). + * @param[in] flipRefMajor If true, flip the major allele to become the reference allele. I.e., if + * an alternate allele affects more than 50% of the samples then make that the reference allele + * and make the old reference allele an alternate. + */ +class MutationIterator { +public: + explicit MutationIterator(FloatRange genomeRange, + bool binaryMutations, + bool emitMissingData, + bool flipRefMajor = false) + : m_genomeRange(genomeRange), + m_binaryMutations(binaryMutations), + m_emitMissingData(emitMissingData), + m_flipRefMajor(flipRefMajor) {} + virtual ~MutationIterator() = default; + + MutationIterator(const MutationIterator& rhs) = delete; + MutationIterator& operator=(const MutationIterator& rhs) = delete; + MutationIterator(const MutationIterator&& rhs) = delete; + MutationIterator& operator=(const MutationIterator&& rhs) = delete; + + virtual void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) = 0; + + virtual size_t countMutations() const = 0; + + bool next(MutationAndSamples& mutAndSamples, size_t& totalSamples); + void reset(); + + size_t numFlippedAlleles() const { return m_flippedAlleles; } + +protected: + virtual void buffer_next(size_t& totalSamples) = 0; + virtual void reset_specific() = 0; + + // A buffer of loaded mutations from the current position/variant. + std::list m_alreadyLoaded; + + // Range to use. + FloatRange m_genomeRange; + + // How many alleles were flipped due to m_flipRefMajor? + size_t m_flippedAlleles{}; + // Remember how many samples we saw last "row" + size_t m_totalSamples{}; + bool m_binaryMutations; + bool m_emitMissingData; + bool m_flipRefMajor; +}; + +/** + * Iterate the mutations in a VCF file. + * + * TODO: if we keep this around, should consider adding an index file that maps genome position to + * file position. + */ +class VCFMutationIterator : public MutationIterator { +public: + explicit VCFMutationIterator( + const char* filename, FloatRange genomeRange, bool binaryMutations, bool emitMissingData, bool flipRefMajor); + + void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) override; + size_t countMutations() const override; + +protected: + void buffer_next(size_t& totalSamples) override; + void reset_specific() override; + +private: + std::unique_ptr m_vcf; +}; + +/** + * Iterate the mutations in an IGD file. + */ +class IGDMutationIterator : public MutationIterator { +public: + explicit IGDMutationIterator( + const char* filename, FloatRange genomeRange, bool binaryMutations, bool emitMissingData, bool flipRefMajor); + + void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) override; + + size_t countMutations() const override; + +protected: + void buffer_next(size_t& totalSamples) override; + void reset_specific() override; + +private: + std::unique_ptr m_igd; + size_t m_currentVariant; +}; + +#if BGEN_ENABLED +/** + * Iterate the mutations in an BGEN (v1.2 or v1.3) file. + * + * This assumes that the reference allele is always the 0th allele. + */ +class BGENMutationIterator : public MutationIterator { +public: + explicit BGENMutationIterator( + const char* filename, FloatRange genomeRange, bool binaryMutations, bool emitMissingData, bool flipRefMajor); + ~BGENMutationIterator() override; + + void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) override; + + size_t countMutations() const override; + +protected: + void buffer_next(size_t& totalSamples) override; + void reset_specific() override; + +private: + bgen_file* m_file; + const bgen_partition* m_partition; + size_t m_currentVariant{}; +}; +#endif + +std::shared_ptr makeMutationIterator( + const std::string& filename, FloatRange genomeRange, bool binaryMutations, bool emitMissingData, bool flipRefMajor); + +} // namespace grgl + +#endif /* GRGL_MUT_ITERATOR_H */ diff --git a/include/grgl/mutation.h b/include/grgl/mutation.h new file mode 100644 index 0000000..5aa626b --- /dev/null +++ b/include/grgl/mutation.h @@ -0,0 +1,258 @@ +/* Genotype Representation Graph Library (GRGL) + * Copyright (C) 2024 April Wei + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * should have received a copy of the GNU General Public License + * with this program. If not, see . + */ +#ifndef GRG_MUTATION_H +#define GRG_MUTATION_H + +#include "grgl/common.h" + +#include +#include +#include +#include +#include + +namespace grgl { + +using EXTERNAL_ID = int32_t; +constexpr EXTERNAL_ID NO_ORIGINAL_ID = -1; + +class Mutation; + +using MutationId = uint32_t; + +/** + * A mutation in the GRG. + */ +class Mutation { +public: + static constexpr char const* ALLELE_A = "A"; + static constexpr char const* ALLELE_C = "C"; + static constexpr char const* ALLELE_T = "T"; + static constexpr char const* ALLELE_G = "G"; + static constexpr char const* ALLELE_MISSING = "."; + static constexpr char const* ALLELE_0 = "0"; + static constexpr char const* ALLELE_1 = "1"; + + Mutation() + : m_alleleStorage({}), + m_position(std::numeric_limits::max()), + m_time(-1.0), + m_originalId(NO_ORIGINAL_ID), + m_refPosition(0) {} + + Mutation(uint64_t position, std::string allele) + : m_alleleStorage({}), + m_position(position), + m_time(-1.0), + m_originalId(NO_ORIGINAL_ID), + m_refPosition(0) { + if (allele.size() > std::numeric_limits::max()) { + throw ApiMisuseFailure("Allele length cannot exceed (2^16)-1"); + } + setAlleleValues(std::move(allele), ""); + } + + Mutation(uint64_t position, + std::string allele, + const std::string& refAllele, + float time = -1.0, + EXTERNAL_ID originalId = NO_ORIGINAL_ID) + : m_alleleStorage({}), + m_position(position), + m_time(time), + m_originalId(originalId), + m_refPosition(0) { + if (allele.size() > std::numeric_limits::max()) { + throw ApiMisuseFailure("Allele length cannot exceed (2^16)-1"); + } + setAlleleValues(std::move(allele), refAllele); + } + + ~Mutation() { cleanup(); } + + void cleanup() { + if (m_alleleStorage._str & PTR_STOLEN_BIT) { + char* ptr = ((char*)(m_alleleStorage._str & ~PTR_STOLEN_BIT)); + delete[] ptr; + } + m_alleleStorage._str = 0; + } + + Mutation(const Mutation& rhs) + : m_alleleStorage({}), + m_position(rhs.m_position), + m_time(rhs.m_time), + m_originalId(rhs.m_originalId), + m_refPosition(0) { + setAlleleValues(std::move(rhs.getAllele()), rhs.getRefAllele()); + } + + Mutation& operator=(const Mutation& rhs) { + if (this != &rhs) { + this->cleanup(); + this->m_position = rhs.m_position; + this->m_time = rhs.m_time; + this->m_originalId = rhs.m_originalId; + this->m_refPosition = 0; + setAlleleValues(std::move(rhs.getAllele()), rhs.getRefAllele()); + } + return *this; + } + + Mutation& operator=(Mutation&& rhs) noexcept { + if (this != &rhs) { + this->cleanup(); + this->m_position = rhs.m_position; + this->m_time = rhs.m_time; + this->m_originalId = rhs.m_originalId; + this->m_refPosition = 0; + setAlleleValues(std::move(rhs.getAllele()), rhs.getRefAllele()); + } + return *this; + } + + Mutation(Mutation&& rhs) noexcept + : m_alleleStorage({}), + m_position(rhs.m_position), + m_time(rhs.m_time), + m_originalId(rhs.m_originalId), + m_refPosition(0) { + setAlleleValues(std::move(rhs.getAllele()), rhs.getRefAllele()); + } + + bool isEmpty() const { return m_position == std::numeric_limits::max(); } + + uint64_t getPosition() const { return m_position; } + + float getTime() const { return m_time; } + + void setTime(float time) { m_time = time; } + + std::string getAllele() const { + std::string storage = alleleStorageString(); + return storage.substr(0, m_refPosition); + } + + std::string getRefAllele() const { + std::string storage = alleleStorageString(); + return storage.substr(m_refPosition); + } + + EXTERNAL_ID getOriginalId() const { return m_originalId; } + + bool operator==(const Mutation& rhs) const { + if (this->m_position != rhs.m_position) { + return false; + } + std::string storage = alleleStorageString(); + std::string storageRhs = rhs.alleleStorageString(); + return storage == storageRhs && this->m_originalId == rhs.m_originalId; + } + + bool operator<(const Mutation& rhs) const { + if (this->m_position == rhs.m_position) { + std::string storage = alleleStorageString(); + std::string storageRhs = rhs.alleleStorageString(); + if (storage == storageRhs) { + return this->m_originalId < rhs.m_originalId; + } + return storage < storageRhs; + } + return this->m_position < rhs.m_position; + } + +protected: + void setAlleleValues(std::string altAllele, std::string refAllele) { + const size_t totalBytes = altAllele.size() + refAllele.size(); + m_refPosition = altAllele.size(); + if (totalBytes > 7) { + m_alleleStorage._str = (size_t) new char[totalBytes + 1]; + std::memcpy((void*)m_alleleStorage._str, altAllele.c_str(), altAllele.size()); + std::memcpy((void*)(m_alleleStorage._str + m_refPosition), refAllele.c_str(), refAllele.size()); + *(char*)(m_alleleStorage._str + totalBytes) = 0; // null term + m_alleleStorage._str |= PTR_STOLEN_BIT; + } else { + for (size_t i = 0; i < altAllele.size(); i++) { + m_alleleStorage._b[i + 1] = altAllele[i]; + } + for (size_t i = altAllele.size(); i < totalBytes; i++) { + m_alleleStorage._b[i + 1] = refAllele[i - altAllele.size()]; + } + } + } + + std::string alleleStorageString() const { + std::string result; + if (m_alleleStorage._str & PTR_STOLEN_BIT) { + result = std::string((char*)(m_alleleStorage._str & ~PTR_STOLEN_BIT)); + } else { + for (size_t i = 1; i < sizeof(m_alleleStorage); i++) { + if (0 == m_alleleStorage._b[i]) { + break; + } + result.push_back(m_alleleStorage._b[i]); + } + } + return std::move(result); + } + + // It is safe to steal the least-significant bit from a heap-allocated pointer + // since those will always be at least 4-byte aligned. Technically we could steal 2 bits... + static constexpr size_t PTR_STOLEN_BIT = 0x1; + // Long alleles use _str, short alleles (<=7 characters) use _b. We never use _b[0], because + // the bit we can steal from the pointer is 0x1 which would conflict with character values. + union { + size_t _str; + char _b[8]; + } m_alleleStorage; // 8 + static_assert(sizeof(m_alleleStorage) == 8, "Allele storage size changed"); + uint64_t m_position; // 8 + float m_time; // 4 + EXTERNAL_ID m_originalId; // 4 + uint16_t m_refPosition; // 2 + // Have 6 spare bits here due to alignment +}; +static_assert(sizeof(Mutation) == 32, "Mutation size changed"); + +/** + * std::less-style comparison for Mutations that only looks at position and allele (ignores + * all other fields). + * Typically used for correctness comparisons. + */ +struct MutationLtPosAllele { + bool operator()(const Mutation& lhs, const Mutation& rhs) const { + if (lhs.getPosition() == rhs.getPosition()) { + return lhs.getAllele() < rhs.getAllele(); + } + return lhs.getPosition() < rhs.getPosition(); + } +}; + +} // namespace grgl + +/** + * Implement std::hash that uses only position/allele + */ +template <> struct std::hash { + std::size_t operator()(const grgl::Mutation& mut) const noexcept { + std::size_t h1 = std::hash{}(mut.getPosition()); + std::size_t h2 = std::hash{}(mut.getAllele()); + return grgl::hash_combine(h1, h2); + } +}; + +#endif /* GRG_MUTATION_H */ diff --git a/include/grgl/serialize.h b/include/grgl/serialize.h new file mode 100644 index 0000000..9d608ce --- /dev/null +++ b/include/grgl/serialize.h @@ -0,0 +1,61 @@ +/* Genotype Representation Graph Library (GRGL) + * Copyright (C) 2024 April Wei + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * should have received a copy of the GNU General Public License + * with this program. If not, see . + */ +#ifndef GRG_SERIALIZE_H +#define GRG_SERIALIZE_H + +#include +#include +#include + +namespace grgl { + +/** + * Exception thrown when a GRG fails to serialize/deserialize. + */ +class SerializationFailure : public std::runtime_error { +public: + explicit SerializationFailure(char const* const message) + : std::runtime_error(message) {} +}; + +class GRG; +class MutableGRG; +using GRGPtr = std::shared_ptr; +using MutableGRGPtr = std::shared_ptr; + +/** + * Serialize the GRG to the given outstream. + * + * @param[in] grg The GRG to be serialized. + * @param[in] out The (binary) output stream. + * @param[in] useVarInt (optional) `true` by default. Set to `false` to avoid using variable-sized + * integer encoding in the serialization. The only reason to do this is if you are using another + * library/program to read GRG files and it does not support variable integer encoding. + */ +void writeGrg(const GRGPtr& grg, std::ostream& out, bool useVarInt = true, bool allowSimplify = true); + +/** + * Deserialize the GRG from the given input stream. + * + * @param[in] inStream The (binary) input stream. + */ +MutableGRGPtr readMutableGrg(std::istream& inStream); +GRGPtr readImmutableGrg(std::istream& inStream, bool loadUpEdges = true); + +}; // namespace grgl + +#endif /* GRG_SERIALIZE_H */ diff --git a/include/grgl/ts2grg.h b/include/grgl/ts2grg.h new file mode 100644 index 0000000..8a91f3c --- /dev/null +++ b/include/grgl/ts2grg.h @@ -0,0 +1,49 @@ +/* Genotype Representation Graph Library (GRGL) + * Copyright (C) 2024 April Wei + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * should have received a copy of the GNU General Public License + * with this program. If not, see . + */ +#ifndef TS2GRG_H +#define TS2GRG_H + +#include +#include +#include +#include +#include + +namespace grgl { + +/** + * Exception thrown when a call to Tskit fails unexpectedly. + */ +class TskitApiFailure : public std::runtime_error { +public: + explicit TskitApiFailure(char const* const message) + : std::runtime_error(message) {} +}; + +class Mutation; +class MutableGRG; +using MutableGRGPtr = std::shared_ptr; + +/** + * Convert a tskit tree-sequence object into a GRG. + */ +MutableGRGPtr +convertTreeSeqToGRG(const tsk_treeseq_t* treeSeq, bool binaryMutations = false, bool useNodeTimes = false); + +} // namespace grgl + +#endif /* TS2GRG_H */ diff --git a/include/grgl/version.h b/include/grgl/version.h new file mode 100644 index 0000000..bc303e1 --- /dev/null +++ b/include/grgl/version.h @@ -0,0 +1,2 @@ +#define GRGL_MAJOR_VERSION 1 +#define GRGL_MINOR_VERSION 0 \ No newline at end of file diff --git a/include/grgl/visitor.h b/include/grgl/visitor.h new file mode 100644 index 0000000..538aecf --- /dev/null +++ b/include/grgl/visitor.h @@ -0,0 +1,67 @@ +/* Genotype Representation Graph Library (GRGL) + * Copyright (C) 2024 April Wei + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * should have received a copy of the GNU General Public License + * with this program. If not, see . + */ +#ifndef GRG_VISITOR_H +#define GRG_VISITOR_H + +#include + +#include "grgnode.h" + +namespace grgl { + +enum TraversalDirection { DIRECTION_DOWN = 1, DIRECTION_UP = 2 }; + +enum DfsPass { DFS_PASS_NONE = 0, DFS_PASS_THERE = 1, DFS_PASS_BACK_AGAIN = 2 }; + +class GRG; +using GRGPtr = std::shared_ptr; +using ConstGRGPtr = std::shared_ptr; + +/** + * Visitor abstract base class for traversing the nodes of a GRG. + * + * To traverse a GRG, create a class that inherits from `GRGVisitor` and overrides the `visit()` + * function. Then use the GRG::visitDfs(), GRG::visitBfs(), or GRG::visitTopo() functions and + * pass your visitor as an argument. + */ +class GRGVisitor { +public: + GRGVisitor() = default; + virtual ~GRGVisitor() = default; + + GRGVisitor(const GRGVisitor&) = delete; + GRGVisitor(GRGVisitor&&) = delete; + GRGVisitor& operator=(const GRGVisitor&) = delete; + GRGVisitor& operator=(GRGVisitor&&) = delete; + + /** + * Visit a node in the GRG. + * @param[in] grg The graph. + * @param[in] node The current node we are visiting. + * @param[in] direction Whether we're traversing bottom-up (DIRECTION_UP) or + * top-down (DIRECTION_DOWN). + * @param[in] dfsPass For DFS traversals, is this the forward (DFS_PASS_THERE) + * pass or the backward (DFS_PASS_BACK_AGAIN) pass after we have computed + * values for all successors. + */ + virtual bool + visit(const ConstGRGPtr& grg, NodeID node, TraversalDirection direction, DfsPass dfsPass = DFS_PASS_NONE) = 0; +}; + +} // namespace grgl + +#endif /* GRG_VISITOR_H */ diff --git a/third-party/args b/third-party/args new file mode 160000 index 0000000..cc2368c --- /dev/null +++ b/third-party/args @@ -0,0 +1 @@ +Subproject commit cc2368ca0d8a962862c96c00fe919e1480050f51 diff --git a/third-party/bgen b/third-party/bgen new file mode 160000 index 0000000..a94ddfa --- /dev/null +++ b/third-party/bgen @@ -0,0 +1 @@ +Subproject commit a94ddfafd62c21a729abdec4636fea0ca5f11b4d diff --git a/third-party/pybind11 b/third-party/pybind11 new file mode 160000 index 0000000..57287b5 --- /dev/null +++ b/third-party/pybind11 @@ -0,0 +1 @@ +Subproject commit 57287b578d58af016cd571f6eacee19300d37f2d diff --git a/third-party/tskit b/third-party/tskit new file mode 160000 index 0000000..beafeba --- /dev/null +++ b/third-party/tskit @@ -0,0 +1 @@ +Subproject commit beafeba3f576da4524e24b00aa184e608f6de4c4