Update to Version 0.5.0

Version 0.5.0
anergictcell · May 23, 2022 · ead4a47 · ead4a47
2 parents 94e96e6 + 1a9ebdb
commit ead4a47
Show file tree

Hide file tree

Showing 32 changed files with 1,091 additions and 279 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # Changelog
 
+## 0.5
+- Add exon numbers in GTF output
+- Refactor GTF parsing to improve performance
+- Add GenePredExt Reader and Writer
+- Add GenePred Writer
+- Remove chr prefixing during parsing of chromosomes/contigs
+- Improve docs
+- Add more integration tests
+
+## 0.4
 - Support writing Fasta files
 - Support sequence output per feature
 - Improved error handling

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "atg"
-version = "0.4.0"
+version = "0.5.0"
 authors = ["Jonas Marcello <[email protected]>"]
 edition = "2018"
 description = "A utility to handle transcripts for genomics and transcriptomics"
@@ -16,4 +16,7 @@ clap = "2.33.3"
 log = "0.4"
 loggerv = "0.7"
 bincode = "1.3.3"
-serde = { version = "1.0", features = ["derive"] }
+serde = { version = "1.0", features = ["derive"] }
+
+[profile.release]
+lto = true
diff --git a/JUSTFILE b/JUSTFILE
@@ -43,17 +43,28 @@ new version:
 
 test:
     #!/usr/bin/env zsh
-    cargo clippy && cargo fmt && cargo test -q && cargo doc
+    echo -ne "Checking formatting and doc generation"
+    (cargo clippy && cargo fmt --check && cargo test -q && cargo doc && \
+    echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
     echo -ne "Checking GTF to RefGene"
-    diff <( cargo run -- -f gtf -i tests/data/example.gtf -t refgene -o /dev/stdout 2> /dev/null | sort ) tests/data/example.refgene
-    echo " \e[32m\e[1mOK\e[0m"
+    (diff <( cargo run -- -f gtf -i tests/data/example.gtf -t refgene -o /dev/stdout 2> /dev/null | sort ) tests/data/example.refgene && \
+    echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
     echo -ne "Checking RefGene to GTF"
-    diff <( cargo run -- -f refgene -i tests/data/example.refgene -t gtf -g ncbiRefSeq.2021-05-17 -o /dev/stdout 2> /dev/null | cut -f1-5,7-8 | sort ) <( cut -f1-5,7-8 tests/data/example.gtf)
-    echo " \e[32m\e[1mOK\e[0m"
-
+    (diff <( cargo run -- -f refgene -i tests/data/example.refgene -t gtf -g ncbiRefSeq.2021-05-17 -o /dev/stdout 2> /dev/null | cut -f1-5,7-8 | sort ) <( cut -f1-5,7-8 tests/data/example.gtf) && \
+    echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
+    echo -ne "Checking uniqness of GTF attribute column across exons"
+    (diff <( cargo run -- -f refgene -i tests/data/example.refgene -t gtf -g ncbiRefSeq.2021-05-17 -o /dev/stdout 2> /dev/null | grep "\texon\t" | cut -f9 | uniq -c | awk '{print $1}' | sort | uniq -c) <(echo " 695 1") && \
+    echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
+    echo -ne "Checking exon numbering in exon, CDS, and UTR"
+    (diff <( cargo run -- -f refgene -i tests/data/example.refgene -t gtf -g ncbiRefSeq.2021-05-17 -o /dev/stdout 2> /dev/null | grep "\texon\t" | grep "NM_004015.3.18" | wc -l | sed "s/ //g") <(echo "1") && \
+    diff <( cargo run -- -f refgene -i tests/data/example.refgene -t gtf -g ncbiRefSeq.2021-05-17 -o /dev/stdout 2> /dev/null | grep "\tCDS\t" | grep "NM_004015.3.18" | wc -l | sed "s/ //g") <(echo "1") && \
+    diff <( cargo run -- -f refgene -i tests/data/example.refgene -t gtf -g ncbiRefSeq.2021-05-17 -o /dev/stdout 2> /dev/null | grep "\t5UTR\t" | grep "NM_004015.3.18" | wc -l | sed "s/ //g") <(echo "1") && \
+    diff <( cargo run -- -f refgene -i tests/data/example.refgene -t gtf -g ncbiRefSeq.2021-05-17 -o /dev/stdout 2> /dev/null | grep "NM_004015.3.18" | wc -l | sed "s/ //g") <(echo "3") && \
+    echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
 
 benchmark name:
     cargo build --release --target=aarch64-apple-darwin
+    rm target/atg-{{name}}-aarch64-apple-darwin
     cp target/aarch64-apple-darwin/release/atg target/atg-{{name}}-aarch64-apple-darwin
     time target/atg-{{name}}-aarch64-apple-darwin -f gtf -i tests/data/hg19.ncbiRefSeq.gtf -o /dev/null -t none
     flamegraph --root -- target/atg-{{name}}-aarch64-apple-darwin -f gtf -i tests/data/hg19.ncbiRefSeq.gtf -o /dev/null -t none
diff --git a/README.md b/README.md
diff --git a/src/fasta/writer.rs b/src/fasta/writer.rs
@@ -1,4 +1,3 @@
-use std::convert::TryInto;
 use std::fs::File;
 use std::io::{BufWriter, Write};
 use std::path::Path;
@@ -344,6 +343,13 @@ enum SequenceBuilder {
 
 impl SequenceBuilder {
     /// Builds the actual Sequence
+    ///
+    /// # Panics
+    ///
+    /// This method panics if the transcript cannot be converted to a Sequence.
+    /// This could happen when the transcript's location is out of bounds of the Fasta file,
+    /// the Fasta file becomes unavaible during the reading
+    /// or if the Fasta file contains invalid Nucleotides
     pub fn build(&self, transcript: &Transcript, fasta_reader: &mut FastaReader<File>) -> Sequence {
         let segments = match self {
             SequenceBuilder::Cds => transcript.cds_coordinates(),
@@ -356,13 +362,13 @@ impl SequenceBuilder {
         };
 
         let capacity: u32 = segments.iter().map(|x| x.2 - x.1 + 1).sum();
-        let mut seq = Sequence::with_capacity(capacity.try_into().unwrap()); // unwrap can't fail
+        let mut seq = Sequence::with_capacity(capacity as usize);
 
         for segment in segments {
             seq.append(
                 fasta_reader
                     .read_sequence(segment.0, segment.1.into(), segment.2.into())
-                    .unwrap(),
+                    .unwrap(), // possible panic documented in signature
             )
         }
         if !transcript.forward() {

diff --git a/src/genepred/mod.rs b/src/genepred/mod.rs
@@ -0,0 +1,29 @@
+//! Convert from/to GenePredExt
+//!
+//!
+//! The GenePred format is described by [UCSC](http://genome.ucsc.edu/FAQ/FAQformat#format9)
+//!
+//!  # Schema for NCBI RefSeq - RefSeq genes from NCBI
+//! | Column | Type | Example | Description |
+//! | --- | --- | --- | --- |
+//! | name | str |  NR_046018.2 | Name of gene (usually transcript_id from GTF) |
+//! | chrom | str | chr1 | Reference sequence chromosome or scaffold |
+//! | strand | enum("+", "-") | + | + or - for strand |
+//! | txStart | int | 11873 | Transcription start position (or end position for minus strand item) |
+//! | txEnd | int | 14409 | Transcription end position (or start position for minus strand item) |
+//! | cdsStart | int | 14409 | Coding region start (or end position for minus strand item) |
+//! | cdsEnd | int | 4409 | Coding region end (or start position for minus strand item) |
+//! | exonCount | int | 3 | Number of exons |
+//! | exonStarts | List of int | 1873,12612,13220, | Exon start positions (or end positions for minus strand item) (with trailing comma) |
+//! | exonEnds | List of int | 12227,12721,14409, | Exon end positions (or start positions for minus strand item) (with trailing comma) |
+//!
+//! The format is almost identical to RefGene, it's only missing some column. So, instead of reinventing the wheel,
+//! we copied most of refgene Writer code and just removed the extra columns.
+//!
+//! At the moment, there is only a GenePred `Writer`. `Reader` is not yet implemented.
+//! Parsing GeneProd is not yet possible due to the missing exonFrames columns. This
+//! could be calculated during parsing, but this is not yet done.
+
+mod writer;
+
+pub use crate::genepred::writer::Writer;
diff --git a/src/genepred/writer.rs b/src/genepred/writer.rs
@@ -0,0 +1,146 @@
+use std::fs::File;
+use std::io::{BufWriter, Write};
+use std::path::Path;
+
+use crate::models::{Transcript, TranscriptWrite};
+use crate::utils::errors::ReadWriteError;
+
+/// Writes [`Transcript`]s into a `BufWriter`
+///
+/// # Examples
+///
+/// ```rust
+/// use std::io;
+/// use atg::tests;;
+/// use atg::genepred::Writer;
+/// use atg::models::TranscriptWrite;
+///
+/// let transcripts = vec![tests::transcripts::standard_transcript()];
+///
+/// let output = Vec::new(); // substitute this with proper IO (io::stdout())
+/// let mut writer = Writer::new(output);
+/// writer.write_transcript_vec(&transcripts);
+///
+/// let written_output = String::from_utf8(writer.into_inner().unwrap()).unwrap();
+/// assert_eq!(written_output.starts_with("Test-Transcript\tchr1\t"), true);
+/// ```
+pub struct Writer<W: std::io::Write> {
+    inner: BufWriter<W>,
+}
+
+impl Writer<File> {
+    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self, ReadWriteError> {
+        match File::create(path.as_ref()) {
+            Ok(file) => Ok(Self::new(file)),
+            Err(err) => Err(ReadWriteError::new(err)),
+        }
+    }
+}
+
+impl<W: std::io::Write> Writer<W> {
+    /// Creates a new generic Writer for any `std::io::Read`` object
+    ///
+    /// Use this method when you want to write to stdout or
+    /// a remote source, e.g. via HTTP
+    pub fn new(writer: W) -> Self {
+        Writer {
+            inner: BufWriter::new(writer),
+        }
+    }
+
+    pub fn with_capacity(capacity: usize, writer: W) -> Self {
+        Writer {
+            inner: BufWriter::with_capacity(capacity, writer),
+        }
+    }
+
+    pub fn flush(&mut self) -> Result<(), std::io::Error> {
+        self.inner.flush()
+    }
+
+    pub fn into_inner(self) -> Result<W, std::io::IntoInnerError<BufWriter<W>>> {
+        self.inner.into_inner()
+    }
+}
+
+impl<W: std::io::Write> TranscriptWrite for Writer<W> {
+    /// Writes a single transcript formatted as GenePred with an extra newline
+    ///
+    /// This method adds an extra newline at the end of the row
+    /// to allow writing multiple transcripts continuosly
+    fn writeln_single_transcript(&mut self, transcript: &Transcript) -> Result<(), std::io::Error> {
+        self.write_single_transcript(transcript)?;
+        self.inner.write_all("\n".as_bytes())
+    }
+
+    /// Writes a single transcript formatted as GenePred
+    ///
+    /// Consider [`writeln_single_transcript`](Writer::writeln_single_transcript)
+    /// to ensure that an extra newline is added to the output
+    fn write_single_transcript(&mut self, transcript: &Transcript) -> Result<(), std::io::Error> {
+        let columns: Vec<String> = Vec::from(transcript);
+        // GenePred is similar to RefGene, but without the first `bin` column
+        // and some other columns at the end
+        self.inner.write_all((columns[1..11].join("\t")).as_bytes())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Writer;
+    use crate::models::TranscriptWrite;
+    use crate::tests::transcripts;
+
+    #[test]
+    fn test_nm_001365057() {
+        let transcripts = vec![transcripts::nm_001365057()];
+        let mut writer = Writer::new(Vec::new());
+        let _ = writer.write_transcript_vec(&transcripts);
+
+        let output = writer.into_inner().unwrap();
+        assert_eq!(
+            std::str::from_utf8(&output).unwrap(),
+            "NM_001365057.2\tchr9\t+\t74526554\t74600974\t74526650\t74597573\t3\t74526554,74561921,74597572,\t74526752,74562028,74600974,\n"
+        );
+    }
+
+    #[test]
+    fn test_nm_001365408() {
+        let transcripts = vec![transcripts::nm_001365408()];
+        let mut writer = Writer::new(Vec::new());
+        let _ = writer.write_transcript_vec(&transcripts);
+
+        let output = writer.into_inner().unwrap();
+
+        assert_eq!(
+            std::str::from_utf8(&output).unwrap(),
+            "NM_001365408.1\tchr16\t+\t66969418\t66978999\t66972142\t66977928\t12\t66969418,66971939,66973119,66974124,66974339,66975026,66975408,66975670,66976007,66976550,66977201,66977741,\t66969778,66972144,66973261,66974258,66974598,66975125,66975549,66975751,66976152,66976640,66977274,66978999,\n"
+        );
+    }
+
+    #[test]
+    fn test_nm_001371720() {
+        let transcripts = vec![transcripts::nm_001371720(false)];
+        let mut writer = Writer::new(Vec::new());
+        let _ = writer.write_transcript_vec(&transcripts);
+
+        let output = writer.into_inner().unwrap();
+        assert_eq!(
+            std::str::from_utf8(&output).unwrap(),
+            "NM_001371720.1\tchr1\t-\t155158299\t155162700\t155158610\t155162634\t8\t155158299,155159700,155159930,155160197,155160483,155160638,155161619,155162576,\t155158685,155159850,155160052,155160334,155160539,155161619,155162101,155162700,\n"
+        );
+    }
+
+    #[test]
+    fn test_nm_201550() {
+        let transcripts = vec![transcripts::nm_201550()];
+        let mut writer = Writer::new(Vec::new());
+        let _ = writer.write_transcript_vec(&transcripts);
+
+        let output = writer.into_inner().unwrap();
+        assert_eq!(
+            std::str::from_utf8(&output).unwrap(),
+            "NM_201550.4\tchr12\t-\t70002343\t70004687\t70003784\t70004618\t1\t70002343,\t70004687,\n"
+        );
+    }
+}
diff --git a/src/genepredext/mod.rs b/src/genepredext/mod.rs
@@ -0,0 +1,32 @@
+//! Convert from/to GenePredExt
+//!
+//!
+//! The GenePred(ext) format is described by [UCSC](http://genome.ucsc.edu/FAQ/FAQformat#format9)
+//!
+//!  # Schema for NCBI RefSeq - RefSeq genes from NCBI
+//! | Column | Type | Example | Description |
+//! | --- | --- | --- | --- |
+//! | name | str |  NR_046018.2 | Name of gene (usually transcript_id from GTF) |
+//! | chrom | str | chr1 | Reference sequence chromosome or scaffold |
+//! | strand | enum("+", "-") | + | + or - for strand |
+//! | txStart | int | 11873 | Transcription start position (or end position for minus strand item) |
+//! | txEnd | int | 14409 | Transcription end position (or start position for minus strand item) |
+//! | cdsStart | int | 14409 | Coding region start (or end position for minus strand item) |
+//! | cdsEnd | int | 4409 | Coding region end (or start position for minus strand item) |
+//! | exonCount | int | 3 | Number of exons |
+//! | exonStarts | List of int | 1873,12612,13220, | Exon start positions (or end positions for minus strand item) (with trailing comma) |
+//! | exonEnds | List of int | 12227,12721,14409, | Exon end positions (or start positions for minus strand item) (with trailing comma) |
+//! | score | int | 0 | The score field indicates a degree of confidence in the feature's existence and coordinates |
+//! | name2 | str | DDX11L1 | Alternate name (e.g. gene_id from GTF) |
+//! | cdsStartStat | enum("none", "unk", "incmpl", "cmpl") | compl | Status of CDS start annotation (none, unknown, incomplete, or complete) |
+//! | cdsEndStat | enum("none", "unk", "incmpl", "cmpl") | compl | Status of CDS end annotation (none, unknown, incomplete, or complete) |
+//! | exonFrames | List of enum(-1, 0, 1, 2) | -1,0,2,1 | Exon frame {0,1,2}, or -1 if no frame for exon |
+//!
+//! The format is almost identical to RefGene, it's only missing the `bin` column. So, instead of reinventing the wheel,
+//! we copied most of refgene Reader and Writer code and just added/removed the `bin` column.
+
+mod reader;
+mod writer;
+
+pub use crate::genepredext::reader::Reader;
+pub use crate::genepredext::writer::Writer;