Skip to content

Commit

Permalink
Update to Version 0.5.0
Browse files Browse the repository at this point in the history
Version 0.5.0
  • Loading branch information
anergictcell authored May 23, 2022
2 parents 94e96e6 + 1a9ebdb commit ead4a47
Show file tree
Hide file tree
Showing 32 changed files with 1,091 additions and 279 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Changelog

## 0.5
- Add exon numbers in GTF output
- Refactor GTF parsing to improve performance
- Add GenePredExt Reader and Writer
- Add GenePred Writer
- Remove chr prefixing during parsing of chromosomes/contigs
- Improve docs
- Add more integration tests

## 0.4
- Support writing Fasta files
- Support sequence output per feature
- Improved error handling
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 5 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "atg"
version = "0.4.0"
version = "0.5.0"
authors = ["Jonas Marcello <[email protected]>"]
edition = "2018"
description = "A utility to handle transcripts for genomics and transcriptomics"
Expand All @@ -16,4 +16,7 @@ clap = "2.33.3"
log = "0.4"
loggerv = "0.7"
bincode = "1.3.3"
serde = { version = "1.0", features = ["derive"] }
serde = { version = "1.0", features = ["derive"] }

[profile.release]
lto = true
23 changes: 17 additions & 6 deletions JUSTFILE
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,28 @@ new version:

test:
#!/usr/bin/env zsh
cargo clippy && cargo fmt && cargo test -q && cargo doc
echo -ne "Checking formatting and doc generation"
(cargo clippy && cargo fmt --check && cargo test -q && cargo doc && \
echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
echo -ne "Checking GTF to RefGene"
diff <( cargo run -- -f gtf -i tests/data/example.gtf -t refgene -o /dev/stdout 2> /dev/null | sort ) tests/data/example.refgene
echo " \e[32m\e[1mOK\e[0m"
(diff <( cargo run -- -f gtf -i tests/data/example.gtf -t refgene -o /dev/stdout 2> /dev/null | sort ) tests/data/example.refgene && \
echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
echo -ne "Checking RefGene to GTF"
diff <( cargo run -- -f refgene -i tests/data/example.refgene -t gtf -g ncbiRefSeq.2021-05-17 -o /dev/stdout 2> /dev/null | cut -f1-5,7-8 | sort ) <( cut -f1-5,7-8 tests/data/example.gtf)
echo " \e[32m\e[1mOK\e[0m"
(diff <( cargo run -- -f refgene -i tests/data/example.refgene -t gtf -g ncbiRefSeq.2021-05-17 -o /dev/stdout 2> /dev/null | cut -f1-5,7-8 | sort ) <( cut -f1-5,7-8 tests/data/example.gtf) && \
echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
echo -ne "Checking uniqness of GTF attribute column across exons"
(diff <( cargo run -- -f refgene -i tests/data/example.refgene -t gtf -g ncbiRefSeq.2021-05-17 -o /dev/stdout 2> /dev/null | grep "\texon\t" | cut -f9 | uniq -c | awk '{print $1}' | sort | uniq -c) <(echo " 695 1") && \
echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
echo -ne "Checking exon numbering in exon, CDS, and UTR"
(diff <( cargo run -- -f refgene -i tests/data/example.refgene -t gtf -g ncbiRefSeq.2021-05-17 -o /dev/stdout 2> /dev/null | grep "\texon\t" | grep "NM_004015.3.18" | wc -l | sed "s/ //g") <(echo "1") && \
diff <( cargo run -- -f refgene -i tests/data/example.refgene -t gtf -g ncbiRefSeq.2021-05-17 -o /dev/stdout 2> /dev/null | grep "\tCDS\t" | grep "NM_004015.3.18" | wc -l | sed "s/ //g") <(echo "1") && \
diff <( cargo run -- -f refgene -i tests/data/example.refgene -t gtf -g ncbiRefSeq.2021-05-17 -o /dev/stdout 2> /dev/null | grep "\t5UTR\t" | grep "NM_004015.3.18" | wc -l | sed "s/ //g") <(echo "1") && \
diff <( cargo run -- -f refgene -i tests/data/example.refgene -t gtf -g ncbiRefSeq.2021-05-17 -o /dev/stdout 2> /dev/null | grep "NM_004015.3.18" | wc -l | sed "s/ //g") <(echo "3") && \
echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
benchmark name:
cargo build --release --target=aarch64-apple-darwin
rm target/atg-{{name}}-aarch64-apple-darwin
cp target/aarch64-apple-darwin/release/atg target/atg-{{name}}-aarch64-apple-darwin
time target/atg-{{name}}-aarch64-apple-darwin -f gtf -i tests/data/hg19.ncbiRefSeq.gtf -o /dev/null -t none
flamegraph --root -- target/atg-{{name}}-aarch64-apple-darwin -f gtf -i tests/data/hg19.ncbiRefSeq.gtf -o /dev/null -t none
185 changes: 116 additions & 69 deletions README.md

Large diffs are not rendered by default.

12 changes: 9 additions & 3 deletions src/fasta/writer.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::convert::TryInto;
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::Path;
Expand Down Expand Up @@ -344,6 +343,13 @@ enum SequenceBuilder {

impl SequenceBuilder {
/// Builds the actual Sequence
///
/// # Panics
///
/// This method panics if the transcript cannot be converted to a Sequence.
/// This could happen when the transcript's location is out of bounds of the Fasta file,
/// the Fasta file becomes unavaible during the reading
/// or if the Fasta file contains invalid Nucleotides
pub fn build(&self, transcript: &Transcript, fasta_reader: &mut FastaReader<File>) -> Sequence {
let segments = match self {
SequenceBuilder::Cds => transcript.cds_coordinates(),
Expand All @@ -356,13 +362,13 @@ impl SequenceBuilder {
};

let capacity: u32 = segments.iter().map(|x| x.2 - x.1 + 1).sum();
let mut seq = Sequence::with_capacity(capacity.try_into().unwrap()); // unwrap can't fail
let mut seq = Sequence::with_capacity(capacity as usize);

for segment in segments {
seq.append(
fasta_reader
.read_sequence(segment.0, segment.1.into(), segment.2.into())
.unwrap(),
.unwrap(), // possible panic documented in signature
)
}
if !transcript.forward() {
Expand Down
29 changes: 29 additions & 0 deletions src/genepred/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
//! Convert from/to GenePredExt
//!
//!
//! The GenePred format is described by [UCSC](http://genome.ucsc.edu/FAQ/FAQformat#format9)
//!
//! # Schema for NCBI RefSeq - RefSeq genes from NCBI
//! | Column | Type | Example | Description |
//! | --- | --- | --- | --- |
//! | name | str | NR_046018.2 | Name of gene (usually transcript_id from GTF) |
//! | chrom | str | chr1 | Reference sequence chromosome or scaffold |
//! | strand | enum("+", "-") | + | + or - for strand |
//! | txStart | int | 11873 | Transcription start position (or end position for minus strand item) |
//! | txEnd | int | 14409 | Transcription end position (or start position for minus strand item) |
//! | cdsStart | int | 14409 | Coding region start (or end position for minus strand item) |
//! | cdsEnd | int | 4409 | Coding region end (or start position for minus strand item) |
//! | exonCount | int | 3 | Number of exons |
//! | exonStarts | List of int | 1873,12612,13220, | Exon start positions (or end positions for minus strand item) (with trailing comma) |
//! | exonEnds | List of int | 12227,12721,14409, | Exon end positions (or start positions for minus strand item) (with trailing comma) |
//!
//! The format is almost identical to RefGene, it's only missing some column. So, instead of reinventing the wheel,
//! we copied most of refgene Writer code and just removed the extra columns.
//!
//! At the moment, there is only a GenePred `Writer`. `Reader` is not yet implemented.
//! Parsing GeneProd is not yet possible due to the missing exonFrames columns. This
//! could be calculated during parsing, but this is not yet done.
mod writer;

pub use crate::genepred::writer::Writer;
146 changes: 146 additions & 0 deletions src/genepred/writer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::Path;

use crate::models::{Transcript, TranscriptWrite};
use crate::utils::errors::ReadWriteError;

/// Writes [`Transcript`]s into a `BufWriter`
///
/// # Examples
///
/// ```rust
/// use std::io;
/// use atg::tests;;
/// use atg::genepred::Writer;
/// use atg::models::TranscriptWrite;
///
/// let transcripts = vec![tests::transcripts::standard_transcript()];
///
/// let output = Vec::new(); // substitute this with proper IO (io::stdout())
/// let mut writer = Writer::new(output);
/// writer.write_transcript_vec(&transcripts);
///
/// let written_output = String::from_utf8(writer.into_inner().unwrap()).unwrap();
/// assert_eq!(written_output.starts_with("Test-Transcript\tchr1\t"), true);
/// ```
pub struct Writer<W: std::io::Write> {
inner: BufWriter<W>,
}

impl Writer<File> {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self, ReadWriteError> {
match File::create(path.as_ref()) {
Ok(file) => Ok(Self::new(file)),
Err(err) => Err(ReadWriteError::new(err)),
}
}
}

impl<W: std::io::Write> Writer<W> {
/// Creates a new generic Writer for any `std::io::Read`` object
///
/// Use this method when you want to write to stdout or
/// a remote source, e.g. via HTTP
pub fn new(writer: W) -> Self {
Writer {
inner: BufWriter::new(writer),
}
}

pub fn with_capacity(capacity: usize, writer: W) -> Self {
Writer {
inner: BufWriter::with_capacity(capacity, writer),
}
}

pub fn flush(&mut self) -> Result<(), std::io::Error> {
self.inner.flush()
}

pub fn into_inner(self) -> Result<W, std::io::IntoInnerError<BufWriter<W>>> {
self.inner.into_inner()
}
}

impl<W: std::io::Write> TranscriptWrite for Writer<W> {
/// Writes a single transcript formatted as GenePred with an extra newline
///
/// This method adds an extra newline at the end of the row
/// to allow writing multiple transcripts continuosly
fn writeln_single_transcript(&mut self, transcript: &Transcript) -> Result<(), std::io::Error> {
self.write_single_transcript(transcript)?;
self.inner.write_all("\n".as_bytes())
}

/// Writes a single transcript formatted as GenePred
///
/// Consider [`writeln_single_transcript`](Writer::writeln_single_transcript)
/// to ensure that an extra newline is added to the output
fn write_single_transcript(&mut self, transcript: &Transcript) -> Result<(), std::io::Error> {
let columns: Vec<String> = Vec::from(transcript);
// GenePred is similar to RefGene, but without the first `bin` column
// and some other columns at the end
self.inner.write_all((columns[1..11].join("\t")).as_bytes())
}
}

#[cfg(test)]
mod tests {
use super::Writer;
use crate::models::TranscriptWrite;
use crate::tests::transcripts;

#[test]
fn test_nm_001365057() {
let transcripts = vec![transcripts::nm_001365057()];
let mut writer = Writer::new(Vec::new());
let _ = writer.write_transcript_vec(&transcripts);

let output = writer.into_inner().unwrap();
assert_eq!(
std::str::from_utf8(&output).unwrap(),
"NM_001365057.2\tchr9\t+\t74526554\t74600974\t74526650\t74597573\t3\t74526554,74561921,74597572,\t74526752,74562028,74600974,\n"
);
}

#[test]
fn test_nm_001365408() {
let transcripts = vec![transcripts::nm_001365408()];
let mut writer = Writer::new(Vec::new());
let _ = writer.write_transcript_vec(&transcripts);

let output = writer.into_inner().unwrap();

assert_eq!(
std::str::from_utf8(&output).unwrap(),
"NM_001365408.1\tchr16\t+\t66969418\t66978999\t66972142\t66977928\t12\t66969418,66971939,66973119,66974124,66974339,66975026,66975408,66975670,66976007,66976550,66977201,66977741,\t66969778,66972144,66973261,66974258,66974598,66975125,66975549,66975751,66976152,66976640,66977274,66978999,\n"
);
}

#[test]
fn test_nm_001371720() {
let transcripts = vec![transcripts::nm_001371720(false)];
let mut writer = Writer::new(Vec::new());
let _ = writer.write_transcript_vec(&transcripts);

let output = writer.into_inner().unwrap();
assert_eq!(
std::str::from_utf8(&output).unwrap(),
"NM_001371720.1\tchr1\t-\t155158299\t155162700\t155158610\t155162634\t8\t155158299,155159700,155159930,155160197,155160483,155160638,155161619,155162576,\t155158685,155159850,155160052,155160334,155160539,155161619,155162101,155162700,\n"
);
}

#[test]
fn test_nm_201550() {
let transcripts = vec![transcripts::nm_201550()];
let mut writer = Writer::new(Vec::new());
let _ = writer.write_transcript_vec(&transcripts);

let output = writer.into_inner().unwrap();
assert_eq!(
std::str::from_utf8(&output).unwrap(),
"NM_201550.4\tchr12\t-\t70002343\t70004687\t70003784\t70004618\t1\t70002343,\t70004687,\n"
);
}
}
32 changes: 32 additions & 0 deletions src/genepredext/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
//! Convert from/to GenePredExt
//!
//!
//! The GenePred(ext) format is described by [UCSC](http://genome.ucsc.edu/FAQ/FAQformat#format9)
//!
//! # Schema for NCBI RefSeq - RefSeq genes from NCBI
//! | Column | Type | Example | Description |
//! | --- | --- | --- | --- |
//! | name | str | NR_046018.2 | Name of gene (usually transcript_id from GTF) |
//! | chrom | str | chr1 | Reference sequence chromosome or scaffold |
//! | strand | enum("+", "-") | + | + or - for strand |
//! | txStart | int | 11873 | Transcription start position (or end position for minus strand item) |
//! | txEnd | int | 14409 | Transcription end position (or start position for minus strand item) |
//! | cdsStart | int | 14409 | Coding region start (or end position for minus strand item) |
//! | cdsEnd | int | 4409 | Coding region end (or start position for minus strand item) |
//! | exonCount | int | 3 | Number of exons |
//! | exonStarts | List of int | 1873,12612,13220, | Exon start positions (or end positions for minus strand item) (with trailing comma) |
//! | exonEnds | List of int | 12227,12721,14409, | Exon end positions (or start positions for minus strand item) (with trailing comma) |
//! | score | int | 0 | The score field indicates a degree of confidence in the feature's existence and coordinates |
//! | name2 | str | DDX11L1 | Alternate name (e.g. gene_id from GTF) |
//! | cdsStartStat | enum("none", "unk", "incmpl", "cmpl") | compl | Status of CDS start annotation (none, unknown, incomplete, or complete) |
//! | cdsEndStat | enum("none", "unk", "incmpl", "cmpl") | compl | Status of CDS end annotation (none, unknown, incomplete, or complete) |
//! | exonFrames | List of enum(-1, 0, 1, 2) | -1,0,2,1 | Exon frame {0,1,2}, or -1 if no frame for exon |
//!
//! The format is almost identical to RefGene, it's only missing the `bin` column. So, instead of reinventing the wheel,
//! we copied most of refgene Reader and Writer code and just added/removed the `bin` column.
mod reader;
mod writer;

pub use crate::genepredext::reader::Reader;
pub use crate::genepredext::writer::Writer;
Loading

0 comments on commit ead4a47

Please sign in to comment.