Merge pull request #25 from anergictcell/feature/qc-filter

Add CLI option to filter transcripts by QC checks Refactor clap to use Derive
anergictcell · Nov 5, 2022 · bdfcc9b · bdfcc9b
2 parents 3b38891 + 3ad2e79
commit bdfcc9b
Show file tree

Hide file tree

Showing 5 changed files with 396 additions and 261 deletions.
diff --git a/JUSTFILE b/JUSTFILE
@@ -108,6 +108,12 @@ test:
         (diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -r tests/data/hg19.fasta -t qc | tail -n +2  | cut -f3-9 | sort |  uniq -c | sed "s/ //g") <(echo -ne "14OK\tOK\tOK\tOK\tNOK\tOK\tOK\n13OK\tOK\tOK\tOK\tOK\tOK\tOK\n") && \
         echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
 
+        echo -ne "Test Filter"
+        (diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -t refgene  -r tests/data/hg19.fasta -q exon -q start -q stop -q exon -q cds-length -q upstream-start -q upstream-stop -q coordinates | wc -l | sed "s/ //g") <(echo -ne "13\n") && \
+        diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -t refgene  -r tests/data/hg19.fasta -q exon -q start -q stop -q exon -q cds-length -q upstream-stop -q coordinates | wc -l | sed "s/ //g") <(echo -ne "27\n") && \
+        diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -t refgene  -r tests/data/hg19.fasta -q upstream-stop -c "chrY:vertebrate mitochondrial" | wc -l | sed "s/ //g") <(echo -ne "26\n") && \
+        echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
+
     fi
 
 benchmark name:

diff --git a/README.md b/README.md
@@ -61,6 +61,7 @@ Additional, optional arguments:
 - `-g`, `--gtf-source`: Specify the source for GTF output files. Defaults to `atg`
 - `-r`, `--reference`: Path of a reference genome fasta file. Required for fasta output
 - `-c`, `--genetic-code`: Specify which genetic code to use for translating the transcripts. Genetic codes can be specified per chromosome by specifying the chromsome and the code, separated by `:` (e.g. `-c chrM:vertebrate mitochondrial`). They can also be specified for all chromsomes by omitting the chromosome (e.g. `-c vertebrate mitochondrial`). The argument can be specified multiple times (e.g: `-c "standard" -c "chrM:vertebrate mitochondrial" -c "chrAYN:alternative yeast nuclear"`). The code names are based on the `name` field from the [NCBI specs](https://www.ncbi.nlm.nih.gov/IEB/ToolBox/C_DOC/lxr/source/data/gc.prt) but all lowercase characters. Alternatively, you can also specify the amino acid lookup table directly: `-c "chrM:FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG"`. Defaults to `standard`.
+- `-q`, `--qc-check`: Specify QC-checks for removing transcripts from the output
 
 #### Examples:
 ```bash
@@ -78,6 +79,10 @@ atg --from refgene --to gtf --input /path/to/input.refgene --output /path/to/out
 
 ## Convert RefGene to bed
 atg --from refgene --to bed --input /path/to/input.refgene --output /path/to/output.bed
+
+
+## Convert a GTF file to a RefGene file, remove all transcript without proper start and stop codons
+atg --from gtf --to refgene --input /path/to/input.gtf --output /path/to/output.refgene --qc-check start --qc-check stop --reference /path/to/fasta.fa
 ```
 
 ### Supported `--output` formats

diff --git a/src/cli.rs b/src/cli.rs
@@ -0,0 +1,180 @@
+use atglib::qc::QcCheck;
+use atglib::qc::QcResult;
+use clap::{Parser, ValueEnum};
+
+/// Convert transcript data from and to different file formats
+///
+/// More detailed usage instructions on Github: <https://github.com/anergictcell/atg>
+#[derive(Parser, Debug)]
+#[command(author, version, about)]
+pub struct Args {
+    /// Format of input file
+    #[arg(short, long, value_name = "FORMAT")]
+    pub from: InputFormat,
+
+    /// Output format
+    #[arg(short, long, value_name = "FORMAT")]
+    pub to: OutputFormat,
+
+    /// Path to input file
+    #[arg(short, long, default_value = "/dev/stdin", value_name = "FILE")]
+    pub input: String,
+
+    /// Path to output file
+    #[arg(short, long, default_value = "/dev/stdout", value_name = "FILE")]
+    pub output: String,
+
+    /// The feature source to indicate in GTF files (optional with `--output gtf`)
+    #[arg(short, long, default_value = env!("CARGO_PKG_NAME"), value_name = "FILE")]
+    pub gtf_source: String,
+
+    /// Path to reference genome fasta file. (required with `--output [fasta | fasta-split | feature-sequence | qc]`)
+    ///
+    /// You can also specify an S3 Uri (s3://mybucket/myfile.fasta), but reading from S3 is currently quite slow
+    #[arg(short, long, value_name = "FASTA_FILE", required_if_eq_any([("to", "fasta"),("to", "fasta-split"),("to", "feature-sequence"),("to", "qc")]))]
+    pub reference: Option<String>,
+
+    /// Which part of the transcript to transcribe
+    ///
+    /// This option is only needed when generating fasta output.
+    #[arg(long, default_value = "cds")]
+    pub fasta_format: FastaFormat,
+
+    /// Sets the level of verbosity
+    #[arg(short, action = clap::ArgAction::Count)]
+    pub verbose: u8,
+
+    /// Specify which genetic code to use for translating the transcripts
+    ///
+    /// Genetic codes can be specified globally (e.g. `-c standard`)
+    ///
+    /// or chromosome specific (e..g `-c "chrM:vertebrate mitochondrial"`).
+    ///
+    /// Specify by name or amino acid lookup table (e.g. `FFLLSSSSYY**CC*....`)
+    ///
+    /// Defaults to the standard genetic code for all transcripts. Suggested use for vertebrates:
+    ///
+    /// `-c standard -c "chrM:vertebrate mitochondrial"`)
+    ///
+    /// (optional with `--output qc`)
+    #[arg(short = 'c', long, action = clap::ArgAction::Append, value_name = "GENETIC CODE")]
+    pub genetic_code: Vec<String>,
+
+    /// Remove all variants from the output that fail QC-checks
+    ///
+    /// You can specify one or multiple QC-checks. Only `NOK` results will be removed. `OK` and `NA` will remain.
+    #[arg(short = 'q', long = "qc-check", action = clap::ArgAction::Append, value_name = "QC CHECKS", requires = "reference")]
+    pub qc_check: Vec<QcFilter>,
+}
+
+#[derive(Clone, Debug, ValueEnum)]
+pub enum FastaFormat {
+    /// The full genomic sequence of the transcript, including introns. (similar to pre-processed mRNA)
+    Transcript,
+    /// All exons of the transcript. (similar to processed mRNA)
+    Exons,
+    /// The coding sequence of the transcript
+    Cds,
+}
+
+impl FastaFormat {
+    pub fn as_str(&self) -> &str {
+        match self {
+            FastaFormat::Transcript => "transcript",
+            FastaFormat::Exons => "exons",
+            FastaFormat::Cds => "cds",
+        }
+    }
+}
+
+#[derive(Clone, Debug, ValueEnum)]
+pub enum InputFormat {
+    /// GTF2.2 format
+    Gtf,
+    /// RefGene format (one transcript per line)
+    Refgene,
+    /// GenePredExt format (one transcript per line)
+    Genepredext,
+    /// ATG-specific binary format
+    Bin,
+}
+
+impl std::fmt::Display for InputFormat {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+#[derive(Clone, Debug, ValueEnum)]
+pub enum OutputFormat {
+    /// GTF2.2 format
+    Gtf,
+    /// RefGene format (one transcript per line)
+    Refgene,
+    /// GenePred format (one transcript per line)
+    Genepred,
+    /// GenePredExt format (one transcript per line)
+    Genepredext,
+    /// Bedfile (one transcript per line)
+    Bed,
+    /// Nucleotide sequence. There are multiple formatting options available, see --fasta-format
+    Fasta,
+    /// Like 'fasta', but every transcript is written to its own file. (--output must be the path to a folder)
+    FastaSplit,
+    /// Nucleotide sequence for every 'feature' (UTR, CDS or non-coding exons)
+    FeatureSequence,
+    /// Custom format, as needed for SpliceAI
+    Spliceai,
+    /// ATG-specific binary format
+    Bin,
+    /// Performs QC checks on all Transcripts
+    Qc,
+    /// No output
+    None,
+    /// This only makes sense for debugging purposes
+    Raw,
+}
+
+impl std::fmt::Display for OutputFormat {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, ValueEnum)]
+pub enum QcFilter {
+    /// Transcript contains at least one exon
+    Exon,
+    /// The length of the CDS is divisible by 3
+    CdsLength,
+    /// The CDS starts with ATG
+    Start,
+    /// The CDS ends with a Stop codon
+    Stop,
+    /// TThe 5’UTR does not contain another start codon ATG
+    UpstreamStart,
+    /// The CDS does not contain another in-frame stop-codon
+    UpstreamStop,
+    /// The transcript is within the coordinates of the reference genome
+    Coordinates,
+}
+
+impl QcFilter {
+    pub fn remove(&self, qc: &QcCheck) -> bool {
+        match self {
+            QcFilter::Exon => qc.contains_exon() == QcResult::NOK,
+            QcFilter::CdsLength => qc.correct_cds_length() == QcResult::NOK,
+            QcFilter::Start => qc.correct_start_codon() == QcResult::NOK,
+            QcFilter::Stop => qc.correct_stop_codon() == QcResult::NOK,
+            QcFilter::UpstreamStart => qc.no_upstream_start_codon() == QcResult::NOK,
+            QcFilter::UpstreamStop => qc.no_upstream_stop_codon() == QcResult::NOK,
+            QcFilter::Coordinates => qc.correct_coordinates() == QcResult::NOK,
+        }
+    }
+}
+
+impl std::fmt::Display for QcFilter {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}