Skip to content

Commit

Permalink
Merge pull request #25 from anergictcell/feature/qc-filter
Browse files Browse the repository at this point in the history
Add CLI option to filter transcripts by QC checks
Refactor clap to use Derive
  • Loading branch information
anergictcell authored Nov 5, 2022
2 parents 3b38891 + 3ad2e79 commit bdfcc9b
Show file tree
Hide file tree
Showing 5 changed files with 396 additions and 261 deletions.
6 changes: 6 additions & 0 deletions JUSTFILE
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,12 @@ test:
(diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -r tests/data/hg19.fasta -t qc | tail -n +2 | cut -f3-9 | sort | uniq -c | sed "s/ //g") <(echo -ne "14OK\tOK\tOK\tOK\tNOK\tOK\tOK\n13OK\tOK\tOK\tOK\tOK\tOK\tOK\n") && \
echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"

echo -ne "Test Filter"
(diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -t refgene -r tests/data/hg19.fasta -q exon -q start -q stop -q exon -q cds-length -q upstream-start -q upstream-stop -q coordinates | wc -l | sed "s/ //g") <(echo -ne "13\n") && \
diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -t refgene -r tests/data/hg19.fasta -q exon -q start -q stop -q exon -q cds-length -q upstream-stop -q coordinates | wc -l | sed "s/ //g") <(echo -ne "27\n") && \
diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -t refgene -r tests/data/hg19.fasta -q upstream-stop -c "chrY:vertebrate mitochondrial" | wc -l | sed "s/ //g") <(echo -ne "26\n") && \
echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"

fi

benchmark name:
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ Additional, optional arguments:
- `-g`, `--gtf-source`: Specify the source for GTF output files. Defaults to `atg`
- `-r`, `--reference`: Path of a reference genome fasta file. Required for fasta output
- `-c`, `--genetic-code`: Specify which genetic code to use for translating the transcripts. Genetic codes can be specified per chromosome by specifying the chromsome and the code, separated by `:` (e.g. `-c chrM:vertebrate mitochondrial`). They can also be specified for all chromsomes by omitting the chromosome (e.g. `-c vertebrate mitochondrial`). The argument can be specified multiple times (e.g: `-c "standard" -c "chrM:vertebrate mitochondrial" -c "chrAYN:alternative yeast nuclear"`). The code names are based on the `name` field from the [NCBI specs](https://www.ncbi.nlm.nih.gov/IEB/ToolBox/C_DOC/lxr/source/data/gc.prt) but all lowercase characters. Alternatively, you can also specify the amino acid lookup table directly: `-c "chrM:FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG"`. Defaults to `standard`.
- `-q`, `--qc-check`: Specify QC-checks for removing transcripts from the output

#### Examples:
```bash
Expand All @@ -78,6 +79,10 @@ atg --from refgene --to gtf --input /path/to/input.refgene --output /path/to/out

## Convert RefGene to bed
atg --from refgene --to bed --input /path/to/input.refgene --output /path/to/output.bed


## Convert a GTF file to a RefGene file, remove all transcript without proper start and stop codons
atg --from gtf --to refgene --input /path/to/input.gtf --output /path/to/output.refgene --qc-check start --qc-check stop --reference /path/to/fasta.fa
```

### Supported `--output` formats
Expand Down
180 changes: 180 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
use atglib::qc::QcCheck;
use atglib::qc::QcResult;
use clap::{Parser, ValueEnum};

/// Convert transcript data from and to different file formats
///
/// More detailed usage instructions on Github: <https://github.com/anergictcell/atg>
#[derive(Parser, Debug)]
#[command(author, version, about)]
pub struct Args {
/// Format of input file
#[arg(short, long, value_name = "FORMAT")]
pub from: InputFormat,

/// Output format
#[arg(short, long, value_name = "FORMAT")]
pub to: OutputFormat,

/// Path to input file
#[arg(short, long, default_value = "/dev/stdin", value_name = "FILE")]
pub input: String,

/// Path to output file
#[arg(short, long, default_value = "/dev/stdout", value_name = "FILE")]
pub output: String,

/// The feature source to indicate in GTF files (optional with `--output gtf`)
#[arg(short, long, default_value = env!("CARGO_PKG_NAME"), value_name = "FILE")]
pub gtf_source: String,

/// Path to reference genome fasta file. (required with `--output [fasta | fasta-split | feature-sequence | qc]`)
///
/// You can also specify an S3 Uri (s3://mybucket/myfile.fasta), but reading from S3 is currently quite slow
#[arg(short, long, value_name = "FASTA_FILE", required_if_eq_any([("to", "fasta"),("to", "fasta-split"),("to", "feature-sequence"),("to", "qc")]))]
pub reference: Option<String>,

/// Which part of the transcript to transcribe
///
/// This option is only needed when generating fasta output.
#[arg(long, default_value = "cds")]
pub fasta_format: FastaFormat,

/// Sets the level of verbosity
#[arg(short, action = clap::ArgAction::Count)]
pub verbose: u8,

/// Specify which genetic code to use for translating the transcripts
///
/// Genetic codes can be specified globally (e.g. `-c standard`)
///
/// or chromosome specific (e..g `-c "chrM:vertebrate mitochondrial"`).
///
/// Specify by name or amino acid lookup table (e.g. `FFLLSSSSYY**CC*....`)
///
/// Defaults to the standard genetic code for all transcripts. Suggested use for vertebrates:
///
/// `-c standard -c "chrM:vertebrate mitochondrial"`)
///
/// (optional with `--output qc`)
#[arg(short = 'c', long, action = clap::ArgAction::Append, value_name = "GENETIC CODE")]
pub genetic_code: Vec<String>,

/// Remove all variants from the output that fail QC-checks
///
/// You can specify one or multiple QC-checks. Only `NOK` results will be removed. `OK` and `NA` will remain.
#[arg(short = 'q', long = "qc-check", action = clap::ArgAction::Append, value_name = "QC CHECKS", requires = "reference")]
pub qc_check: Vec<QcFilter>,
}

#[derive(Clone, Debug, ValueEnum)]
pub enum FastaFormat {
/// The full genomic sequence of the transcript, including introns. (similar to pre-processed mRNA)
Transcript,
/// All exons of the transcript. (similar to processed mRNA)
Exons,
/// The coding sequence of the transcript
Cds,
}

impl FastaFormat {
pub fn as_str(&self) -> &str {
match self {
FastaFormat::Transcript => "transcript",
FastaFormat::Exons => "exons",
FastaFormat::Cds => "cds",
}
}
}

#[derive(Clone, Debug, ValueEnum)]
pub enum InputFormat {
/// GTF2.2 format
Gtf,
/// RefGene format (one transcript per line)
Refgene,
/// GenePredExt format (one transcript per line)
Genepredext,
/// ATG-specific binary format
Bin,
}

impl std::fmt::Display for InputFormat {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{:?}", self)
}
}

#[derive(Clone, Debug, ValueEnum)]
pub enum OutputFormat {
/// GTF2.2 format
Gtf,
/// RefGene format (one transcript per line)
Refgene,
/// GenePred format (one transcript per line)
Genepred,
/// GenePredExt format (one transcript per line)
Genepredext,
/// Bedfile (one transcript per line)
Bed,
/// Nucleotide sequence. There are multiple formatting options available, see --fasta-format
Fasta,
/// Like 'fasta', but every transcript is written to its own file. (--output must be the path to a folder)
FastaSplit,
/// Nucleotide sequence for every 'feature' (UTR, CDS or non-coding exons)
FeatureSequence,
/// Custom format, as needed for SpliceAI
Spliceai,
/// ATG-specific binary format
Bin,
/// Performs QC checks on all Transcripts
Qc,
/// No output
None,
/// This only makes sense for debugging purposes
Raw,
}

impl std::fmt::Display for OutputFormat {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{:?}", self)
}
}

#[derive(Clone, Debug, Eq, PartialEq, ValueEnum)]
pub enum QcFilter {
/// Transcript contains at least one exon
Exon,
/// The length of the CDS is divisible by 3
CdsLength,
/// The CDS starts with ATG
Start,
/// The CDS ends with a Stop codon
Stop,
/// TThe 5’UTR does not contain another start codon ATG
UpstreamStart,
/// The CDS does not contain another in-frame stop-codon
UpstreamStop,
/// The transcript is within the coordinates of the reference genome
Coordinates,
}

impl QcFilter {
pub fn remove(&self, qc: &QcCheck) -> bool {
match self {
QcFilter::Exon => qc.contains_exon() == QcResult::NOK,
QcFilter::CdsLength => qc.correct_cds_length() == QcResult::NOK,
QcFilter::Start => qc.correct_start_codon() == QcResult::NOK,
QcFilter::Stop => qc.correct_stop_codon() == QcResult::NOK,
QcFilter::UpstreamStart => qc.no_upstream_start_codon() == QcResult::NOK,
QcFilter::UpstreamStop => qc.no_upstream_stop_codon() == QcResult::NOK,
QcFilter::Coordinates => qc.correct_coordinates() == QcResult::NOK,
}
}
}

impl std::fmt::Display for QcFilter {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{:?}", self)
}
}
Loading

0 comments on commit bdfcc9b

Please sign in to comment.