From 054f9ffcb38f7c66df6130a56488c62eaf8cb588 Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Sat, 5 Nov 2022 15:21:33 +0100 Subject: [PATCH 1/3] Add option to filter output by QC stats --- src/cli.rs | 180 +++++++++++++++++++ src/main.rs | 391 ++++++++++++++---------------------------- src/reader_wrapper.rs | 75 ++++++++ 3 files changed, 385 insertions(+), 261 deletions(-) create mode 100644 src/cli.rs create mode 100644 src/reader_wrapper.rs diff --git a/src/cli.rs b/src/cli.rs new file mode 100644 index 0000000..55cb22d --- /dev/null +++ b/src/cli.rs @@ -0,0 +1,180 @@ +use atglib::qc::QcCheck; +use atglib::qc::QcResult; +use clap::{Parser, ValueEnum}; + +/// Convert transcript data from and to different file formats +/// +/// More detailed usage instructions on Github: +#[derive(Parser, Debug)] +#[command(author, version, about)] +pub struct Args { + /// Format of input file + #[arg(short, long, value_name = "FORMAT")] + pub from: InputFormat, + + /// Output format + #[arg(short, long, value_name = "FORMAT")] + pub to: OutputFormat, + + /// Path to input file + #[arg(short, long, default_value = "/dev/stdin", value_name = "FILE")] + pub input: String, + + /// Path to output file + #[arg(short, long, default_value = "/dev/stdout", value_name = "FILE")] + pub output: String, + + /// The feature source to indicate in GTF files (optional with `--output gtf`) + #[arg(short, long, default_value = env!("CARGO_PKG_NAME"), value_name = "FILE")] + pub gtf_source: String, + + /// Path to reference genome fasta file. (required with `--output [fasta | fasta-split | feature-sequence | qc]`) + /// + /// You can also specify an S3 Uri (s3://mybucket/myfile.fasta), but reading from S3 is currently quite slow + #[arg(short, long, value_name = "FASTA_FILE", required_if_eq_any([("to", "fasta"),("to", "fasta-split"),("to", "feature-sequence"),("to", "qc")]))] + pub reference: Option, + + /// Which part of the transcript to transcribe + /// + /// This option is only needed when generating fasta output. + #[arg(long, default_value = "cds")] + pub fasta_format: FastaFormat, + + /// Sets the level of verbosity + #[arg(short, action = clap::ArgAction::Count)] + pub verbose: u8, + + /// Specify which genetic code to use for translating the transcripts + /// + /// Genetic codes can be specified globally (e.g. `-c standard`) + /// + /// or chromosome specific (e..g `-c "chrM:vertebrate mitochondrial"`). + /// + /// Specify by name or amino acid lookup table (e.g. `FFLLSSSSYY**CC*....`) + /// + /// Defaults to the standard genetic code for all transcripts. Suggested use for vertebrates: + /// + /// `-c standard -c "chrM:vertebrate mitochondrial"`) + /// + /// (optional with `--output qc`) + #[arg(short = 'c', long, action = clap::ArgAction::Append, value_name = "GENETIC CODE")] + pub genetic_code: Vec, + + /// Remove all variants from the output that fail QC-checks + /// + /// You can specify one or multiple QC-checks. Only `NOK` results will be removed. `OK` and `NA` will remain. + #[arg(short = 'q', long = "qc-check", action = clap::ArgAction::Append, value_name = "QC CHECKS", requires = "reference")] + pub qc_check: Vec, +} + +#[derive(Clone, Debug, ValueEnum)] +pub enum FastaFormat { + /// The full genomic sequence of the transcript, including introns. (similar to pre-processed mRNA) + Transcript, + /// All exons of the transcript. (similar to processed mRNA) + Exons, + /// The coding sequence of the transcript + Cds, +} + +impl FastaFormat { + pub fn as_str(&self) -> &str { + match self { + FastaFormat::Transcript => "transcript", + FastaFormat::Exons => "exons", + FastaFormat::Cds => "cds", + } + } +} + +#[derive(Clone, Debug, ValueEnum)] +pub enum InputFormat { + /// GTF2.2 format + Gtf, + /// RefGene format (one transcript per line) + Refgene, + /// GenePredExt format (one transcript per line) + Genepredext, + /// ATG-specific binary format + Bin, +} + +impl std::fmt::Display for InputFormat { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{:?}", self) + } +} + +#[derive(Clone, Debug, ValueEnum)] +pub enum OutputFormat { + /// GTF2.2 format + Gtf, + /// RefGene format (one transcript per line) + Refgene, + /// GenePred format (one transcript per line) + Genepred, + /// GenePredExt format (one transcript per line) + Genepredext, + /// Bedfile (one transcript per line) + Bed, + /// Nucleotide sequence. There are multiple formatting options available, see --fasta-format + Fasta, + /// Like 'fasta', but every transcript is written to its own file. (--output must be the path to a folder) + FastaSplit, + /// Nucleotide sequence for every 'feature' (UTR, CDS or non-coding exons) + FeatureSequence, + /// Custom format, as needed for SpliceAI + Spliceai, + /// ATG-specific binary format + Bin, + /// Performs QC checks on all Transcripts + Qc, + /// No output + None, + /// This only makes sense for debugging purposes + Raw, +} + +impl std::fmt::Display for OutputFormat { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{:?}", self) + } +} + +#[derive(Clone, Debug, Eq, PartialEq, ValueEnum)] +pub enum QcFilter { + /// Transcript contains at least one exon + Exon, + /// The length of the CDS is divisible by 3 + CdsLength, + /// The CDS starts with ATG + Start, + /// The CDS ends with a Stop codon + Stop, + /// TThe 5’UTR does not contain another start codon ATG + UpstreamStart, + /// The CDS does not contain another in-frame stop-codon + UpstreamStop, + /// The transcript is within the coordinates of the reference genome + Coordinates, +} + +impl QcFilter { + pub fn remove(&self, qc: &QcCheck) -> bool { + match self { + QcFilter::Exon => qc.contains_exon() == QcResult::NOK, + QcFilter::CdsLength => qc.correct_cds_length() == QcResult::NOK, + QcFilter::Start => qc.correct_start_codon() == QcResult::NOK, + QcFilter::Stop => qc.correct_stop_codon() == QcResult::NOK, + QcFilter::UpstreamStart => qc.no_upstream_start_codon() == QcResult::NOK, + QcFilter::UpstreamStop => qc.no_upstream_stop_codon() == QcResult::NOK, + QcFilter::Coordinates => qc.correct_coordinates() == QcResult::NOK, + } + } +} + +impl std::fmt::Display for QcFilter { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{:?}", self) + } +} diff --git a/src/main.rs b/src/main.rs index 4d749eb..d1642d5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,9 +4,7 @@ use std::fs::File; use std::process; use bincode::{deserialize_from, serialize_into}; -use clap::{Parser, ValueEnum}; - -use s3reader::{S3ObjectUri, S3Reader}; +use clap::Parser; use atglib::bed; use atglib::fasta; @@ -16,224 +14,17 @@ use atglib::genepredext; use atglib::gtf; use atglib::models::{GeneticCode, TranscriptWrite, Transcripts}; use atglib::qc; +use atglib::qc::QcCheck; use atglib::read_transcripts; use atglib::refgene; use atglib::spliceai; use atglib::utils::errors::AtgError; -/// Convert transcript data from and to different file formats -/// -/// More detailed usage instructions on Github: -#[derive(Parser, Debug)] -#[command(author, version, about)] -struct Args { - /// Format of input file - #[arg(short, long, value_name = "FORMAT")] - from: InputFormat, - - /// Output format - #[arg(short, long, value_name = "FORMAT")] - to: OutputFormat, - - /// Path to input file - #[arg(short, long, default_value = "/dev/stdin", value_name = "FILE")] - input: String, - - /// Path to output file - #[arg(short, long, default_value = "/dev/stdout", value_name = "FILE")] - output: String, - - /// The feature source to indicate in GTF files (optional with `--output gtf`) - #[arg(short, long, default_value = env!("CARGO_PKG_NAME"), value_name = "FILE")] - gtf_source: String, - - /// Path to reference genome fasta file. (required with `--output [fasta | fasta-split | feature-sequence | qc]`) - /// - /// You can also specify an S3 Uri (s3://mybucket/myfile.fasta), but reading from S3 is currently quite slow - #[arg(short, long, value_name = "FASTA_FILE", required_if_eq_any([("to", "fasta"),("to", "fasta-split"),("to", "feature-sequence"),("to", "qc")]))] - reference: Option, - - /// Which part of the transcript to transcribe - /// - /// This option is only needed when generating fasta output. - #[arg(long, default_value = "cds")] - fasta_format: FastaFormat, - - /// Sets the level of verbosity - #[arg(short, action = clap::ArgAction::Count)] - verbose: u8, - - /// Specify which genetic code to use for translating the transcripts - /// - /// Genetic codes can be specified globally (e.g. `-c standard`) - /// - /// or chromosome specific (e..g `-c chrM:vertebrate_mitochondria`). - /// - /// Specify by name or amino acid lookup table (e.g. `FFLLSSSSYY**CC*....`) - /// - /// Defaults to the standard genetic code for all transcripts. Suggested use for vertebrates: - /// - /// `-c standard -c chrM:vertebrate_mitochondria`) - /// - /// (optional with `--output qc`) - #[arg(short = 'c', long, action = clap::ArgAction::Append, value_name = "GENETIC CODE")] - genetic_code: Vec, -} - -#[derive(Clone, Debug, ValueEnum)] -enum FastaFormat { - /// The full genomic sequence of the transcript, including introns. (similar to pre-processed mRNA) - Transcript, - /// All exons of the transcript. (similar to processed mRNA) - Exons, - /// The coding sequence of the transcript - Cds, -} - -impl FastaFormat { - fn as_str(&self) -> &str { - match self { - FastaFormat::Transcript => "transcript", - FastaFormat::Exons => "exons", - FastaFormat::Cds => "cds", - } - } -} - -#[derive(Clone, Debug, ValueEnum)] -enum InputFormat { - /// GTF2.2 format - Gtf, - /// RefGene format (one transcript per line) - Refgene, - /// GenePredExt format (one transcript per line) - Genepredext, - /// ATG-specific binary format - Bin, -} - -impl std::fmt::Display for InputFormat { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{:?}", self) - } -} - -#[derive(Clone, Debug, ValueEnum)] -enum OutputFormat { - /// GTF2.2 format - Gtf, - /// RefGene format (one transcript per line) - Refgene, - /// GenePred format (one transcript per line) - Genepred, - /// GenePredExt format (one transcript per line) - Genepredext, - /// Bedfile (one transcript per line) - Bed, - /// Nucleotide sequence. There are multiple formatting options available, see --fasta-format - Fasta, - /// Like 'fasta', but every transcript is written to its own file. (--output must be the path to a folder) - FastaSplit, - /// Nucleotide sequence for every 'feature' (UTR, CDS or non-coding exons) - FeatureSequence, - /// Custom format, as needed for SpliceAI - Spliceai, - /// ATG-specific binary format - Bin, - /// Performs QC checks on all Transcripts - Qc, - /// No output - None, - /// This only makes sense for debugging purposes - Raw, -} - -impl std::fmt::Display for OutputFormat { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{:?}", self) - } -} - - -// There will be only a single instance of this enum -// so we can allow a large variant -#[allow(clippy::large_enum_variant)] -/// ReadSeekWrapper is an enum to allow dynamic assignment of either File or S3 Readers -/// to be used in the Reader objects of Atglib. -enum ReadSeekWrapper { - File(File, String), - S3(S3Reader, String), -} - -impl ReadSeekWrapper { - pub fn from_filename(filename: &str) -> Self { - if filename.starts_with("s3://") { - let uri = S3ObjectUri::new(filename).unwrap(); - let s3obj = S3Reader::open(uri).unwrap(); - Self::S3(s3obj, filename.to_string()) - } else { - Self::File(File::open(filename).unwrap(), filename.to_string()) - } - } - - pub fn from_cli_arg(filename: &Option<&str>) -> Result { - if let Some(filename) = filename { - if filename.starts_with("s3://") { - let uri = S3ObjectUri::new(filename).unwrap(); - let s3obj = S3Reader::open(uri).unwrap(); - Ok(Self::S3(s3obj, filename.to_string())) - } else { - Ok(Self::File( - File::open(filename).unwrap(), - filename.to_string(), - )) - } - } else { - Err(AtgError::new("No file specified")) - } - } - - pub fn filename(&self) -> &str { - match self { - ReadSeekWrapper::File(_, fname) => fname, - ReadSeekWrapper::S3(_, fname) => fname, - } - } -} - -// forward all custom implementations straight to the actual reader -impl std::io::Read for ReadSeekWrapper { - fn read(&mut self, buf: &mut [u8]) -> Result { - match self { - ReadSeekWrapper::S3(r, _) => r.read(buf), - ReadSeekWrapper::File(r, _) => r.read(buf), - } - } - - fn read_to_end(&mut self, buf: &mut Vec) -> Result { - match self { - ReadSeekWrapper::S3(r, _) => r.read_to_end(buf), - ReadSeekWrapper::File(r, _) => r.read_to_end(buf), - } - } - - fn read_to_string(&mut self, buf: &mut String) -> Result { - match self { - ReadSeekWrapper::S3(r, _) => r.read_to_string(buf), - ReadSeekWrapper::File(r, _) => r.read_to_string(buf), - } - } -} - -impl std::io::Seek for ReadSeekWrapper { - fn seek(&mut self, pos: std::io::SeekFrom) -> Result { - match self { - ReadSeekWrapper::S3(r, _) => r.seek(pos), - ReadSeekWrapper::File(r, _) => r.seek(pos), - } - } -} +mod cli; +use cli::{Args, InputFormat, OutputFormat}; +mod reader_wrapper; +use reader_wrapper::ReadSeekWrapper; fn read_input_file(args: &Args) -> Result { let input_format = &args.from; @@ -266,11 +57,10 @@ fn write_output(args: &Args, transcripts: Transcripts) -> Result<(), AtgError> { let fasta_format = &args.fasta_format; let fasta_reference = &args.reference; + let fastareader = get_fasta_reader(&fasta_reference.as_deref()); debug!("Writing transcripts as {} to {}", output_format, output_fd); - let fastareader = get_fasta_reader(&fasta_reference.as_deref()); - match output_format { OutputFormat::Refgene => { let mut writer = refgene::Writer::from_file(output_fd)?; @@ -295,9 +85,7 @@ fn write_output(args: &Args, transcripts: Transcripts) -> Result<(), AtgError> { } OutputFormat::Fasta => { let mut writer = fasta::Writer::from_file(output_fd)?; - writer.fasta_reader( - fastareader.ok_or_else(|| AtgError::new("unable to open fasta file"))?, - ); + writer.fasta_reader(fastareader?); writer.fasta_format(fasta_format.as_str()); writer.write_transcripts(&transcripts)? } @@ -309,9 +97,7 @@ fn write_output(args: &Args, transcripts: Transcripts) -> Result<(), AtgError> { )); } let mut writer = fasta::Writer::from_file("/dev/null")?; - writer.fasta_reader( - fastareader.ok_or_else(|| AtgError::new("unable to open fasta file"))?, - ); + writer.fasta_reader(fastareader?); writer.fasta_format(fasta_format.as_str()); for tx in transcripts { @@ -322,9 +108,7 @@ fn write_output(args: &Args, transcripts: Transcripts) -> Result<(), AtgError> { } OutputFormat::FeatureSequence => { let mut writer = fasta::Writer::from_file(output_fd)?; - writer.fasta_reader( - fastareader.ok_or_else(|| AtgError::new("unable to open fasta file"))?, - ); + writer.fasta_reader(fastareader?); for tx in transcripts { writer.write_features(&tx)? } @@ -335,11 +119,8 @@ fn write_output(args: &Args, transcripts: Transcripts) -> Result<(), AtgError> { } OutputFormat::Qc => { let mut writer = qc::Writer::from_file(output_fd)?; - let genetic_code_arg = &args.genetic_code; - add_genetic_code(genetic_code_arg, &mut writer)?; - writer.fasta_reader( - fastareader.ok_or_else(|| AtgError::new("unable to open fasta file"))?, - ); + add_genetic_code(&args.genetic_code, &mut writer)?; + writer.fasta_reader(fastareader?); writer.write_header()?; writer.write_transcripts(&transcripts)? } @@ -364,47 +145,123 @@ fn write_output(args: &Args, transcripts: Transcripts) -> Result<(), AtgError> { Ok(()) } -fn get_fasta_reader(filename: &Option<&str>) -> Option> { +/// Helper function to get a FastaReader that can read both local files and S3 objects +fn get_fasta_reader(filename: &Option<&str>) -> Result, AtgError> { + if filename.is_none() { + return Err(AtgError::new("no Fasta filename specified")); + } // Both fasta_reader and fai_reader are Result instances - // so they can be accessed via `?` - let fasta_reader = ReadSeekWrapper::from_cli_arg(filename); - let fai_reader = match &fasta_reader { - Ok(r) => Ok(ReadSeekWrapper::from_filename(&format!( - "{}.fai", - r.filename() - ))), - Err(err) => Err(AtgError::new(format!("invalid fasta reader: {}", err))), - }; + let fasta_reader = ReadSeekWrapper::from_cli_arg(filename)?; + let fai_reader = ReadSeekWrapper::from_filename(&format!("{}.fai", fasta_reader.filename()))?; - if let (Ok(fasta_wrapper), Ok(fai_wrapper)) = (fasta_reader, fai_reader) { - Some(FastaReader::from_reader(fasta_wrapper, fai_wrapper).unwrap()) - } else { - None - } + Ok(FastaReader::from_reader(fasta_reader, fai_reader)?) } +/// Attaches the chromosome-specific and default genetic code to the QC-Writer fn add_genetic_code( genetic_code_arg: &Vec, writer: &mut qc::Writer, ) -> Result<(), AtgError> { - for genetic_code_value in genetic_code_arg { - match genetic_code_value.split_once(':') { - // if the value contains a `:`, it is a key:value pair - // for chromosome:genetic_code. - Some((chrom, seq)) => { - let gen_code = GeneticCode::guess(seq)?; - debug!("Adding genetic code {} for {}", gen_code, chrom); - writer.add_genetic_code(chrom.to_string(), gen_code); + let codes = GeneticCodeSelecter::from_cli(genetic_code_arg)?; + + debug!("Setting default genetic code to {}", codes.default); + writer.default_genetic_code(codes.default); + + for (chrom, code) in codes.custom { + debug!("Adding genetic code {} for {}", &code, &chrom); + writer.add_genetic_code(chrom, code); + } + Ok(()) +} + +#[derive(Default)] +/// Helper struct for parsing the genetic-code CLI arguments +/// +/// The CLI argument can specify both one generic/default genetic code +/// and several chromosomse-specific genetic codes +struct GeneticCodeSelecter { + default: GeneticCode, + custom: Vec<(String, GeneticCode)>, +} + +impl GeneticCodeSelecter { + fn from_cli(genetic_code_arg: &Vec) -> Result { + let mut code = GeneticCodeSelecter::default(); + for genetic_code_value in genetic_code_arg { + match genetic_code_value.split_once(':') { + // if the value contains a `:`, it is a key:value pair + // for chromosome:genetic_code. + Some((chrom, seq)) => { + let gen_code = GeneticCode::guess(seq)?; + debug!("Specified custom genetic code {} for {}", gen_code, chrom); + code.custom.push((chrom.to_string(), gen_code)); + } + // Without `:` the genetic code is used as default + None => { + let gen_code = GeneticCode::guess(genetic_code_value)?; + debug!("Specified default genetic code {}", gen_code); + code.default = gen_code; + } + } + } + Ok(code) + } +} + +/// Returns a filtered `Transcript`s object based on CLI-provided filter criteria +/// +/// If a transcript fails one of the QC checks, it is removed from the output +/// +/// Some QC checks might need the fasta file. To keep the logic simple, +/// the filter function will always run all QC checks (using `QcCheck`) +/// and then filter based on only the requested criteria. +/// This might not be the best performance approach, but other approaches +/// would add a lot more logic complexity. +/// The performance hit does not impact the most frequent use cases, where Fasta +/// data is needed anyway +fn filter_transcripts(transcripts: Transcripts, args: &Args) -> Result { + let len_start = transcripts.len(); + + let fasta_reference = &args.reference; + let mut fastareader = get_fasta_reader(&fasta_reference.as_deref())?; + + // To collect all transcripts that pass the filter + let mut filtered_transcripts = Transcripts::new(); + + let codes = GeneticCodeSelecter::from_cli(&args.genetic_code)?; + let mut custom_code: Option<&GeneticCode>; + + 'tx_loop: for tx in transcripts.to_vec() { + let qc = match codes.custom.is_empty() { + true => QcCheck::new(&tx, &mut fastareader, &codes.default), + false => { + custom_code = None; + for cc in &codes.custom { + if cc.0 == tx.chrom() { + custom_code = Some(&cc.1); + break; + } + } + QcCheck::new(&tx, &mut fastareader, custom_code.unwrap_or(&codes.default)) } - // Without `:` the genetic code is used as default - None => { - let gen_code = GeneticCode::guess(genetic_code_value)?; - debug!("Setting default genetic code to {}", gen_code); - writer.default_genetic_code(gen_code); + }; + + for check in &args.qc_check { + if check.remove(&qc) { + debug!("Removing {} for failing QC filter {}", tx.name(), check); + // Transcript fails the QC check, move on to the next transcript + continue 'tx_loop; } } + + // only keep transcripts that did not fail any QC test + filtered_transcripts.push(tx) } - Ok(()) + info!( + "Filtered out {} transcripts.", + len_start - filtered_transcripts.len() + ); + Ok(filtered_transcripts) } fn main() { @@ -412,7 +269,7 @@ fn main() { loggerv::init_with_verbosity(cli_commands.verbose.into()).unwrap(); - let transcripts = match read_input_file(&cli_commands) { + let mut transcripts = match read_input_file(&cli_commands) { Ok(x) => x, Err(err) => { println!("\x1b[1;31mError:\x1b[0m {}", err); @@ -421,6 +278,18 @@ fn main() { } }; + if !cli_commands.qc_check.is_empty() { + debug!("Filtering transcripts"); + transcripts = match filter_transcripts(transcripts, &cli_commands) { + Ok(t) => t, + Err(err) => { + println!("\x1b[1;31mError:\x1b[0m {}", err); + println!("\nPlease check `atg --help` for more options\n"); + process::exit(1); + } + }; + } + match write_output(&cli_commands, transcripts) { Ok(_) => debug!("All done here."), Err(err) => { diff --git a/src/reader_wrapper.rs b/src/reader_wrapper.rs new file mode 100644 index 0000000..4353ba0 --- /dev/null +++ b/src/reader_wrapper.rs @@ -0,0 +1,75 @@ +use std::fs::File; + +use s3reader::{S3ObjectUri, S3Reader}; + +use atglib::utils::errors::AtgError; + +// There will be only a single instance of this enum +// so we can allow a large variant +#[allow(clippy::large_enum_variant)] +/// ReadSeekWrapper is an enum to allow dynamic assignment of either File or S3 Readers +/// to be used in the Reader objects of Atglib. +pub enum ReadSeekWrapper { + File(File, String), + S3(S3Reader, String), +} + +impl ReadSeekWrapper { + pub fn from_filename(filename: &str) -> Result { + if filename.starts_with("s3://") { + let uri = S3ObjectUri::new(filename).map_err(AtgError::new)?; + let s3obj = S3Reader::open(uri).map_err(AtgError::new)?; + Ok(Self::S3(s3obj, filename.to_string())) + } else { + Ok(Self::File(File::open(filename)?, filename.to_string())) + } + } + + pub fn from_cli_arg(filename: &Option<&str>) -> Result { + if let Some(filename) = filename { + Ok(ReadSeekWrapper::from_filename(filename)?) + } else { + Err(AtgError::new("No file specified")) + } + } + + pub fn filename(&self) -> &str { + match self { + ReadSeekWrapper::File(_, fname) => fname, + ReadSeekWrapper::S3(_, fname) => fname, + } + } +} + +// forward all custom implementations straight to the actual reader +impl std::io::Read for ReadSeekWrapper { + fn read(&mut self, buf: &mut [u8]) -> Result { + match self { + ReadSeekWrapper::S3(r, _) => r.read(buf), + ReadSeekWrapper::File(r, _) => r.read(buf), + } + } + + fn read_to_end(&mut self, buf: &mut Vec) -> Result { + match self { + ReadSeekWrapper::S3(r, _) => r.read_to_end(buf), + ReadSeekWrapper::File(r, _) => r.read_to_end(buf), + } + } + + fn read_to_string(&mut self, buf: &mut String) -> Result { + match self { + ReadSeekWrapper::S3(r, _) => r.read_to_string(buf), + ReadSeekWrapper::File(r, _) => r.read_to_string(buf), + } + } +} + +impl std::io::Seek for ReadSeekWrapper { + fn seek(&mut self, pos: std::io::SeekFrom) -> Result { + match self { + ReadSeekWrapper::S3(r, _) => r.seek(pos), + ReadSeekWrapper::File(r, _) => r.seek(pos), + } + } +} From 50a27c63a0e1353ed5f4e7e48f88591270965716 Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Sat, 5 Nov 2022 19:49:25 +0100 Subject: [PATCH 2/3] Add tests for QC-filtering --- JUSTFILE | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/JUSTFILE b/JUSTFILE index f4c72d3..72677df 100644 --- a/JUSTFILE +++ b/JUSTFILE @@ -108,6 +108,12 @@ test: (diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -r tests/data/hg19.fasta -t qc | tail -n +2 | cut -f3-9 | sort | uniq -c | sed "s/ //g") <(echo -ne "14OK\tOK\tOK\tOK\tNOK\tOK\tOK\n13OK\tOK\tOK\tOK\tOK\tOK\tOK\n") && \ echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m" + echo -ne "Test Filter" + (diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -t refgene -r tests/data/hg19.fasta -q exon -q start -q stop -q exon -q cds-length -q upstream-start -q upstream-stop -q coordinates | wc -l | sed "s/ //g") <(echo -ne "13\n") && \ + diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -t refgene -r tests/data/hg19.fasta -q exon -q start -q stop -q exon -q cds-length -q upstream-stop -q coordinates | wc -l | sed "s/ //g") <(echo -ne "27\n") && \ + diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -t refgene -r tests/data/hg19.fasta -q upstream-stop -c "chrY:vertebrate mitochondrial" | wc -l | sed "s/ //g") <(echo -ne "26\n") && \ + echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m" + fi benchmark name: From 3ad2e792bf9aff99e4d06323394e9c742e793525 Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Sat, 5 Nov 2022 19:49:40 +0100 Subject: [PATCH 3/3] Document Qc-filtering in README --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 8618eed..6282b66 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,7 @@ Additional, optional arguments: - `-g`, `--gtf-source`: Specify the source for GTF output files. Defaults to `atg` - `-r`, `--reference`: Path of a reference genome fasta file. Required for fasta output - `-c`, `--genetic-code`: Specify which genetic code to use for translating the transcripts. Genetic codes can be specified per chromosome by specifying the chromsome and the code, separated by `:` (e.g. `-c chrM:vertebrate mitochondrial`). They can also be specified for all chromsomes by omitting the chromosome (e.g. `-c vertebrate mitochondrial`). The argument can be specified multiple times (e.g: `-c "standard" -c "chrM:vertebrate mitochondrial" -c "chrAYN:alternative yeast nuclear"`). The code names are based on the `name` field from the [NCBI specs](https://www.ncbi.nlm.nih.gov/IEB/ToolBox/C_DOC/lxr/source/data/gc.prt) but all lowercase characters. Alternatively, you can also specify the amino acid lookup table directly: `-c "chrM:FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG"`. Defaults to `standard`. +- `-q`, `--qc-check`: Specify QC-checks for removing transcripts from the output #### Examples: ```bash @@ -78,6 +79,10 @@ atg --from refgene --to gtf --input /path/to/input.refgene --output /path/to/out ## Convert RefGene to bed atg --from refgene --to bed --input /path/to/input.refgene --output /path/to/output.bed + + +## Convert a GTF file to a RefGene file, remove all transcript without proper start and stop codons +atg --from gtf --to refgene --input /path/to/input.gtf --output /path/to/output.refgene --qc-check start --qc-check stop --reference /path/to/fasta.fa ``` ### Supported `--output` formats