From 054f9ffcb38f7c66df6130a56488c62eaf8cb588 Mon Sep 17 00:00:00 2001
From: Jonas Marcello <jonas.marcello@esbme.com>
Date: Sat, 5 Nov 2022 15:21:33 +0100
Subject: [PATCH 1/3] Add option to filter output by QC stats

---
 src/cli.rs            | 180 +++++++++++++++++++
 src/main.rs           | 391 ++++++++++++++----------------------------
 src/reader_wrapper.rs |  75 ++++++++
 3 files changed, 385 insertions(+), 261 deletions(-)
 create mode 100644 src/cli.rs
 create mode 100644 src/reader_wrapper.rs
diff --git a/src/cli.rs b/src/cli.rs
new file mode 100644
index 0000000..55cb22d
--- /dev/null
+++ b/src/cli.rs
@@ -0,0 +1,180 @@
+use atglib::qc::QcCheck;
+use atglib::qc::QcResult;
+use clap::{Parser, ValueEnum};
+
+/// Convert transcript data from and to different file formats
+///
+/// More detailed usage instructions on Github: <https://github.com/anergictcell/atg>
+#[derive(Parser, Debug)]
+#[command(author, version, about)]
+pub struct Args {
+    /// Format of input file
+    #[arg(short, long, value_name = "FORMAT")]
+    pub from: InputFormat,
+
+    /// Output format
+    #[arg(short, long, value_name = "FORMAT")]
+    pub to: OutputFormat,
+
+    /// Path to input file
+    #[arg(short, long, default_value = "/dev/stdin", value_name = "FILE")]
+    pub input: String,
+
+    /// Path to output file
+    #[arg(short, long, default_value = "/dev/stdout", value_name = "FILE")]
+    pub output: String,
+
+    /// The feature source to indicate in GTF files (optional with `--output gtf`)
+    #[arg(short, long, default_value = env!("CARGO_PKG_NAME"), value_name = "FILE")]
+    pub gtf_source: String,
+
+    /// Path to reference genome fasta file. (required with `--output [fasta | fasta-split | feature-sequence | qc]`)
+    ///
+    /// You can also specify an S3 Uri (s3://mybucket/myfile.fasta), but reading from S3 is currently quite slow
+    #[arg(short, long, value_name = "FASTA_FILE", required_if_eq_any([("to", "fasta"),("to", "fasta-split"),("to", "feature-sequence"),("to", "qc")]))]
+    pub reference: Option<String>,
+
+    /// Which part of the transcript to transcribe
+    ///
+    /// This option is only needed when generating fasta output.
+    #[arg(long, default_value = "cds")]
+    pub fasta_format: FastaFormat,
+
+    /// Sets the level of verbosity
+    #[arg(short, action = clap::ArgAction::Count)]
+    pub verbose: u8,
+
+    /// Specify which genetic code to use for translating the transcripts
+    ///
+    /// Genetic codes can be specified globally (e.g. `-c standard`)
+    ///
+    /// or chromosome specific (e..g `-c "chrM:vertebrate mitochondrial"`).
+    ///
+    /// Specify by name or amino acid lookup table (e.g. `FFLLSSSSYY**CC*....`)
+    ///
+    /// Defaults to the standard genetic code for all transcripts. Suggested use for vertebrates:
+    ///
+    /// `-c standard -c "chrM:vertebrate mitochondrial"`)
+    ///
+    /// (optional with `--output qc`)
+    #[arg(short = 'c', long, action = clap::ArgAction::Append, value_name = "GENETIC CODE")]
+    pub genetic_code: Vec<String>,
+
+    /// Remove all variants from the output that fail QC-checks
+    ///
+    /// You can specify one or multiple QC-checks. Only `NOK` results will be removed. `OK` and `NA` will remain.
+    #[arg(short = 'q', long = "qc-check", action = clap::ArgAction::Append, value_name = "QC CHECKS", requires = "reference")]
+    pub qc_check: Vec<QcFilter>,
+}
+
+#[derive(Clone, Debug, ValueEnum)]
+pub enum FastaFormat {
+    /// The full genomic sequence of the transcript, including introns. (similar to pre-processed mRNA)
+    Transcript,
+    /// All exons of the transcript. (similar to processed mRNA)
+    Exons,
+    /// The coding sequence of the transcript
+    Cds,
+}
+
+impl FastaFormat {
+    pub fn as_str(&self) -> &str {
+        match self {
+            FastaFormat::Transcript => "transcript",
+            FastaFormat::Exons => "exons",
+            FastaFormat::Cds => "cds",
+        }
+    }
+}
+
+#[derive(Clone, Debug, ValueEnum)]
+pub enum InputFormat {
+    /// GTF2.2 format
+    Gtf,
+    /// RefGene format (one transcript per line)
+    Refgene,
+    /// GenePredExt format (one transcript per line)
+    Genepredext,
+    /// ATG-specific binary format
+    Bin,
+}
+
+impl std::fmt::Display for InputFormat {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+#[derive(Clone, Debug, ValueEnum)]
+pub enum OutputFormat {
+    /// GTF2.2 format
+    Gtf,
+    /// RefGene format (one transcript per line)
+    Refgene,
+    /// GenePred format (one transcript per line)
+    Genepred,
+    /// GenePredExt format (one transcript per line)
+    Genepredext,
+    /// Bedfile (one transcript per line)
+    Bed,
+    /// Nucleotide sequence. There are multiple formatting options available, see --fasta-format
+    Fasta,
+    /// Like 'fasta', but every transcript is written to its own file. (--output must be the path to a folder)
+    FastaSplit,
+    /// Nucleotide sequence for every 'feature' (UTR, CDS or non-coding exons)
+    FeatureSequence,
+    /// Custom format, as needed for SpliceAI
+    Spliceai,
+    /// ATG-specific binary format
+    Bin,
+    /// Performs QC checks on all Transcripts
+    Qc,
+    /// No output
+    None,
+    /// This only makes sense for debugging purposes
+    Raw,
+}
+
+impl std::fmt::Display for OutputFormat {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, ValueEnum)]
+pub enum QcFilter {
+    /// Transcript contains at least one exon
+    Exon,
+    /// The length of the CDS is divisible by 3
+    CdsLength,
+    /// The CDS starts with ATG
+    Start,
+    /// The CDS ends with a Stop codon
+    Stop,
+    /// TThe 5’UTR does not contain another start codon ATG
+    UpstreamStart,
+    /// The CDS does not contain another in-frame stop-codon
+    UpstreamStop,
+    /// The transcript is within the coordinates of the reference genome
+    Coordinates,
+}
+
+impl QcFilter {
+    pub fn remove(&self, qc: &QcCheck) -> bool {
+        match self {
+            QcFilter::Exon => qc.contains_exon() == QcResult::NOK,
+            QcFilter::CdsLength => qc.correct_cds_length() == QcResult::NOK,
+            QcFilter::Start => qc.correct_start_codon() == QcResult::NOK,
+            QcFilter::Stop => qc.correct_stop_codon() == QcResult::NOK,
+            QcFilter::UpstreamStart => qc.no_upstream_start_codon() == QcResult::NOK,
+            QcFilter::UpstreamStop => qc.no_upstream_stop_codon() == QcResult::NOK,
+            QcFilter::Coordinates => qc.correct_coordinates() == QcResult::NOK,
+        }
+    }
+}
+
+impl std::fmt::Display for QcFilter {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index 4d749eb..d1642d5 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -4,9 +4,7 @@ use std::fs::File;
 use std::process;
 
 use bincode::{deserialize_from, serialize_into};
-use clap::{Parser, ValueEnum};
-
-use s3reader::{S3ObjectUri, S3Reader};
+use clap::Parser;
 
 use atglib::bed;
 use atglib::fasta;
@@ -16,224 +14,17 @@ use atglib::genepredext;
 use atglib::gtf;
 use atglib::models::{GeneticCode, TranscriptWrite, Transcripts};
 use atglib::qc;
+use atglib::qc::QcCheck;
 use atglib::read_transcripts;
 use atglib::refgene;
 use atglib::spliceai;
 use atglib::utils::errors::AtgError;
 
-/// Convert transcript data from and to different file formats
-///
-/// More detailed usage instructions on Github: <https://github.com/anergictcell/atg>
-#[derive(Parser, Debug)]
-#[command(author, version, about)]
-struct Args {
-    /// Format of input file
-    #[arg(short, long, value_name = "FORMAT")]
-    from: InputFormat,
-
-    /// Output format
-    #[arg(short, long, value_name = "FORMAT")]
-    to: OutputFormat,
-
-    /// Path to input file
-    #[arg(short, long, default_value = "/dev/stdin", value_name = "FILE")]
-    input: String,
-
-    /// Path to output file
-    #[arg(short, long, default_value = "/dev/stdout", value_name = "FILE")]
-    output: String,
-
-    /// The feature source to indicate in GTF files (optional with `--output gtf`)
-    #[arg(short, long, default_value = env!("CARGO_PKG_NAME"), value_name = "FILE")]
-    gtf_source: String,
-
-    /// Path to reference genome fasta file. (required with `--output [fasta | fasta-split | feature-sequence | qc]`)
-    ///
-    /// You can also specify an S3 Uri (s3://mybucket/myfile.fasta), but reading from S3 is currently quite slow
-    #[arg(short, long, value_name = "FASTA_FILE", required_if_eq_any([("to", "fasta"),("to", "fasta-split"),("to", "feature-sequence"),("to", "qc")]))]
-    reference: Option<String>,
-
-    /// Which part of the transcript to transcribe
-    ///
-    /// This option is only needed when generating fasta output.
-    #[arg(long, default_value = "cds")]
-    fasta_format: FastaFormat,
-
-    /// Sets the level of verbosity
-    #[arg(short, action = clap::ArgAction::Count)]
-    verbose: u8,
-
-    /// Specify which genetic code to use for translating the transcripts
-    ///
-    /// Genetic codes can be specified globally (e.g. `-c standard`)
-    ///
-    /// or chromosome specific (e..g `-c chrM:vertebrate_mitochondria`).
-    ///
-    /// Specify by name or amino acid lookup table (e.g. `FFLLSSSSYY**CC*....`)
-    ///
-    /// Defaults to the standard genetic code for all transcripts. Suggested use for vertebrates:
-    ///
-    /// `-c standard -c chrM:vertebrate_mitochondria`)
-    ///
-    /// (optional with `--output qc`)
-    #[arg(short = 'c', long, action = clap::ArgAction::Append, value_name = "GENETIC CODE")]
-    genetic_code: Vec<String>,
-}
-
-#[derive(Clone, Debug, ValueEnum)]
-enum FastaFormat {
-    /// The full genomic sequence of the transcript, including introns. (similar to pre-processed mRNA)
-    Transcript,
-    /// All exons of the transcript. (similar to processed mRNA)
-    Exons,
-    /// The coding sequence of the transcript
-    Cds,
-}
-
-impl FastaFormat {
-    fn as_str(&self) -> &str {
-        match self {
-            FastaFormat::Transcript => "transcript",
-            FastaFormat::Exons => "exons",
-            FastaFormat::Cds => "cds",
-        }
-    }
-}
-
-#[derive(Clone, Debug, ValueEnum)]
-enum InputFormat {
-    /// GTF2.2 format
-    Gtf,
-    /// RefGene format (one transcript per line)
-    Refgene,
-    /// GenePredExt format (one transcript per line)
-    Genepredext,
-    /// ATG-specific binary format
-    Bin,
-}
-
-impl std::fmt::Display for InputFormat {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "{:?}", self)
-    }
-}
-
-#[derive(Clone, Debug, ValueEnum)]
-enum OutputFormat {
-    /// GTF2.2 format
-    Gtf,
-    /// RefGene format (one transcript per line)
-    Refgene,
-    /// GenePred format (one transcript per line)
-    Genepred,
-    /// GenePredExt format (one transcript per line)
-    Genepredext,
-    /// Bedfile (one transcript per line)
-    Bed,
-    /// Nucleotide sequence. There are multiple formatting options available, see --fasta-format
-    Fasta,
-    /// Like 'fasta', but every transcript is written to its own file. (--output must be the path to a folder)
-    FastaSplit,
-    /// Nucleotide sequence for every 'feature' (UTR, CDS or non-coding exons)
-    FeatureSequence,
-    /// Custom format, as needed for SpliceAI
-    Spliceai,
-    /// ATG-specific binary format
-    Bin,
-    /// Performs QC checks on all Transcripts
-    Qc,
-    /// No output
-    None,
-    /// This only makes sense for debugging purposes
-    Raw,
-}
-
-impl std::fmt::Display for OutputFormat {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "{:?}", self)
-    }
-}
-
-
-// There will be only a single instance of this enum
-// so we can allow a large variant
-#[allow(clippy::large_enum_variant)]
-/// ReadSeekWrapper is an enum to allow dynamic assignment of either File or S3 Readers
-/// to be used in the Reader objects of Atglib.
-enum ReadSeekWrapper {
-    File(File, String),
-    S3(S3Reader, String),
-}
-
-impl ReadSeekWrapper {
-    pub fn from_filename(filename: &str) -> Self {
-        if filename.starts_with("s3://") {
-            let uri = S3ObjectUri::new(filename).unwrap();
-            let s3obj = S3Reader::open(uri).unwrap();
-            Self::S3(s3obj, filename.to_string())
-        } else {
-            Self::File(File::open(filename).unwrap(), filename.to_string())
-        }
-    }
-
-    pub fn from_cli_arg(filename: &Option<&str>) -> Result<ReadSeekWrapper, AtgError> {
-        if let Some(filename) = filename {
-            if filename.starts_with("s3://") {
-                let uri = S3ObjectUri::new(filename).unwrap();
-                let s3obj = S3Reader::open(uri).unwrap();
-                Ok(Self::S3(s3obj, filename.to_string()))
-            } else {
-                Ok(Self::File(
-                    File::open(filename).unwrap(),
-                    filename.to_string(),
-                ))
-            }
-        } else {
-            Err(AtgError::new("No file specified"))
-        }
-    }
-
-    pub fn filename(&self) -> &str {
-        match self {
-            ReadSeekWrapper::File(_, fname) => fname,
-            ReadSeekWrapper::S3(_, fname) => fname,
-        }
-    }
-}
-
-// forward all custom implementations straight to the actual reader
-impl std::io::Read for ReadSeekWrapper {
-    fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
-        match self {
-            ReadSeekWrapper::S3(r, _) => r.read(buf),
-            ReadSeekWrapper::File(r, _) => r.read(buf),
-        }
-    }
-
-    fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<usize, std::io::Error> {
-        match self {
-            ReadSeekWrapper::S3(r, _) => r.read_to_end(buf),
-            ReadSeekWrapper::File(r, _) => r.read_to_end(buf),
-        }
-    }
-
-    fn read_to_string(&mut self, buf: &mut String) -> Result<usize, std::io::Error> {
-        match self {
-            ReadSeekWrapper::S3(r, _) => r.read_to_string(buf),
-            ReadSeekWrapper::File(r, _) => r.read_to_string(buf),
-        }
-    }
-}
-
-impl std::io::Seek for ReadSeekWrapper {
-    fn seek(&mut self, pos: std::io::SeekFrom) -> Result<u64, std::io::Error> {
-        match self {
-            ReadSeekWrapper::S3(r, _) => r.seek(pos),
-            ReadSeekWrapper::File(r, _) => r.seek(pos),
-        }
-    }
-}
+mod cli;
+use cli::{Args, InputFormat, OutputFormat};
 
+mod reader_wrapper;
+use reader_wrapper::ReadSeekWrapper;
 
 fn read_input_file(args: &Args) -> Result<Transcripts, AtgError> {
     let input_format = &args.from;
@@ -266,11 +57,10 @@ fn write_output(args: &Args, transcripts: Transcripts) -> Result<(), AtgError> {
 
     let fasta_format = &args.fasta_format;
     let fasta_reference = &args.reference;
+    let fastareader = get_fasta_reader(&fasta_reference.as_deref());
 
     debug!("Writing transcripts as {} to {}", output_format, output_fd);
 
-    let fastareader = get_fasta_reader(&fasta_reference.as_deref());
-
     match output_format {
         OutputFormat::Refgene => {
             let mut writer = refgene::Writer::from_file(output_fd)?;
@@ -295,9 +85,7 @@ fn write_output(args: &Args, transcripts: Transcripts) -> Result<(), AtgError> {
         }
         OutputFormat::Fasta => {
             let mut writer = fasta::Writer::from_file(output_fd)?;
-            writer.fasta_reader(
-                fastareader.ok_or_else(|| AtgError::new("unable to open fasta file"))?,
-            );
+            writer.fasta_reader(fastareader?);
             writer.fasta_format(fasta_format.as_str());
             writer.write_transcripts(&transcripts)?
         }
@@ -309,9 +97,7 @@ fn write_output(args: &Args, transcripts: Transcripts) -> Result<(), AtgError> {
                 ));
             }
             let mut writer = fasta::Writer::from_file("/dev/null")?;
-            writer.fasta_reader(
-                fastareader.ok_or_else(|| AtgError::new("unable to open fasta file"))?,
-            );
+            writer.fasta_reader(fastareader?);
             writer.fasta_format(fasta_format.as_str());
 
             for tx in transcripts {
@@ -322,9 +108,7 @@ fn write_output(args: &Args, transcripts: Transcripts) -> Result<(), AtgError> {
         }
         OutputFormat::FeatureSequence => {
             let mut writer = fasta::Writer::from_file(output_fd)?;
-            writer.fasta_reader(
-                fastareader.ok_or_else(|| AtgError::new("unable to open fasta file"))?,
-            );
+            writer.fasta_reader(fastareader?);
             for tx in transcripts {
                 writer.write_features(&tx)?
             }
@@ -335,11 +119,8 @@ fn write_output(args: &Args, transcripts: Transcripts) -> Result<(), AtgError> {
         }
         OutputFormat::Qc => {
             let mut writer = qc::Writer::from_file(output_fd)?;
-            let genetic_code_arg = &args.genetic_code;
-            add_genetic_code(genetic_code_arg, &mut writer)?;
-            writer.fasta_reader(
-                fastareader.ok_or_else(|| AtgError::new("unable to open fasta file"))?,
-            );
+            add_genetic_code(&args.genetic_code, &mut writer)?;
+            writer.fasta_reader(fastareader?);
             writer.write_header()?;
             writer.write_transcripts(&transcripts)?
         }
@@ -364,47 +145,123 @@ fn write_output(args: &Args, transcripts: Transcripts) -> Result<(), AtgError> {
     Ok(())
 }
 
-fn get_fasta_reader(filename: &Option<&str>) -> Option<FastaReader<ReadSeekWrapper>> {
+/// Helper function to get a FastaReader that can read both local files and S3 objects
+fn get_fasta_reader(filename: &Option<&str>) -> Result<FastaReader<ReadSeekWrapper>, AtgError> {
+    if filename.is_none() {
+        return Err(AtgError::new("no Fasta filename specified"));
+    }
     // Both fasta_reader and fai_reader are Result<ReadSeekWrapper> instances
-    // so they can be accessed via `?`
-    let fasta_reader = ReadSeekWrapper::from_cli_arg(filename);
-    let fai_reader = match &fasta_reader {
-        Ok(r) => Ok(ReadSeekWrapper::from_filename(&format!(
-            "{}.fai",
-            r.filename()
-        ))),
-        Err(err) => Err(AtgError::new(format!("invalid fasta reader: {}", err))),
-    };
+    let fasta_reader = ReadSeekWrapper::from_cli_arg(filename)?;
+    let fai_reader = ReadSeekWrapper::from_filename(&format!("{}.fai", fasta_reader.filename()))?;
 
-    if let (Ok(fasta_wrapper), Ok(fai_wrapper)) = (fasta_reader, fai_reader) {
-        Some(FastaReader::from_reader(fasta_wrapper, fai_wrapper).unwrap())
-    } else {
-        None
-    }
+    Ok(FastaReader::from_reader(fasta_reader, fai_reader)?)
 }
 
+/// Attaches the chromosome-specific and default genetic code to the QC-Writer
 fn add_genetic_code<W: std::io::Write, R: std::io::Read + std::io::Seek>(
     genetic_code_arg: &Vec<String>,
     writer: &mut qc::Writer<W, R>,
 ) -> Result<(), AtgError> {
-    for genetic_code_value in genetic_code_arg {
-        match genetic_code_value.split_once(':') {
-            // if the value contains a `:`, it is a key:value pair
-            // for chromosome:genetic_code.
-            Some((chrom, seq)) => {
-                let gen_code = GeneticCode::guess(seq)?;
-                debug!("Adding genetic code {} for {}", gen_code, chrom);
-                writer.add_genetic_code(chrom.to_string(), gen_code);
+    let codes = GeneticCodeSelecter::from_cli(genetic_code_arg)?;
+
+    debug!("Setting default genetic code to {}", codes.default);
+    writer.default_genetic_code(codes.default);
+
+    for (chrom, code) in codes.custom {
+        debug!("Adding genetic code {} for {}", &code, &chrom);
+        writer.add_genetic_code(chrom, code);
+    }
+    Ok(())
+}
+
+#[derive(Default)]
+/// Helper struct for parsing the genetic-code CLI arguments
+///
+/// The CLI argument can specify both one generic/default genetic code
+/// and several chromosomse-specific genetic codes
+struct GeneticCodeSelecter {
+    default: GeneticCode,
+    custom: Vec<(String, GeneticCode)>,
+}
+
+impl GeneticCodeSelecter {
+    fn from_cli(genetic_code_arg: &Vec<String>) -> Result<Self, AtgError> {
+        let mut code = GeneticCodeSelecter::default();
+        for genetic_code_value in genetic_code_arg {
+            match genetic_code_value.split_once(':') {
+                // if the value contains a `:`, it is a key:value pair
+                // for chromosome:genetic_code.
+                Some((chrom, seq)) => {
+                    let gen_code = GeneticCode::guess(seq)?;
+                    debug!("Specified custom genetic code {} for {}", gen_code, chrom);
+                    code.custom.push((chrom.to_string(), gen_code));
+                }
+                // Without `:` the genetic code is used as default
+                None => {
+                    let gen_code = GeneticCode::guess(genetic_code_value)?;
+                    debug!("Specified default genetic code {}", gen_code);
+                    code.default = gen_code;
+                }
+            }
+        }
+        Ok(code)
+    }
+}
+
+/// Returns a filtered `Transcript`s object based on CLI-provided filter criteria
+///
+/// If a transcript fails one of the QC checks, it is removed from the output
+///
+/// Some QC checks might need the fasta file. To keep the logic simple,
+/// the filter function will always run all QC checks (using `QcCheck`)
+/// and then filter based on only the requested criteria.
+/// This might not be the best performance approach, but other approaches
+/// would add a lot more logic complexity.
+/// The performance hit does not impact the most frequent use cases, where Fasta
+/// data is needed anyway
+fn filter_transcripts(transcripts: Transcripts, args: &Args) -> Result<Transcripts, AtgError> {
+    let len_start = transcripts.len();
+
+    let fasta_reference = &args.reference;
+    let mut fastareader = get_fasta_reader(&fasta_reference.as_deref())?;
+
+    // To collect all transcripts that pass the filter
+    let mut filtered_transcripts = Transcripts::new();
+
+    let codes = GeneticCodeSelecter::from_cli(&args.genetic_code)?;
+    let mut custom_code: Option<&GeneticCode>;
+
+    'tx_loop: for tx in transcripts.to_vec() {
+        let qc = match codes.custom.is_empty() {
+            true => QcCheck::new(&tx, &mut fastareader, &codes.default),
+            false => {
+                custom_code = None;
+                for cc in &codes.custom {
+                    if cc.0 == tx.chrom() {
+                        custom_code = Some(&cc.1);
+                        break;
+                    }
+                }
+                QcCheck::new(&tx, &mut fastareader, custom_code.unwrap_or(&codes.default))
             }
-            // Without `:` the genetic code is used as default
-            None => {
-                let gen_code = GeneticCode::guess(genetic_code_value)?;
-                debug!("Setting default genetic code to {}", gen_code);
-                writer.default_genetic_code(gen_code);
+        };
+
+        for check in &args.qc_check {
+            if check.remove(&qc) {
+                debug!("Removing {} for failing QC filter {}", tx.name(), check);
+                // Transcript fails the QC check, move on to the next transcript
+                continue 'tx_loop;
             }
         }
+
+        // only keep transcripts that did not fail any QC test
+        filtered_transcripts.push(tx)
     }
-    Ok(())
+    info!(
+        "Filtered out {} transcripts.",
+        len_start - filtered_transcripts.len()
+    );
+    Ok(filtered_transcripts)
 }
 
 fn main() {
@@ -412,7 +269,7 @@ fn main() {
 
     loggerv::init_with_verbosity(cli_commands.verbose.into()).unwrap();
 
-    let transcripts = match read_input_file(&cli_commands) {
+    let mut transcripts = match read_input_file(&cli_commands) {
         Ok(x) => x,
         Err(err) => {
             println!("\x1b[1;31mError:\x1b[0m {}", err);
@@ -421,6 +278,18 @@ fn main() {
         }
     };
 
+    if !cli_commands.qc_check.is_empty() {
+        debug!("Filtering transcripts");
+        transcripts = match filter_transcripts(transcripts, &cli_commands) {
+            Ok(t) => t,
+            Err(err) => {
+                println!("\x1b[1;31mError:\x1b[0m {}", err);
+                println!("\nPlease check `atg --help` for more options\n");
+                process::exit(1);
+            }
+        };
+    }
+
     match write_output(&cli_commands, transcripts) {
         Ok(_) => debug!("All done here."),
         Err(err) => {
diff --git a/src/reader_wrapper.rs b/src/reader_wrapper.rs
new file mode 100644
index 0000000..4353ba0
--- /dev/null
+++ b/src/reader_wrapper.rs
@@ -0,0 +1,75 @@
+use std::fs::File;
+
+use s3reader::{S3ObjectUri, S3Reader};
+
+use atglib::utils::errors::AtgError;
+
+// There will be only a single instance of this enum
+// so we can allow a large variant
+#[allow(clippy::large_enum_variant)]
+/// ReadSeekWrapper is an enum to allow dynamic assignment of either File or S3 Readers
+/// to be used in the Reader objects of Atglib.
+pub enum ReadSeekWrapper {
+    File(File, String),
+    S3(S3Reader, String),
+}
+
+impl ReadSeekWrapper {
+    pub fn from_filename(filename: &str) -> Result<Self, AtgError> {
+        if filename.starts_with("s3://") {
+            let uri = S3ObjectUri::new(filename).map_err(AtgError::new)?;
+            let s3obj = S3Reader::open(uri).map_err(AtgError::new)?;
+            Ok(Self::S3(s3obj, filename.to_string()))
+        } else {
+            Ok(Self::File(File::open(filename)?, filename.to_string()))
+        }
+    }
+
+    pub fn from_cli_arg(filename: &Option<&str>) -> Result<ReadSeekWrapper, AtgError> {
+        if let Some(filename) = filename {
+            Ok(ReadSeekWrapper::from_filename(filename)?)
+        } else {
+            Err(AtgError::new("No file specified"))
+        }
+    }
+
+    pub fn filename(&self) -> &str {
+        match self {
+            ReadSeekWrapper::File(_, fname) => fname,
+            ReadSeekWrapper::S3(_, fname) => fname,
+        }
+    }
+}
+
+// forward all custom implementations straight to the actual reader
+impl std::io::Read for ReadSeekWrapper {
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
+        match self {
+            ReadSeekWrapper::S3(r, _) => r.read(buf),
+            ReadSeekWrapper::File(r, _) => r.read(buf),
+        }
+    }
+
+    fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<usize, std::io::Error> {
+        match self {
+            ReadSeekWrapper::S3(r, _) => r.read_to_end(buf),
+            ReadSeekWrapper::File(r, _) => r.read_to_end(buf),
+        }
+    }
+
+    fn read_to_string(&mut self, buf: &mut String) -> Result<usize, std::io::Error> {
+        match self {
+            ReadSeekWrapper::S3(r, _) => r.read_to_string(buf),
+            ReadSeekWrapper::File(r, _) => r.read_to_string(buf),
+        }
+    }
+}
+
+impl std::io::Seek for ReadSeekWrapper {
+    fn seek(&mut self, pos: std::io::SeekFrom) -> Result<u64, std::io::Error> {
+        match self {
+            ReadSeekWrapper::S3(r, _) => r.seek(pos),
+            ReadSeekWrapper::File(r, _) => r.seek(pos),
+        }
+    }
+}

From 50a27c63a0e1353ed5f4e7e48f88591270965716 Mon Sep 17 00:00:00 2001
From: Jonas Marcello <jonas.marcello@esbme.com>
Date: Sat, 5 Nov 2022 19:49:25 +0100
Subject: [PATCH 2/3] Add tests for QC-filtering

---
 JUSTFILE | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/JUSTFILE b/JUSTFILE
index f4c72d3..72677df 100644
--- a/JUSTFILE
+++ b/JUSTFILE
@@ -108,6 +108,12 @@ test:
         (diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -r tests/data/hg19.fasta -t qc | tail -n +2  | cut -f3-9 | sort |  uniq -c | sed "s/ //g") <(echo -ne "14OK\tOK\tOK\tOK\tNOK\tOK\tOK\n13OK\tOK\tOK\tOK\tOK\tOK\tOK\n") && \
         echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
 
+        echo -ne "Test Filter"
+        (diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -t refgene  -r tests/data/hg19.fasta -q exon -q start -q stop -q exon -q cds-length -q upstream-start -q upstream-stop -q coordinates | wc -l | sed "s/ //g") <(echo -ne "13\n") && \
+        diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -t refgene  -r tests/data/hg19.fasta -q exon -q start -q stop -q exon -q cds-length -q upstream-stop -q coordinates | wc -l | sed "s/ //g") <(echo -ne "27\n") && \
+        diff <( cargo run -q -- -f gtf -i tests/data/example.gtf -t refgene  -r tests/data/hg19.fasta -q upstream-stop -c "chrY:vertebrate mitochondrial" | wc -l | sed "s/ //g") <(echo -ne "26\n") && \
+        echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
+
     fi
 
 benchmark name:

From 3ad2e792bf9aff99e4d06323394e9c742e793525 Mon Sep 17 00:00:00 2001
From: Jonas Marcello <jonas.marcello@esbme.com>
Date: Sat, 5 Nov 2022 19:49:40 +0100
Subject: [PATCH 3/3] Document Qc-filtering in README

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 8618eed..6282b66 100644
--- a/README.md
+++ b/README.md
@@ -61,6 +61,7 @@ Additional, optional arguments:
 - `-g`, `--gtf-source`: Specify the source for GTF output files. Defaults to `atg`
 - `-r`, `--reference`: Path of a reference genome fasta file. Required for fasta output
 - `-c`, `--genetic-code`: Specify which genetic code to use for translating the transcripts. Genetic codes can be specified per chromosome by specifying the chromsome and the code, separated by `:` (e.g. `-c chrM:vertebrate mitochondrial`). They can also be specified for all chromsomes by omitting the chromosome (e.g. `-c vertebrate mitochondrial`). The argument can be specified multiple times (e.g: `-c "standard" -c "chrM:vertebrate mitochondrial" -c "chrAYN:alternative yeast nuclear"`). The code names are based on the `name` field from the [NCBI specs](https://www.ncbi.nlm.nih.gov/IEB/ToolBox/C_DOC/lxr/source/data/gc.prt) but all lowercase characters. Alternatively, you can also specify the amino acid lookup table directly: `-c "chrM:FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG"`. Defaults to `standard`.
+- `-q`, `--qc-check`: Specify QC-checks for removing transcripts from the output
 
 #### Examples:
 ```bash
@@ -78,6 +79,10 @@ atg --from refgene --to gtf --input /path/to/input.refgene --output /path/to/out
 
 ## Convert RefGene to bed
 atg --from refgene --to bed --input /path/to/input.refgene --output /path/to/output.bed
+
+
+## Convert a GTF file to a RefGene file, remove all transcript without proper start and stop codons
+atg --from gtf --to refgene --input /path/to/input.gtf --output /path/to/output.refgene --qc-check start --qc-check stop --reference /path/to/fasta.fa
 ```
 
 ### Supported `--output` formats