From 14bc060a3f71a92c3c95efccb3e8898dc5c599b6 Mon Sep 17 00:00:00 2001 From: Ryan Lim Date: Fri, 3 Jan 2025 09:40:38 -0800 Subject: [PATCH] fix pipeline visualization descriptions --- lib/idseq-dag/idseq_dag/steps/run_subsample.py | 2 +- workflows/short-read-mngs/host_filter.wdl | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/idseq-dag/idseq_dag/steps/run_subsample.py b/lib/idseq-dag/idseq_dag/steps/run_subsample.py index 17a0f3568..a172ba75e 100644 --- a/lib/idseq-dag/idseq_dag/steps/run_subsample.py +++ b/lib/idseq-dag/idseq_dag/steps/run_subsample.py @@ -14,7 +14,7 @@ class PipelineStepRunSubsample(PipelineCountingStep): For samples with a high fraction of non-host reads (ie stool samples), the .fasta outputs following bowtie alignment may contain large numbers of sequences. - GSNAP alignment to NT and NR databases is a resource-intensive step. + Alignment to NT and NR databases is a resource-intensive step. To reduce computational time, the reads are randomly sub-sampled to 1 million total fragments (1 million single-end reads or 2 million paired-end reads). """ diff --git a/workflows/short-read-mngs/host_filter.wdl b/workflows/short-read-mngs/host_filter.wdl index 38c30dada..4205eda38 100644 --- a/workflows/short-read-mngs/host_filter.wdl +++ b/workflows/short-read-mngs/host_filter.wdl @@ -315,7 +315,7 @@ task ercc_bowtie2_filter { `bowtie2 ~{bowtie2_options}` using a precomputed index, then uses [samtools](http://www.htslib.org/) to keep reads *not* mapping to the ercc genome. - Bowtie2 is run on the fastp-filtered FASTQ(s): + Bowtie2 is run on validated FASTQ files: ``` ~{bowtie2_invocation} @@ -393,7 +393,7 @@ task fastp_qc { 5. Complexity filter ([custom feature](https://github.com/mlin/fastp/tree/mlin/sdust) using the [SDUST algorithm](https://pubmed.ncbi.nlm.nih.gov/16796549/)) - fastp is run on the FASTQ file(s) from input validation: + fastp is run on ERCC-filtered FASTQ files: ``` ~{fastp_invocation} ``` @@ -504,10 +504,8 @@ task kallisto { **kallisto RNA quantification** Quantifies host transcripts using [kallisto](https://pachterlab.github.io/kallisto/about). - The host transcript sequences are sourced from GENCODE, along with - [ERCC control sequences](https://www.nist.gov/programs-projects/external-rna-controls-consortium). - Not all CZ ID host species have transcripts indexed; for those without, kallisto is run using ERCC - sequences only. + The host transcript sequences are sourced from GENCODE. + Not all CZ ID host species have transcripts indexed, so transcripts are not calculated for all hosts. kallisto is run on the fastp-filtered FASTQ(s):