From 14bc060a3f71a92c3c95efccb3e8898dc5c599b6 Mon Sep 17 00:00:00 2001
From: Ryan Lim <rzlim08@gmail.com>
Date: Fri, 3 Jan 2025 09:40:38 -0800
Subject: [PATCH] fix pipeline visualization descriptions

---
 lib/idseq-dag/idseq_dag/steps/run_subsample.py |  2 +-
 workflows/short-read-mngs/host_filter.wdl      | 10 ++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/lib/idseq-dag/idseq_dag/steps/run_subsample.py b/lib/idseq-dag/idseq_dag/steps/run_subsample.py
index 17a0f3568..a172ba75e 100644
--- a/lib/idseq-dag/idseq_dag/steps/run_subsample.py
+++ b/lib/idseq-dag/idseq_dag/steps/run_subsample.py
@@ -14,7 +14,7 @@ class PipelineStepRunSubsample(PipelineCountingStep):
 
     For samples with a high fraction of non-host reads (ie stool samples), the .fasta outputs
     following bowtie alignment may contain large numbers of sequences.
-    GSNAP alignment to NT and NR databases is a resource-intensive step.
+    Alignment to NT and NR databases is a resource-intensive step.
     To reduce computational time, the reads are randomly sub-sampled to
     1 million total fragments (1 million single-end reads or 2 million paired-end reads).
     """
diff --git a/workflows/short-read-mngs/host_filter.wdl b/workflows/short-read-mngs/host_filter.wdl
index 38c30dada..4205eda38 100644
--- a/workflows/short-read-mngs/host_filter.wdl
+++ b/workflows/short-read-mngs/host_filter.wdl
@@ -315,7 +315,7 @@ task ercc_bowtie2_filter {
       `bowtie2 ~{bowtie2_options}` using a precomputed index, then uses
       [samtools](http://www.htslib.org/) to keep reads *not* mapping to the ercc genome.
 
-      Bowtie2 is run on the fastp-filtered FASTQ(s):
+      Bowtie2 is run on validated FASTQ files:
 
       ```
       ~{bowtie2_invocation}
@@ -393,7 +393,7 @@ task fastp_qc {
       5. Complexity filter ([custom feature](https://github.com/mlin/fastp/tree/mlin/sdust)
          using the [SDUST algorithm](https://pubmed.ncbi.nlm.nih.gov/16796549/))
 
-      fastp is run on the FASTQ file(s) from input validation:
+      fastp is run on ERCC-filtered FASTQ files:
       ```
       ~{fastp_invocation}
       ```
@@ -504,10 +504,8 @@ task kallisto {
       **kallisto RNA quantification**
 
       Quantifies host transcripts using [kallisto](https://pachterlab.github.io/kallisto/about).
-      The host transcript sequences are sourced from GENCODE, along with
-      [ERCC control sequences](https://www.nist.gov/programs-projects/external-rna-controls-consortium).
-      Not all CZ ID host species have transcripts indexed; for those without, kallisto is run using ERCC
-      sequences only.
+      The host transcript sequences are sourced from GENCODE. 
+      Not all CZ ID host species have transcripts indexed, so transcripts are not calculated for all hosts. 
 
       kallisto is run on the fastp-filtered FASTQ(s):