Merge pull request #1 from iaradsouza1/feature-counts

feat: added featureCounts module from subread. Tested with the default test.config.
iaradsouza1 · Jun 8, 2023 · 9c77d75 · 9c77d75
2 parents f1c2e1a + f2bf007
commit 9c77d75
Show file tree

Hide file tree

Showing 9 changed files with 201 additions and 7 deletions.
diff --git a/bin/get_counts.r b/bin/get_counts.r
@@ -0,0 +1,27 @@
+#!/usr/bin/env Rscript
+
+library(dplyr)
+library(purrr)
+
+args <- commandArgs(trailingOnly = TRUE)
+if (length(args) < 1) {
+    stop("Usage: get_counts.r <feature_counts_files>", call. = FALSE)
+}
+
+files <- unlist(strsplit(args, split = " "))
+
+map(files, ~ {
+  feature_counts <- read.table(.x, header = TRUE) %>%
+    select(1, 7)
+}) %>%
+  reduce(inner_join, by = "Geneid") -> count_table
+
+write.table(count_table, file = "count_table.txt", row.names = FALSE, quote = FALSE)
+
+colnames(count_table) <- sapply(strsplit(colnames(count_table), split = "_"), "[[", 1)
+rownames(count_table) <- count_table$Geneid
+count_table$Geneid <- NULL
+
+count_table <- as.matrix(count_table)
+
+saveRDS(count_table, file = "count_table.rds")
diff --git a/conf/test.config b/conf/test.config
@@ -11,6 +11,7 @@
 */
 
 params {
+
     config_profile_name        = 'Test profile'
     config_profile_description = 'Minimal test dataset to check pipeline function'
 
@@ -20,7 +21,14 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input = "https://raw.githubusercontent.com/iaradsouza1/test-dataset/main/samplesheet.csv"
+    // input = "https://raw.githubusercontent.com/iaradsouza1/test-dataset/main/samplesheet.csv"
+    input = "file:///home/iaradsouza/samplesheet.csv"
+
     fasta_filter = "https://github.com/iaradsouza1/test-dataset/raw/main/genome/sub_csabeus_26.fasta.gz"
-    fasta_align = "https://github.com/iaradsouza1/test-dataset/raw/main/genome/sub_tcruzi_1_4.fasta.gz"
+
+    // fasta_align = "https://github.com/iaradsouza1/test-dataset/raw/main/genome/sub_tcruzi_1_4.fasta.gz"
+    fasta_align = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fasta"
+    gtf_align = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genes.gtf"
+    attribute = 'gene_id'
+
 }
diff --git a/modules.json b/modules.json
@@ -29,6 +29,11 @@
                         "branch": "master",
                         "git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7",
                         "installed_by": ["modules"]
+                    },
+                    "subread/featurecounts": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"]
                     }
                 }
             }

diff --git a/modules/local/gather_counts.nf b/modules/local/gather_counts.nf
@@ -0,0 +1,27 @@
+process GATHER_COUNTS {
+    label "process_medium"
+
+    container "biocontainers/r-tidyverse:1.2.1"
+
+    input:
+    path feature_counts
+
+    output:
+    path "count_table.txt"
+    path "count_table.rds"
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script: // This script is bundled with the pipeline, in iaradsouza1/tab-projeto-final/bin
+    """
+    get_counts.r \\
+        $feature_counts
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        r-base: \$(echo \$(R --version 2>&1) | sed 's/^.*R version //; s/ .*\$//')
+    END_VERSIONS
+    """
+
+}
diff --git a/modules/nf-core/subread/featurecounts/main.nf b/modules/nf-core/subread/featurecounts/main.nf
diff --git a/modules/nf-core/subread/featurecounts/meta.yml b/modules/nf-core/subread/featurecounts/meta.yml
diff --git a/nextflow.config b/nextflow.config
@@ -13,8 +13,9 @@ params {
     // Input options
     input                      = null
     fasta_filter               = null
-    fasta_align               = null
-
+    fasta_align                = null
+    gtf_align                  = null
+    attribute                  = null
 
     // References
     // genome                     = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -57,6 +57,17 @@
                     "type": "string",
                     "description": "Fasta file from the target genome",
                     "format": "file-path"
+                },
+                "gtf_align": {
+                    "type": "string",
+                    "description": "GTF file for the target organism.",
+                    "format": "file-path"
+                },
+                "attribute": {
+                    "type": "string",
+                    "default": "gene_id",
+                    "description": "Attribute to count reads",
+                    "enum": ["gene_id", "gene", "transcript", "exon"]
                 }
             },
             "required": ["fasta_filter", "fasta_align"]

diff --git a/workflows/finalproject.nf b/workflows/finalproject.nf
@@ -9,14 +9,13 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params)
 // Validate input parameters
 WorkflowFinalproject.initialise(params, log)
 
-// TODO nf-core: Add all file path parameters for the pipeline to the list below
 // Check input path parameters to see if they exist
-def checkPathParamList = [ params.input, params.multiqc_config, params.fasta_filter, params.fasta_align ]
+def checkPathParamList = [ params.input, params.multiqc_config, params.fasta_filter, params.fasta_align, params.gtf_align ]
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
 // Check mandatory parameters
 if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }
-
+if (params.attribute == null ) { exit 1, 'You must specify a feature type (e.g., gene_id or exon) to count reads.' }
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     CONFIG FILES
@@ -55,6 +54,8 @@ include { BOWTIE2_BUILD as BOWTIE2_BUILD_HOST } from '../modules/nf-core/bowtie2
 include { BOWTIE2_ALIGN as BOWTIE2_ALIGN_HOST } from '../modules/nf-core/bowtie2/align/main'
 include { BOWTIE2_BUILD as BOWTIE2_BUILD_ORG  } from '../modules/nf-core/bowtie2/build/main'
 include { BOWTIE2_ALIGN as BOWTIE2_ALIGN_ORG  } from '../modules/nf-core/bowtie2/align/main'
+include { SUBREAD_FEATURECOUNTS               } from '../modules/nf-core/subread/featurecounts/main'
+include { GATHER_COUNTS                       } from '../modules/local/gather_counts'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -116,10 +117,23 @@ workflow FINALPROJECT {
     )
     ch_versions = ch_versions.mix(BOWTIE2_ALIGN_ORG.out.versions)
 
+    //
+    // MODULE: Run featureCounts to obtain the table of gene counts
+    //
+    SUBREAD_FEATURECOUNTS (
+        BOWTIE2_ALIGN_ORG.out.aligned.map{ [ it[0], it[1], params.gtf_align ] }, params.attribute
+    )
+    ch_versions = ch_versions.mix(SUBREAD_FEATURECOUNTS.out.versions)
+
+    GATHER_COUNTS(
+        SUBREAD_FEATURECOUNTS.out.counts.collect{it[1]}
+    )
+
     // Dump software versions
     CUSTOM_DUMPSOFTWAREVERSIONS (
         ch_versions.unique().collectFile(name: 'collated_versions.yml')
     )
+
     //
     // MODULE: MultiQC
     //