Update modules required for rnaseq pipeline (#449)

* Update HISAT2 build module * Bump preseq version * Fix tests * Add meta.yml for preseq to fix linting * Auto-detect --genomeSAindexNbases for smaller genomes * Add placeholder to use human data for the tests * Add CSI output option to samtools/index * Fix samtools/index tests
nf-core · Apr 16, 2021 · d1c6082 · d1c6082
1 parent defaca4
commit d1c6082
Show file tree

Hide file tree

Showing 10 changed files with 132 additions and 39 deletions.
diff --git a/software/hisat2/build/main.nf b/software/hisat2/build/main.nf
@@ -9,6 +9,7 @@ def VERSION = '2.2.0'
 process HISAT2_BUILD {
     tag "$fasta"
     label 'process_high'
+    label 'process_high_memory'
     publishDir "${params.outdir}",
         mode: params.publish_dir_mode,
         saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'index', meta:[:], publish_by_meta:[]) }
@@ -26,7 +27,7 @@ process HISAT2_BUILD {
     path splicesites
 
     output:
-    path "hisat2",        emit: index
+    path "hisat2"       , emit: index
     path "*.version.txt", emit: version
 
     script:
@@ -38,17 +39,18 @@ process HISAT2_BUILD {
         avail_mem = task.memory.toGiga()
     }
 
-    def extract_exons = ''
     def ss = ''
     def exon = ''
-    if (avail_mem > params.hisat_build_memory) {
-        log.info "[HISAT2 index build] Over ${params.hisat_build_memory} GB available, so using splice sites and exons in HISAT2 index"
+    def extract_exons = ''
+    def hisat2_build_memory = params.hisat2_build_memory ? (params.hisat2_build_memory as nextflow.util.MemoryUnit).toGiga() : 0
+    if (avail_mem >= hisat2_build_memory) {
+        log.info "[HISAT2 index build] At least ${hisat2_build_memory} GB available, so using splice sites and exons to build HISAT2 index"
         extract_exons = "hisat2_extract_exons.py $gtf > ${gtf.baseName}.exons.txt"
         ss = "--ss $splicesites"
         exon = "--exon ${gtf.baseName}.exons.txt"
     } else {
-        log.info "[HISAT2 index build] Less than ${params.hisat_build_memory} GB available, so NOT using splice sites and exons in HISAT2 index."
-        log.info "[HISAT2 index build] Use --hisat_build_memory [small number] to skip this check."
+        log.info "[HISAT2 index build] Less than ${hisat2_build_memory} GB available, so NOT using splice sites and exons to build HISAT2 index."
+        log.info "[HISAT2 index build] Use --hisat2_build_memory [small number] to skip this check."
     }
 
     def software = getSoftwareName(task.process)

diff --git a/software/preseq/lcextrap/main.nf b/software/preseq/lcextrap/main.nf
@@ -12,11 +12,11 @@ process PRESEQ_LCEXTRAP {
         mode: params.publish_dir_mode,
         saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) }
 
-    conda (params.enable_conda ? "bioconda::preseq=2.0.3" : null)
+    conda (params.enable_conda ? "bioconda::preseq=3.1.2" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/preseq:2.0.3--hf53bd2b_3"
+        container "https://depot.galaxyproject.org/singularity/preseq:3.1.2--h06ef8b0_1"
     } else {
-        container "quay.io/biocontainers/preseq:2.0.3--hf53bd2b_3"
+        container "quay.io/biocontainers/preseq:3.1.2--h06ef8b0_1"
     }
 
     input:

diff --git a/software/preseq/lcextrap/meta.yml b/software/preseq/lcextrap/meta.yml
@@ -0,0 +1,47 @@
+name: preseq_lcextrap
+description: Software for predicting library complexity and genome coverage in high-throughput sequencing
+keywords:
+  - preseq
+  - library
+  - complexity
+tools:
+  - preseq:
+      description: Software for predicting library complexity and genome coverage in high-throughput sequencing
+      homepage: http://smithlabresearch.org/software/preseq/
+      documentation: None
+      tool_dev_url: None
+      doi: ""
+      licence: ['GPL']
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - version:
+      type: file
+      description: File containing software version
+      pattern: "*.{version.txt}"
+  - ccurve:
+      type: file
+      description: File containing output of Preseq lcextrap
+      pattern: "*.{ccurve.txt}"
+  - log:
+      type: file
+      description: Log file containing stderr produced by Preseq
+      pattern: "*.{log}"
+
+authors:
+  - "@drpatelh"
diff --git a/software/samtools/index/main.nf b/software/samtools/index/main.nf
@@ -22,13 +22,14 @@ process SAMTOOLS_INDEX {
     tuple val(meta), path(bam)
 
     output:
-    tuple val(meta), path("*.bai"), emit: bai
+    tuple val(meta), path("*.bai"), optional:true, emit: bai
+    tuple val(meta), path("*.csi"), optional:true, emit: csi
     path  "*.version.txt"         , emit: version
 
     script:
     def software = getSoftwareName(task.process)
     """
-    samtools index $bam
+    samtools index $options.args $bam
     echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' > ${software}.version.txt
     """
 }
diff --git a/software/samtools/index/meta.yml b/software/samtools/index/meta.yml
@@ -34,6 +34,10 @@ output:
         type: file
         description: BAM/CRAM/SAM index file
         pattern: "*.{bai,crai,sai}"
+    - csi:
+        type: file
+        description: CSI index file
+        pattern: "*.{csi}"
     - version:
         type: file
         description: File containing software version

diff --git a/software/star/genomegenerate/main.nf b/software/star/genomegenerate/main.nf
@@ -12,11 +12,11 @@ process STAR_GENOMEGENERATE {
         saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'index', meta:[:], publish_by_meta:[]) }
 
     // Note: 2.7X indices incompatible with AWS iGenomes.
-    conda (params.enable_conda ? "bioconda::star=2.6.1d" : null)
+    conda (params.enable_conda ? "bioconda::star=2.6.1d bioconda::samtools=1.10 conda-forge::gawk=5.1.0" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/star:2.6.1d--0"
+        container "https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:59cdd445419f14abac76b31dd0d71217994cbcc9-0"
     } else {
-        container "quay.io/biocontainers/star:2.6.1d--0"
+        container "quay.io/biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:59cdd445419f14abac76b31dd0d71217994cbcc9-0"
     }
 
     input:
@@ -28,19 +28,40 @@ process STAR_GENOMEGENERATE {
     path "*.version.txt", emit: version
 
     script:
-    def software  = getSoftwareName(task.process)
-    def memory    = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : ''
-    """
-    mkdir star
-    STAR \\
-        --runMode genomeGenerate \\
-        --genomeDir star/ \\
-        --genomeFastaFiles $fasta \\
-        --sjdbGTFfile $gtf \\
-        --runThreadN $task.cpus \\
-        $memory \\
-        $options.args
-
-    STAR --version | sed -e "s/STAR_//g" > ${software}.version.txt
-    """
+    def software = getSoftwareName(task.process)
+    def memory   = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : ''
+    def args     = options.args.tokenize()
+    if (args.contains('--genomeSAindexNbases')) {
+        """
+        mkdir star
+        STAR \\
+            --runMode genomeGenerate \\
+            --genomeDir star/ \\
+            --genomeFastaFiles $fasta \\
+            --sjdbGTFfile $gtf \\
+            --runThreadN $task.cpus \\
+            $memory \\
+            $options.args
+
+        STAR --version | sed -e "s/STAR_//g" > ${software}.version.txt
+        """
+    } else {
+        """
+        samtools faidx $fasta
+        NUM_BASES=`gawk '{sum = sum + \$2}END{if ((log(sum)/log(2))/2 - 1 > 14) {printf "%.0f", 14} else {printf "%.0f", (log(sum)/log(2))/2 - 1}}' ${fasta}.fai`
+
+        mkdir star
+        STAR \\
+            --runMode genomeGenerate \\
+            --genomeDir star/ \\
+            --genomeFastaFiles $fasta \\
+            --sjdbGTFfile $gtf \\
+            --runThreadN $task.cpus \\
+            --genomeSAindexNbases \$NUM_BASES \\
+            $memory \\
+            $options.args
+
+        STAR --version | sed -e "s/STAR_//g" > ${software}.version.txt
+        """
+    }
 }
diff --git a/tests/software/preseq/lcextrap/test.yml b/tests/software/preseq/lcextrap/test.yml
@@ -5,7 +5,7 @@
     - preseq/lcextrap
   files:
     - path: output/preseq/test.ccurve.txt
-      md5sum: 76ae04c8eaf19c94e3210bb69da38498
+      md5sum: 1fa5cdd601079329618f61660bee00de
     - path: output/preseq/test.command.log
 
 - name: preseq lcextrap paired-end
@@ -15,5 +15,5 @@
     - preseq/lcextrap
   files:
     - path: output/preseq/test.ccurve.txt
-      md5sum: 2836d2fabd2213f097fd7063db550276
+      md5sum: 10e5ea860e87fb6f5dc10f4f20c62040
     - path: output/preseq/test.command.log
diff --git a/tests/software/samtools/index/main.nf b/tests/software/samtools/index/main.nf
@@ -2,12 +2,21 @@
 
 nextflow.enable.dsl = 2
 
-include { SAMTOOLS_INDEX } from '../../../../software/samtools/index/main.nf' addParams( options: [:] )
+include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_BAI } from '../../../../software/samtools/index/main.nf' addParams( options: [:] )
+include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_CSI } from '../../../../software/samtools/index/main.nf' addParams( options: [args:'-c'] )
 
-workflow test_samtools_index {
+workflow test_samtools_index_bai {
     input = [ [ id:'test', single_end:false ], // meta map
                 file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true)
             ]
 
-    SAMTOOLS_INDEX ( input )
+    SAMTOOLS_INDEX_BAI ( input )
+}
+
+workflow test_samtools_index_csi {
+    input = [ [ id:'test', single_end:false ], // meta map
+                file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true)
+            ]
+
+    SAMTOOLS_INDEX_CSI ( input )
 }
diff --git a/tests/software/samtools/index/test.yml b/tests/software/samtools/index/test.yml
@@ -1,8 +1,17 @@
-- name: samtools index
-  command: nextflow run tests/software/samtools/index -entry test_samtools_index -c tests/config/nextflow.config
+- name: samtools index bai
+  command: nextflow run tests/software/samtools/index -entry test_samtools_index_bai -c tests/config/nextflow.config
   tags:
     - samtools
     - samtools/index
   files:
     - path: output/samtools/test_paired_end.sorted.bam.bai
       md5sum: 704c10dd1326482448ca3073fdebc2f4
+
+- name: samtools index csi
+  command: nextflow run tests/software/samtools/index -entry test_samtools_index_csi -c tests/config/nextflow.config
+  tags:
+    - samtools
+    - samtools/index
+  files:
+    - path: output/samtools/test_paired_end.sorted.bam.csi
+      md5sum: 3dd9e3ed959fca075b88bb8dc3cf7dbd
diff --git a/tests/software/star/genomegenerate/main.nf b/tests/software/star/genomegenerate/main.nf
@@ -2,11 +2,11 @@
 
 nextflow.enable.dsl = 2
 
-include { STAR_GENOMEGENERATE } from '../../../../software/star/genomegenerate/main.nf' addParams( options: [args: '--genomeSAindexNbases 9'] )
+include { STAR_GENOMEGENERATE } from '../../../../software/star/genomegenerate/main.nf' addParams( options: [:] )
 
 workflow test_star_genomegenerate {
-    fasta = file("${launchDir}/tests/data/generic/fasta/GCF_000019425.1_ASM1942v1_genomic.fna", checkIfExists: true)
-    gtf   = file("${launchDir}/tests/data/generic/gtf/GCF_000019425.1_ASM1942v1_genomic.gtf", checkIfExists: true)
+    fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
+    gtf   = file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)
 
     STAR_GENOMEGENERATE ( fasta, gtf )
 }