nf-core · olgabot · Jun 3, 2019 · Jun 3, 2019 · Jun 3, 2019 · Jun 3, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## Version 1.4dev
 
+#### Pipeline updates
+
+* Add Salmon index for transcriptome
+
 #### Dependency Updates
 * Added htseq=0.11.2
 

diff --git a/conf/base.config b/conf/base.config
@@ -34,6 +34,10 @@ process {
     memory = { check_max( 200.GB * task.attempt, 'memory' ) }
     time = { check_max( 5.h * task.attempt, 'time' ) }
   }
+  withName: salmon_index {
+    cpus = { check_max( 8, 'cpus' ) }
+    memory = { check_max( 16.GB * task.attempt, 'memory' ) }
+  }
   withLabel: low_memory {
     memory = { check_max( 16.GB * task.attempt, 'memory' ) }
   }

diff --git a/conf/test.config b/conf/test.config
@@ -26,4 +26,5 @@ params {
   // Genome references
   fasta = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genome.fa'
   gtf = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genes.gtf'
+  transcriptome = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/transcriptome.fasta'
 }
diff --git a/docs/output.md b/docs/output.md
@@ -23,6 +23,7 @@ and processes data using the following steps:
 * [dupRadar](#dupradar) - technical / biological read duplication
 * [Preseq](#preseq) - library complexity
 * [featureCounts](#featurecounts) - gene counts, biotype counts, rRNA estimation.
+* [Salmon](#salmon) - gene counts, biotype counts, rRNA estimation.
 * [StringTie](#stringtie) - FPKMs for genes and transcripts
 * [Sample_correlation](#Sample_correlation) - create MDS plot and sample pairwise distance heatmap / dendrogram
 * [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline
@@ -301,6 +302,32 @@ We also use featureCounts to count overlaps with different classes of features.
 
 **Output directory: `results/featureCounts`**
 
+* `Sample.bam_biotype_counts.txt`
+  * Read counts for the different gene biotypes that featureCounts distinguishes.
+* `Sample.featureCounts.txt`
+  * Read the counts for each gene provided in the reference `gtf` file
+* `Sample.featureCounts.txt.summary`
+  * Summary file, containing statistics about the counts
+
+## Salmon
+[Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) from [Ocean Genomics](https://oceangenomics.com/) quasi-maps reads to a transcriptome and counts gene expression per transcript.
+
+### Salmon Index
+
+**Output directory: `results/reference_transcriptome`**
+
+* `Sample.bam_biotype_counts.txt`
+  * Read counts for the different gene biotypes that featureCounts distinguishes.
+* `Sample.featureCounts.txt`
+  * Read the counts for each gene provided in the reference `gtf` file
+* `Sample.featureCounts.txt.summary`
+  * Summary file, containing statistics about the counts
+
+
+### Salmon quant
+
+**Output directory: `results/salmon`**
+
 * `Sample.bam_biotype_counts.txt`
   * Read counts for the different gene biotypes that featureCounts distinguishes.
 * `Sample.featureCounts.txt`

diff --git a/docs/usage.md b/docs/usage.md
@@ -17,6 +17,8 @@
 * [FeatureCounts Extra Gene Names](#featurecounts-extra-gene-names)
   * [Default Attribute Type](#default-attribute-type)
   * [Extra Gene Names](#extra-gene-names)
+* [Transcriptome mapping with Salmon](#transcriptome-mapping-with-salmon)
+  * [Indexing the transcriptome](#indexing-the-transcriptome)
 * [Alignment tool](#alignment-tool)
 * [Reference genomes](#reference-genomes)
   * [`--genome` (using iGenomes)](#--genome-using-igenomes)
@@ -196,6 +198,15 @@ Note that you can also specify more than one desired value, separated by a comma
 ``--fcExtraAttributes gene_id,...``
 
 
+## Transcriptome mapping with Salmon
+
+If the option `--transcriptome` is provided to a fasta file of cDNA sequences, the pipeline will also run transcriptome quantification using [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html).
+
+### Indexing the transcriptome
+
+The transcriptome is indexed using the default parameters of Salmon, using the default k-mer size of 31. As [discussed](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode), the a k-mer size off 31 works well with reads that are length 75bp or longer.
+
+
 ## Alignment tool
 By default, the pipeline uses [STAR](https://github.com/alexdobin/STAR) to align the raw FastQ reads to the reference genome. STAR is fast and common, but requires a lot of memory to run, typically around 38GB for the Human GRCh37 reference genome.
 

diff --git a/main.nf b/main.nf
@@ -36,6 +36,7 @@ def helpMessage() {
       --star_index                  Path to STAR index
       --hisat2_index                Path to HiSAT2 index
       --fasta                       Path to Fasta reference
+      --transcriptome               Path to Fasta transcriptome
       --gtf                         Path to GTF file
       --gff                         Path to GFF3 file
       --bed12                       Path to bed12 file
@@ -168,6 +169,15 @@ if( params.gtf ){
     exit 1, "No GTF or GFF3 annotation specified!"
 }
 
+if (params.transcriptome){
+  Channel
+      .fromPath(params.transcriptome)
+      .ifEmpty { exit 1, "Transcript fasta file is unreachable: ${params.transcriptome}"  }
+      .set { tx_fasta_ch  }
+}
+
+
+
 if( params.bed12 ){
     bed12 = Channel
         .fromPath(params.bed12)
@@ -251,6 +261,7 @@ if(params.aligner == 'star'){
     else if(params.fasta)          summary['Fasta Ref']    = params.fasta
     if(params.splicesites)         summary['Splice Sites'] = params.splicesites
 }
+if(params.transcriptome)       summary['Transcriptome']  = params.transcriptome
 if(params.gtf)                 summary['GTF Annotation']  = params.gtf
 if(params.gff)                 summary['GFF3 Annotation']  = params.gff
 if(params.bed12)               summary['BED Annotation']  = params.bed12
@@ -431,6 +442,31 @@ if(params.aligner == 'hisat2' && !params.hisat2_index && params.fasta){
         """
     }
 }
+
+
+/*
+ * PREPROCESSING - Create Salmon transcriptome index
+ */
+if(params.transcriptome){
+  process salmon_index {
+      tag "$transcriptome.simpleName"
+      publishDir path: { "${params.outdir}/reference_transcriptome" },
+                 mode: 'copy'
+
+      input:
+      file transcriptome from tx_fasta_ch
+
+      output:
+      file 'salmon_index' into index_ch
+
+      script:
+      """
+      salmon index --threads $task.cpus -t $transcriptome -i salmon_index
+      """
+  }
+}
+
+
 /*
  * PREPROCESSING - Convert GFF3 to GTF
  */
@@ -968,6 +1004,31 @@ process dupradar {
     """
 }
 
+if (params.transcriptome){
+    process salmon_quant {
+        tag "$sample"
+        publishDir "${params.outdir}/Salmon", mode: 'copy'
+
+        input:
+        file index from index_ch.collect()
+        set sample, file(reads) from raw_salmon
+
+        output:
+        file(sample) into quant_ch
+
+        script:
+        if (params.singleEnd){
+            """
+            salmon quant --validateMappings --threads $task.cpus --libType=A -i $index -r ${reads[0]} -o $sample
+            """
+        }else{
+            """
+            salmon quant --validateMappings --threads $task.cpus --libType=A -i $index -1 ${reads[0]} -2 ${reads[1]} -o $sample
+            """
+        }
+    }
+}
+
 
 /*
  * STEP 9 - Feature counts

diff --git a/nextflow.config b/nextflow.config
@@ -18,14 +18,15 @@ params {
   fcExtraAttributes = 'gene_name'
   fcGroupFeatures = 'gene_id'
   fcGroupFeaturesType = 'gene_biotype'
-  markdup_java_options = '"-Xms4000m -Xmx7g"' //Established values for markDuplicate memory consumption, see issue PR #689 (in Sarek) for details  
+  markdup_java_options = '"-Xms4000m -Xmx7g"' //Established values for markDuplicate memory consumption, see issue PR #689 (in Sarek) for details
   splicesites = false
   saveReference = false
   saveTrimmed = false
   saveAlignedIntermediates = false
   singleEnd = false
   reads = "data/*{1,2}.fastq.gz"
   outdir = './results'
+  transcriptome = false
   seqCenter = false
   skip_qc = false
   skip_fastqc = false