diff --git a/CHANGELOG.md b/CHANGELOG.md index dd3214094..ed7ed0dec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## Version 1.4dev +#### Pipeline updates + +* Add Salmon index for transcriptome + #### Dependency Updates * Added htseq=0.11.2 diff --git a/conf/base.config b/conf/base.config index 3b087a99d..3c3a9be2c 100644 --- a/conf/base.config +++ b/conf/base.config @@ -34,6 +34,10 @@ process { memory = { check_max( 200.GB * task.attempt, 'memory' ) } time = { check_max( 5.h * task.attempt, 'time' ) } } + withName: salmon_index { + cpus = { check_max( 8, 'cpus' ) } + memory = { check_max( 16.GB * task.attempt, 'memory' ) } + } withLabel: low_memory { memory = { check_max( 16.GB * task.attempt, 'memory' ) } } diff --git a/conf/test.config b/conf/test.config index 14d3aedb9..a3deac2f3 100644 --- a/conf/test.config +++ b/conf/test.config @@ -26,4 +26,5 @@ params { // Genome references fasta = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genome.fa' gtf = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genes.gtf' + transcriptome = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/transcriptome.fasta' } diff --git a/docs/output.md b/docs/output.md index 0f3a4c8ff..3bb9d2aea 100644 --- a/docs/output.md +++ b/docs/output.md @@ -23,6 +23,7 @@ and processes data using the following steps: * [dupRadar](#dupradar) - technical / biological read duplication * [Preseq](#preseq) - library complexity * [featureCounts](#featurecounts) - gene counts, biotype counts, rRNA estimation. +* [Salmon](#salmon) - gene counts, biotype counts, rRNA estimation. * [StringTie](#stringtie) - FPKMs for genes and transcripts * [Sample_correlation](#Sample_correlation) - create MDS plot and sample pairwise distance heatmap / dendrogram * [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline @@ -301,6 +302,32 @@ We also use featureCounts to count overlaps with different classes of features. **Output directory: `results/featureCounts`** +* `Sample.bam_biotype_counts.txt` + * Read counts for the different gene biotypes that featureCounts distinguishes. +* `Sample.featureCounts.txt` + * Read the counts for each gene provided in the reference `gtf` file +* `Sample.featureCounts.txt.summary` + * Summary file, containing statistics about the counts + +## Salmon +[Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) from [Ocean Genomics](https://oceangenomics.com/) quasi-maps reads to a transcriptome and counts gene expression per transcript. + +### Salmon Index + +**Output directory: `results/reference_transcriptome`** + +* `Sample.bam_biotype_counts.txt` + * Read counts for the different gene biotypes that featureCounts distinguishes. +* `Sample.featureCounts.txt` + * Read the counts for each gene provided in the reference `gtf` file +* `Sample.featureCounts.txt.summary` + * Summary file, containing statistics about the counts + + +### Salmon quant + +**Output directory: `results/salmon`** + * `Sample.bam_biotype_counts.txt` * Read counts for the different gene biotypes that featureCounts distinguishes. * `Sample.featureCounts.txt` diff --git a/docs/usage.md b/docs/usage.md index 517108025..664f61dad 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -17,6 +17,8 @@ * [FeatureCounts Extra Gene Names](#featurecounts-extra-gene-names) * [Default Attribute Type](#default-attribute-type) * [Extra Gene Names](#extra-gene-names) +* [Transcriptome mapping with Salmon](#transcriptome-mapping-with-salmon) + * [Indexing the transcriptome](#indexing-the-transcriptome) * [Alignment tool](#alignment-tool) * [Reference genomes](#reference-genomes) * [`--genome` (using iGenomes)](#--genome-using-igenomes) @@ -196,6 +198,15 @@ Note that you can also specify more than one desired value, separated by a comma ``--fcExtraAttributes gene_id,...`` +## Transcriptome mapping with Salmon + +If the option `--transcriptome` is provided to a fasta file of cDNA sequences, the pipeline will also run transcriptome quantification using [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html). + +### Indexing the transcriptome + +The transcriptome is indexed using the default parameters of Salmon, using the default k-mer size of 31. As [discussed](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode), the a k-mer size off 31 works well with reads that are length 75bp or longer. + + ## Alignment tool By default, the pipeline uses [STAR](https://github.com/alexdobin/STAR) to align the raw FastQ reads to the reference genome. STAR is fast and common, but requires a lot of memory to run, typically around 38GB for the Human GRCh37 reference genome. diff --git a/main.nf b/main.nf index c3dc2ec49..ef1e5cadd 100644 --- a/main.nf +++ b/main.nf @@ -36,6 +36,7 @@ def helpMessage() { --star_index Path to STAR index --hisat2_index Path to HiSAT2 index --fasta Path to Fasta reference + --transcriptome Path to Fasta transcriptome --gtf Path to GTF file --gff Path to GFF3 file --bed12 Path to bed12 file @@ -168,6 +169,15 @@ if( params.gtf ){ exit 1, "No GTF or GFF3 annotation specified!" } +if (params.transcriptome){ + Channel + .fromPath(params.transcriptome) + .ifEmpty { exit 1, "Transcript fasta file is unreachable: ${params.transcriptome}" } + .set { tx_fasta_ch } +} + + + if( params.bed12 ){ bed12 = Channel .fromPath(params.bed12) @@ -251,6 +261,7 @@ if(params.aligner == 'star'){ else if(params.fasta) summary['Fasta Ref'] = params.fasta if(params.splicesites) summary['Splice Sites'] = params.splicesites } +if(params.transcriptome) summary['Transcriptome'] = params.transcriptome if(params.gtf) summary['GTF Annotation'] = params.gtf if(params.gff) summary['GFF3 Annotation'] = params.gff if(params.bed12) summary['BED Annotation'] = params.bed12 @@ -431,6 +442,31 @@ if(params.aligner == 'hisat2' && !params.hisat2_index && params.fasta){ """ } } + + +/* + * PREPROCESSING - Create Salmon transcriptome index + */ +if(params.transcriptome){ + process salmon_index { + tag "$transcriptome.simpleName" + publishDir path: { "${params.outdir}/reference_transcriptome" }, + mode: 'copy' + + input: + file transcriptome from tx_fasta_ch + + output: + file 'salmon_index' into index_ch + + script: + """ + salmon index --threads $task.cpus -t $transcriptome -i salmon_index + """ + } +} + + /* * PREPROCESSING - Convert GFF3 to GTF */ @@ -968,6 +1004,31 @@ process dupradar { """ } +if (params.transcriptome){ + process salmon_quant { + tag "$sample" + publishDir "${params.outdir}/Salmon", mode: 'copy' + + input: + file index from index_ch.collect() + set sample, file(reads) from raw_salmon + + output: + file(sample) into quant_ch + + script: + if (params.singleEnd){ + """ + salmon quant --validateMappings --threads $task.cpus --libType=A -i $index -r ${reads[0]} -o $sample + """ + }else{ + """ + salmon quant --validateMappings --threads $task.cpus --libType=A -i $index -1 ${reads[0]} -2 ${reads[1]} -o $sample + """ + } + } +} + /* * STEP 9 - Feature counts diff --git a/nextflow.config b/nextflow.config index 900f25292..5924aba23 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,7 +18,7 @@ params { fcExtraAttributes = 'gene_name' fcGroupFeatures = 'gene_id' fcGroupFeaturesType = 'gene_biotype' - markdup_java_options = '"-Xms4000m -Xmx7g"' //Established values for markDuplicate memory consumption, see issue PR #689 (in Sarek) for details + markdup_java_options = '"-Xms4000m -Xmx7g"' //Established values for markDuplicate memory consumption, see issue PR #689 (in Sarek) for details splicesites = false saveReference = false saveTrimmed = false @@ -26,6 +26,7 @@ params { singleEnd = false reads = "data/*{1,2}.fastq.gz" outdir = './results' + transcriptome = false seqCenter = false skip_qc = false skip_fastqc = false