diff --git a/conf/uppmax.config b/conf/uppmax.config index 7ca7181c9..98493d498 100644 --- a/conf/uppmax.config +++ b/conf/uppmax.config @@ -86,7 +86,7 @@ process { } $featureCounts { module = ['bioinfo-tools', 'subread/1.5.1'] - } + } $stringtieFPKM { module = ['bioinfo-tools', 'StringTie/1.3.3'] } @@ -108,7 +108,7 @@ params { clusterOptions = false rlocation = "$HOME/R/nxtflow_libs/" saveReference = true - reverse_stranded=true + reverse_stranded = true // illumina iGenomes reference file paths on UPPMAX genomes { 'GRCh37' { diff --git a/docs/usage.md b/docs/usage.md index 48a7769a5..8902263da 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -30,6 +30,29 @@ all files as single end. The file path should be in quotation marks to prevent s If left unspecified, the pipeline will assume that the data is in a directory called `data` in the working directory. +### Library strandedness +Three command line flags / config parameters set the library strandedness for a run: + +* `--forward_stranded` +* `--reverse_stranded` +* `--unstranded` + +If not set, the pipeline will be run as unstranded. The UPPMAX configuration file sets `reverse_stranded` to true by default. +Use `--unstranded` or `--forward_stranded` to overwrite this. + +These flags affect the commands used for several steps in the pipeline - namely HISAT2, featureCounts, RSeQC (`RPKM_saturation.py`) +and StringTie: +* `--forward_stranded` + * HISAT2: `--rna-strandness F` / `--rna-strandness FR` + * featureCounts: `-s 1` + * RSeQC: `-d ++,--` / `-d 1++,1--,2+-,2-+` + * StringTie: `--fr` +* `--reverse_stranded` + * HISAT2: `--rna-strandness R` / `--rna-strandness RF` + * featureCounts: `-s 2` + * RSeQC: `-d +-,-+` / `-d 1+-,1-+,2++,2--` + * StringTie: `--rf` + ## Alignment tool By default, the pipeline uses [STAR](https://github.com/alexdobin/STAR) to align the raw FastQ reads to the reference genome. STAR is fast and common, but requires a great deal of RAM to run, typically @@ -123,16 +146,6 @@ for common RNA-seq library preparation kits. | `--pico` | SMARTer Stranded Total RNA-Seq Kit - Pico Input | 3 | 0 | 0 | 3 | -### strand direction for StringTie and feturecounts -The strandedness of the library can be set py using the `--forward_stranded` and `--reverse_stranded` flags. Both are set as -`false` by default but reversed is set to true in the Uppmax config file. If you library instead is forward oriented simply specify the`--forward_stranded` flag. - -e.g. -```groovy -`--forward_stranded` -``` - - ## Job Resources ### Automatic resubmission Each step in the pipeline has a default set of requirements for number of CPUs, @@ -170,24 +183,6 @@ The output directory where the results will be saved. ### `--sampleLevel` Used to turn of the edgeR MDS and heatmap. Set automatically when running on fewer than 3 samples. -### `--strandRule` -Some RSeQC jobs need to know the stranded nature of the library. By default, the pipeline will use -`++,--` for single end libraries and `1+-,1-+,2++,2--` for paired end libraries. These codes are for -strand specific libraries (antisense). `1+-,1-+,2++,2--` decodes as: - -* Reads 1 mapped to `+` => parental gene on `+` -* Reads 1 mapped to `-` => parental gene on `-` -* Reads 2 mapped to `+` => parental gene on `-` -* Reads 2 mapped to `-` => parental gene on `+` - -Use this parameter to override these defaults. For example, if your data is paired end and strand specific, -but same-sense to the reference, you could run: - -```bash -nextflow run NGI-RNAseq/main.nf --strandRule '1++,1--,2+-,2-+' -``` -Use `--strandRule 'none'` if your data is not strand specific. - ### `--rlocation` Some steps in the pipeline run R with required modules. By default, the pipeline will install these modules to `~/R/nxtflow_libs/` if not present. You can specify what path to use with this diff --git a/main.nf b/main.nf index c309aff1e..728314f01 100644 --- a/main.nf +++ b/main.nf @@ -11,8 +11,8 @@ vim: syntax=groovy #### Authors Phil Ewels @ewels Rickard Hammarén @Hammarn - Docker and AWS integration by - Denis Moreno @Galithil + Docker and AWS integration by + Denis Moreno @Galithil ---------------------------------------------------------------------------------------- */ @@ -29,6 +29,7 @@ params.project = false params.genome = false params.forward_stranded = false params.reverse_stranded = false +params.unstranded = false params.star_index = params.genome ? params.genomes[ params.genome ].star ?: false : false params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false params.gtf = params.genome ? params.genomes[ params.genome ].gtf ?: false : false @@ -53,7 +54,6 @@ if (params.rlocation){ def single params.sampleLevel = false -params.strandRule = false // Custom trimming options params.clip_r1 = 0 @@ -140,6 +140,7 @@ if(params.aligner == 'star'){ if(params.gtf) log.info "GTF Annotation : ${params.gtf}" else if(params.download_gtf) log.info "GTF URL : ${params.download_gtf}" if(params.bed12) log.info "BED Annotation : ${params.bed12}" +log.info "Strandedness : ${params.unstranded ? 'None' : params.forward_stranded ? 'Forward' : params.reverse_stranded ? 'Reverse' : 'None'}" log.info "Current home : $HOME" log.info "Current user : $USER" log.info "Current path : $PWD" @@ -489,11 +490,18 @@ if(params.aligner == 'hisat2'){ script: index_base = hs2_indices[0].toString() - ~/.\d.ht2/ prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/ + def rnastrandness = '' + if (params.forward_stranded && !params.unstranded){ + rnastrandness = single ? '--rna-strandness F' : '--rna-strandness FR' + } else if (params.reverse_stranded && !params.unstranded){ + rnastrandness = single ? '--rna-strandness R' : '--rna-strandness RF' + } if (single) { """ set -o pipefail # Capture exit codes from HISAT2, not samtools hisat2 -x $index_base \\ -U $reads \\ + $rnastrandness \\ --known-splicesite-infile $alignment_splicesites \\ -p ${task.cpus} \\ --met-stderr \\ @@ -506,6 +514,7 @@ if(params.aligner == 'hisat2'){ hisat2 -x $index_base \\ -1 ${reads[0]} \\ -2 ${reads[1]} \\ + $rnastrandness \\ --known-splicesite-infile $alignment_splicesites \\ --no-mixed \\ --no-discordant \\ @@ -582,12 +591,12 @@ process rseqc { script: def strandRule = '' - if (params.forward_stranded){ - strandRule = params.strandRule ?: (single ? '-d +-,-+' : '-d 1+-,1-+,2++,2–') - } else if (params.reverse_stranded){ - strandRule = params.strandRule ?: (single ? '-d ++,--' : '-d 1+-,1-+,2++,2--') + if (params.forward_stranded && !params.unstranded){ + strandRule = single ? '-d ++,--' : '-d 1++,1--,2+-,2-+' + } else if (params.reverse_stranded && !params.unstranded){ + strandRule = single ? '-d +-,-+' : '-d 1+-,1-+,2++,2--' } - """ + """ samtools index $bam_rseqc infer_experiment.py -i $bam_rseqc -r $bed12 > ${bam_rseqc.baseName}.infer_experiment.txt RPKM_saturation.py -i $bam_rseqc -r $bed12 $strandRule -o ${bam_rseqc.baseName}.RPKM_saturation @@ -718,10 +727,10 @@ process featureCounts { script: def featureCounts_direction = 0 - if (params.reverse_stranded){ - featureCounts_direction = 2 - } else if (params.forward_stranded) { + if (params.forward_stranded && !params.unstranded) { featureCounts_direction = 1 + } else if (params.reverse_stranded && !params.unstranded){ + featureCounts_direction = 2 } """ featureCounts -a $gtf -g gene_id -o ${bam_featurecounts.baseName}_gene.featureCounts.txt -p -s $featureCounts_direction $bam_featurecounts @@ -775,9 +784,9 @@ process stringtieFPKM { script: def StringTie_direction = '' - if (params.forward_stranded){ + if (params.forward_stranded && !params.unstranded){ StringTie_direction = "--fr" - } else if (!params.reverse_stranded){ + } else if (!params.reverse_stranded && !params.unstranded){ StringTie_direction = "--rf" } """