diff --git a/CHANGELOG.md b/CHANGELOG.md index c0d9d0548..f8b9ec7da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Bump minimum Nextflow version from `21.10.3` -> `22.10.1` - Updated pipeline template to [nf-core/tools 2.7.1](https://github.com/nf-core/tools/releases/tag/2.7.1) +- [[#891](https://github.com/nf-core/rnaseq/issues/891)] - Skip MarkDuplicates when UMIs are used - [[#896](https://github.com/nf-core/rnaseq/issues/896)] - Remove `copyTo` call for iGenomes README - [[#897](https://github.com/nf-core/rnaseq/issues/897)] - Use `--skip_preseq` by default - [[#900](https://github.com/nf-core/rnaseq/issues/900)] - Add `--recursive` option to `fastq_dir_to_samplesheet.py` script diff --git a/conf/modules.config b/conf/modules.config index 9d4d444fa..b80f9c8fb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -331,7 +331,7 @@ if (!params.skip_alignment) { } } - if (!params.skip_markduplicates) { + if (!params.skip_markduplicates && !params.with_umi) { process { withName: '.*:BAM_MARKDUPLICATES_PICARD:PICARD_MARKDUPLICATES' { ext.args = '--ASSUME_SORTED true --REMOVE_DUPLICATES false --VALIDATION_STRINGENCY LENIENT --TMP_DIR tmp' @@ -392,7 +392,7 @@ if (!params.skip_alignment) { pattern: '*.bam', enabled: ( params.save_align_intermeds || - params.skip_markduplicates || + params.with_umi || params.save_umi_intermeds ) ] @@ -408,7 +408,7 @@ if (!params.skip_alignment) { pattern: '*.{bai,csi}', enabled: ( params.save_align_intermeds || - params.skip_markduplicates || + params.with_umi || params.save_umi_intermeds ) ] diff --git a/docs/output.md b/docs/output.md index 167fc4c51..80b41edf0 100644 --- a/docs/output.md +++ b/docs/output.md @@ -292,7 +292,7 @@ After extracting the UMI information from the read sequence (see [UMI-tools extr -Unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-explorer/kits-and-arrays/umi.html) it is not possible to establish whether the fragments you have sequenced from your sample were derived via true biological duplication (i.e. sequencing independent template fragments) or as a result of PCR biases introduced during the library preparation. By default, the pipeline uses [picard MarkDuplicates](https://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates) to _mark_ the duplicate reads identified amongst the alignments to allow you to guage the overall level of duplication in your samples. However, for RNA-seq data it is not recommended to physically remove duplicate reads from the alignments (unless you are using UMIs) because you expect a significant level of true biological duplication that arises from the same fragments being sequenced from for example highly expressed genes. You can skip this step via the `--skip_markduplicates` parameter. +Unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-explorer/kits-and-arrays/umi.html) it is not possible to establish whether the fragments you have sequenced from your sample were derived via true biological duplication (i.e. sequencing independent template fragments) or as a result of PCR biases introduced during the library preparation. By default, the pipeline uses [picard MarkDuplicates](https://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates) to _mark_ the duplicate reads identified amongst the alignments to allow you to guage the overall level of duplication in your samples. However, for RNA-seq data it is not recommended to physically remove duplicate reads from the alignments (unless you are using UMIs) because you expect a significant level of true biological duplication that arises from the same fragments being sequenced from for example highly expressed genes. This step will be skipped automatically when using the `--with_umi` option or explicitly via the `--skip_markduplicates` parameter. ![MultiQC - Picard MarkDuplicates metrics plot](images/mqc_picard_markduplicates.png) diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index d8df957ba..b4f6a7f5f 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -562,7 +562,7 @@ workflow RNASEQ { // SUBWORKFLOW: Mark duplicate reads // ch_markduplicates_multiqc = Channel.empty() - if (!params.skip_alignment && !params.skip_markduplicates) { + if (!params.skip_alignment && !params.skip_markduplicates && !params.with_umi) { BAM_MARKDUPLICATES_PICARD ( ch_genome_bam, PREPARE_GENOME.out.fasta,