Fix #891

nf-core · Dec 19, 2022 · cf4f463 · cf4f463
1 parent eb529de
commit cf4f463
Show file tree

Hide file tree

Showing 4 changed files with 6 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Bump minimum Nextflow version from `21.10.3` -> `22.10.1`
 - Updated pipeline template to [nf-core/tools 2.7.1](https://github.com/nf-core/tools/releases/tag/2.7.1)
+- [[#891](https://github.com/nf-core/rnaseq/issues/891)] - Skip MarkDuplicates when UMIs are used
 - [[#896](https://github.com/nf-core/rnaseq/issues/896)] - Remove `copyTo` call for iGenomes README
 - [[#897](https://github.com/nf-core/rnaseq/issues/897)] - Use `--skip_preseq` by default
 - [[#900](https://github.com/nf-core/rnaseq/issues/900)] - Add `--recursive` option to `fastq_dir_to_samplesheet.py` script

diff --git a/conf/modules.config b/conf/modules.config
@@ -331,7 +331,7 @@ if (!params.skip_alignment) {
         }
     }
 
-    if (!params.skip_markduplicates) {
+    if (!params.skip_markduplicates && !params.with_umi) {
         process {
             withName: '.*:BAM_MARKDUPLICATES_PICARD:PICARD_MARKDUPLICATES' {
                 ext.args   = '--ASSUME_SORTED true --REMOVE_DUPLICATES false --VALIDATION_STRINGENCY LENIENT --TMP_DIR tmp'
@@ -392,7 +392,7 @@ if (!params.skip_alignment) {
                         pattern: '*.bam',
                         enabled: (
                             params.save_align_intermeds ||
-                            params.skip_markduplicates ||
+                            params.with_umi ||
                             params.save_umi_intermeds
                         )
                     ]
@@ -408,7 +408,7 @@ if (!params.skip_alignment) {
                     pattern: '*.{bai,csi}',
                     enabled: (
                         params.save_align_intermeds ||
-                        params.skip_markduplicates ||
+                        params.with_umi ||
                         params.save_umi_intermeds
                     )
                 ]

diff --git a/docs/output.md b/docs/output.md
@@ -292,7 +292,7 @@ After extracting the UMI information from the read sequence (see [UMI-tools extr
 
 </details>
 
-Unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-explorer/kits-and-arrays/umi.html) it is not possible to establish whether the fragments you have sequenced from your sample were derived via true biological duplication (i.e. sequencing independent template fragments) or as a result of PCR biases introduced during the library preparation. By default, the pipeline uses [picard MarkDuplicates](https://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates) to _mark_ the duplicate reads identified amongst the alignments to allow you to guage the overall level of duplication in your samples. However, for RNA-seq data it is not recommended to physically remove duplicate reads from the alignments (unless you are using UMIs) because you expect a significant level of true biological duplication that arises from the same fragments being sequenced from for example highly expressed genes. You can skip this step via the `--skip_markduplicates` parameter.
+Unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-explorer/kits-and-arrays/umi.html) it is not possible to establish whether the fragments you have sequenced from your sample were derived via true biological duplication (i.e. sequencing independent template fragments) or as a result of PCR biases introduced during the library preparation. By default, the pipeline uses [picard MarkDuplicates](https://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates) to _mark_ the duplicate reads identified amongst the alignments to allow you to guage the overall level of duplication in your samples. However, for RNA-seq data it is not recommended to physically remove duplicate reads from the alignments (unless you are using UMIs) because you expect a significant level of true biological duplication that arises from the same fragments being sequenced from for example highly expressed genes. This step will be skipped automatically when using the `--with_umi` option or explicitly via the `--skip_markduplicates` parameter.
 
 ![MultiQC - Picard MarkDuplicates metrics plot](images/mqc_picard_markduplicates.png)
 

diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf
@@ -562,7 +562,7 @@ workflow RNASEQ {
     // SUBWORKFLOW: Mark duplicate reads
     //
     ch_markduplicates_multiqc = Channel.empty()
-    if (!params.skip_alignment && !params.skip_markduplicates) {
+    if (!params.skip_alignment && !params.skip_markduplicates && !params.with_umi) {
         BAM_MARKDUPLICATES_PICARD (
             ch_genome_bam,
             PREPARE_GENOME.out.fasta,