Skip to content

Commit

Permalink
Starting complement for umi factor-out
Browse files Browse the repository at this point in the history
  • Loading branch information
pinin4fjords committed Dec 10, 2024
1 parent 674645e commit f4b76b1
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 83 deletions.
118 changes: 118 additions & 0 deletions subworkflows/local/bam_dedup_umi/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
//
// BAM deduplication with UMI processing
//

include { BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE } from '../../../subworkflows/nf-core/bam_dedup_stats_samtools_umicollapse'
include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS } from '../../../subworkflows/nf-core/bam_dedup_stats_samtools_umitools'
include { BAM_SORT_STATS_SAMTOOLS } from '../../../subworkflows/nf-core/bam_sort_stats_samtools'
include { UMITOOLS_PREPAREFORRSEM } from '../../../modules/nf-core/umitools/prepareforrsem'
include { SAMTOOLS_SORT } from '../../../modules/nf-core/samtools/sort/main'

workflow BAM_DEDUP_UMI {
take:
ch_genome_bam // channel: [ val(meta), path(bam), path(bai) ]
ch_fasta // channel: [ path(fasta) ]
umi_dedup_tool // string: 'umicollapse' or 'umitools'
umitools_dedup_stats // boolean: whether to generate UMI-tools dedup stats
bam_csi_index // boolean: whether to generate CSI index
ch_transcriptome_bam //
ch_transcript_fasta

main:
ch_versions = Channel.empty()

if (umi_dedup_tool == "umicollapse" && umi_dedup_tool != "umitools"){
error("Unknown umi_dedup_tool '${umi_dedup_tool}'")
}

// Genome BAM deduplication
if (umi_dedup_tool == "umicollapse") {
BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE (
ch_genome_bam
)
UMI_DEDUP_GENOME = BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE
ch_dedup_log = UMI_DEDUP_GENOME.out.dedup_stats

} else if (umi_dedup_tool == "umitools") {
BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS (
ch_genome_bam,
umitools_dedup_stats
)
UMI_DEDUP_GENOME = BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS
ch_dedup_log = UMI_DEDUP_GENOME.out.deduplog
}

// Co-ordinate sort, index and run stats on transcriptome BAM. This takes
// some preparation- we have to coordinate sort the BAM, run the
// deduplication, then restore name sorting and run a script from umitools
// to prepare for rsem or salmon

// 1. Coordinate sort

BAM_SORT_STATS_SAMTOOLS (
ch_transcriptome_bam,
ch_transcript_fasta.map { [ [:], it ] }
)
ch_sorted_transcriptome_bam = BAM_SORT_STATS_SAMTOOLS.out.bam
.join(BAM_SORT_STATS_SAMTOOLS.out.bai)

// 2. Transcriptome BAM deduplication
if (umi_dedup_tool == "umicollapse") {
BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE (
ch_sorted_transcriptome_bam
)
UMI_DEDUP_GENOME = BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE
ch_dedup_log = UMI_DEDUP_GENOME.out.dedup_stats

} else if (umi_dedup_tool == "umitools") {
BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS (
ch_sorted_transcriptome_bam,
umitools_dedup_stats
)
UMI_DEDUP_GENOME = BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS
ch_dedup_log = UMI_DEDUP_GENOME.out.deduplog
}

// 3. Restore name sorting
SAMTOOLS_SORT (
UMI_DEDUP_TRANSCRIPTOME.out.bam,
ch_fasta.map { [ [:], it ] }
)

// 4. Run prepare_for_rsem.py on paired-end BAM files
// This fixes paired-end reads in name sorted BAM files
// See: https://github.com/nf-core/rnaseq/issues/828
ended_transcriptome_dedup_bam = SAMTOOLS_SORT.out.bam
.branch {
meta, bam ->
single_end: meta.single_end
return [ meta, bam ]
paired_end: !meta.single_end
return [ meta, bam ]
}

UMITOOLS_PREPAREFORSALMON (
ended_transcriptome_dedup_bam.paired_end
.map { meta, bam -> [ meta, bam, [] ] }
)

ch_dedup_transcriptome_bam = ch_transcriptome_bam
.single_end
.mix(UMITOOLS_PREPAREFORSALMON.out.bam)

// Record versions

ch_versions = UMI_DEDUP_GENOME.out.versions
.mix(BAM_SORT_STATS_SAMTOOLS.out.versions)
.mix(UMITOOLS_PREPAREFORSALMON.out.versions)

emit:
bam = UMI_DEDUP_GENOME.out.bam // channel: [ val(meta), path(bam) ]
bam_index = bam_csi_index ? UMI_DEDUP_GENOME.out.csi : UMI_DEDUP_GENOME.out.bai // channel: [ val(meta), path(bai) ]
dedup_log = ch_dedup_log // channel: [ val(meta), path(log) ]
stats = UMI_DEDUP_GENOME.out.stats
flagstat = UMI_DEDUP_GENOME.out.flagstat
idxstats = UMI_DEDUP_GENOME.out.idxstats
transcriptome_bam = ch_dedup_transcriptome_bam // channel: [ val(meta), path(bam) ]
versions = ch_versions // channel: [ path(versions.yml) ]
}
97 changes: 19 additions & 78 deletions workflows/rnaseq/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ include { MULTIQC_CUSTOM_BIOTYPE } from '../../modules/local/multiqc
//
include { ALIGN_STAR } from '../../subworkflows/local/align_star'
include { QUANTIFY_RSEM } from '../../subworkflows/local/quantify_rsem'
include { BAM_DEDUP_UMI } from '../../subworkflows/local/bam_dedup_umi'
include { checkSamplesAfterGrouping } from '../../subworkflows/local/utils_nfcore_rnaseq_pipeline'
include { multiqcTsvFromList } from '../../subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness'
include { getStarPercentMapped } from '../../subworkflows/local/utils_nfcore_rnaseq_pipeline'
Expand Down Expand Up @@ -218,88 +219,28 @@ workflow RNASEQ {
// SUBWORKFLOW: Remove duplicate reads from BAM file based on UMIs
//
if (params.with_umi) {
// Deduplicate genome BAM file before downstream analysis
if (params.umi_dedup_tool == "umicollapse") {
BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_GENOME (
ch_genome_bam.join(ch_genome_bam_index, by: [0])
)
UMI_DEDUP_GENOME = BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_GENOME
ch_multiqc_files = ch_multiqc_files.mix(UMI_DEDUP_GENOME.out.dedup_stats.collect{it[1]}.ifEmpty([]))
} else if (params.umi_dedup_tool == "umitools") {
BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME (
ch_genome_bam.join(ch_genome_bam_index, by: [0]),
params.umitools_dedup_stats
)
UMI_DEDUP_GENOME = BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME
ch_multiqc_files = ch_multiqc_files.mix(UMI_DEDUP_GENOME.out.deduplog.collect{it[1]})
} else {
error("Unknown umi_dedup_tool '${params.umi_dedup_tool}'")
}
ch_genome_bam = UMI_DEDUP_GENOME.out.bam
ch_genome_bam_index = UMI_DEDUP_GENOME.out.bai
ch_multiqc_files = ch_multiqc_files.mix(UMI_DEDUP_GENOME.out.stats.collect{it[1]})
ch_multiqc_files = ch_multiqc_files.mix(UMI_DEDUP_GENOME.out.flagstat.collect{it[1]})
ch_multiqc_files = ch_multiqc_files.mix(UMI_DEDUP_GENOME.out.idxstats.collect{it[1]})

if (params.bam_csi_index) {
ch_genome_bam_index = UMI_DEDUP_GENOME.out.csi
}
ch_versions = ch_versions.mix(UMI_DEDUP_GENOME.out.versions)

// Co-ordinate sort, index and run stats on transcriptome BAM
BAM_SORT_STATS_SAMTOOLS (
ch_transcriptome_bam,
ch_transcript_fasta.map { [ [:], it ] }
BAM_DEDUP_UMI(
ch_genome_bam.join(ch_genome_bam_index, by: [0]),
ch_fasta,
params.umi_dedup_tool,
params.umitools_dedup_stats,
params.bam_csi_index,
BAM_SORT_STATS_SAMTOOLS.out.bam.join(BAM_SORT_STATS_SAMTOOLS.out.bai, by: [0])
)
ch_transcriptome_sorted_bam = BAM_SORT_STATS_SAMTOOLS.out.bam
ch_transcriptome_sorted_bai = BAM_SORT_STATS_SAMTOOLS.out.bai

// Deduplicate transcriptome BAM file before read counting with Salmon
if (params.umi_dedup_tool == "umicollapse") {
BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_TRANSCRIPTOME (
ch_transcriptome_sorted_bam.join(ch_transcriptome_sorted_bai, by: [0])
ch_genome_bam = BAM_DEDUP_UMI.out.bam
ch_transcriptome_bam = BAM_DEDUP_UMI.out.bam
ch_genome_bam_index = BAM_DEDUP_UMI.out.bai
ch_versions = BAM_DEDUP_UMI.out.versions

ch_multiqc_files = ch_multiqc_files
.mix(
BAM_DEDUP_UMI.dedup_log
.concat(BAM_DEDUP_UMI.out.stats)
.concat(BAM_DEDUP_UMI.out.flagstat)
.concat(BAM_DEDUP_UMI.out.idxstats)
)
UMI_DEDUP_TRANSCRIPTOME = BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_TRANSCRIPTOME
} else if (params.umi_dedup_tool == "umitools") {
BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME (
ch_transcriptome_sorted_bam.join(ch_transcriptome_sorted_bai, by: [0]),
params.umitools_dedup_stats
)
UMI_DEDUP_TRANSCRIPTOME = BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME
} else {
error("Unknown umi_dedup_tool '${params.umi_dedup_tool}'")
}

// Name sort BAM before passing to Salmon
SAMTOOLS_SORT (
UMI_DEDUP_TRANSCRIPTOME.out.bam,
ch_fasta.map { [ [:], it ] }
)

// Only run prepare_for_rsem.py on paired-end BAM files
SAMTOOLS_SORT
.out
.bam
.branch {
meta, bam ->
single_end: meta.single_end
return [ meta, bam ]
paired_end: !meta.single_end
return [ meta, bam ]
}
.set { ch_dedup_bam }

// Fix paired-end reads in name sorted BAM file
// See: https://github.com/nf-core/rnaseq/issues/828
UMITOOLS_PREPAREFORSALMON (
ch_dedup_bam.paired_end.map { meta, bam -> [ meta, bam, [] ] }
)
ch_versions = ch_versions.mix(UMITOOLS_PREPAREFORSALMON.out.versions.first())

ch_dedup_bam
.single_end
.mix(UMITOOLS_PREPAREFORSALMON.out.bam)
.set { ch_transcriptome_bam }
}

//
Expand Down
10 changes: 5 additions & 5 deletions workflows/rnaseq/nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') {

if (params.with_umi) {
process {
withName: 'NFCORE_RNASEQ:RNASEQ:SAMTOOLS_SORT' {
withName: 'NFCORE_RNASEQ:RNASEQ:BAM_DEDUP_UMI:SAMTOOLS_SORT' {
ext.args = '-n'
ext.prefix = { "${meta.id}.umi_dedup.transcriptome" }
publishDir = [
Expand All @@ -113,7 +113,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') {
]
}

withName: 'NFCORE_RNASEQ:RNASEQ:UMITOOLS_PREPAREFORSALMON' {
withName: 'NFCORE_RNASEQ:RNASEQ:BAM_DEDUP_UMI:UMITOOLS_PREPAREFORSALMON' {
ext.prefix = { "${meta.id}.umi_dedup.transcriptome.filtered" }
publishDir = [
[
Expand All @@ -130,7 +130,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') {
]
}

withName: 'NFCORE_RNASEQ:RNASEQ:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_SORT' {
withName: 'NFCORE_RNASEQ:RNASEQ:BAM_DEDUP_UMI:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_SORT' {
ext.prefix = { "${meta.id}.transcriptome.sorted" }
publishDir = [
path: { params.save_align_intermeds || params.save_umi_intermeds ? "${params.outdir}/${params.aligner}" : params.outdir },
Expand All @@ -140,7 +140,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') {
]
}

withName: 'NFCORE_RNASEQ:RNASEQ:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_INDEX' {
withName: 'NFCORE_RNASEQ:RNASEQ:BAM_DEDUP_UMI:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_INDEX' {
publishDir = [
path: { params.save_align_intermeds || params.save_umi_intermeds ? "${params.outdir}/${params.aligner}" : params.outdir },
mode: params.publish_dir_mode,
Expand All @@ -149,7 +149,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') {
]
}

withName: 'NFCORE_RNASEQ:RNASEQ:BAM_SORT_STATS_SAMTOOLS:BAM_STATS_SAMTOOLS:.*' {
withName: 'NFCORE_RNASEQ:RNASEQ:BAM_DEDUP_UMI:BAM_SORT_STATS_SAMTOOLS:BAM_STATS_SAMTOOLS:.*' {
ext.prefix = { "${meta.id}.transcriptome.sorted.bam" }
publishDir = [
path: { params.save_align_intermeds || params.save_umi_intermeds ? "${params.outdir}/${params.aligner}/samtools_stats" : params.outdir },
Expand Down

0 comments on commit f4b76b1

Please sign in to comment.