diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 4ecfbfe..4a9bc5c 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -18,11 +18,11 @@ "python.linting.flake8Path": "/opt/conda/bin/flake8", "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle", "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle", - "python.linting.pylintPath": "/opt/conda/bin/pylint" + "python.linting.pylintPath": "/opt/conda/bin/pylint", }, // Add the IDs of extensions you want installed when the container is created. - "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] - } - } + "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"], + }, + }, } diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index a28853e..5a2a6ee 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,6 +1,6 @@ report_comment: > - This report has been generated by the nf-core/riboseq - analysis pipeline. For information about how to interpret these results, please see the + + This report has been generated by the nf-core/riboseq analysis pipeline. For information about how to interpret these results, please see the documentation. report_section_order: "nf-core-riboseq-methods-description": @@ -11,3 +11,169 @@ report_section_order: order: -1002 export_plots: true + +# Run only these modules +run_modules: + - custom_content + - fastqc + - cutadapt + - fastp + - sortmerna + - star + - hisat2 + - rsem + - salmon + - kallisto + - samtools + - picard + - preseq + - rseqc + - qualimap + +# Order of modules +top_modules: + - "fail_trimmed_samples" + - "fail_mapped_samples" + - "fail_strand_check" + - "star_rsem_deseq2_pca" + - "star_rsem_deseq2_clustering" + - "star_salmon_deseq2_pca" + - "star_salmon_deseq2_clustering" + - "salmon_deseq2_pca" + - "salmon_deseq2_clustering" + - "biotype_counts" + - "dupradar" + +module_order: + - fastqc: + name: "FastQC (raw)" + info: "This section of the report shows FastQC results before adapter trimming." + path_filters: + - "./fastqc/raw/*.zip" + - cutadapt + - fastp + - fastqc: + name: "FastQC (trimmed)" + info: "This section of the report shows FastQC results after adapter trimming." + path_filters: + - "./fastqc/trim/*.zip" + +# Don't show % Dups in the General Stats table (we have this from Picard) +table_columns_visible: + fastqc: + percent_duplicates: False + +extra_fn_clean_exts: + - ".umi_dedup" + - "_val" + - ".markdup" + - "_primary" + +# Customise the module search patterns to speed up execution time +# - Skip module sub-tools that we are not interested in +# - Replace file-content searching with filename pattern searching +# - Don't add anything that is the same as the MultiQC default +# See https://multiqc.info/docs/#optimise-file-search-patterns for details +sp: + cutadapt: + fn: "*trimming_report.txt" + + fastp: + fn: "*.fastp.json" + + sortmerna: + fn: "*.sortmerna.log" + + hisat2: + fn: "*.hisat2.summary.log" + + salmon/meta: + fn: "meta_info.json" + + preseq: + fn: "*.lc_extrap.txt" + + samtools/stats: + fn: "*.stats" + samtools/flagstat: + fn: "*.flagstat" + samtools/idxstats: + fn: "*.idxstats*" + + rseqc/bam_stat: + fn: "*.bam_stat.txt" + rseqc/gene_body_coverage: + skip: true + rseqc/junction_annotation: + fn: "*.junction_annotation.log" + rseqc/read_gc: + skip: true + rseqc/read_distribution: + fn: "*.read_distribution.txt" + rseqc/infer_experiment: + fn: "*.infer_experiment.txt" + rseqc/tin: + fn: "*.summary.txt" + + picard/markdups: + fn: "*.MarkDuplicates.metrics.txt" + picard/alignment_metrics: + skip: true + picard/basedistributionbycycle: + skip: true + picard/gcbias: + skip: true + picard/hsmetrics: + skip: true + picard/insertsize: + skip: true + picard/oxogmetrics: + skip: true + picard/pcr_metrics: + skip: true + picard/quality_by_cycle: + skip: true + picard/quality_score_distribution: + skip: true + picard/quality_yield_metrics: + skip: true + picard/rnaseqmetrics: + skip: true + picard/rrbs_metrics: + skip: true + picard/sam_file_validation: + skip: true + picard/variant_calling_metrics: + skip: true + picard/wgs_metrics: + skip: true + +# See https://github.com/ewels/MultiQC_TestData/blob/master/data/custom_content/with_config/table_headerconfig/multiqc_config.yaml +custom_data: + fail_trimmed_samples: + section_name: "WARNING: Fail Trimming Check" + description: "List of samples that failed the minimum trimmed reads threshold specified via the '--min_trimmed_reads' parameter, and hence were ignored for the downstream processing steps." + plot_type: "table" + pconfig: + id: "fail_trimmed_samples_table" + table_title: "Samples failed trimming threshold" + namespace: "Samples failed trimming threshold" + format: "{:.0f}" + fail_mapped_samples: + section_name: "WARNING: Fail Alignment Check" + description: "List of samples that failed the STAR minimum mapped reads threshold specified via the '--min_mapped_reads' parameter, and hence were ignored for the downstream processing steps." + plot_type: "table" + pconfig: + id: "fail_mapped_samples_table" + table_title: "Samples failed mapping threshold" + namespace: "Samples failed mapping threshold" + format: "{:.2f}" + fail_strand_check: + section_name: "WARNING: Fail Strand Check" + description: "List of samples that failed the strandedness check between that provided in the samplesheet and calculated by the RSeQC infer_experiment.py tool." + plot_type: "table" + pconfig: + id: "fail_strand_check_table" + table_title: "Samples failed strandedness check" + namespace: "Samples failed strandedness check" + format: "{:.2f}" diff --git a/assets/rrna-db-defaults.txt b/assets/rrna-db-defaults.txt new file mode 100644 index 0000000..4223356 --- /dev/null +++ b/assets/rrna-db-defaults.txt @@ -0,0 +1,8 @@ +https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta +https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5s-database-id98.fasta +https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-arc-16s-id95.fasta +https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-arc-23s-id98.fasta +https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-bac-16s-id90.fasta +https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-bac-23s-id98.fasta +https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-euk-18s-id95.fasta +https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-euk-28s-id98.fasta diff --git a/assets/schema_input.json b/assets/schema_input.json index 9f610bd..da69e6a 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -10,27 +10,37 @@ "sample": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces" + "errorMessage": "Sample name must be provided and cannot contain spaces", + "meta": ["id"] }, "fastq_1": { "type": "string", + "format": "file-path", + "exists": true, "pattern": "^\\S+\\.f(ast)?q\\.gz$", "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" }, "fastq_2": { "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", + "type": "string", + "format": "file-path", + "exists": true, "anyOf": [ { - "type": "string", "pattern": "^\\S+\\.f(ast)?q\\.gz$" }, { - "type": "string", "maxLength": 0 } ] + }, + "strandedness": { + "type": "string", + "errorMessage": "Strandedness must be provided and be one of 'auto', 'forward', 'reverse' or 'unstranded'", + "enum": ["forward", "reverse", "unstranded", "auto"], + "meta": ["strandedness"] } }, - "required": ["sample", "fastq_1"] + "required": ["sample", "fastq_1", "strandedness"] } } diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py new file mode 100755 index 0000000..b2215fd --- /dev/null +++ b/bin/filter_gtf.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python + +# Written by Olga Botvinnik with subsequent reworking by Jonathan Manning. Released under the MIT license. + +import logging +import argparse +import re +import statistics +from typing import Set + +# Create a logger +logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") +logger = logging.getLogger("fasta_gtf_filter") +logger.setLevel(logging.INFO) + + +def extract_fasta_seq_names(fasta_name: str) -> Set[str]: + """Extracts the sequence names from a FASTA file.""" + with open(fasta_name) as fasta: + return {line[1:].split(None, 1)[0] for line in fasta if line.startswith(">")} + + +def tab_delimited(file: str) -> float: + """Check if file is tab-delimited and return median number of tabs.""" + with open(file, "r") as f: + data = f.read(102400) + return statistics.median(line.count("\t") for line in data.split("\n")) + + +def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None: + """Filter GTF file based on FASTA sequence names.""" + if tab_delimited(gtf_in) != 8: + raise ValueError("Invalid GTF file: Expected 9 tab-separated columns.") + + seq_names_in_genome = extract_fasta_seq_names(fasta) + logger.info(f"Extracted chromosome sequence names from {fasta}") + logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome))) + + seq_names_in_gtf = set() + try: + with open(gtf_in) as gtf, open(filtered_gtf_out, "w") as out: + line_count = 0 + for line in gtf: + seq_name = line.split("\t")[0] + seq_names_in_gtf.add(seq_name) # Add sequence name to the set + + if seq_name in seq_names_in_genome: + if skip_transcript_id_check or re.search(r'transcript_id "([^"]+)"', line): + out.write(line) + line_count += 1 + + if line_count == 0: + raise ValueError("All GTF lines removed by filters") + + except IOError as e: + logger.error(f"File operation failed: {e}") + return + + logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf))) + logger.info(f"Extracted {line_count} matching sequences from {gtf_in} into {filtered_gtf_out}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.") + parser.add_argument("--gtf", type=str, required=True, help="GTF file") + parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file") + parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files") + parser.add_argument( + "--skip_transcript_id_check", action="store_true", help="Skip checking for transcript IDs in the GTF file" + ) + + args = parser.parse_args() + filter_gtf(args.fasta, args.gtf, args.prefix + ".filtered.gtf", args.skip_transcript_id_check) diff --git a/bin/gtf2bed b/bin/gtf2bed new file mode 100755 index 0000000..4da5c04 --- /dev/null +++ b/bin/gtf2bed @@ -0,0 +1,122 @@ +#!/usr/bin/env perl + +# Copyright (c) 2011 Erik Aronesty (erik@q32.com) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT. + +use Getopt::Long; + +my $extended; +GetOptions("x"=>\$extended); + +$in = shift @ARGV; + +my $in_cmd =($in =~ /\.gz$/ ? "gunzip -c $in|" : $in =~ /\.zip$/ ? "unzip -p $in|" : "$in") || die "Can't open $in: $!\n"; +open IN, $in_cmd; + +while () { + $gff = 2 if /^##gff-version 2/; + $gff = 3 if /^##gff-version 3/; + next if /^#/ && $gff; + + s/\s+$//; + # 0-chr 1-src 2-feat 3-beg 4-end 5-scor 6-dir 7-fram 8-attr + my @f = split /\t/; + if ($gff) { + # most ver 2's stick gene names in the id field + ($id) = $f[8]=~ /\bID="([^"]+)"/; + # most ver 3's stick unquoted names in the name field + ($id) = $f[8]=~ /\bName=([^";]+)/ if !$id && $gff == 3; + } else { + ($id) = $f[8]=~ /transcript_id "([^"]+)"/; + } + + next unless $id && $f[0]; + + if ($f[2] eq 'exon') { + die "no position at exon on line $." if ! $f[3]; + # gff3 puts :\d in exons sometimes + $id =~ s/:\d+$// if $gff == 3; + push @{$exons{$id}}, \@f; + # save lowest start + $trans{$id} = \@f if !$trans{$id}; + } elsif ($f[2] eq 'start_codon') { + #optional, output codon start/stop as "thick" region in bed + $sc{$id}->[0] = $f[3]; + } elsif ($f[2] eq 'stop_codon') { + $sc{$id}->[1] = $f[4]; + } elsif ($f[2] eq 'miRNA' ) { + $trans{$id} = \@f if !$trans{$id}; + push @{$exons{$id}}, \@f; + } +} + +for $id ( + # sort by chr then pos + sort { + $trans{$a}->[0] eq $trans{$b}->[0] ? + $trans{$a}->[3] <=> $trans{$b}->[3] : + $trans{$a}->[0] cmp $trans{$b}->[0] + } (keys(%trans)) ) { + my ($chr, undef, undef, undef, undef, undef, $dir, undef, $attr, undef, $cds, $cde) = @{$trans{$id}}; + my ($cds, $cde); + ($cds, $cde) = @{$sc{$id}} if $sc{$id}; + + # sort by pos + my @ex = sort { + $a->[3] <=> $b->[3] + } @{$exons{$id}}; + + my $beg = $ex[0][3]; + my $end = $ex[-1][4]; + + if ($dir eq '-') { + # swap + $tmp=$cds; + $cds=$cde; + $cde=$tmp; + $cds -= 2 if $cds; + $cde += 2 if $cde; + } + + # not specified, just use exons + $cds = $beg if !$cds; + $cde = $end if !$cde; + + # adjust start for bed + --$beg; --$cds; + + my $exn = @ex; # exon count + my $exst = join ",", map {$_->[3]-$beg-1} @ex; # exon start + my $exsz = join ",", map {$_->[4]-$_->[3]+1} @ex; # exon size + + my $gene_id; + my $extend = ""; + if ($extended) { + ($gene_id) = $attr =~ /gene_name "([^"]+)"/; + ($gene_id) = $attr =~ /gene_id "([^"]+)"/ unless $gene_id; + $extend="\t$gene_id"; + } + # added an extra comma to make it look exactly like ucsc's beds + print "$chr\t$beg\t$end\t$id\t0\t$dir\t$cds\t$cde\t0\t$exn\t$exsz,\t$exst,$extend\n"; +} + +close IN; diff --git a/conf/modules.config b/conf/modules.config index d91c6ab..4665219 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -10,41 +10,1154 @@ ---------------------------------------------------------------------------------------- */ -process { +def rseqc_modules = params.rseqc_modules ? params.rseqc_modules.split(',').collect{ it.trim().toLowerCase() } : [] + +// +// General configuration options +// +process { publishDir = [ path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: SAMPLESHEET_CHECK { + withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' { publishDir = [ path: { "${params.outdir}/pipeline_info" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + pattern: '*_versions.yml' + ] + } +} + +// +// Genome preparation options +// + +process { + withName: 'GUNZIP_.*|MAKE_TRANSCRIPTS_FASTA' { + publishDir = [ + path: { "${params.outdir}/genome" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } ] } - withName: FASTQC { - ext.args = '--quiet' + withName: 'UNTAR_.*' { + ext.args2 = '--no-same-owner' } - withName: CUSTOM_DUMPSOFTWAREVERSIONS { + withName: 'UNTAR_.*|STAR_GENOMEGENERATE|STAR_GENOMEGENERATE_IGENOMES|HISAT2_BUILD' { publishDir = [ - path: { "${params.outdir}/pipeline_info" }, + path: { "${params.outdir}/genome/index" }, mode: params.publish_dir_mode, - pattern: '*_versions.yml' + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } + ] + } + + withName: 'GFFREAD' { + ext.args = '--keep-exon-attrs -F -T' + publishDir = [ + path: { "${params.outdir}/genome" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } + ] + } + + withName: 'HISAT2_EXTRACTSPLICESITES' { + publishDir = [ + path: { "${params.outdir}/genome/index" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } ] } - withName: 'MULTIQC' { - ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + withName: 'SALMON_INDEX' { + ext.args = { [ + params.gencode ? '--gencode' : '', + params.pseudo_aligner_kmer_size ? "-k ${params.pseudo_aligner_kmer_size}": '' + ].join(' ').trim() } publishDir = [ - path: { "${params.outdir}/multiqc" }, + path: { "${params.outdir}/genome/index" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } ] } + withName: 'KALLISTO_INDEX' { + ext.args = params.pseudo_aligner_kmer_size ? "-k ${params.pseudo_aligner_kmer_size}" : '' + publishDir = [ + path: { "${params.outdir}/genome/index" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } + ] + } + + withName: 'RSEM_PREPAREREFERENCE_GENOME' { + ext.args = '--star' + publishDir = [ + path: { "${params.outdir}/genome/index" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } + ] + } + + withName: 'GTF2BED' { + publishDir = [ + path: { "${params.outdir}/genome" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } + ] + } + + withName: 'CAT_ADDITIONAL_FASTA|PREPROCESS_TRANSCRIPTS_FASTA_GENCODE' { + publishDir = [ + path: { "${params.outdir}/genome" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } + ] + } + + withName: 'GTF_FILTER' { + ext.args = { params.skip_gtf_transcript_filter ?: '--skip_transcript_id_check' } + publishDir = [ + path: { "${params.outdir}/genome" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } + ] + } + + withName: 'CUSTOM_GETCHROMSIZES' { + publishDir = [ + path: { "${params.outdir}/genome" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } + ] + } + + withName: 'CAT_FASTQ' { + publishDir = [ + path: { "${params.outdir}/fastq" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_merged_fastq ? filename : null } + ] + } +} + +if (!params.skip_bbsplit && params.bbsplit_fasta_list) { + process { + withName: '.*:PREPARE_GENOME:BBMAP_BBSPLIT' { + ext.args = 'build=1' + publishDir = [ + path: { "${params.outdir}/genome/index" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } + ] + } + } +} + +// +// Read subsampling and strand inferring options +// + +process { + withName: 'FQ_SUBSAMPLE' { + ext.args = '--record-count 1000000 --seed 1' + ext.prefix = { "${meta.id}.subsampled" } + publishDir = [ + enabled: false + ] + } + + withName: '.*:FASTQ_SUBSAMPLE_FQ_SALMON:SALMON_QUANT' { + ext.args = '--skipQuant' + publishDir = [ + enabled: false + ] + } +} + +// +// Read QC and trimming options +// + +if (!(params.skip_fastqc || params.skip_qc)) { + if (params.trimmer == 'trimgalore') { + process { + withName: '.*:FASTQ_FASTQC_UMITOOLS_TRIMGALORE:FASTQC' { + ext.args = '--quiet' + } + } + } + + if (params.trimmer == 'fastp') { + process { + withName: '.*:FASTQ_FASTQC_UMITOOLS_FASTP:FASTQC_RAW' { + ext.args = '--quiet' + } + + withName: '.*:FASTQ_FASTQC_UMITOOLS_FASTP:FASTQC_TRIM' { + ext.args = '--quiet' + publishDir = [ + path: { "${params.outdir}/${params.trimmer}/fastqc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + } +} + +if (!params.skip_trimming) { + if (params.trimmer == 'trimgalore') { + process { + withName: '.*:FASTQ_FASTQC_UMITOOLS_TRIMGALORE:TRIMGALORE' { + ext.args = { + [ + "--fastqc_args '-t ${task.cpus}'", + params.extra_trimgalore_args ? params.extra_trimgalore_args.split("\\s(?=--)") : '' + ].flatten().unique(false).join(' ').trim() + } + publishDir = [ + [ + path: { "${params.outdir}/${params.trimmer}/fastqc" }, + mode: params.publish_dir_mode, + pattern: "*.{html,zip}" + ], + [ + path: { "${params.outdir}/${params.trimmer}" }, + mode: params.publish_dir_mode, + pattern: "*.fq.gz", + saveAs: { params.save_trimmed ? it : null } + ], + [ + path: { "${params.outdir}/${params.trimmer}" }, + mode: params.publish_dir_mode, + pattern: "*.txt" + ] + ] + } + } + } + + if (params.trimmer == 'fastp') { + process { + withName: '.*:FASTQ_FASTQC_UMITOOLS_FASTP:FASTP' { + ext.args = { params.extra_fastp_args ?: '' } + publishDir = [ + [ + path: { "${params.outdir}/${params.trimmer}" }, + mode: params.publish_dir_mode, + pattern: "*.{json,html}" + ], + [ + path: { "${params.outdir}/${params.trimmer}/log" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ], + [ + path: { "${params.outdir}/${params.trimmer}" }, + mode: params.publish_dir_mode, + pattern: "*.fastq.gz", + saveAs: { params.save_trimmed ? it : null } + ] + ] + } + } + } +} + +if (params.with_umi && !params.skip_umi_extract) { + process { + withName: 'UMITOOLS_EXTRACT' { + ext.args = { [ + params.umitools_extract_method ? "--extract-method=${params.umitools_extract_method}" : '', + params.umitools_bc_pattern ? "--bc-pattern='${params.umitools_bc_pattern}'" : '', + params.umitools_bc_pattern2 ? "--bc-pattern2='${params.umitools_bc_pattern2}'" : '', + params.umitools_umi_separator ? "--umi-separator='${params.umitools_umi_separator}'" : '' + ].join(' ').trim() } + publishDir = [ + [ + path: { "${params.outdir}/umitools" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ], + [ + path: { "${params.outdir}/umitools" }, + mode: params.publish_dir_mode, + pattern: "*.fastq.gz", + saveAs: { params.save_umi_intermeds ? it : null } + ] + ] + } + } +} + +// +// Contaminant removal options +// + +if (!params.skip_bbsplit) { + process { + withName: 'BBMAP_BBSPLIT' { + ext.args = 'build=1 ambiguous2=all maxindel=150000' + publishDir = [ + [ + path: { "${params.outdir}/bbsplit" }, + mode: params.publish_dir_mode, + pattern: '*.txt' + ], + [ + path: { "${params.outdir}/bbsplit" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + saveAs: { params.save_bbsplit_reads ? it : null } + ] + ] + } + } +} + +if (params.remove_ribo_rna) { + process { + withName: 'SORTMERNA' { + ext.args = '--num_alignments 1 -v' + publishDir = [ + [ + path: { "${params.outdir}/sortmerna" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ], + [ + path: { "${params.outdir}/sortmerna" }, + mode: params.publish_dir_mode, + pattern: "*.fastq.gz", + saveAs: { params.save_non_ribo_reads ? it : null } + ] + ] + } + } +} + +// +// General alignment options +// + +if (!params.skip_alignment) { + process { + withName: 'NFCORE_RNASEQ:RNASEQ:.*:BAM_SORT_STATS_SAMTOOLS:BAM_STATS_SAMTOOLS:.*' { + ext.prefix = { "${meta.id}.sorted.bam" } + publishDir = [ + path: { "${params.outdir}/${params.aligner}/samtools_stats" }, + mode: params.publish_dir_mode, + pattern: "*.{stats,flagstat,idxstats}" + ] + } + + withName: 'NFCORE_RNASEQ:RNASEQ:.*:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_SORT' { + ext.prefix = { "${meta.id}.sorted" } + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: "*.bam", + saveAs: { ( ['star_salmon','hisat2'].contains(params.aligner) && + ( params.save_align_intermeds || ( !params.with_umi && params.skip_markduplicates ) ) + ) || params.save_align_intermeds || params.skip_markduplicates ? it : null } + ] + } + + withName: 'NFCORE_RNASEQ:RNASEQ:.*:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_INDEX' { + ext.args = { params.bam_csi_index ? '-c' : '' } + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: "*.{bai,csi}", + saveAs: { ( ['star_salmon','hisat2'].contains(params.aligner) && + ( params.save_align_intermeds || ( !params.with_umi && params.skip_markduplicates ) ) + ) || params.save_align_intermeds || params.skip_markduplicates ? it : null } + ] + } + } + + if (!params.skip_markduplicates && !params.with_umi) { + process { + withName: '.*:BAM_MARKDUPLICATES_PICARD:PICARD_MARKDUPLICATES' { + ext.args = '--ASSUME_SORTED true --REMOVE_DUPLICATES false --VALIDATION_STRINGENCY LENIENT --TMP_DIR tmp' + ext.prefix = { "${meta.id}.markdup.sorted" } + publishDir = [ + [ + path: { "${params.outdir}/${params.aligner}/picard_metrics" }, + mode: params.publish_dir_mode, + pattern: '*metrics.txt' + ], + [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: '*.bam' + ] + ] + } + + withName: '.*:BAM_MARKDUPLICATES_PICARD:SAMTOOLS_INDEX' { + ext.args = { params.bam_csi_index ? '-c' : '' } + ext.prefix = { "${meta.id}.markdup.sorted" } + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: '*.{bai,csi}' + ] + } + + withName: '.*:BAM_MARKDUPLICATES_PICARD:BAM_STATS_SAMTOOLS:.*' { + ext.prefix = { "${meta.id}.markdup.sorted.bam" } + publishDir = [ + path: { "${params.outdir}/${params.aligner}/samtools_stats" }, + mode: params.publish_dir_mode, + pattern: '*.{stats,flagstat,idxstats}' + ] + } + } + } + + if (params.with_umi && ['star_salmon','hisat2'].contains(params.aligner)) { + process { + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME:UMITOOLS_DEDUP' { + ext.args = { [ + meta.single_end ? '' : '--unpaired-reads=discard --chimeric-pairs=discard', + params.umitools_grouping_method ? "--method='${params.umitools_grouping_method}'" : '', + params.umitools_umi_separator ? "--umi-separator='${params.umitools_umi_separator}'" : '' + ].join(' ').trim() } + ext.prefix = { "${meta.id}.umi_dedup.sorted" } + publishDir = [ + [ + path: { "${params.outdir}/${params.aligner}/umitools" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ], + [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: '*.bam', + saveAs: { params.save_align_intermeds || params.with_umi || params.save_umi_intermeds ? it : null } + ] + ] + } + + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME:SAMTOOLS_INDEX' { + ext.args = { params.bam_csi_index ? '-c' : '' } + ext.prefix = { "${meta.id}.umi_dedup.sorted" } + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: '*.{bai,csi}', + saveAs: { params.save_align_intermeds || params.with_umi || params.save_umi_intermeds ? it : null } + ] + } + + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME:BAM_STATS_SAMTOOLS:.*' { + ext.prefix = { "${meta.id}.umi_dedup.sorted.bam" } + publishDir = [ + path: { "${params.outdir}/${params.aligner}/samtools_stats" }, + mode: params.publish_dir_mode, + pattern: '*.{stats,flagstat,idxstats}' + ] + } + } + } + + if (!params.skip_bigwig) { + process { + withName: 'BEDTOOLS_GENOMECOV' { + ext.args = '-split -du' + publishDir = [ + enabled: false + ] + } + + withName: '.*:BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG_FORWARD:UCSC_BEDCLIP' { + ext.prefix = { "${meta.id}.clip.forward" } + publishDir = [ + enabled: false + ] + } + + withName: '.*:BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG_FORWARD:UCSC_BEDGRAPHTOBIGWIG' { + ext.prefix = { "${meta.id}.forward" } + publishDir = [ + path: { "${params.outdir}/${params.aligner}/bigwig" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG_REVERSE:UCSC_BEDCLIP' { + ext.prefix = { "${meta.id}.clip.reverse" } + publishDir = [ + enabled: false + ] + } + + withName: '.*:BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG_REVERSE:UCSC_BEDGRAPHTOBIGWIG' { + ext.prefix = { "${meta.id}.reverse" } + publishDir = [ + path: { "${params.outdir}/${params.aligner}/bigwig" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + } + + if (!params.skip_stringtie) { + process { + withName: 'STRINGTIE_STRINGTIE' { + ext.args = { [ + '-v', + params.stringtie_ignore_gtf ? '' : '-e' + ].join(' ').trim() } + publishDir = [ + path: { "${params.outdir}/${params.aligner}/stringtie" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + } +} + +// +// STAR Salmon alignment options +// + +if (!params.skip_alignment && params.aligner == 'star_salmon') { + process { + withName: '.*:ALIGN_STAR:STAR_ALIGN|.*:ALIGN_STAR:STAR_ALIGN_IGENOMES' { + ext.args = { [ + '--quantMode TranscriptomeSAM', + '--twopassMode Basic', + '--outSAMtype BAM Unsorted', + '--readFilesCommand zcat', + '--runRNGseed 0', + '--outFilterMultimapNmax 20', + '--alignSJDBoverhangMin 1', + '--outSAMattributes NH HI AS NM MD', + '--quantTranscriptomeBan Singleend', + '--outSAMstrandField intronMotif', + params.save_unaligned ? '--outReadsUnmapped Fastx' : '', + params.extra_star_align_args ? params.extra_star_align_args.split("\\s(?=--)") : '' + ].flatten().unique(false).join(' ').trim() } + publishDir = [ + [ + path: { "${params.outdir}/${params.aligner}/log" }, + mode: params.publish_dir_mode, + pattern: '*.{out,tab}' + ], + [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: '*.bam', + saveAs: { params.save_align_intermeds ? it : null } + ], + [ + path: { "${params.outdir}/${params.aligner}/unmapped" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + saveAs: { params.save_unaligned ? it : null } + ] + ] + } + + withName: '.*:QUANTIFY_STAR_SALMON:SALMON_QUANT' { + ext.args = { params.extra_salmon_quant_args ?: '' } + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') || filename.endsWith('_meta_info.json') ? null : filename } + ] + } + + withName: '.*:QUANTIFY_STAR_SALMON:TX2GENE' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:QUANTIFY_STAR_SALMON:TXIMPORT' { + ext.prefix = { "${quant_type}.merged" } + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:QUANTIFY_STAR_SALMON:SE_.*' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + + if (params.with_umi) { + process { + withName: 'NFCORE_RNASEQ:RNASEQ:SAMTOOLS_SORT' { + ext.args = '-n' + ext.prefix = { "${meta.id}.umi_dedup.transcriptome" } + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: '*.bam', + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } + ] + } + + withName: 'NFCORE_RNASEQ:RNASEQ:UMITOOLS_PREPAREFORSALMON' { + ext.prefix = { "${meta.id}.umi_dedup.transcriptome.filtered" } + publishDir = [ + [ + path: { "${params.outdir}/${params.aligner}/umitools/log" }, + mode: params.publish_dir_mode, + pattern: '*.log' + ], + [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: '*.bam', + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } + ] + ] + } + + withName: 'NFCORE_RNASEQ:RNASEQ:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_SORT' { + ext.prefix = { "${meta.id}.transcriptome.sorted" } + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: '*.bam', + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } + ] + } + + withName: 'NFCORE_RNASEQ:RNASEQ:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_INDEX' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: '*.bai', + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } + ] + } + + withName: 'NFCORE_RNASEQ:RNASEQ:BAM_SORT_STATS_SAMTOOLS:BAM_STATS_SAMTOOLS:.*' { + ext.prefix = { "${meta.id}.transcriptome.sorted.bam" } + publishDir = [ + path: { "${params.outdir}/${params.aligner}/samtools_stats" }, + mode: params.publish_dir_mode, + pattern: '*.{stats,flagstat,idxstats}', + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } + ] + } + + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME:UMITOOLS_DEDUP' { + ext.args = { [ + meta.single_end ? '' : '--unpaired-reads=discard --chimeric-pairs=discard', + params.umitools_grouping_method ? "--method='${params.umitools_grouping_method}'" : '', + params.umitools_umi_separator ? "--umi-separator='${params.umitools_umi_separator}'" : '' + ].join(' ').trim() } + ext.prefix = { "${meta.id}.umi_dedup.transcriptome.sorted" } + publishDir = [ + [ + path: { "${params.outdir}/${params.aligner}/umitools" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ], + [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: '*.bam', + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } + ] + ] + } + + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME:SAMTOOLS_INDEX' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: '*.bai', + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } + ] + } + + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME:BAM_STATS_SAMTOOLS:.*' { + ext.prefix = { "${meta.id}.umi_dedup.transcriptome.sorted.bam" } + publishDir = [ + path: { "${params.outdir}/${params.aligner}/samtools_stats" }, + mode: params.publish_dir_mode, + pattern: '*.{stats,flagstat,idxstats}' + ] + } + } + } + + if (!params.skip_qc & !params.skip_deseq2_qc) { + process { + withName: 'DESEQ2_QC_STAR_SALMON' { + ext.args = { [ + "--id_col 1", + "--sample_suffix ''", + "--count_col 3", + params.deseq2_vst ? '--vst TRUE' : '' + ].join(' ').trim() } + ext.args2 = 'star_salmon' + publishDir = [ + path: { "${params.outdir}/${params.aligner}/deseq2_qc" }, + mode: params.publish_dir_mode, + pattern: "*{RData,pca.vals.txt,plots.pdf,sample.dists.txt,size_factors,log}" + ] + } + } + } +} + +// +// STAR RSEM alignment options +// + +if (!params.skip_alignment && params.aligner == 'star_rsem') { + process { + withName: '.*:QUANTIFY_RSEM:RSEM_CALCULATEEXPRESSION' { + ext.args = [ + '--star', + '--star-output-genome-bam', + '--star-gzipped-read-file', + '--estimate-rspd', + '--seed 1' + ].join(' ').trim() + publishDir = [ + [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: "*.{stat,results}" + ], + [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: "*.bam", + saveAs: { params.save_align_intermeds ? it : null } + ], + [ + path: { "${params.outdir}/${params.aligner}/log" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ] + ] + } + + withName: '.*:QUANTIFY_RSEM:RSEM_MERGE_COUNTS' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + + if (!params.skip_qc & !params.skip_deseq2_qc) { + process { + withName: 'DESEQ2_QC_RSEM' { + ext.args = { [ + "--id_col 1", + "--sample_suffix ''", + "--count_col 3", + params.deseq2_vst ? '--vst TRUE' : '' + ].join(' ').trim() } + ext.args2 = 'star_rsem' + publishDir = [ + path: { "${params.outdir}/${params.aligner}/deseq2_qc" }, + mode: params.publish_dir_mode, + pattern: "*{RData,pca.vals.txt,plots.pdf,sample.dists.txt,size_factors,log}" + ] + } + } + } +} + +// +// HISAT2 alignment options +// + +if (!params.skip_alignment && params.aligner == 'hisat2') { + process { + withName: '.*:FASTQ_ALIGN_HISAT2:HISAT2_ALIGN' { + ext.args = '--met-stderr --new-summary --dta' + publishDir = [ + [ + path: { "${params.outdir}/${params.aligner}/log" }, + mode: params.publish_dir_mode, + pattern: '*.log' + ], + [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: '*.bam', + saveAs: { params.save_align_intermeds ? it : null } + ], + [ + path: { "${params.outdir}/${params.aligner}/unmapped" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + saveAs: { params.save_unaligned ? it : null } + ] + ] + } + } +} + +// +// Post-alignment QC options +// + +if (!params.skip_alignment && !params.skip_qc) { + if (!params.skip_preseq) { + process { + withName: 'PRESEQ_LCEXTRAP' { + ext.args = '-verbose -bam -seed 1 -seg_len 100000000' + publishDir = [ + [ + path: { "${params.outdir}/${params.aligner}/preseq" }, + mode: params.publish_dir_mode, + pattern: "*.txt" + ], + [ + path: { "${params.outdir}/${params.aligner}/preseq/log" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ] + ] + } + } + } + + if (!params.skip_qualimap) { + process { + withName: 'QUALIMAP_RNASEQ' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}/qualimap" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + } + + if (!params.skip_dupradar) { + process { + withName: 'DUPRADAR' { + publishDir = [ + [ + path: { "${params.outdir}/${params.aligner}/dupradar/scatter_plot" }, + mode: params.publish_dir_mode, + pattern: "*Dens.pdf" + ], + [ + path: { "${params.outdir}/${params.aligner}/dupradar/box_plot" }, + mode: params.publish_dir_mode, + pattern: "*Boxplot.pdf" + ], + [ + path: { "${params.outdir}/${params.aligner}/dupradar/histogram" }, + mode: params.publish_dir_mode, + pattern: "*Hist.pdf" + ], + [ + path: { "${params.outdir}/${params.aligner}/dupradar/gene_data" }, + mode: params.publish_dir_mode, + pattern: "*Matrix.txt" + ], + [ + path: { "${params.outdir}/${params.aligner}/dupradar/intercepts_slope" }, + mode: params.publish_dir_mode, + pattern: "*slope.txt" + ] + ] + } + } + } + + if (!params.skip_biotype_qc && params.featurecounts_group_type) { + process { + withName: 'SUBREAD_FEATURECOUNTS' { + ext.args = { [ + '-B -C', + params.gencode ? "-g gene_type" : "-g $params.featurecounts_group_type", + "-t $params.featurecounts_feature_type" + ].join(' ').trim() } + publishDir = [ + path: { "${params.outdir}/${params.aligner}/featurecounts" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'MULTIQC_CUSTOM_BIOTYPE' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}/featurecounts" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + } + + if (!params.skip_rseqc && 'bam_stat' in rseqc_modules) { + process { + withName: '.*:BAM_RSEQC:RSEQC_BAMSTAT' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}/rseqc/bam_stat" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + } + + if (!params.skip_rseqc && 'infer_experiment' in rseqc_modules) { + process { + withName: '.*:BAM_RSEQC:RSEQC_INFEREXPERIMENT' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}/rseqc/infer_experiment" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + } + + if (!params.skip_rseqc && 'junction_annotation' in rseqc_modules) { + process { + withName: '.*:BAM_RSEQC:RSEQC_JUNCTIONANNOTATION' { + publishDir = [ + [ + path: { "${params.outdir}/${params.aligner}/rseqc/junction_annotation/pdf" }, + mode: params.publish_dir_mode, + pattern: '*.pdf' + ], + [ + path: { "${params.outdir}/${params.aligner}/rseqc/junction_annotation/bed" }, + mode: params.publish_dir_mode, + pattern: '*.bed' + ], + [ + path: { "${params.outdir}/${params.aligner}/rseqc/junction_annotation/xls" }, + mode: params.publish_dir_mode, + pattern: '*.xls' + ], + [ + path: { "${params.outdir}/${params.aligner}/rseqc/junction_annotation/log" }, + mode: params.publish_dir_mode, + pattern: '*.log' + ], + [ + path: { "${params.outdir}/${params.aligner}/rseqc/junction_annotation/rscript" }, + mode: params.publish_dir_mode, + pattern: '*.r' + ] + ] + } + } + } + + if (!params.skip_rseqc && 'junction_saturation' in rseqc_modules) { + process { + withName: '.*:BAM_RSEQC:RSEQC_JUNCTIONSATURATION' { + publishDir = [ + [ + path: { "${params.outdir}/${params.aligner}/rseqc/junction_saturation/pdf" }, + mode: params.publish_dir_mode, + pattern: '*.pdf' + ], + [ + path: { "${params.outdir}/${params.aligner}/rseqc/junction_saturation/rscript" }, + mode: params.publish_dir_mode, + pattern: '*.r' + ] + ] + } + } + } + + if (!params.skip_rseqc && 'read_duplication' in rseqc_modules) { + process { + withName: '.*:BAM_RSEQC:RSEQC_READDUPLICATION' { + publishDir = [ + [ + path: { "${params.outdir}/${params.aligner}/rseqc/read_duplication/pdf" }, + mode: params.publish_dir_mode, + pattern: '*.pdf' + ], + [ + path: { "${params.outdir}/${params.aligner}/rseqc/read_duplication/xls" }, + mode: params.publish_dir_mode, + pattern: '*.xls' + ], + [ + path: { "${params.outdir}/${params.aligner}/rseqc/read_duplication/rscript" }, + mode: params.publish_dir_mode, + pattern: '*.r' + ] + ] + } + } + } + + if (!params.skip_rseqc && 'read_distribution' in rseqc_modules && !params.bam_csi_index) { + process { + withName: '.*:BAM_RSEQC:RSEQC_READDISTRIBUTION' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}/rseqc/read_distribution" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + } + + if (!params.skip_rseqc && 'inner_distance' in rseqc_modules && !params.bam_csi_index) { + process { + withName: '.*:BAM_RSEQC:RSEQC_INNERDISTANCE' { + publishDir = [ + [ + path: { "${params.outdir}/${params.aligner}/rseqc/inner_distance/txt" }, + mode: params.publish_dir_mode, + pattern: '*.txt', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], + [ + path: { "${params.outdir}/${params.aligner}/rseqc/inner_distance/pdf" }, + mode: params.publish_dir_mode, + pattern: '*.pdf' + ], + [ + path: { "${params.outdir}/${params.aligner}/rseqc/inner_distance/rscript" }, + mode: params.publish_dir_mode, + pattern: '*.r' + ] + ] + } + } + } + + if (!params.skip_rseqc && 'tin' in rseqc_modules && !params.bam_csi_index) { + process { + withName: '.*:BAM_RSEQC:RSEQC_TIN' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}/rseqc/tin" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + } +} + +if (!params.skip_multiqc) { + process { + withName: 'MULTIQC' { + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + ext.prefix = "multiqc_report" + publishDir = [ + path: { [ + "${params.outdir}/multiqc", + params.skip_alignment? '' : "/${params.aligner}" + ].join('') }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} + +// +// Salmon/ Kallisto pseudoalignment options +// + +if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { + process { + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:SALMON_QUANT' { + ext.args = { params.extra_salmon_quant_args ?: '' } + publishDir = [ + path: { "${params.outdir}/${params.pseudo_aligner}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') || filename.endsWith('_meta_info.json') ? null : filename } + ] + } + } +} + +if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'kallisto') { + process { + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:KALLISTO_QUANT' { + ext.args = params.extra_kallisto_quant_args ?: '' + publishDir = [ + path: { "${params.outdir}/${params.pseudo_aligner}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') || filename.endsWith('.run_info.json') || filename.endsWith('.log') ? null : filename } + ] + } + } +} + +if (!params.skip_pseudo_alignment && params.pseudo_aligner) { + process { + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:TX2GENE' { + publishDir = [ + path: { "${params.outdir}/${params.pseudo_aligner}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:TXIMPORT' { + ext.prefix = { "${quant_type}.merged" } + publishDir = [ + path: { "${params.outdir}/${params.pseudo_aligner}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:SE_.*' { + publishDir = [ + path: { "${params.outdir}/${params.pseudo_aligner}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + + if (!params.skip_qc & !params.skip_deseq2_qc) { + process { + withName: 'DESEQ2_QC_PSEUDO' { + ext.args = { [ + "--id_col 1", + "--sample_suffix ''", + "--count_col 3", + params.deseq2_vst ? '--vst TRUE' : '' + ].join(' ').trim() } + ext.args2 = { params.pseudo_aligner } + publishDir = [ + path: { "${params.outdir}/${params.pseudo_aligner}/deseq2_qc" }, + mode: params.publish_dir_mode, + pattern: "*{RData,pca.vals.txt,plots.pdf,sample.dists.txt,size_factors,log}" + ] + } + } + } } diff --git a/conf/test.config b/conf/test.config index e10ad7e..2884e4d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,6 +20,9 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/002aa1f1df77b4dbfc816340c8d7d629805b166b/testdata/samplesheet.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/riboseq/testdata/samplesheet.csv' + fasta = 'http://ftp.ensembl.org/pub/release-110/fasta/mus_musculus/dna/Mus_musculus.GRCm39.dna.chromosome.1.fa.gz' + gtf = 'http://ftp.ensembl.org/pub/release-110/gtf/mus_musculus/Mus_musculus.GRCm39.110.gtf.gz' + min_trimmed_reads = 1000 } diff --git a/lib/WorkflowRiboseq.groovy b/lib/WorkflowRiboseq.groovy index a4cd33d..39d4b8d 100755 --- a/lib/WorkflowRiboseq.groovy +++ b/lib/WorkflowRiboseq.groovy @@ -1,8 +1,9 @@ // -// This file holds several functions specific to the workflow/riboseq.nf in the nf-core/riboseq pipeline +// This file holds several functions specific to the workflow/rnaseq.nf in the nf-core/rnaseq pipeline // import nextflow.Nextflow +import groovy.json.JsonSlurper import groovy.text.SimpleTemplateEngine class WorkflowRiboseq { @@ -11,13 +12,237 @@ class WorkflowRiboseq { // Check and validate parameters // public static void initialise(params, log) { - genomeExistsError(params, log) + if (!params.gtf && !params.gff) { + Nextflow.error("No GTF or GFF3 annotation specified! The pipeline requires at least one of these files.") + } + + if (params.gtf) { + if (params.gff) { + gtfGffWarn(log) + } + if (params.genome == 'GRCh38' && params.gtf.contains('Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf')) { + ncbiGenomeWarn(log) + } + if (params.gtf.contains('/UCSC/') && params.gtf.contains('Annotation/Genes/genes.gtf')) { + ucscGenomeWarn(log) + } + } + + if (params.transcript_fasta) { + transcriptsFastaWarn(log) + } + + if (!params.skip_bbsplit && !params.bbsplit_index && !params.bbsplit_fasta_list) { + Nextflow.error("Please provide either --bbsplit_fasta_list / --bbsplit_index to run BBSplit.") + } + + if (params.remove_ribo_rna && !params.ribo_database_manifest) { + Nextflow.error("Please provide --ribo_database_manifest to remove ribosomal RNA with SortMeRNA.") + } + + + if (params.with_umi && !params.skip_umi_extract) { + if (!params.umitools_bc_pattern && !params.umitools_bc_pattern2) { + Nextflow.error("UMI-tools requires a barcode pattern to extract barcodes from the reads.") + } + } + + if (params.skip_alignment) { + skipAlignmentWarn(log) + } + + if (!params.skip_pseudo_alignment && params.pseudo_aligner) { + if (!(params.salmon_index || params.transcript_fasta || (params.fasta && (params.gtf || params.gff)))) { + Nextflow.error("To use `--pseudo_aligner 'salmon'`, you must provide either --salmon_index or --transcript_fasta or both --fasta and --gtf / --gff.") + } + } + + // Checks when running --aligner star_rsem + if (!params.skip_alignment && params.aligner == 'star_rsem') { + if (params.with_umi) { + rsemUmiError(log) + } + if (params.rsem_index && params.star_index) { + rsemStarIndexWarn(log) + } + if (params.aligner == 'star_rsem' && params.extra_star_align_args) { + rsemStarExtraArgumentsWarn(log) + } + } - if (!params.fasta) { - Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." + // Warn if --additional_fasta provided with aligner index + if (!params.skip_alignment && params.additional_fasta) { + def index = '' + if (params.aligner == 'star_salmon' && params.star_index) { + index = 'star' + } + if (params.aligner == 'star_rsem' && params.rsem_index) { + index = 'rsem' + } + if (params.aligner == 'hisat2' && params.hisat2_index) { + index = 'hisat2' + } + if (index) { + additionaFastaIndexWarn(index, log) + } } + + // Check which RSeQC modules we are running + def valid_rseqc_modules = ['bam_stat', 'inner_distance', 'infer_experiment', 'junction_annotation', 'junction_saturation', 'read_distribution', 'read_duplication', 'tin'] + def rseqc_modules = params.rseqc_modules ? params.rseqc_modules.split(',').collect{ it.trim().toLowerCase() } : [] + if ((valid_rseqc_modules + rseqc_modules).unique().size() != valid_rseqc_modules.size()) { + Nextflow.error("Invalid option: ${params.rseqc_modules}. Valid options for '--rseqc_modules': ${valid_rseqc_modules.join(', ')}") + } + } + + // + // Function to validate channels from input samplesheet + // + public static ArrayList validateInput(input) { + def (metas, fastqs) = input[1..2] + + // Check that multiple runs of the same sample are of the same strandedness + def strandedness_ok = metas.collect{ it.strandedness }.unique().size == 1 + if (!strandedness_ok) { + Nextflow.error("Please check input samplesheet -> Multiple runs of a sample must have the same strandedness!: ${metas[0].id}") + } + + // Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end + def endedness_ok = metas.collect{ it.single_end }.unique().size == 1 + if (!endedness_ok) { + Nextflow.error("Please check input samplesheet -> Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end: ${metas[0].id}") + } + + return [ metas[0], fastqs ] + } + + // + // Function to check whether biotype field exists in GTF file + // + public static Boolean biotypeInGtf(gtf_file, biotype, log) { + def hits = 0 + gtf_file.eachLine { line -> + def attributes = line.split('\t')[-1].split() + if (attributes.contains(biotype)) { + hits += 1 + } + } + if (hits) { + return true + } else { + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " Biotype attribute '${biotype}' not found in the last column of the GTF file!\n\n" + + " Biotype QC will be skipped to circumvent the issue below:\n" + + " https://github.com/nf-core/rnaseq/issues/460\n\n" + + " Amend '--featurecounts_group_type' to change this behaviour.\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + return false + } + } + + // + // Function to generate an error if contigs in genome fasta file > 512 Mbp + // + public static void checkMaxContigSize(fai_file, log) { + def max_size = 512000000 + fai_file.eachLine { line -> + def lspl = line.split('\t') + def chrom = lspl[0] + def size = lspl[1] + if (size.toInteger() > max_size) { + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " Contig longer than ${max_size}bp found in reference genome!\n\n" + + " ${chrom}: ${size}\n\n" + + " Provide the '--bam_csi_index' parameter to use a CSI instead of BAI index.\n\n" + + " Please see:\n" + + " https://github.com/nf-core/rnaseq/issues/744\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + Nextflow.error(error_string) + } + } + } + + // + // Function that parses Salmon quant 'meta_info.json' output file to get inferred strandedness + // + public static String getSalmonInferredStrandedness(json_file) { + def lib_type = new JsonSlurper().parseText(json_file.text).get('library_types')[0] + def strandedness = 'reverse' + if (lib_type) { + if (lib_type in ['U', 'IU']) { + strandedness = 'unstranded' + } else if (lib_type in ['SF', 'ISF']) { + strandedness = 'forward' + } else if (lib_type in ['SR', 'ISR']) { + strandedness = 'reverse' + } + } + return strandedness + } + + // + // Function that parses TrimGalore log output file to get total number of reads after trimming + // + public static Integer getTrimGaloreReadsAfterFiltering(log_file) { + def total_reads = 0 + def filtered_reads = 0 + log_file.eachLine { line -> + def total_reads_matcher = line =~ /([\d\.]+)\ssequences processed in total/ + def filtered_reads_matcher = line =~ /shorter than the length cutoff[^:]+:\s([\d\.]+)/ + if (total_reads_matcher) total_reads = total_reads_matcher[0][1].toFloat() + if (filtered_reads_matcher) filtered_reads = filtered_reads_matcher[0][1].toFloat() + } + return total_reads - filtered_reads + } + + // + // Function that parses and returns the alignment rate from the STAR log output + // + public static ArrayList getStarPercentMapped(params, align_log) { + def percent_aligned = 0 + def pattern = /Uniquely mapped reads %\s*\|\s*([\d\.]+)%/ + align_log.eachLine { line -> + def matcher = line =~ pattern + if (matcher) { + percent_aligned = matcher[0][1].toFloat() + } + } + + def pass = false + if (percent_aligned >= params.min_mapped_reads.toFloat()) { + pass = true + } + return [ percent_aligned, pass ] + } + + // + // Function that parses and returns the predicted strandedness from the RSeQC infer_experiment.py output + // + public static ArrayList getInferexperimentStrandedness(inferexperiment_file, cutoff=30) { + def sense = 0 + def antisense = 0 + def undetermined = 0 + inferexperiment_file.eachLine { line -> + def undetermined_matcher = line =~ /Fraction of reads failed to determine:\s([\d\.]+)/ + def se_sense_matcher = line =~ /Fraction of reads explained by "\++,--":\s([\d\.]+)/ + def se_antisense_matcher = line =~ /Fraction of reads explained by "\+-,-\+":\s([\d\.]+)/ + def pe_sense_matcher = line =~ /Fraction of reads explained by "1\++,1--,2\+-,2-\+":\s([\d\.]+)/ + def pe_antisense_matcher = line =~ /Fraction of reads explained by "1\+-,1-\+,2\+\+,2--":\s([\d\.]+)/ + if (undetermined_matcher) undetermined = undetermined_matcher[0][1].toFloat() * 100 + if (se_sense_matcher) sense = se_sense_matcher[0][1].toFloat() * 100 + if (se_antisense_matcher) antisense = se_antisense_matcher[0][1].toFloat() * 100 + if (pe_sense_matcher) sense = pe_sense_matcher[0][1].toFloat() * 100 + if (pe_antisense_matcher) antisense = pe_antisense_matcher[0][1].toFloat() * 100 + } + def strandedness = 'unstranded' + if (sense >= 100-cutoff) { + strandedness = 'forward' + } else if (antisense >= 100-cutoff) { + strandedness = 'reverse' + } + return [ strandedness, sense, antisense, undetermined ] } // @@ -50,9 +275,7 @@ class WorkflowRiboseq { // // Generate methods description for MultiQC // - public static String toolCitationText(params) { - // TODO nf-core: Optionally add in-text citation tools to this list. // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", // Uncomment function in methodsDescriptionText to render in MultiQC report @@ -67,7 +290,6 @@ class WorkflowRiboseq { } public static String toolBibliographyText(params) { - // TODO Optionally add bibliographic entries to this list. // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", // Uncomment function in methodsDescriptionText to render in MultiQC report @@ -97,7 +319,6 @@ class WorkflowRiboseq { //meta["tool_citations"] = toolCitationText(params).replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") //meta["tool_bibliography"] = toolBibliographyText(params) - def methods_text = mqc_methods_yaml.text def engine = new SimpleTemplateEngine() @@ -106,6 +327,18 @@ class WorkflowRiboseq { return description_html } + // + // Create MultiQC tsv custom content from a list of values + // + public static String multiqcTsvFromList(tsv_data, header) { + def tsv_string = "" + if (tsv_data.size() > 0) { + tsv_string += "${header.join('\t')}\n" + tsv_string += tsv_data.join('\n') + } + return tsv_string + } + // // Exit pipeline if incorrect --genome key provided // @@ -119,4 +352,119 @@ class WorkflowRiboseq { Nextflow.error(error_string) } } + + // + // Print a warning if using GRCh38 assembly from igenomes.config + // + private static void ncbiGenomeWarn(log) { + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " When using '--genome GRCh38' the assembly is from the NCBI and NOT Ensembl.\n" + + " Biotype QC will be skipped to circumvent the issue below:\n" + + " https://github.com/nf-core/rnaseq/issues/460\n\n" + + " If you would like to use the soft-masked Ensembl assembly instead please see:\n" + + " https://github.com/nf-core/rnaseq/issues/159#issuecomment-501184312\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + } + + // + // Print a warning if using a UCSC assembly from igenomes.config + // + private static void ucscGenomeWarn(log) { + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " When using UCSC assemblies the 'gene_biotype' field is absent from the GTF file.\n" + + " Biotype QC will be skipped to circumvent the issue below:\n" + + " https://github.com/nf-core/rnaseq/issues/460\n\n" + + " If you would like to use the soft-masked Ensembl assembly instead please see:\n" + + " https://github.com/nf-core/rnaseq/issues/159#issuecomment-501184312\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + } + + // + // Print a warning if both GTF and GFF have been provided + // + private static void gtfGffWarn(log) { + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " Both '--gtf' and '--gff' parameters have been provided.\n" + + " Using GTF file as priority.\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + } + + // + // Print a warning if using '--transcript_fasta' + // + private static void transcriptsFastaWarn(log) { + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " '--transcript_fasta' parameter has been provided.\n" + + " Make sure transcript names in this file match those in the GFF/GTF file.\n\n" + + " Please see:\n" + + " https://github.com/nf-core/rnaseq/issues/753\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + } + + // + // Print a warning if --skip_alignment has been provided + // + private static void skipAlignmentWarn(log) { + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " '--skip_alignment' parameter has been provided.\n" + + " Skipping alignment, genome-based quantification and all downstream QC processes.\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + } + + // + // Print a warning if using '--aligner star_rsem' and '--with_umi' + // + private static void rsemUmiError(log) { + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " When using '--aligner star_rsem', STAR is run by RSEM itself and so it is\n" + + " not possible to remove UMIs before the quantification.\n\n" + + " If you would like to remove UMI barcodes using the '--with_umi' option\n" + + " please use either '--aligner star_salmon' or '--aligner hisat2'.\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + Nextflow.error(error_string) + } + + // + // Print a warning if using '--aligner star_rsem' and providing both '--rsem_index' and '--star_index' + // + private static void rsemStarIndexWarn(log) { + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " When using '--aligner star_rsem', both the STAR and RSEM indices should\n" + + " be present in the path specified by '--rsem_index'.\n\n" + + " This warning has been generated because you have provided both\n" + + " '--rsem_index' and '--star_index'. The pipeline will ignore the latter.\n\n" + + " Please see:\n" + + " https://github.com/nf-core/rnaseq/issues/568\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + } + + // + // Print a warning if using '--aligner star_rsem' and providing '--star_extra_alignment_args' + // + private static void rsemStarExtraArgumentsWarn(log) { + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " No additional arguments can be passed to STAR when using RSEM.\n" + + " Because RSEM enforces its own parameters for STAR, any extra arguments\n" + + " to STAR will be ignored. Alternatively, choose the STAR+Salmon route.\n\n" + + " This warning has been generated because you have provided both\n" + + " '--aligner star_rsem' and '--extra_star_align_args'.\n\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + } + + // + // Print a warning if using '--additional_fasta' and '--_index' + // + private static void additionaFastaIndexWarn(index, log) { + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " When using '--additional_fasta ' the aligner index will not\n" + + " be re-built with the transgenes incorporated by default since you have \n" + + " already provided an index via '--${index}_index '.\n\n" + + " Set '--additional_fasta --${index}_index false --gene_bed false --save_reference'\n" + + " to re-build the index with transgenes included and the index and gene BED file will be saved in\n" + + " 'results/genome/index/${index}/' for re-use with '--${index}_index'.\n\n" + + " Ignore this warning if you know that the index already contains transgenes.\n\n" + + " Please see:\n" + + " https://github.com/nf-core/rnaseq/issues/556\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + } } diff --git a/main.nf b/main.nf index 91453db..12ab6bb 100644 --- a/main.nf +++ b/main.nf @@ -17,10 +17,18 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// TODO nf-core: Remove this line if you don't need a FASTA file -// This is an example of how to use getGenomeAttribute() to fetch parameters -// from igenomes.config using `--genome` -params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') +params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') +params.transcript_fasta = WorkflowMain.getGenomeAttribute(params, 'transcript_fasta') +params.additional_fasta = WorkflowMain.getGenomeAttribute(params, 'additional_fasta') +params.gtf = WorkflowMain.getGenomeAttribute(params, 'gtf') +params.gff = WorkflowMain.getGenomeAttribute(params, 'gff') +params.gene_bed = WorkflowMain.getGenomeAttribute(params, 'bed12') +params.bbsplit_index = WorkflowMain.getGenomeAttribute(params, 'bbsplit') +params.star_index = WorkflowMain.getGenomeAttribute(params, 'star') +params.hisat2_index = WorkflowMain.getGenomeAttribute(params, 'hisat2') +params.rsem_index = WorkflowMain.getGenomeAttribute(params, 'rsem') +params.salmon_index = WorkflowMain.getGenomeAttribute(params, 'salmon') +params.kallisto_index = WorkflowMain.getGenomeAttribute(params, 'kallisto') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/modules.json b/modules.json index 72b4ff3..ffdc080 100644 --- a/modules.json +++ b/modules.json @@ -5,20 +5,124 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "bbmap/bbsplit": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "02fd5bd7275abad27aad32d5c852e0a9b1b98882", + "installed_by": ["modules"] + }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", + "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", + "installed_by": ["modules"] + }, + "custom/getchromsizes": { + "branch": "master", + "git_sha": "1b0ffa4e5aed5b7e3cd4311af31bd3b2c8345051", + "installed_by": ["modules"] + }, + "cutadapt": { + "branch": "master", + "git_sha": "07881e42938b4f0070e864b45d424b01745bc3a4", + "installed_by": ["modules"] + }, + "fastp": { + "branch": "master", + "git_sha": "c9488585ce7bd35ccd2a30faa2371454c8112fb9", "installed_by": ["modules"] }, "fastqc": { "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "git_sha": "c9488585ce7bd35ccd2a30faa2371454c8112fb9", + "installed_by": ["modules"] + }, + "fq/subsample": { + "branch": "master", + "git_sha": "59d1faf07ace4f7a00f7fa7778bce4e1f1dcdd63", + "installed_by": ["fastq_subsample_fq_salmon"] + }, + "gffread": { + "branch": "master", + "git_sha": "9a6dc58f7d65c1e6f0ee0107521aea85bbed83cb", + "installed_by": ["modules"] + }, + "gunzip": { + "branch": "master", + "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208", + "installed_by": ["modules"] + }, + "hisat2/build": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "hisat2/extractsplicesites": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "kallisto/index": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "multiqc": { "branch": "master", - "git_sha": "4ab13872435962dadc239979554d13709e20bf29", + "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", + "installed_by": ["modules"] + }, + "rsem/preparereference": { + "branch": "master", + "git_sha": "8ec12d844837b0167540c910afe4d3f920cd7114", + "installed_by": ["modules"] + }, + "salmon/index": { + "branch": "master", + "git_sha": "ce801ced94947b2ff01002b4120eef2f79489b34", + "installed_by": ["fastq_subsample_fq_salmon", "modules"] + }, + "salmon/quant": { + "branch": "master", + "git_sha": "ce801ced94947b2ff01002b4120eef2f79489b34", + "installed_by": ["fastq_subsample_fq_salmon"] + }, + "sortmerna": { + "branch": "master", + "git_sha": "ce558e30784469b88a16923ca96d81899d240b42", + "installed_by": ["modules"] + }, + "star/genomegenerate": { + "branch": "master", + "git_sha": "d87a6e2156c2099c09280fa70776eaf0a824817a", + "installed_by": ["modules"] + }, + "trimgalore": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "umitools/extract": { + "branch": "master", + "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", "installed_by": ["modules"] + }, + "untar": { + "branch": "master", + "git_sha": "e719354ba77df0a1bd310836aa2039b45c29d620", + "installed_by": ["modules"] + } + } + }, + "subworkflows": { + "nf-core": { + "fastq_subsample_fq_salmon": { + "branch": "master", + "git_sha": "a4bceac1aecee5aa0a5dbc601baf0e2e61013fb2", + "installed_by": ["subworkflows"] } } } diff --git a/modules/local/cat_additional_fasta/main.nf b/modules/local/cat_additional_fasta/main.nf new file mode 100644 index 0000000..72af4bc --- /dev/null +++ b/modules/local/cat_additional_fasta/main.nf @@ -0,0 +1,42 @@ +process CAT_ADDITIONAL_FASTA { + tag "$add_fasta" + + conda "conda-forge::python=3.9.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'biocontainers/python:3.9--1' }" + + input: + path fasta + path gtf + path add_fasta + val biotype + + output: + path "${name}.fasta", emit: fasta + path "${name}.gtf" , emit: gtf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def genome_name = params.genome ? params.genome : fasta.getBaseName() + def biotype_name = biotype ? "-b $biotype" : '' + def add_name = add_fasta.getBaseName() + name = "${genome_name}_${add_name}" + """ + fasta2gtf.py \\ + -o ${add_fasta.baseName}.gtf \\ + $biotype_name \\ + $add_fasta + + cat $fasta $add_fasta > ${name}.fasta + cat $gtf ${add_fasta.baseName}.gtf > ${name}.gtf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/gtf2bed/main.nf b/modules/local/gtf2bed/main.nf new file mode 100644 index 0000000..65d3f7a --- /dev/null +++ b/modules/local/gtf2bed/main.nf @@ -0,0 +1,31 @@ +process GTF2BED { + tag "$gtf" + label 'process_low' + + conda "conda-forge::perl=5.26.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/perl:5.26.2' : + 'biocontainers/perl:5.26.2' }" + + input: + path gtf + + output: + path '*.bed' , emit: bed + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // This script is bundled with the pipeline, in nf-core/riboseq/bin/ + """ + gtf2bed \\ + $gtf \\ + > ${gtf.baseName}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + perl: \$(echo \$(perl --version 2>&1) | sed 's/.*v\\(.*\\)) built.*/\\1/') + END_VERSIONS + """ +} diff --git a/modules/local/gtf_filter/main.nf b/modules/local/gtf_filter/main.nf new file mode 100644 index 0000000..b61d9cd --- /dev/null +++ b/modules/local/gtf_filter/main.nf @@ -0,0 +1,32 @@ +process GTF_FILTER { + tag "$fasta" + + conda "conda-forge::python=3.9.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'biocontainers/python:3.9--1' }" + + input: + path fasta + path gtf + + output: + path "*.filtered.gtf", emit: genome_gtf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // filter_gtf.py is bundled with the pipeline, in nf-core/riboseq/bin/ + """ + filter_gtf.py \\ + --gtf $gtf \\ + --fasta $fasta \\ + --prefix ${fasta.baseName} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/preprocess_transcripts_fasta_gencode/main.nf b/modules/local/preprocess_transcripts_fasta_gencode/main.nf new file mode 100644 index 0000000..41f1d42 --- /dev/null +++ b/modules/local/preprocess_transcripts_fasta_gencode/main.nf @@ -0,0 +1,31 @@ +process PREPROCESS_TRANSCRIPTS_FASTA_GENCODE { + tag "$fasta" + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + path fasta + + output: + path "*.fa" , emit: fasta + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def gzipped = fasta.toString().endsWith('.gz') + def outfile = gzipped ? file(fasta.baseName).baseName : fasta.baseName + def command = gzipped ? 'zcat' : 'cat' + """ + $command $fasta | cut -d "|" -f1 > ${outfile}.fixed.fa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/star_genomegenerate_igenomes/main.nf b/modules/local/star_genomegenerate_igenomes/main.nf new file mode 100644 index 0000000..7766faa --- /dev/null +++ b/modules/local/star_genomegenerate_igenomes/main.nf @@ -0,0 +1,68 @@ +process STAR_GENOMEGENERATE_IGENOMES { + tag "$fasta" + label 'process_high' + + conda "bioconda::star=2.6.1d bioconda::samtools=1.10 conda-forge::gawk=5.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:59cdd445419f14abac76b31dd0d71217994cbcc9-0' : + 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:59cdd445419f14abac76b31dd0d71217994cbcc9-0' }" + + input: + path fasta + path gtf + + output: + path "star" , emit: index + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args_list = args.tokenize() + def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' + if (args_list.contains('--genomeSAindexNbases')) { + """ + mkdir star + STAR \\ + --runMode genomeGenerate \\ + --genomeDir star/ \\ + --genomeFastaFiles $fasta \\ + --sjdbGTFfile $gtf \\ + --runThreadN $task.cpus \\ + $memory \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } else { + """ + samtools faidx $fasta + NUM_BASES=`gawk '{sum = sum + \$2}END{if ((log(sum)/log(2))/2 - 1 > 14) {printf "%.0f", 14} else {printf "%.0f", (log(sum)/log(2))/2 - 1}}' ${fasta}.fai` + + mkdir star + STAR \\ + --runMode genomeGenerate \\ + --genomeDir star/ \\ + --genomeFastaFiles $fasta \\ + --sjdbGTFfile $gtf \\ + --runThreadN $task.cpus \\ + --genomeSAindexNbases \$NUM_BASES \\ + $memory \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/bbmap/bbsplit/environment.yml b/modules/nf-core/bbmap/bbsplit/environment.yml new file mode 100644 index 0000000..4c6db62 --- /dev/null +++ b/modules/nf-core/bbmap/bbsplit/environment.yml @@ -0,0 +1,7 @@ +name: bbmap_bbsplit +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bbmap=39.01 diff --git a/modules/nf-core/bbmap/bbsplit/main.nf b/modules/nf-core/bbmap/bbsplit/main.nf new file mode 100644 index 0000000..b1ae2c8 --- /dev/null +++ b/modules/nf-core/bbmap/bbsplit/main.nf @@ -0,0 +1,89 @@ +process BBMAP_BBSPLIT { + tag "$meta.id" + label 'process_high' + label 'error_retry' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bbmap:39.01--h5c4e2a8_0': + 'biocontainers/bbmap:39.01--h5c4e2a8_0' }" + + input: + tuple val(meta), path(reads) + path index + path primary_ref + tuple val(other_ref_names), path (other_ref_paths) + val only_build_index + + output: + path "bbsplit" , optional:true, emit: index + tuple val(meta), path('*primary*fastq.gz'), optional:true, emit: primary_fastq + tuple val(meta), path('*fastq.gz') , optional:true, emit: all_fastq + tuple val(meta), path('*txt') , optional:true, emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[BBSplit] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + + def other_refs = [] + other_ref_names.eachWithIndex { name, index -> + other_refs << "ref_${name}=${other_ref_paths[index]}" + } + if (only_build_index) { + if (primary_ref && other_ref_names && other_ref_paths) { + """ + bbsplit.sh \\ + -Xmx${avail_mem}M \\ + ref_primary=$primary_ref \\ + ${other_refs.join(' ')} \\ + path=bbsplit \\ + threads=$task.cpus \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset") + END_VERSIONS + """ + } else { + log.error 'ERROR: Please specify as input a primary fasta file along with names and paths to non-primary fasta files.' + } + } else { + def index_files = '' + if (index) { + index_files = "path=$index" + } else if (primary_ref && other_ref_names && other_ref_paths) { + index_files = "ref_primary=${primary_ref} ${other_refs.join(' ')}" + } else { + log.error 'ERROR: Please either specify a BBSplit index as input or a primary fasta file along with names and paths to non-primary fasta files.' + } + def fastq_in = meta.single_end ? "in=${reads}" : "in=${reads[0]} in2=${reads[1]}" + def fastq_out = meta.single_end ? "basename=${prefix}_%.fastq.gz" : "basename=${prefix}_%_#.fastq.gz" + """ + bbsplit.sh \\ + -Xmx${avail_mem}M \\ + $index_files \\ + threads=$task.cpus \\ + $fastq_in \\ + $fastq_out \\ + refstats=${prefix}.stats.txt \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset") + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/bbmap/bbsplit/meta.yml b/modules/nf-core/bbmap/bbsplit/meta.yml new file mode 100644 index 0000000..f4e0a5e --- /dev/null +++ b/modules/nf-core/bbmap/bbsplit/meta.yml @@ -0,0 +1,74 @@ +name: bbmap_bbsplit +description: Split sequencing reads by mapping them to multiple references simultaneously +keywords: + - align + - map + - fastq + - genome + - reference +tools: + - bbmap: + description: BBMap is a short read aligner, as well as various other bioinformatic tools. + homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + licence: ["UC-LBL license (see package)"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - index: + type: directory + description: Directory to place generated index + pattern: "*" + - primary_ref: + type: file + description: Path to the primary reference + pattern: "*" + - other_ref_names: + type: list + description: List of other reference ids apart from the primary + - other_ref_paths: + type: list + description: Path to other references paths corresponding to "other_ref_names" + - only_build_index: + type: string + description: true = only build index; false = mapping +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - index: + type: directory + description: Directory with index files + pattern: "bbsplit" + - primary_fastq: + type: file + description: Output reads that map to the primary reference + pattern: "*primary*fastq.gz" + - all_fastq: + type: file + description: All reads mapping to any of the references + pattern: "*fastq.gz" + - stats: + type: file + description: Tab-delimited text file containing mapping statistics + pattern: "*.txt" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/cat/fastq/environment.yml b/modules/nf-core/cat/fastq/environment.yml new file mode 100644 index 0000000..bff93ad --- /dev/null +++ b/modules/nf-core/cat/fastq/environment.yml @@ -0,0 +1,7 @@ +name: cat_fastq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf new file mode 100644 index 0000000..3d96378 --- /dev/null +++ b/modules/nf-core/cat/fastq/main.nf @@ -0,0 +1,80 @@ +process CAT_FASTQ { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size >= 1) { + """ + cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size >= 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size > 1) { + """ + touch ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + """ + touch ${prefix}_1.merged.fastq.gz + touch ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + +} diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml new file mode 100644 index 0000000..db4ac3c --- /dev/null +++ b/modules/nf-core/cat/fastq/meta.yml @@ -0,0 +1,42 @@ +name: cat_fastq +description: Concatenates fastq files +keywords: + - cat + - fastq + - concatenate +tools: + - cat: + description: | + The cat utility reads files sequentially, writing them to the standard output. + documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files to be concatenated. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Merged fastq file + pattern: "*.{merged.fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test b/modules/nf-core/cat/fastq/tests/main.nf.test new file mode 100644 index 0000000..dab2e14 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test @@ -0,0 +1,138 @@ +nextflow_process { + + name "Test Process CAT_FASTQ" + script "../main.nf" + process "CAT_FASTQ" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/fastq" + + test("test_cat_fastq_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_fastq_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_fastq_single_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_fastq_paired_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_fastq_single_end_single_file") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test.snap b/modules/nf-core/cat/fastq/tests/main.nf.test.snap new file mode 100644 index 0000000..43dfe28 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test.snap @@ -0,0 +1,169 @@ +{ + "test_cat_fastq_single_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,ee314a9bd568d06617171b0c85f508da" + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,ee314a9bd568d06617171b0c85f508da" + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:30:39.816981" + }, + "test_cat_fastq_single_end_same_name": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22" + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22" + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:32:35.229332" + }, + "test_cat_fastq_single_end_single_file": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:34:00.058829" + }, + "test_cat_fastq_paired_end_same_name": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:33:33.031555" + }, + "test_cat_fastq_paired_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:32:02.270935" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/fastq/tests/tags.yml b/modules/nf-core/cat/fastq/tests/tags.yml new file mode 100644 index 0000000..6ac4361 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/tags.yml @@ -0,0 +1,2 @@ +cat/fastq: + - modules/nf-core/cat/fastq/** diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml index f0c63f6..9b3272b 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/environment.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::multiqc=1.17 + - bioconda::multiqc=1.19 diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 7685b33..f218761 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -4,8 +4,8 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : - 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test index eec1db1..b1e1630 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -31,7 +31,12 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(process.out).match() } + { assert snapshot( + process.out.versions, + file(process.out.mqc_yml[0]).readLines()[0..10], + file(process.out.yml[0]).readLines()[0..7] + ).match() + } ) } } diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap index 7fef262..5f59a93 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -1,27 +1,33 @@ { "Should run without failures": { "content": [ - { - "0": [ - "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" - ], - "1": [ - "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" - ], - "2": [ - "versions.yml:md5,3843ac526e762117eedf8825b40683df" - ], - "mqc_yml": [ - "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" - ], - "versions": [ - "versions.yml:md5,3843ac526e762117eedf8825b40683df" - ], - "yml": [ - "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" - ] - } + [ + "versions.yml:md5,76d454d92244589d32455833f7c1ba6d" + ], + [ + "data: \"\\n\\n \\n \\n \\n \\n \\n \\n \\n\\", + " \\n\\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n \\n \\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n\\n\\n \\n\\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\" + ], + [ + "CUSTOM_DUMPSOFTWAREVERSIONS:", + " python: 3.11.7", + " yaml: 5.4.1", + "TOOL1:", + " tool1: 0.11.9", + "TOOL2:", + " tool2: '1.9'", + "Workflow:" + ] ], - "timestamp": "2023-11-03T14:43:22.157011" + "timestamp": "2024-01-09T23:01:18.710682" } } \ No newline at end of file diff --git a/modules/nf-core/custom/getchromsizes/environment.yml b/modules/nf-core/custom/getchromsizes/environment.yml new file mode 100644 index 0000000..2a01695 --- /dev/null +++ b/modules/nf-core/custom/getchromsizes/environment.yml @@ -0,0 +1,7 @@ +name: custom_getchromsizes +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.16.1 diff --git a/modules/nf-core/custom/getchromsizes/main.nf b/modules/nf-core/custom/getchromsizes/main.nf new file mode 100644 index 0000000..e8084ea --- /dev/null +++ b/modules/nf-core/custom/getchromsizes/main.nf @@ -0,0 +1,44 @@ +process CUSTOM_GETCHROMSIZES { + tag "$fasta" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : + 'biocontainers/samtools:1.16.1--h6899075_1' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path ("*.sizes"), emit: sizes + tuple val(meta), path ("*.fai") , emit: fai + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools faidx $fasta + cut -f 1,2 ${fasta}.fai > ${fasta}.sizes + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + getchromsizes: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${fasta}.fai + touch ${fasta}.sizes + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + getchromsizes: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/custom/getchromsizes/meta.yml b/modules/nf-core/custom/getchromsizes/meta.yml new file mode 100644 index 0000000..529be07 --- /dev/null +++ b/modules/nf-core/custom/getchromsizes/meta.yml @@ -0,0 +1,54 @@ +name: custom_getchromsizes +description: Generates a FASTA file of chromosome sizes and a fasta index file +keywords: + - fasta + - chromosome + - indexing +tools: + - samtools: + description: Tools for dealing with SAM, BAM and CRAM files + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + tool_dev_url: https://github.com/samtools/samtools + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: FASTA file + pattern: "*.{fa,fasta,fna,fas}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - sizes: + type: file + description: File containing chromosome lengths + pattern: "*.{sizes}" + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" + - gzi: + type: file + description: Optional gzip index file for compressed inputs + pattern: "*.gzi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@tamara-hodgetts" + - "@chris-cheshire" + - "@muffato" +maintainers: + - "@tamara-hodgetts" + - "@chris-cheshire" + - "@muffato" diff --git a/modules/nf-core/custom/getchromsizes/tests/main.nf.test b/modules/nf-core/custom/getchromsizes/tests/main.nf.test new file mode 100644 index 0000000..9f6b564 --- /dev/null +++ b/modules/nf-core/custom/getchromsizes/tests/main.nf.test @@ -0,0 +1,62 @@ +nextflow_process { + + name "Test Process CUSTOM_GETCHROMSIZES" + script "../main.nf" + process "CUSTOM_GETCHROMSIZES" + + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/getchromsizes" + + test("test_custom_getchromsizes") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_custom_getchromsizes_bgzip") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/custom/getchromsizes/tests/main.nf.test.snap b/modules/nf-core/custom/getchromsizes/tests/main.nf.test.snap new file mode 100644 index 0000000..2e560bd --- /dev/null +++ b/modules/nf-core/custom/getchromsizes/tests/main.nf.test.snap @@ -0,0 +1,114 @@ +{ + "test_custom_getchromsizes": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "genome.fasta.sizes:md5,a57c401f27ae5133823fb09fb21c8a3c" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "genome.fasta.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,45a83c5f3dddbc5dcab30035169f7ce8" + ], + "fai": [ + [ + { + "id": "test" + }, + "genome.fasta.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "gzi": [ + + ], + "sizes": [ + [ + { + "id": "test" + }, + "genome.fasta.sizes:md5,a57c401f27ae5133823fb09fb21c8a3c" + ] + ], + "versions": [ + "versions.yml:md5,45a83c5f3dddbc5dcab30035169f7ce8" + ] + } + ], + "timestamp": "2024-01-17T17:48:35.562918" + }, + "test_custom_getchromsizes_bgzip": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "genome.fasta.gz.sizes:md5,a57c401f27ae5133823fb09fb21c8a3c" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "genome.fasta.gz.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "genome.fasta.gz.gzi:md5,7dea362b3fac8e00956a4952a3d4f474" + ] + ], + "3": [ + "versions.yml:md5,45a83c5f3dddbc5dcab30035169f7ce8" + ], + "fai": [ + [ + { + "id": "test" + }, + "genome.fasta.gz.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "gzi": [ + [ + { + "id": "test" + }, + "genome.fasta.gz.gzi:md5,7dea362b3fac8e00956a4952a3d4f474" + ] + ], + "sizes": [ + [ + { + "id": "test" + }, + "genome.fasta.gz.sizes:md5,a57c401f27ae5133823fb09fb21c8a3c" + ] + ], + "versions": [ + "versions.yml:md5,45a83c5f3dddbc5dcab30035169f7ce8" + ] + } + ], + "timestamp": "2024-01-17T17:49:02.562311" + } +} \ No newline at end of file diff --git a/modules/nf-core/custom/getchromsizes/tests/tags.yml b/modules/nf-core/custom/getchromsizes/tests/tags.yml new file mode 100644 index 0000000..d89a805 --- /dev/null +++ b/modules/nf-core/custom/getchromsizes/tests/tags.yml @@ -0,0 +1,2 @@ +custom/getchromsizes: + - modules/nf-core/custom/getchromsizes/** diff --git a/modules/nf-core/cutadapt/environment.yml b/modules/nf-core/cutadapt/environment.yml new file mode 100644 index 0000000..d32a8f9 --- /dev/null +++ b/modules/nf-core/cutadapt/environment.yml @@ -0,0 +1,7 @@ +name: cutadapt +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::cutadapt=3.4 diff --git a/modules/nf-core/cutadapt/main.nf b/modules/nf-core/cutadapt/main.nf new file mode 100644 index 0000000..e232a70 --- /dev/null +++ b/modules/nf-core/cutadapt/main.nf @@ -0,0 +1,50 @@ +process CUTADAPT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/cutadapt:3.4--py39h38f01e4_1' : + 'biocontainers/cutadapt:3.4--py39h38f01e4_1' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path('*.trim.fastq.gz'), emit: reads + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def trimmed = meta.single_end ? "-o ${prefix}.trim.fastq.gz" : "-o ${prefix}_1.trim.fastq.gz -p ${prefix}_2.trim.fastq.gz" + """ + cutadapt \\ + --cores $task.cpus \\ + $args \\ + $trimmed \\ + $reads \\ + > ${prefix}.cutadapt.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cutadapt: \$(cutadapt --version) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def trimmed = meta.single_end ? "${prefix}.trim.fastq.gz" : "${prefix}_1.trim.fastq.gz ${prefix}_2.trim.fastq.gz" + """ + touch ${prefix}.cutadapt.log + touch ${trimmed} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cutadapt: \$(cutadapt --version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cutadapt/meta.yml b/modules/nf-core/cutadapt/meta.yml new file mode 100644 index 0000000..5ecfe27 --- /dev/null +++ b/modules/nf-core/cutadapt/meta.yml @@ -0,0 +1,49 @@ +name: cutadapt +description: Perform adapter/quality trimming on sequencing reads +keywords: + - trimming + - adapter trimming + - adapters + - quality trimming +tools: + - cuatadapt: + description: | + Cutadapt finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads. + documentation: https://cutadapt.readthedocs.io/en/stable/index.html + doi: 10.14806/ej.17.1.200 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified fastq reads + pattern: "*fastq.gz" + - log: + type: file + description: cuatadapt log file + pattern: "*cutadapt.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@kevinmenden" +maintainers: + - "@drpatelh" + - "@kevinmenden" diff --git a/modules/nf-core/cutadapt/tests/main.nf.test b/modules/nf-core/cutadapt/tests/main.nf.test new file mode 100644 index 0000000..819c8f9 --- /dev/null +++ b/modules/nf-core/cutadapt/tests/main.nf.test @@ -0,0 +1,69 @@ +nextflow_process { + + name "Test Process CUTADAPT" + script "../main.nf" + process "CUTADAPT" + tag "modules" + tag "modules_nfcore" + tag "cutadapt" + + test("sarscov2 Illumina single end [fastq]") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id: 'test', single_end:true ], + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert process.out.reads != null }, + { assert process.out.reads.get(0).get(1) ==~ ".*.trim.fastq.gz" }, + { assert snapshot(process.out.versions).match("versions") }, + { assert snapshot(path(process.out.reads.get(0).get(1)).linesGzip[0]).match() } + ) + } + } + + test("sarscov2 Illumina paired-end [fastq]") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id: 'test', single_end:false ], + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert process.out.reads != null }, + { assert process.out.reads.get(0).get(1).get(0) ==~ ".*.1.trim.fastq.gz" }, + { assert process.out.reads.get(0).get(1).get(1) ==~ ".*.2.trim.fastq.gz" }, + { assert snapshot(path(process.out.reads.get(0).get(1).get(1)).linesGzip[0]).match() }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } +} diff --git a/modules/nf-core/cutadapt/tests/main.nf.test.snap b/modules/nf-core/cutadapt/tests/main.nf.test.snap new file mode 100644 index 0000000..4bdee0b --- /dev/null +++ b/modules/nf-core/cutadapt/tests/main.nf.test.snap @@ -0,0 +1,22 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,d37c5b9e465accf6d836972608795071" + ] + ], + "timestamp": "2023-10-24T11:22:34.352529" + }, + "Single-Read": { + "content": [ + "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1" + ], + "timestamp": "2023-10-24T11:22:34.364556" + }, + "Paired-Reads": { + "content": [ + "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/2" + ], + "timestamp": "2023-10-24T11:22:41.877854" + } +} \ No newline at end of file diff --git a/modules/nf-core/cutadapt/tests/nextflow.config b/modules/nf-core/cutadapt/tests/nextflow.config new file mode 100644 index 0000000..6c3b425 --- /dev/null +++ b/modules/nf-core/cutadapt/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: CUTADAPT { + ext.args = '-q 25' + } + +} diff --git a/modules/nf-core/cutadapt/tests/tags.yml b/modules/nf-core/cutadapt/tests/tags.yml new file mode 100644 index 0000000..f64f997 --- /dev/null +++ b/modules/nf-core/cutadapt/tests/tags.yml @@ -0,0 +1,2 @@ +cutadapt: + - modules/nf-core/cutadapt/** diff --git a/modules/nf-core/fastp/environment.yml b/modules/nf-core/fastp/environment.yml new file mode 100644 index 0000000..70389e6 --- /dev/null +++ b/modules/nf-core/fastp/environment.yml @@ -0,0 +1,7 @@ +name: fastp +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fastp=0.23.4 diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf new file mode 100644 index 0000000..2a3b679 --- /dev/null +++ b/modules/nf-core/fastp/main.nf @@ -0,0 +1,120 @@ +process FASTP { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' : + 'biocontainers/fastp:0.23.4--h5f740d0_0' }" + + input: + tuple val(meta), path(reads) + path adapter_fasta + val save_trimmed_fail + val save_merged + + output: + tuple val(meta), path('*.fastp.fastq.gz') , optional:true, emit: reads + tuple val(meta), path('*.json') , emit: json + tuple val(meta), path('*.html') , emit: html + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + tuple val(meta), path('*.fail.fastq.gz') , optional:true, emit: reads_fail + tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def adapter_list = adapter_fasta ? "--adapter_fasta ${adapter_fasta}" : "" + def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' + // Added soft-links to original fastqs for consistent naming in MultiQC + // Use single ended for interleaved. Add --interleaved_in in config. + if ( task.ext.args?.contains('--interleaved_in') ) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --stdout \\ + --in1 ${prefix}.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $args \\ + 2> >(tee ${prefix}.fastp.log >&2) \\ + | gzip -c > ${prefix}.fastp.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else if (meta.single_end) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --in1 ${prefix}.fastq.gz \\ + --out1 ${prefix}.fastp.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $args \\ + 2> >(tee ${prefix}.fastp.log >&2) + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else { + def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : '' + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -sf ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -sf ${reads[1]} ${prefix}_2.fastq.gz + fastp \\ + --in1 ${prefix}_1.fastq.gz \\ + --in2 ${prefix}_2.fastq.gz \\ + --out1 ${prefix}_1.fastp.fastq.gz \\ + --out2 ${prefix}_2.fastp.fastq.gz \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $merge_fastq \\ + --thread $task.cpus \\ + --detect_adapter_for_pe \\ + $args \\ + 2> >(tee ${prefix}.fastp.log >&2) + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def is_single_output = task.ext.args?.contains('--interleaved_in') || meta.single_end + def touch_reads = is_single_output ? "${prefix}.fastp.fastq.gz" : "${prefix}_1.fastp.fastq.gz ${prefix}_2.fastp.fastq.gz" + def touch_merged = (!is_single_output && save_merged) ? "touch ${prefix}.merged.fastq.gz" : "" + """ + touch $touch_reads + touch "${prefix}.fastp.json" + touch "${prefix}.fastp.html" + touch "${prefix}.fastp.log" + $touch_merged + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml new file mode 100644 index 0000000..c22a16a --- /dev/null +++ b/modules/nf-core/fastp/meta.yml @@ -0,0 +1,75 @@ +name: fastp +description: Perform adapter/quality trimming on sequencing reads +keywords: + - trimming + - quality control + - fastq +tools: + - fastp: + description: | + A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance. + documentation: https://github.com/OpenGene/fastp + doi: 10.1093/bioinformatics/bty560 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads. + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. If you wish to run interleaved paired-end data, supply as single-end data + but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module. + - adapter_fasta: + type: file + description: File in FASTA format containing possible adapters to remove. + pattern: "*.{fasta,fna,fas,fa}" + - save_trimmed_fail: + type: boolean + description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz` + - save_merged: + type: boolean + description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz` +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified/unmerged fastq reads + pattern: "*fastp.fastq.gz" + - json: + type: file + description: Results in JSON format + pattern: "*.json" + - html: + type: file + description: Results in HTML format + pattern: "*.html" + - log: + type: file + description: fastq log file + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads_fail: + type: file + description: Reads the failed the preprocessing + pattern: "*fail.fastq.gz" + - reads_merged: + type: file + description: Reads that were successfully merged + pattern: "*.{merged.fastq.gz}" +authors: + - "@drpatelh" + - "@kevinmenden" +maintainers: + - "@drpatelh" + - "@kevinmenden" diff --git a/modules/nf-core/fastp/tests/main.nf.test b/modules/nf-core/fastp/tests/main.nf.test new file mode 100644 index 0000000..fa7e5b4 --- /dev/null +++ b/modules/nf-core/fastp/tests/main.nf.test @@ -0,0 +1,723 @@ +nextflow_process { + + name "Test Process FASTP" + script "../main.nf" + process "FASTP" + tag "modules" + tag "modules_nfcore" + tag "fastp" + + test("test_fastp_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = Channel.of([ + [ id:'test', single_end:true ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:") }, - { assert snapshot(process.out.versions).match("versions") }, - { assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" } + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 paired-end [fastq]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, + { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, + { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, + { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, + { assert path(process.out.html[0][1][0]).text.contains("") }, + { assert path(process.out.html[0][1][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 interleaved [fastq]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } ) } } -// TODO -// // -// // Test with paired-end data -// // -// workflow test_fastqc_paired_end { -// input = [ -// [id: 'test', single_end: false], // meta map -// [ -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) -// ] -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with interleaved data -// // -// workflow test_fastqc_interleaved { -// input = [ -// [id: 'test', single_end: false], // meta map -// file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with bam data -// // -// workflow test_fastqc_bam { -// input = [ -// [id: 'test', single_end: false], // meta map -// file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with multiple samples -// // -// workflow test_fastqc_multiple { -// input = [ -// [id: 'test', single_end: false], // meta map -// [ -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) -// ] -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with custom prefix -// // -// workflow test_fastqc_custom_prefix { -// input = [ -// [ id:'mysample', single_end:true ], // meta map -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } + + test("sarscov2 paired-end [bam]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 multiple [fastq]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, + { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, + { assert process.out.html[0][1][2] ==~ ".*/test_3_fastqc.html" }, + { assert process.out.html[0][1][3] ==~ ".*/test_4_fastqc.html" }, + { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, + { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, + { assert process.out.zip[0][1][2] ==~ ".*/test_3_fastqc.zip" }, + { assert process.out.zip[0][1][3] ==~ ".*/test_4_fastqc.zip" }, + { assert path(process.out.html[0][1][0]).text.contains("") }, + { assert path(process.out.html[0][1][1]).text.contains("") }, + { assert path(process.out.html[0][1][2]).text.contains("") }, + { assert path(process.out.html[0][1][3]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 custom_prefix") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'mysample', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/mysample_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/mysample_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 single-end [fastq] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([ + [ id: 'test', single_end:true ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.html.collect { file(it[1]).getName() } + + process.out.zip.collect { file(it[1]).getName() } + + process.out.versions ).match() } + ) + } + } + } diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap index 636a32c..5d624bb 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test.snap +++ b/modules/nf-core/fastqc/tests/main.nf.test.snap @@ -1,10 +1,20 @@ { + "sarscov2 single-end [fastq] - stub": { + "content": [ + [ + "test.html", + "test.zip", + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "timestamp": "2024-01-17T18:40:57.254299" + }, "versions": { "content": [ [ "versions.yml:md5,e1cc25ca8af856014824abd842e93978" ] ], - "timestamp": "2023-10-09T23:40:54+0000" + "timestamp": "2024-01-17T18:36:50.033627" } } \ No newline at end of file diff --git a/modules/nf-core/fq/subsample/environment.yml b/modules/nf-core/fq/subsample/environment.yml new file mode 100644 index 0000000..c588de3 --- /dev/null +++ b/modules/nf-core/fq/subsample/environment.yml @@ -0,0 +1,7 @@ +name: fq_subsample +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fq=0.9.1 diff --git a/modules/nf-core/fq/subsample/main.nf b/modules/nf-core/fq/subsample/main.nf new file mode 100644 index 0000000..f3d8cc7 --- /dev/null +++ b/modules/nf-core/fq/subsample/main.nf @@ -0,0 +1,55 @@ +process FQ_SUBSAMPLE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fq:0.9.1--h9ee0642_0': + 'biocontainers/fq:0.9.1--h9ee0642_0' }" + + input: + tuple val(meta), path(fastq) + + output: + tuple val(meta), path("*.fastq.gz"), emit: fastq + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + /* args requires: + --probability : Probability read is kept, between 0 and 1. Mutually exclusive with record-count. + --record-count : Number of records to keep. Mutually exclusive with probability + */ + def args = task.ext.args ?: '' + def prob_exists = args =~ /-p|--probability/ + def nrec_exists = args =~ /-n|--record-count/ + if ( !(prob_exists || nrec_exists) ){ + error "FQ/SUBSAMPLE requires --probability (-p) or --record-count (-n) specified in task.ext.args!" + } + def prefix = task.ext.prefix ?: "${meta.id}" + def n_fastq = fastq instanceof List ? fastq.size() : 1 + log.debug "FQ/SUBSAMPLE found ${n_fastq} FASTQ files" + if ( n_fastq == 1 ){ + fastq1_output = "--r1-dst ${prefix}.fastq.gz" + fastq2_output = "" + } else if ( n_fastq == 2 ){ + fastq1_output = "--r1-dst ${prefix}_R1.fastq.gz" + fastq2_output = "--r2-dst ${prefix}_R2.fastq.gz" + } else { + error "FQ/SUBSAMPLE only accepts 1 or 2 FASTQ files!" + } + """ + fq subsample \\ + $args \\ + $fastq \\ + $fastq1_output \\ + $fastq2_output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fq: \$(echo \$(fq subsample --version | sed 's/fq-subsample //g')) + END_VERSIONS + """ +} diff --git a/modules/nf-core/fq/subsample/meta.yml b/modules/nf-core/fq/subsample/meta.yml new file mode 100644 index 0000000..6c5e87f --- /dev/null +++ b/modules/nf-core/fq/subsample/meta.yml @@ -0,0 +1,41 @@ +name: "fq_subsample" +description: fq subsample outputs a subset of records from single or paired FASTQ files. This requires a seed (--seed) to be set in ext.args. +keywords: + - fastq + - fq + - subsample +tools: + - "fq": + description: "fq is a library to generate and validate FASTQ file pairs." + homepage: "https://github.com/stjude-rust-labs/fq" + documentation: "https://github.com/stjude-rust-labs/fq" + tool_dev_url: "https://github.com/stjude-rust-labs/fq" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: FASTQ file + pattern: "*.{fq,fastq}{,.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: Randomly sampled FASTQ files. + pattern: "*_R[12].fastq.gz" +authors: + - "@adamrtalbot" +maintainers: + - "@adamrtalbot" diff --git a/modules/nf-core/fq/subsample/tests/main.nf.test b/modules/nf-core/fq/subsample/tests/main.nf.test new file mode 100644 index 0000000..285f30c --- /dev/null +++ b/modules/nf-core/fq/subsample/tests/main.nf.test @@ -0,0 +1,140 @@ +nextflow_process { + + name "Test Process FQ_SUBSAMPLE" + script "../main.nf" + process "FQ_SUBSAMPLE" + + tag "modules" + tag "modules_nfcore" + tag "fq" + tag "fq/subsample" + + test("test_fq_subsample_no_args") { + config "./nextflow_no_args.config" + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll ( + { assert !process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } + + test("test_fq_subsample_probability") { + config "./nextflow_probability.config" + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } + + test("test_fq_subsample_record_count") { + config "./nextflow_record_count.config" + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } + + test("test_fq_subsample_single") { + config "./nextflow.config" + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } + + test("test_fq_subsample_no_input") { + config "./nextflow.config" + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ ] + ]) + """ + } + } + + then { + assertAll ( + { assert !process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/fq/subsample/tests/main.nf.test.snap b/modules/nf-core/fq/subsample/tests/main.nf.test.snap new file mode 100644 index 0000000..1e47918 --- /dev/null +++ b/modules/nf-core/fq/subsample/tests/main.nf.test.snap @@ -0,0 +1,145 @@ +{ + "test_fq_subsample_probability": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_R1.fastq.gz:md5,19326ff922a16c0cb81191f2a0a5c5fc", + "test_R2.fastq.gz:md5,ce7ff46296d89b68521ad55a3588bcfe" + ] + ] + ], + "1": [ + "versions.yml:md5,ffbf935bd0de512fbc9e83b187bf924f" + ], + "fastq": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_R1.fastq.gz:md5,19326ff922a16c0cb81191f2a0a5c5fc", + "test_R2.fastq.gz:md5,ce7ff46296d89b68521ad55a3588bcfe" + ] + ] + ], + "versions": [ + "versions.yml:md5,ffbf935bd0de512fbc9e83b187bf924f" + ] + } + ], + "timestamp": "2024-01-17T17:57:15.446336" + }, + "test_fq_subsample_record_count": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_R1.fastq.gz:md5,394c7a233f1c1c1a167a34cf2895d26d", + "test_R2.fastq.gz:md5,32724cbdb5ab954a0a659ebcd56ca422" + ] + ] + ], + "1": [ + "versions.yml:md5,ffbf935bd0de512fbc9e83b187bf924f" + ], + "fastq": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_R1.fastq.gz:md5,394c7a233f1c1c1a167a34cf2895d26d", + "test_R2.fastq.gz:md5,32724cbdb5ab954a0a659ebcd56ca422" + ] + ] + ], + "versions": [ + "versions.yml:md5,ffbf935bd0de512fbc9e83b187bf924f" + ] + } + ], + "timestamp": "2024-01-17T17:57:23.920058" + }, + "test_fq_subsample_single": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fastq.gz:md5,19326ff922a16c0cb81191f2a0a5c5fc" + ] + ], + "1": [ + "versions.yml:md5,ffbf935bd0de512fbc9e83b187bf924f" + ], + "fastq": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fastq.gz:md5,19326ff922a16c0cb81191f2a0a5c5fc" + ] + ], + "versions": [ + "versions.yml:md5,ffbf935bd0de512fbc9e83b187bf924f" + ] + } + ], + "timestamp": "2024-01-17T17:57:31.908993" + }, + "test_fq_subsample_no_args": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "fastq": [ + + ], + "versions": [ + + ] + } + ], + "timestamp": "2023-10-17T11:17:09.761156" + }, + "test_fq_subsample_no_input": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "fastq": [ + + ], + "versions": [ + + ] + } + ], + "timestamp": "2023-10-17T11:17:37.555824" + } +} \ No newline at end of file diff --git a/modules/nf-core/fq/subsample/tests/nextflow.config b/modules/nf-core/fq/subsample/tests/nextflow.config new file mode 100644 index 0000000..41edd9f --- /dev/null +++ b/modules/nf-core/fq/subsample/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: FQ_SUBSAMPLE { + ext.args = '--probability 0.1 -s 123' + } + +} diff --git a/modules/nf-core/fq/subsample/tests/nextflow_no_args.config b/modules/nf-core/fq/subsample/tests/nextflow_no_args.config new file mode 100644 index 0000000..88990d2 --- /dev/null +++ b/modules/nf-core/fq/subsample/tests/nextflow_no_args.config @@ -0,0 +1,7 @@ +process { + + withName: FQ_SUBSAMPLE { + ext.args = '' + } + +} diff --git a/modules/nf-core/fq/subsample/tests/nextflow_probability.config b/modules/nf-core/fq/subsample/tests/nextflow_probability.config new file mode 100644 index 0000000..8cde0c7 --- /dev/null +++ b/modules/nf-core/fq/subsample/tests/nextflow_probability.config @@ -0,0 +1,7 @@ +process { + + withName: FQ_SUBSAMPLE { + ext.args = '-p 0.1 -s 123' + } + +} diff --git a/modules/nf-core/fq/subsample/tests/nextflow_record_count.config b/modules/nf-core/fq/subsample/tests/nextflow_record_count.config new file mode 100644 index 0000000..1ea624e --- /dev/null +++ b/modules/nf-core/fq/subsample/tests/nextflow_record_count.config @@ -0,0 +1,7 @@ +process { + + withName: FQ_SUBSAMPLE { + ext.args = '-n 10 -s 123' + } + +} diff --git a/modules/nf-core/fq/subsample/tests/tags.yml b/modules/nf-core/fq/subsample/tests/tags.yml new file mode 100644 index 0000000..5156431 --- /dev/null +++ b/modules/nf-core/fq/subsample/tests/tags.yml @@ -0,0 +1,2 @@ +fq/subsample: + - modules/nf-core/fq/subsample/** diff --git a/modules/nf-core/gffread/environment.yml b/modules/nf-core/gffread/environment.yml new file mode 100644 index 0000000..5398f71 --- /dev/null +++ b/modules/nf-core/gffread/environment.yml @@ -0,0 +1,7 @@ +name: gffread +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gffread=0.12.1 diff --git a/modules/nf-core/gffread/main.nf b/modules/nf-core/gffread/main.nf new file mode 100644 index 0000000..d8a473e --- /dev/null +++ b/modules/nf-core/gffread/main.nf @@ -0,0 +1,35 @@ +process GFFREAD { + tag "$gff" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gffread:0.12.1--h8b12597_0' : + 'biocontainers/gffread:0.12.1--h8b12597_0' }" + + input: + path gff + + output: + path "*.gtf" , emit: gtf , optional: true + path "*.gff3" , emit: gffread_gff , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${gff.baseName}" + def extension = args.contains("-T") ? 'gtf' : 'gffread.gff3' + """ + gffread \\ + $gff \\ + $args \\ + -o ${prefix}.${extension} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gffread: \$(gffread --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gffread/meta.yml b/modules/nf-core/gffread/meta.yml new file mode 100644 index 0000000..27ac310 --- /dev/null +++ b/modules/nf-core/gffread/meta.yml @@ -0,0 +1,36 @@ +name: gffread +description: Validate, filter, convert and perform various other operations on GFF files +keywords: + - gff + - conversion + - validation +tools: + - gffread: + description: GFF/GTF utility providing format conversions, region filtering, FASTA sequence extraction and more. + homepage: http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread + documentation: http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread + tool_dev_url: https://github.com/gpertea/gffread + doi: 10.12688/f1000research.23297.1 + licence: ["MIT"] +input: + - gff: + type: file + description: A reference file in either the GFF3, GFF2 or GTF format. + pattern: "*.{gff, gtf}" +output: + - gtf: + type: file + description: GTF file resulting from the conversion of the GFF input file if '-T' argument is present + pattern: "*.{gtf}" + - gffread_gff: + type: file + description: GFF3 file resulting from the conversion of the GFF input file if '-T' argument is absent + pattern: "*.{gff3}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@emiller88" +maintainers: + - "@emiller88" diff --git a/modules/nf-core/gffread/tests/main.nf.test b/modules/nf-core/gffread/tests/main.nf.test new file mode 100644 index 0000000..bdbc96a --- /dev/null +++ b/modules/nf-core/gffread/tests/main.nf.test @@ -0,0 +1,65 @@ +nextflow_process { + + name "Test Process GFFREAD" + script "../main.nf" + process "GFFREAD" + + tag "gffread" + tag "modules_nfcore" + tag "modules" + + test("sarscov2-gff3-gtf") { + + config "./nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.gtf, + process.out.versions + ).match() }, + { assert process.out.gffread_gff == [] } + ) + } + + } + + test("sarscov2-gff3-gff3") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.gffread_gff, + process.out.versions + ).match() }, + { assert process.out.gtf == [] }, + ) + } + + } + +} diff --git a/modules/nf-core/gffread/tests/main.nf.test.snap b/modules/nf-core/gffread/tests/main.nf.test.snap new file mode 100644 index 0000000..00a11a4 --- /dev/null +++ b/modules/nf-core/gffread/tests/main.nf.test.snap @@ -0,0 +1,24 @@ +{ + "sarscov2-gff3-gtf": { + "content": [ + [ + "genome.gtf:md5,2394072d7d31530dfd590c4a117bf6e3" + ], + [ + "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" + ] + ], + "timestamp": "2024-01-23T20:00:32.688779117" + }, + "sarscov2-gff3-gff3": { + "content": [ + [ + "genome.gffread.gff3:md5,a7d40d99dcddac23ac673c473279ea2d" + ], + [ + "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" + ] + ], + "timestamp": "2024-01-23T20:07:11.457356625" + } +} \ No newline at end of file diff --git a/modules/nf-core/gffread/tests/nextflow.config b/modules/nf-core/gffread/tests/nextflow.config new file mode 100644 index 0000000..74b2509 --- /dev/null +++ b/modules/nf-core/gffread/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: GFFREAD { + ext.args = '-T' + } +} diff --git a/modules/nf-core/gffread/tests/tags.yml b/modules/nf-core/gffread/tests/tags.yml new file mode 100644 index 0000000..0557606 --- /dev/null +++ b/modules/nf-core/gffread/tests/tags.yml @@ -0,0 +1,2 @@ +gffread: + - modules/nf-core/gffread/** diff --git a/modules/nf-core/gunzip/environment.yml b/modules/nf-core/gunzip/environment.yml new file mode 100644 index 0000000..25910b3 --- /dev/null +++ b/modules/nf-core/gunzip/environment.yml @@ -0,0 +1,7 @@ +name: gunzip +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf new file mode 100644 index 0000000..468a6f2 --- /dev/null +++ b/modules/nf-core/gunzip/main.nf @@ -0,0 +1,48 @@ +process GUNZIP { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$gunzip"), emit: gunzip + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + gunzip = archive.toString() - '.gz' + """ + # Not calling gunzip itself because it creates files + # with the original group ownership rather than the + # default one for that user / the work directory + gzip \\ + -cd \\ + $args \\ + $archive \\ + > $gunzip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + gunzip = archive.toString() - '.gz' + """ + touch $gunzip + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml new file mode 100644 index 0000000..231034f --- /dev/null +++ b/modules/nf-core/gunzip/meta.yml @@ -0,0 +1,39 @@ +name: gunzip +description: Compresses and decompresses files. +keywords: + - gunzip + - compression + - decompression +tools: + - gunzip: + description: | + gzip is a file format and a software application used for file compression and decompression. + documentation: https://www.gnu.org/software/gzip/manual/gzip.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Optional groovy Map containing meta information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be compressed/uncompressed + pattern: "*.*" +output: + - gunzip: + type: file + description: Compressed/uncompressed file + pattern: "*.*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/gunzip/tests/main.nf.test b/modules/nf-core/gunzip/tests/main.nf.test new file mode 100644 index 0000000..6406008 --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test @@ -0,0 +1,36 @@ +nextflow_process { + + name "Test Process GUNZIP" + script "../main.nf" + process "GUNZIP" + tag "gunzip" + tag "modules_nfcore" + tag "modules" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gunzip/tests/main.nf.test.snap b/modules/nf-core/gunzip/tests/main.nf.test.snap new file mode 100644 index 0000000..720fd9f --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test.snap @@ -0,0 +1,31 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,54376d32aca20e937a4ec26dac228e84" + ], + "gunzip": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,54376d32aca20e937a4ec26dac228e84" + ] + } + ], + "timestamp": "2023-10-17T15:35:37.690477896" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunzip/tests/tags.yml b/modules/nf-core/gunzip/tests/tags.yml new file mode 100644 index 0000000..fd3f691 --- /dev/null +++ b/modules/nf-core/gunzip/tests/tags.yml @@ -0,0 +1,2 @@ +gunzip: + - modules/nf-core/gunzip/** diff --git a/modules/nf-core/kallisto/index/environment.yml b/modules/nf-core/kallisto/index/environment.yml new file mode 100644 index 0000000..471b006 --- /dev/null +++ b/modules/nf-core/kallisto/index/environment.yml @@ -0,0 +1,7 @@ +name: kallisto_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::kallisto=0.48.0 diff --git a/modules/nf-core/kallisto/index/main.nf b/modules/nf-core/kallisto/index/main.nf new file mode 100644 index 0000000..28a47db --- /dev/null +++ b/modules/nf-core/kallisto/index/main.nf @@ -0,0 +1,44 @@ +process KALLISTO_INDEX { + tag "$fasta" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kallisto:0.48.0--h15996b6_2': + 'biocontainers/kallisto:0.48.0--h15996b6_2' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("kallisto") , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + kallisto \\ + index \\ + $args \\ + -i kallisto \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallisto: \$(echo \$(kallisto 2>&1) | sed 's/^kallisto //; s/Usage.*\$//') + END_VERSIONS + """ + + stub: + """ + touch kallisto + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallisto: \$(echo \$(kallisto 2>&1) | sed 's/^kallisto //; s/Usage.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/kallisto/index/meta.yml b/modules/nf-core/kallisto/index/meta.yml new file mode 100644 index 0000000..d366aeb --- /dev/null +++ b/modules/nf-core/kallisto/index/meta.yml @@ -0,0 +1,41 @@ +name: kallisto_index +description: Create kallisto index +keywords: + - kallisto + - kallisto/index + - index +tools: + - kallisto: + description: Quantifying abundances of transcripts from bulk and single-cell RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads. + homepage: https://pachterlab.github.io/kallisto/ + documentation: https://pachterlab.github.io/kallisto/manual + tool_dev_url: https://github.com/pachterlab/kallisto + licence: ["BSD-2-Clause"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: genome fasta file + pattern: "*.{fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - index: + type: directory + description: Kallisto genome index + pattern: "*.idx" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@ggabernet" +maintainers: + - "@ggabernet" diff --git a/modules/nf-core/kallisto/index/tests/main.nf.test b/modules/nf-core/kallisto/index/tests/main.nf.test new file mode 100644 index 0000000..97933d6 --- /dev/null +++ b/modules/nf-core/kallisto/index/tests/main.nf.test @@ -0,0 +1,33 @@ +nextflow_process { + + name "Test Process KALLISTO_INDEX" + script "../main.nf" + process "KALLISTO_INDEX" + tag "modules" + tag "modules_nfcore" + tag "kallisto" + tag "kallisto/index" + + test("homo_sapiens genome_fasta") { + + when { + process { + """ + input[0] = [ + [ id:'test_fasta' ], // meta map + [ file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/kallisto/index/tests/main.nf.test.snap b/modules/nf-core/kallisto/index/tests/main.nf.test.snap new file mode 100644 index 0000000..c0f45ac --- /dev/null +++ b/modules/nf-core/kallisto/index/tests/main.nf.test.snap @@ -0,0 +1,31 @@ +{ + "homo_sapiens genome_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test_fasta" + }, + "kallisto:md5,2dab84e1456201beca5a43f4c514d67c" + ] + ], + "1": [ + "versions.yml:md5,178f9b57d4228edc356911d571b958a4" + ], + "index": [ + [ + { + "id": "test_fasta" + }, + "kallisto:md5,2dab84e1456201beca5a43f4c514d67c" + ] + ], + "versions": [ + "versions.yml:md5,178f9b57d4228edc356911d571b958a4" + ] + } + ], + "timestamp": "2023-11-02T09:58:48.83625986" + } +} \ No newline at end of file diff --git a/modules/nf-core/kallisto/index/tests/tags.yml b/modules/nf-core/kallisto/index/tests/tags.yml new file mode 100644 index 0000000..9f47b88 --- /dev/null +++ b/modules/nf-core/kallisto/index/tests/tags.yml @@ -0,0 +1,2 @@ +kallisto/index: + - modules/nf-core/kallisto/index/** diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml index bc0bdb5..7625b75 100644 --- a/modules/nf-core/multiqc/environment.yml +++ b/modules/nf-core/multiqc/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::multiqc=1.18 + - bioconda::multiqc=1.19 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 00cc48d..1b9f7c4 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -3,8 +3,8 @@ process MULTIQC { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.18--pyhdfd78af_0' : - 'biocontainers/multiqc:1.18--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" @@ -43,7 +43,7 @@ process MULTIQC { stub: """ - touch multiqc_data + mkdir multiqc_data touch multiqc_plots touch multiqc_report.html diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index f1aa660..45a9bc3 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,4 +1,3 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: multiqc description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test index c2dad21..d0438ed 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -7,12 +7,9 @@ nextflow_process { tag "modules_nfcore" tag "multiqc" - test("MULTIQC: FASTQC") { + test("sarscov2 single-end [fastqc]") { when { - params { - outdir = "$outputDir" - } process { """ input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) @@ -26,20 +23,17 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert path(process.out.report.get(0)).exists() }, - { assert path(process.out.data.get(0)).exists() }, - { assert path(process.out.versions.get(0)).getText().contains("multiqc") } + { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, + { assert process.out.data[0] ==~ ".*/multiqc_data" }, + { assert snapshot(process.out.versions).match("versions") } ) } } - test("MULTIQC: FASTQC and a config file") { + test("sarscov2 single-end [fastqc] [config]") { when { - params { - outdir = "$outputDir" - } process { """ input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) @@ -53,9 +47,35 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert path(process.out.report.get(0)).exists() }, - { assert path(process.out.data.get(0)).exists() }, - { assert path(process.out.versions.get(0)).getText().contains("multiqc") } + { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, + { assert process.out.data[0] ==~ ".*/multiqc_data" }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 single-end [fastqc] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.report.collect { file(it).getName() } + + process.out.data.collect { file(it).getName() } + + process.out.plots.collect { file(it).getName() } + + process.out.versions ).match() } ) } diff --git a/modules/nf-core/rsem/preparereference/environment.yml b/modules/nf-core/rsem/preparereference/environment.yml new file mode 100644 index 0000000..c80e691 --- /dev/null +++ b/modules/nf-core/rsem/preparereference/environment.yml @@ -0,0 +1,8 @@ +name: rsem_preparereference +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::rsem=1.3.3 + - bioconda::star=2.7.10a diff --git a/modules/nf-core/rsem/preparereference/main.nf b/modules/nf-core/rsem/preparereference/main.nf new file mode 100644 index 0000000..44f76eb --- /dev/null +++ b/modules/nf-core/rsem/preparereference/main.nf @@ -0,0 +1,72 @@ +process RSEM_PREPAREREFERENCE { + tag "$fasta" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-cf0123ef83b3c38c13e3b0696a3f285d3f20f15b:64aad4a4e144878400649e71f42105311be7ed87-0' : + 'biocontainers/mulled-v2-cf0123ef83b3c38c13e3b0696a3f285d3f20f15b:64aad4a4e144878400649e71f42105311be7ed87-0' }" + + input: + path fasta, stageAs: "rsem/*" + path gtf + + output: + path "rsem" , emit: index + path "*transcripts.fa", emit: transcript_fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args_list = args.tokenize() + if (args_list.contains('--star')) { + args_list.removeIf { it.contains('--star') } + def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' + """ + STAR \\ + --runMode genomeGenerate \\ + --genomeDir rsem/ \\ + --genomeFastaFiles $fasta \\ + --sjdbGTFfile $gtf \\ + --runThreadN $task.cpus \\ + $memory \\ + $args2 + + rsem-prepare-reference \\ + --gtf $gtf \\ + --num-threads $task.cpus \\ + ${args_list.join(' ')} \\ + $fasta \\ + rsem/genome + + cp rsem/genome.transcripts.fa . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rsem: \$(rsem-calculate-expression --version | sed -e "s/Current version: RSEM v//g") + star: \$(STAR --version | sed -e "s/STAR_//g") + END_VERSIONS + """ + } else { + """ + rsem-prepare-reference \\ + --gtf $gtf \\ + --num-threads $task.cpus \\ + $args \\ + $fasta \\ + rsem/genome + + cp rsem/genome.transcripts.fa . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rsem: \$(rsem-calculate-expression --version | sed -e "s/Current version: RSEM v//g") + star: \$(STAR --version | sed -e "s/STAR_//g") + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/rsem/preparereference/meta.yml b/modules/nf-core/rsem/preparereference/meta.yml new file mode 100644 index 0000000..05aa44f --- /dev/null +++ b/modules/nf-core/rsem/preparereference/meta.yml @@ -0,0 +1,42 @@ +name: rsem_preparereference +description: Prepare a reference genome for RSEM +keywords: + - rsem + - genome + - index +tools: + - rseqc: + description: | + RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome + homepage: https://github.com/deweylab/RSEM + documentation: https://github.com/deweylab/RSEM + doi: 10.1186/1471-2105-12-323 + licence: ["GPL-3.0-or-later"] +input: + - fasta: + type: file + description: The Fasta file of the reference genome + pattern: "*.{fasta,fa}" + - gtf: + type: file + description: The GTF file of the reference genome + pattern: "*.gtf" +output: + - rsem: + type: directory + description: RSEM index directory + pattern: "rsem" + - transcript_fasta: + type: file + description: Fasta file of transcripts + pattern: "rsem/*transcripts.fa" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@kevinmenden" +maintainers: + - "@drpatelh" + - "@kevinmenden" diff --git a/modules/nf-core/rsem/preparereference/tests/main.nf.test b/modules/nf-core/rsem/preparereference/tests/main.nf.test new file mode 100644 index 0000000..a1d948d --- /dev/null +++ b/modules/nf-core/rsem/preparereference/tests/main.nf.test @@ -0,0 +1,36 @@ +nextflow_process { + + name "Test Process RSEM_PREPAREREFERENCE" + script "../main.nf" + process "RSEM_PREPAREREFERENCE" + tag "modules" + tag "modules_nfcore" + tag "rsem" + tag "rsem/preparereference" + + test("homo_sapiens") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)]) + input[1] = Channel.of([file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.index).match("index")}, + { assert snapshot(process.out.transcript_fasta).match("transcript_fasta")}, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + +} diff --git a/modules/nf-core/rsem/preparereference/tests/main.nf.test.snap b/modules/nf-core/rsem/preparereference/tests/main.nf.test.snap new file mode 100644 index 0000000..0251065 --- /dev/null +++ b/modules/nf-core/rsem/preparereference/tests/main.nf.test.snap @@ -0,0 +1,35 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,517611c42f6354d3609db1b35fffa397" + ] + ], + "timestamp": "2023-11-22T13:16:49.170587" + }, + "index": { + "content": [ + [ + [ + "genome.chrlist:md5,b190587cae0531f3cf25552d8aa674db", + "genome.fasta:md5,f315020d899597c1b57e5fe9f60f4c3e", + "genome.grp:md5,c2848a8b6d495956c11ec53efc1de67e", + "genome.idx.fa:md5,050c521a2719c2ae48267c1e65218f29", + "genome.n2g.idx.fa:md5,050c521a2719c2ae48267c1e65218f29", + "genome.seq:md5,94da0c6b88c33e63c9a052a11f4f57c1", + "genome.ti:md5,c9e4ae8d4d13a504eec2acf1b8589a66", + "genome.transcripts.fa:md5,050c521a2719c2ae48267c1e65218f29" + ] + ] + ], + "timestamp": "2023-11-22T13:16:49.140398" + }, + "transcript_fasta": { + "content": [ + [ + "genome.transcripts.fa:md5,050c521a2719c2ae48267c1e65218f29" + ] + ], + "timestamp": "2023-11-22T13:16:49.159946" + } +} \ No newline at end of file diff --git a/modules/nf-core/rsem/preparereference/tests/tags.yml b/modules/nf-core/rsem/preparereference/tests/tags.yml new file mode 100644 index 0000000..1129045 --- /dev/null +++ b/modules/nf-core/rsem/preparereference/tests/tags.yml @@ -0,0 +1,2 @@ +rsem/preparereference: + - modules/nf-core/rsem/preparereference/** diff --git a/modules/nf-core/salmon/index/environment.yml b/modules/nf-core/salmon/index/environment.yml new file mode 100644 index 0000000..a6607d9 --- /dev/null +++ b/modules/nf-core/salmon/index/environment.yml @@ -0,0 +1,7 @@ +name: salmon_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::salmon=1.10.1 diff --git a/modules/nf-core/salmon/index/main.nf b/modules/nf-core/salmon/index/main.nf new file mode 100644 index 0000000..88d9cf1 --- /dev/null +++ b/modules/nf-core/salmon/index/main.nf @@ -0,0 +1,47 @@ +process SALMON_INDEX { + tag "$transcript_fasta" + label "process_medium" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/salmon:1.10.1--h7e5ed60_0' : + 'biocontainers/salmon:1.10.1--h7e5ed60_0' }" + + input: + path genome_fasta + path transcript_fasta + + output: + path "salmon" , emit: index + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def get_decoy_ids = "grep '^>' $genome_fasta | cut -d ' ' -f 1 | cut -d \$'\\t' -f 1 > decoys.txt" + def gentrome = "gentrome.fa" + if (genome_fasta.endsWith('.gz')) { + get_decoy_ids = "grep '^>' <(gunzip -c $genome_fasta) | cut -d ' ' -f 1 | cut -d \$'\\t' -f 1 > decoys.txt" + gentrome = "gentrome.fa.gz" + } + """ + $get_decoy_ids + sed -i.bak -e 's/>//g' decoys.txt + cat $transcript_fasta $genome_fasta > $gentrome + + salmon \\ + index \\ + --threads $task.cpus \\ + -t $gentrome \\ + -d decoys.txt \\ + $args \\ + -i salmon + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + salmon: \$(echo \$(salmon --version) | sed -e "s/salmon //g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/salmon/index/meta.yml b/modules/nf-core/salmon/index/meta.yml new file mode 100644 index 0000000..fd94dd2 --- /dev/null +++ b/modules/nf-core/salmon/index/meta.yml @@ -0,0 +1,37 @@ +name: salmon_index +description: Create index for salmon +keywords: + - index + - fasta + - genome + - reference +tools: + - salmon: + description: | + Salmon is a tool for wicked-fast transcript quantification from RNA-seq data + homepage: https://salmon.readthedocs.io/en/latest/salmon.html + manual: https://salmon.readthedocs.io/en/latest/salmon.html + doi: 10.1038/nmeth.4197 + licence: ["GPL-3.0-or-later"] +input: + - genome_fasta: + type: file + description: Fasta file of the reference genome + - transcriptome_fasta: + type: file + description: Fasta file of the reference transcriptome +output: + - index: + type: directory + description: Folder containing the star index files + pattern: "salmon" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@kevinmenden" + - "@drpatelh" +maintainers: + - "@kevinmenden" + - "@drpatelh" diff --git a/modules/nf-core/salmon/index/tests/main.nf.test b/modules/nf-core/salmon/index/tests/main.nf.test new file mode 100644 index 0000000..24f95c0 --- /dev/null +++ b/modules/nf-core/salmon/index/tests/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process SALMON_INDEX" + script "../main.nf" + process "SALMON_INDEX" + tag "modules" + tag "modules_nfcore" + tag "salmon" + tag "salmon/index" + + test("sarscov2") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)]) + input[1] = Channel.of([file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true)]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.index.get(0)).exists() }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/salmon/index/tests/main.nf.test.snap b/modules/nf-core/salmon/index/tests/main.nf.test.snap new file mode 100644 index 0000000..1e3e6b6 --- /dev/null +++ b/modules/nf-core/salmon/index/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,563eeafb4577be0b13801d7021c0bf42" + ] + ], + "timestamp": "2023-11-22T14:26:33.32036" + } +} \ No newline at end of file diff --git a/modules/nf-core/salmon/index/tests/tags.yml b/modules/nf-core/salmon/index/tests/tags.yml new file mode 100644 index 0000000..0299789 --- /dev/null +++ b/modules/nf-core/salmon/index/tests/tags.yml @@ -0,0 +1,2 @@ +salmon/index: + - modules/nf-core/salmon/index/** diff --git a/modules/nf-core/salmon/quant/environment.yml b/modules/nf-core/salmon/quant/environment.yml new file mode 100644 index 0000000..8626672 --- /dev/null +++ b/modules/nf-core/salmon/quant/environment.yml @@ -0,0 +1,7 @@ +name: salmon_quant +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::salmon=1.10.1 diff --git a/modules/nf-core/salmon/quant/main.nf b/modules/nf-core/salmon/quant/main.nf new file mode 100644 index 0000000..e115d39 --- /dev/null +++ b/modules/nf-core/salmon/quant/main.nf @@ -0,0 +1,79 @@ +process SALMON_QUANT { + tag "$meta.id" + label "process_medium" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/salmon:1.10.1--h7e5ed60_0' : + 'biocontainers/salmon:1.10.1--h7e5ed60_0' }" + + input: + tuple val(meta), path(reads) + path index + path gtf + path transcript_fasta + val alignment_mode + val lib_type + + output: + tuple val(meta), path("${prefix}") , emit: results + tuple val(meta), path("*info.json"), emit: json_info, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + + def reference = "--index $index" + def reads1 = [], reads2 = [] + meta.single_end ? [reads].flatten().each{reads1 << it} : reads.eachWithIndex{ v, ix -> ( ix & 1 ? reads2 : reads1) << v } + def input_reads = meta.single_end ? "-r ${reads1.join(" ")}" : "-1 ${reads1.join(" ")} -2 ${reads2.join(" ")}" + if (alignment_mode) { + reference = "-t $transcript_fasta" + input_reads = "-a $reads" + } + + def strandedness_opts = [ + 'A', 'U', 'SF', 'SR', + 'IS', 'IU' , 'ISF', 'ISR', + 'OS', 'OU' , 'OSF', 'OSR', + 'MS', 'MU' , 'MSF', 'MSR' + ] + def strandedness = 'A' + if (lib_type) { + if (strandedness_opts.contains(lib_type)) { + strandedness = lib_type + } else { + log.info "[Salmon Quant] Invalid library type specified '--libType=${lib_type}', defaulting to auto-detection with '--libType=A'." + } + } else { + strandedness = meta.single_end ? 'U' : 'IU' + if (meta.strandedness == 'forward') { + strandedness = meta.single_end ? 'SF' : 'ISF' + } else if (meta.strandedness == 'reverse') { + strandedness = meta.single_end ? 'SR' : 'ISR' + } + } + """ + salmon quant \\ + --geneMap $gtf \\ + --threads $task.cpus \\ + --libType=$strandedness \\ + $reference \\ + $input_reads \\ + $args \\ + -o $prefix + + if [ -f $prefix/aux_info/meta_info.json ]; then + cp $prefix/aux_info/meta_info.json "${prefix}_meta_info.json" + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + salmon: \$(echo \$(salmon --version) | sed -e "s/salmon //g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/salmon/quant/meta.yml b/modules/nf-core/salmon/quant/meta.yml new file mode 100644 index 0000000..49d7137 --- /dev/null +++ b/modules/nf-core/salmon/quant/meta.yml @@ -0,0 +1,62 @@ +name: salmon_quant +description: gene/transcript quantification with Salmon +keywords: + - index + - fasta + - genome + - reference +tools: + - salmon: + description: | + Salmon is a tool for wicked-fast transcript quantification from RNA-seq data + homepage: https://salmon.readthedocs.io/en/latest/salmon.html + manual: https://salmon.readthedocs.io/en/latest/salmon.html + doi: 10.1038/nmeth.4197 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files for single-end or paired-end data. + Multiple single-end fastqs or pairs of paired-end fastqs are + handled. + - index: + type: directory + description: Folder containing the star index files + - gtf: + type: file + description: GTF of the reference transcriptome + - transcriptome_fasta: + type: file + description: Fasta file of the reference transcriptome + - alignment_mode: + type: boolean + description: whether to run salmon in alignment mode + - lib_type: + type: string + description: | + Override library type inferred based on strandedness defined in meta object +output: + - results: + type: directory + description: Folder containing the quantification results for a specific sample + pattern: "${prefix}" + - json_info: + type: file + description: File containing meta information from Salmon quant + pattern: "*info.json" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@kevinmenden" + - "@drpatelh" +maintainers: + - "@kevinmenden" + - "@drpatelh" diff --git a/modules/nf-core/salmon/quant/tests/main.nf.test b/modules/nf-core/salmon/quant/tests/main.nf.test new file mode 100644 index 0000000..7b28db3 --- /dev/null +++ b/modules/nf-core/salmon/quant/tests/main.nf.test @@ -0,0 +1,192 @@ +nextflow_process { + + name "Test Process SALMON_QUANT" + script "../main.nf" + process "SALMON_QUANT" + config "./nextflow.config" + tag "modules" + tag "modules_nfcore" + tag "salmon" + tag "salmon/quant" + + test("sarscov2 - single_end") { + + setup { + run("SALMON_INDEX") { + script "../../../salmon/index/main.nf" + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)]) + input[1] = Channel.of([file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true)]) + """ + } + } + } + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ]) + input[1] = SALMON_INDEX.out.index + input[2] = Channel.of([file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true)]) + input[3] = Channel.of([file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true)]) + input[4] = false + input[5] = '' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.json_info.get(0).get(1)).exists() }, + { assert path(process.out.results.get(0).get(1)).exists() }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("sarscov2 - single_end lib type A") { + + setup { + run("SALMON_INDEX") { + script "../../../salmon/index/main.nf" + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)]) + input[1] = Channel.of([file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true)]) + """ + } + } + } + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ]) + input[1] = SALMON_INDEX.out.index + input[2] = Channel.of([file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true)]) + input[3] = Channel.of([file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true)]) + input[4] = false + input[5] = 'A' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.json_info.get(0).get(1)).exists() }, + { assert path(process.out.results.get(0).get(1)).exists() }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("sarscov2 - pair_end") { + + setup { + run("SALMON_INDEX") { + script "../../../salmon/index/main.nf" + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)]) + input[1] = Channel.of([file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true)]) + """ + } + } + } + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ]) + input[1] = SALMON_INDEX.out.index + input[2] = Channel.of([file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true)]) + input[3] = Channel.of([file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true)]) + input[4] = false + input[5] = '' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.json_info.get(0).get(1)).exists() }, + { assert path(process.out.results.get(0).get(1)).exists() }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + test("sarscov2 - pair_end multiple") { + + setup { + run("SALMON_INDEX") { + script "../../../salmon/index/main.nf" + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)]) + input[1] = Channel.of([file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true)]) + """ + } + } + } + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) + ] + ]) + input[1] = SALMON_INDEX.out.index + input[2] = Channel.of([file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true)]) + input[3] = Channel.of([file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true)]) + input[4] = false + input[5] = '' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.json_info.get(0).get(1)).exists() }, + { assert path(process.out.results.get(0).get(1)).exists() }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } +} \ No newline at end of file diff --git a/modules/nf-core/salmon/quant/tests/main.nf.test.snap b/modules/nf-core/salmon/quant/tests/main.nf.test.snap new file mode 100644 index 0000000..386a7a3 --- /dev/null +++ b/modules/nf-core/salmon/quant/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,80eb3d2ad36960c7e9263f81ede9d263" + ] + ], + "timestamp": "2023-11-22T14:47:10.132112" + } +} \ No newline at end of file diff --git a/modules/nf-core/salmon/quant/tests/nextflow.config b/modules/nf-core/salmon/quant/tests/nextflow.config new file mode 100644 index 0000000..37c0821 --- /dev/null +++ b/modules/nf-core/salmon/quant/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: SALMON_QUANT { + ext.args = '--minAssignedFrags 1' + } + +} diff --git a/modules/nf-core/salmon/quant/tests/tags.yml b/modules/nf-core/salmon/quant/tests/tags.yml new file mode 100644 index 0000000..048d816 --- /dev/null +++ b/modules/nf-core/salmon/quant/tests/tags.yml @@ -0,0 +1,2 @@ +salmon/quant: + - modules/nf-core/salmon/quant/** diff --git a/modules/nf-core/sortmerna/environment.yml b/modules/nf-core/sortmerna/environment.yml new file mode 100644 index 0000000..f40f995 --- /dev/null +++ b/modules/nf-core/sortmerna/environment.yml @@ -0,0 +1,7 @@ +name: sortmerna +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sortmerna=4.3.6 diff --git a/modules/nf-core/sortmerna/main.nf b/modules/nf-core/sortmerna/main.nf new file mode 100644 index 0000000..29c640c --- /dev/null +++ b/modules/nf-core/sortmerna/main.nf @@ -0,0 +1,96 @@ +process SORTMERNA { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sortmerna:4.3.6--h9ee0642_0' : + 'biocontainers/sortmerna:4.3.6--h9ee0642_0' }" + + input: + tuple val(meta), path(reads) + path fastas + + output: + tuple val(meta), path("*non_rRNA.fastq.gz"), emit: reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + """ + sortmerna \\ + ${'--ref '+fastas.join(' --ref ')} \\ + --reads $reads \\ + --threads $task.cpus \\ + --workdir . \\ + --aligned rRNA_reads \\ + --fastx \\ + --other non_rRNA_reads \\ + $args + + mv non_rRNA_reads.f*q.gz ${prefix}.non_rRNA.fastq.gz + mv rRNA_reads.log ${prefix}.sortmerna.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') + END_VERSIONS + """ + } else { + """ + sortmerna \\ + ${'--ref '+fastas.join(' --ref ')} \\ + --reads ${reads[0]} \\ + --reads ${reads[1]} \\ + --threads $task.cpus \\ + --workdir . \\ + --aligned rRNA_reads \\ + --fastx \\ + --other non_rRNA_reads \\ + --paired_in \\ + --out2 \\ + $args + + mv non_rRNA_reads_fwd.f*q.gz ${prefix}_1.non_rRNA.fastq.gz + mv non_rRNA_reads_rev.f*q.gz ${prefix}_2.non_rRNA.fastq.gz + mv rRNA_reads.log ${prefix}.sortmerna.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') + END_VERSIONS + """ + } + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + """ + touch ${prefix}.non_rRNA.fastq.gz + touch ${prefix}.sortmerna.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') + END_VERSIONS + """ + } else { + """ + touch ${prefix}_1.non_rRNA.fastq.gz + touch ${prefix}_2.non_rRNA.fastq.gz + touch ${prefix}.sortmerna.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/sortmerna/meta.yml b/modules/nf-core/sortmerna/meta.yml new file mode 100644 index 0000000..de0b18e --- /dev/null +++ b/modules/nf-core/sortmerna/meta.yml @@ -0,0 +1,53 @@ +name: sortmerna +description: Local sequence alignment tool for filtering, mapping and clustering. +keywords: + - filtering + - mapping + - clustering + - rRNA + - ribosomal RNA +tools: + - SortMeRNA: + description: The core algorithm is based on approximate seeds and allows for sensitive analysis of NGS reads. The main application of SortMeRNA is filtering rRNA from metatranscriptomic data. SortMeRNA takes as input files of reads (fasta, fastq, fasta.gz, fastq.gz) and one or multiple rRNA database file(s), and sorts apart aligned and rejected reads into two files. Additional applications include clustering and taxonomy assignation available through QIIME v1.9.1. SortMeRNA works with Illumina, Ion Torrent and PacBio data, and can produce SAM and BLAST-like alignments. + homepage: https://hpc.nih.gov/apps/sortmeRNA.html + documentation: https://github.com/biocore/sortmerna/wiki/ + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - fastas: + type: file + description: | + Path to reference file(s) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The filtered fastq reads + pattern: "*fastq.gz" + - log: + type: file + description: SortMeRNA log file + pattern: "*sortmerna.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@mashehu" +maintainers: + - "@drpatelh" + - "@mashehu" diff --git a/modules/nf-core/sortmerna/tests/main.nf.test b/modules/nf-core/sortmerna/tests/main.nf.test new file mode 100644 index 0000000..8a01e2a --- /dev/null +++ b/modules/nf-core/sortmerna/tests/main.nf.test @@ -0,0 +1,144 @@ +nextflow_process { + + name "Test Process SORTMERNA" + script "../main.nf" + process "SORTMERNA" + tag "modules" + tag "modules_nfcore" + tag "sortmerna" + + test("sarscov2 single_end") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + input[1] = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.reads }, + { assert file(process.out.log[0][1]).text.contains("Total reads passing E-value threshold = 100 (100.00)") }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + ).sort() + ).match("sarscov2 single_end-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("sarscov2 single_end stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + input[1] = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + ).sort() + ).match("sarscov2 single_end-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("sarscov2 paired_end") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.reads }, + { assert file(process.out.log[0][1]).text.contains("Total reads passing E-value threshold = 200 (100.00)") }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { it[1].collect { item -> file(item).getName() } } + + process.out.log.collect { file(it[1]).getName() } + ).sort() + ).match("sarscov2 paired_end-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("sarscov2 paired_end stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { it[1].collect { item -> file(item).getName() } } + + process.out.log.collect { file(it[1]).getName() } + ).sort() + ).match("sarscov2 paired_end-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + +} diff --git a/modules/nf-core/sortmerna/tests/main.nf.test.snap b/modules/nf-core/sortmerna/tests/main.nf.test.snap new file mode 100644 index 0000000..e502000 --- /dev/null +++ b/modules/nf-core/sortmerna/tests/main.nf.test.snap @@ -0,0 +1,33 @@ +{ + "sarscov2 single_end-for_stub_match": { + "content": [ + [ + "test.non_rRNA.fastq.gz", + "test.sortmerna.log", + "{id=test, single_end=true}" + ] + ], + "timestamp": "2023-12-21T11:56:00.15356" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,7df9d50209f351e1f75e05a1fad6ba4b" + ] + ], + "timestamp": "2023-12-21T11:56:00.200244" + }, + "sarscov2 paired_end-for_stub_match": { + "content": [ + [ + [ + "test_1.non_rRNA.fastq.gz", + "test_2.non_rRNA.fastq.gz" + ], + "test.sortmerna.log", + "{id=test, single_end=false}" + ] + ], + "timestamp": "2023-12-21T12:00:47.879193" + } +} \ No newline at end of file diff --git a/modules/nf-core/sortmerna/tests/tags.yml b/modules/nf-core/sortmerna/tests/tags.yml new file mode 100644 index 0000000..e088480 --- /dev/null +++ b/modules/nf-core/sortmerna/tests/tags.yml @@ -0,0 +1,2 @@ +sortmerna: + - modules/nf-core/sortmerna/** diff --git a/modules/nf-core/star/genomegenerate/environment.yml b/modules/nf-core/star/genomegenerate/environment.yml new file mode 100644 index 0000000..93e4476 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/environment.yml @@ -0,0 +1,11 @@ +name: star_genomegenerate + +channels: + - conda-forge + - bioconda + - defaults + +dependencies: + - bioconda::samtools=1.18 + - bioconda::star=2.7.10a + - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/genomegenerate/main.nf b/modules/nf-core/star/genomegenerate/main.nf new file mode 100644 index 0000000..b885571 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/main.nf @@ -0,0 +1,119 @@ +process STAR_GENOMEGENERATE { + tag "$fasta" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' : + 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' }" + + input: + tuple val(meta), path(fasta) + tuple val(meta2), path(gtf) + + output: + tuple val(meta), path("star") , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args_list = args.tokenize() + def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' + def include_gtf = gtf ? "--sjdbGTFfile $gtf" : '' + if (args_list.contains('--genomeSAindexNbases')) { + """ + mkdir star + STAR \\ + --runMode genomeGenerate \\ + --genomeDir star/ \\ + --genomeFastaFiles $fasta \\ + $include_gtf \\ + --runThreadN $task.cpus \\ + $memory \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } else { + """ + samtools faidx $fasta + NUM_BASES=`gawk '{sum = sum + \$2}END{if ((log(sum)/log(2))/2 - 1 > 14) {printf "%.0f", 14} else {printf "%.0f", (log(sum)/log(2))/2 - 1}}' ${fasta}.fai` + + mkdir star + STAR \\ + --runMode genomeGenerate \\ + --genomeDir star/ \\ + --genomeFastaFiles $fasta \\ + $include_gtf \\ + --runThreadN $task.cpus \\ + --genomeSAindexNbases \$NUM_BASES \\ + $memory \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } + + stub: + if (gtf) { + """ + mkdir star + touch star/Genome + touch star/Log.out + touch star/SA + touch star/SAindex + touch star/chrLength.txt + touch star/chrName.txt + touch star/chrNameLength.txt + touch star/chrStart.txt + touch star/exonGeTrInfo.tab + touch star/exonInfo.tab + touch star/geneInfo.tab + touch star/genomeParameters.txt + touch star/sjdbInfo.txt + touch star/sjdbList.fromGTF.out.tab + touch star/sjdbList.out.tab + touch star/transcriptInfo.tab + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } else { + """ + mkdir star + touch star/Genome + touch star/Log.out + touch star/SA + touch star/SAindex + touch star/chrLength.txt + touch star/chrName.txt + touch star/chrNameLength.txt + touch star/chrStart.txt + touch star/genomeParameters.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/star/genomegenerate/meta.yml b/modules/nf-core/star/genomegenerate/meta.yml new file mode 100644 index 0000000..1061e1b --- /dev/null +++ b/modules/nf-core/star/genomegenerate/meta.yml @@ -0,0 +1,53 @@ +name: star_genomegenerate +description: Create index for STAR +keywords: + - index + - fasta + - genome + - reference +tools: + - star: + description: | + STAR is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: https://github.com/alexdobin/STAR + manual: https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf + doi: 10.1093/bioinformatics/bts635 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Fasta file of the reference genome + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - gtf: + type: file + description: GTF file of the reference genome +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - index: + type: directory + description: Folder containing the star index files + pattern: "star" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@kevinmenden" + - "@drpatelh" +maintainers: + - "@kevinmenden" + - "@drpatelh" diff --git a/modules/nf-core/star/genomegenerate/tests/main.nf.test b/modules/nf-core/star/genomegenerate/tests/main.nf.test new file mode 100644 index 0000000..af0c942 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/main.nf.test @@ -0,0 +1,117 @@ +nextflow_process { + + name "Test Process STAR_GENOMEGENERATE" + script "../main.nf" + process "STAR_GENOMEGENERATE" + tag "modules" + tag "modules_nfcore" + tag "star" + tag "star/genomegenerate" + + test("homo_sapiens") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_with_gtf") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("homo_sapiens-stub") { + + options '-stub' + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_with_gtf") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("homo_sapiens-without_gtf") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + ]) + input[1] = Channel.of([ [], [] ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_without_gtf") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("homo_sapiens-without_gtf-stub") { + + options '-stub' + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + ]) + input[1] = Channel.of([ [], [] ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_without_gtf") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap b/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap new file mode 100644 index 0000000..9de08c7 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap @@ -0,0 +1,22 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "timestamp": "2023-12-19T11:05:51.741109" + }, + "index_with_gtf": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, exonGeTrInfo.tab, exonInfo.tab, geneInfo.tab, genomeParameters.txt, sjdbInfo.txt, sjdbList.fromGTF.out.tab, sjdbList.out.tab, transcriptInfo.tab]" + ], + "timestamp": "2023-12-19T11:38:14.551548" + }, + "index_without_gtf": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, genomeParameters.txt]" + ], + "timestamp": "2023-12-19T11:38:22.382905" + } +} \ No newline at end of file diff --git a/modules/nf-core/star/genomegenerate/tests/tags.yml b/modules/nf-core/star/genomegenerate/tests/tags.yml new file mode 100644 index 0000000..79f619b --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/tags.yml @@ -0,0 +1,2 @@ +star/genomegenerate: + - modules/nf-core/star/genomegenerate/** diff --git a/modules/nf-core/trimgalore/environment.yml b/modules/nf-core/trimgalore/environment.yml new file mode 100644 index 0000000..6cd0f51 --- /dev/null +++ b/modules/nf-core/trimgalore/environment.yml @@ -0,0 +1,7 @@ +name: trimgalore +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::trim-galore=0.6.7 diff --git a/modules/nf-core/trimgalore/main.nf b/modules/nf-core/trimgalore/main.nf new file mode 100644 index 0000000..24ead87 --- /dev/null +++ b/modules/nf-core/trimgalore/main.nf @@ -0,0 +1,75 @@ +process TRIMGALORE { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/trim-galore:0.6.7--hdfd78af_0' : + 'biocontainers/trim-galore:0.6.7--hdfd78af_0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*{3prime,5prime,trimmed,val}*.fq.gz"), emit: reads + tuple val(meta), path("*report.txt") , emit: log , optional: true + tuple val(meta), path("*unpaired*.fq.gz") , emit: unpaired, optional: true + tuple val(meta), path("*.html") , emit: html , optional: true + tuple val(meta), path("*.zip") , emit: zip , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + // Calculate number of --cores for TrimGalore based on value of task.cpus + // See: https://github.com/FelixKrueger/TrimGalore/blob/master/Changelog.md#version-060-release-on-1-mar-2019 + // See: https://github.com/nf-core/atacseq/pull/65 + def cores = 1 + if (task.cpus) { + cores = (task.cpus as int) - 4 + if (meta.single_end) cores = (task.cpus as int) - 3 + if (cores < 1) cores = 1 + if (cores > 8) cores = 8 + } + + // Added soft-links to original fastqs for consistent naming in MultiQC + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + def args_list = args.split("\\s(?=--)").toList() + args_list.removeAll { it.toLowerCase().contains('_r2 ') } + """ + [ ! -f ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz + trim_galore \\ + ${args_list.join(' ')} \\ + --cores $cores \\ + --gzip \\ + ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + trimgalore: \$(echo \$(trim_galore --version 2>&1) | sed 's/^.*version //; s/Last.*\$//') + cutadapt: \$(cutadapt --version) + END_VERSIONS + """ + } else { + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz + trim_galore \\ + $args \\ + --cores $cores \\ + --paired \\ + --gzip \\ + ${prefix}_1.fastq.gz \\ + ${prefix}_2.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + trimgalore: \$(echo \$(trim_galore --version 2>&1) | sed 's/^.*version //; s/Last.*\$//') + cutadapt: \$(cutadapt --version) + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/trimgalore/meta.yml b/modules/nf-core/trimgalore/meta.yml new file mode 100644 index 0000000..e649088 --- /dev/null +++ b/modules/nf-core/trimgalore/meta.yml @@ -0,0 +1,68 @@ +name: trimgalore +description: Trim FastQ files using Trim Galore! +keywords: + - trimming + - adapters + - sequencing adapters + - fastq +tools: + - trimgalore: + description: | + A wrapper tool around Cutadapt and FastQC to consistently apply quality + and adapter trimming to FastQ files, with some extra functionality for + MspI-digested RRBS-type (Reduced Representation Bisufite-Seq) libraries. + homepage: https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/ + documentation: https://github.com/FelixKrueger/TrimGalore/blob/master/Docs/Trim_Galore_User_Guide.md + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input adapter trimmed FastQ files of size 1 and 2 for + single-end and paired-end data, respectively. + pattern: "*{3prime,5prime,trimmed,val}*.fq.gz" + - unpaired: + type: file + description: | + FastQ files containing unpaired reads from read 1 or read 2 + pattern: "*unpaired*.fq.gz" + - html: + type: file + description: FastQC report (optional) + pattern: "*_{fastqc.html}" + - zip: + type: file + description: FastQC report archive (optional) + pattern: "*_{fastqc.zip}" + - log: + type: file + description: Trim Galore! trimming report + pattern: "*_{report.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@FelixKrueger" +maintainers: + - "@drpatelh" + - "@ewels" + - "@FelixKrueger" diff --git a/modules/nf-core/trimgalore/tests/main.nf.test b/modules/nf-core/trimgalore/tests/main.nf.test new file mode 100644 index 0000000..bc6812c --- /dev/null +++ b/modules/nf-core/trimgalore/tests/main.nf.test @@ -0,0 +1,105 @@ +nextflow_process { + + name "Test Process TRIMGALORE" + script "../main.nf" + process "TRIMGALORE" + tag "modules" + tag "modules_nfcore" + tag "trimgalore" + + test("test_trimgalore_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + def read_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { report1_lines.each { report1_line -> + { assert path(process.out.log.get(0).get(1)).getText().contains(report1_line) } + } + } + ) + } + } + + test("test_trimgalore_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + def read1_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { report1_lines.each { report1_line -> + { assert path(process.out.log.get(0).get(1).get(0)).getText().contains(report1_line) } + } + }, + { report2_lines.each { report2_line -> + { assert path(process.out.log.get(0).get(1).get(1)).getText().contains(report2_line) } + } + } + ) + } + } +} diff --git a/modules/nf-core/trimgalore/tests/main.nf.test.snap b/modules/nf-core/trimgalore/tests/main.nf.test.snap new file mode 100644 index 0000000..84feacc --- /dev/null +++ b/modules/nf-core/trimgalore/tests/main.nf.test.snap @@ -0,0 +1,148 @@ +{ + "test_trimgalore_single_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_trimmed.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastq.gz_trimming_report.txt:md5,a1ab3958205f1ddf48af623242b5b429" + ] + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "html": [ + + ], + "log": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastq.gz_trimming_report.txt:md5,a1ab3958205f1ddf48af623242b5b429" + ] + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test_trimmed.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4" + ] + ], + "unpaired": [ + + ], + "versions": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "zip": [ + + ] + } + ], + "timestamp": "2023-10-17T15:24:57.782141441" + }, + "test_trimgalore_paired_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1_val_1.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4", + "test_2_val_2.fq.gz:md5,f3d61189e6d10202da7b8686f1dbb71b" + ] + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastq.gz_trimming_report.txt:md5,315d40465412f9909bbaabf52269274d", + "test_2.fastq.gz_trimming_report.txt:md5,34436303da1c78811103427a2fb57f7b" + ] + ] + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "html": [ + + ], + "log": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastq.gz_trimming_report.txt:md5,315d40465412f9909bbaabf52269274d", + "test_2.fastq.gz_trimming_report.txt:md5,34436303da1c78811103427a2fb57f7b" + ] + ] + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1_val_1.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4", + "test_2_val_2.fq.gz:md5,f3d61189e6d10202da7b8686f1dbb71b" + ] + ] + ], + "unpaired": [ + + ], + "versions": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "zip": [ + + ] + } + ], + "timestamp": "2023-10-17T15:25:08.513589909" + } +} \ No newline at end of file diff --git a/modules/nf-core/trimgalore/tests/tags.yml b/modules/nf-core/trimgalore/tests/tags.yml new file mode 100644 index 0000000..e993769 --- /dev/null +++ b/modules/nf-core/trimgalore/tests/tags.yml @@ -0,0 +1,2 @@ +trimgalore: + - modules/nf-core/trimgalore/** diff --git a/modules/nf-core/umitools/extract/environment.yml b/modules/nf-core/umitools/extract/environment.yml new file mode 100644 index 0000000..7d08ac0 --- /dev/null +++ b/modules/nf-core/umitools/extract/environment.yml @@ -0,0 +1,7 @@ +name: umitools_extract +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::umi_tools=1.1.4 diff --git a/modules/nf-core/umitools/extract/main.nf b/modules/nf-core/umitools/extract/main.nf new file mode 100644 index 0000000..4bd79e7 --- /dev/null +++ b/modules/nf-core/umitools/extract/main.nf @@ -0,0 +1,56 @@ +process UMITOOLS_EXTRACT { + tag "$meta.id" + label "process_single" + label "process_long" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.4--py38hbff2b2d_1' : + 'biocontainers/umi_tools:1.1.4--py38hbff2b2d_1' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + """ + umi_tools \\ + extract \\ + -I $reads \\ + -S ${prefix}.umi_extract.fastq.gz \\ + $args \\ + > ${prefix}.umi_extract.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + umitools: \$( umi_tools --version | sed '/version:/!d; s/.*: //' ) + END_VERSIONS + """ + } else { + """ + umi_tools \\ + extract \\ + -I ${reads[0]} \\ + --read2-in=${reads[1]} \\ + -S ${prefix}.umi_extract_1.fastq.gz \\ + --read2-out=${prefix}.umi_extract_2.fastq.gz \\ + $args \\ + > ${prefix}.umi_extract.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + umitools: \$( umi_tools --version | sed '/version:/!d; s/.*: //' ) + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/umitools/extract/meta.yml b/modules/nf-core/umitools/extract/meta.yml new file mode 100644 index 0000000..7695b27 --- /dev/null +++ b/modules/nf-core/umitools/extract/meta.yml @@ -0,0 +1,48 @@ +name: umitools_extract +description: Extracts UMI barcode from a read and add it to the read name, leaving any sample barcode in place +keywords: + - UMI + - barcode + - extract + - umitools +tools: + - umi_tools: + description: > + UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) and single cell RNA-Seq cell barcodes + documentation: https://umi-tools.readthedocs.io/en/latest/ + license: "MIT" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: list + description: | + List of input FASTQ files whose UMIs will be extracted. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: > + Extracted FASTQ files. | For single-end reads, pattern is \${prefix}.umi_extract.fastq.gz. | For paired-end reads, pattern is \${prefix}.umi_extract_{1,2}.fastq.gz. + pattern: "*.{fastq.gz}" + - log: + type: file + description: Logfile for umi_tools + pattern: "*.{log}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@grst" +maintainers: + - "@drpatelh" + - "@grst" diff --git a/modules/nf-core/umitools/extract/tests/main.nf.test b/modules/nf-core/umitools/extract/tests/main.nf.test new file mode 100644 index 0000000..22242d1 --- /dev/null +++ b/modules/nf-core/umitools/extract/tests/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process UMITOOLS_EXTRACT" + script "../main.nf" + process "UMITOOLS_EXTRACT" + config "./nextflow.config" + tag "modules_nfcore" + tag "modules" + tag "umitools" + tag "umitools/extract" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } +} \ No newline at end of file diff --git a/modules/nf-core/umitools/extract/tests/main.nf.test.snap b/modules/nf-core/umitools/extract/tests/main.nf.test.snap new file mode 100644 index 0000000..6d5944f --- /dev/null +++ b/modules/nf-core/umitools/extract/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,5a18da2d3a5a4de15e7aaae9082d7abb" + ] + ], + "timestamp": "2023-12-08T09:41:43.540658352" + } +} \ No newline at end of file diff --git a/modules/nf-core/umitools/extract/tests/nextflow.config b/modules/nf-core/umitools/extract/tests/nextflow.config new file mode 100644 index 0000000..c866f5a --- /dev/null +++ b/modules/nf-core/umitools/extract/tests/nextflow.config @@ -0,0 +1,9 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + withName: UMITOOLS_EXTRACT { + ext.args = '--bc-pattern="NNNN"' + } + +} diff --git a/modules/nf-core/umitools/extract/tests/tags.yml b/modules/nf-core/umitools/extract/tests/tags.yml new file mode 100644 index 0000000..c3fb23d --- /dev/null +++ b/modules/nf-core/umitools/extract/tests/tags.yml @@ -0,0 +1,2 @@ +umitools/extract: + - modules/nf-core/umitools/extract/** diff --git a/modules/nf-core/untar/environment.yml b/modules/nf-core/untar/environment.yml new file mode 100644 index 0000000..0c9cbb1 --- /dev/null +++ b/modules/nf-core/untar/environment.yml @@ -0,0 +1,11 @@ +name: untar + +channels: + - conda-forge + - bioconda + - defaults + +dependencies: + - conda-forge::grep=3.11 + - conda-forge::sed=4.7 + - conda-forge::tar=1.34 diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf new file mode 100644 index 0000000..8a75bb9 --- /dev/null +++ b/modules/nf-core/untar/main.nf @@ -0,0 +1,63 @@ +process UNTAR { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$prefix"), emit: untar + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + + """ + mkdir $prefix + + ## Ensures --strip-components only applied when top level of tar contents is a directory + ## If just files or multiple directories, place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + tar \\ + -C $prefix --strip-components 1 \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + else + tar \\ + -C $prefix \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) + """ + mkdir $prefix + touch ${prefix}/file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml new file mode 100644 index 0000000..a9a2110 --- /dev/null +++ b/modules/nf-core/untar/meta.yml @@ -0,0 +1,46 @@ +name: untar +description: Extract files. +keywords: + - untar + - uncompress + - extract +tools: + - untar: + description: | + Extract tar.gz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untar + pattern: "*.{tar}.{gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - untar: + type: directory + description: Directory containing contents of archive + pattern: "*/" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" diff --git a/modules/nf-core/untar/tests/main.nf.test b/modules/nf-core/untar/tests/main.nf.test new file mode 100644 index 0000000..679e83c --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test @@ -0,0 +1,55 @@ +nextflow_process { + + name "Test Process UNTAR" + script "../main.nf" + process "UNTAR" + + tag "modules" + tag "modules_nfcore" + tag "untar" + + test("test_untar") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar") }, + ) + } + + } + + test("test_untar_onlyfiles") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar_onlyfiles") }, + ) + } + + } + +} diff --git a/modules/nf-core/untar/tests/main.nf.test.snap b/modules/nf-core/untar/tests/main.nf.test.snap new file mode 100644 index 0000000..ace4257 --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test.snap @@ -0,0 +1,34 @@ +{ + "test_untar_onlyfiles": { + "content": [ + [ + [ + [ + + ], + [ + "hello.txt:md5,e59ff97941044f85df5297e1c302d260" + ] + ] + ] + ], + "timestamp": "2023-10-18T11:56:46.878844" + }, + "test_untar": { + "content": [ + [ + [ + [ + + ], + [ + "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9", + "opts.k2d:md5,a033d00cf6759407010b21700938f543", + "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c" + ] + ] + ] + ], + "timestamp": "2023-10-18T11:56:08.16574" + } +} \ No newline at end of file diff --git a/modules/nf-core/untar/tests/tags.yml b/modules/nf-core/untar/tests/tags.yml new file mode 100644 index 0000000..feb6f15 --- /dev/null +++ b/modules/nf-core/untar/tests/tags.yml @@ -0,0 +1,2 @@ +untar: + - modules/nf-core/untar/** diff --git a/nextflow.config b/nextflow.config index e2f3488..7308f0f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,15 +9,91 @@ // Global default params, used in configs params { - // TODO nf-core: Specify your pipeline's command line flags // Input options input = null // References + skip_gtf_filter = false genome = null - igenomes_base = 's3://ngi-igenomes/igenomes/' + splicesites = null + gtf_extra_attributes = 'gene_name' + gtf_group_features = 'gene_id' + skip_gtf_filter = false + skip_gtf_transcript_filter = false + igenomes_base = 's3://ngi-igenomes/igenomes' igenomes_ignore = false + gencode = false + save_reference = false + featurecounts_group_type = 'gene_biotype' + featurecounts_feature_type = 'exon' + // UMI handling + with_umi = false + skip_umi_extract = false + umitools_extract_method = 'string' + umitools_grouping_method = 'directional' + umitools_dedup_stats = false + umitools_bc_pattern = null + umitools_bc_pattern2 = null + umitools_umi_separator = null + umi_discard_read = null + save_umi_intermeds = false + + // Trimming + trimmer = 'trimgalore' + min_trimmed_reads = 10000 + extra_trimgalore_args = null + extra_fastp_args = null + save_trimmed = false + skip_trimming = false + // BBSplit genome filtering + bbsplit_fasta_list = null + save_bbsplit_reads = false + skip_bbsplit = true + + // Ribosomal RNA removal + remove_ribo_rna = true + save_non_ribo_reads = false + ribo_database_manifest = "${projectDir}/assets/rrna-db-defaults.txt" + + // Alignment + aligner = 'star_salmon' + pseudo_aligner = null + pseudo_aligner_kmer_size = 31 + seq_center = null + bam_csi_index = false + star_ignore_sjdbgtf = false + salmon_quant_libtype = null + hisat2_build_memory = '200.GB' // Amount of memory required to build HISAT2 index with splice sites + stringtie_ignore_gtf = false + min_mapped_reads = 5 + extra_star_align_args = null + extra_salmon_quant_args = null + extra_kallisto_quant_args = null + kallisto_quant_fraglen = 200 + kallisto_quant_fraglen_sd = 200 + save_merged_fastq = false + save_unaligned = false + save_align_intermeds = false + skip_markduplicates = false + skip_alignment = true + skip_pseudo_alignment = false + + // QC + skip_qc = false + skip_bigwig = false + skip_stringtie = false + skip_fastqc = false + skip_preseq = true + skip_dupradar = false + skip_qualimap = false + skip_rseqc = false + skip_biotype_qc = false + skip_deseq2_qc = false + skip_multiqc = false + deseq2_vst = true + rseqc_modules = 'bam_stat,inner_distance,infer_experiment,junction_annotation,junction_saturation,read_distribution,read_duplication' + // MultiQC options multiqc_config = null multiqc_title = null @@ -43,6 +119,7 @@ params { custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" config_profile_contact = null config_profile_url = null + test_data_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/riboseq/testdata/' // Max resource options diff --git a/nextflow_schema.json b/nextflow_schema.json index ecc8a6c..800ba5e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -18,6 +18,7 @@ "exists": true, "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", + "schema": "assets/schema_input.json", "description": "Path to comma-separated file containing information about the samples in the experiment.", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/riboseq/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" @@ -52,7 +53,7 @@ "type": "string", "description": "Name of iGenomes reference.", "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + "help_text": "If using a reference genome configured in the pipeline using iGenomes (not recommended), use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, "fasta": { "type": "string", @@ -61,9 +62,148 @@ "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", + "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have the appropriate alignment index available this will be generated for you automatically. Combine with `--save_reference` to save alignment index for future runs.", "fa_icon": "far fa-file-code" }, + "gtf": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", + "pattern": "^\\S+\\.gtf(\\.gz)?$", + "description": "Path to GTF annotation file.", + "fa_icon": "fas fa-code-branch", + "help_text": "This parameter is *mandatory* if `--genome` is not specified." + }, + "gff": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", + "pattern": "^\\S+\\.gff(\\.gz)?$", + "fa_icon": "fas fa-code-branch", + "description": "Path to GFF3 annotation file.", + "help_text": "This parameter must be specified if `--genome` or `--gtf` are not specified." + }, + "gene_bed": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", + "pattern": "^\\S+\\.bed(\\.gz)?$", + "fa_icon": "fas fa-procedures", + "description": "Path to BED file containing gene intervals. This will be created from the GTF file if not specified." + }, + "transcript_fasta": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", + "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", + "fa_icon": "far fa-file-code", + "description": "Path to FASTA transcriptome file." + }, + "additional_fasta": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", + "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", + "fa_icon": "far fa-file-code", + "description": "FASTA file to concatenate to genome FASTA file e.g. containing spike-in sequences.", + "help_text": "If provided, the sequences in this file will get concatenated to the existing genome FASTA file, a GTF file will be automatically created using the entire sequence as the gene, transcript, and exon features, and any alignment index will get created from the combined FASTA and GTF. It is recommended to save the reference with `--save_reference` to re-use the index for future runs so you do not need to create it again." + }, + "splicesites": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", + "fa_icon": "fas fa-hand-scissors", + "description": "Splice sites file required for HISAT2." + }, + "star_index": { + "type": "string", + "format": "path", + "exists": true, + "fa_icon": "fas fa-bezier-curve", + "description": "Path to directory or tar.gz archive for pre-built STAR index." + }, + "hisat2_index": { + "type": "string", + "format": "path", + "exists": true, + "fa_icon": "fas fa-bezier-curve", + "description": "Path to directory or tar.gz archive for pre-built HISAT2 index." + }, + "rsem_index": { + "type": "string", + "format": "path", + "exists": true, + "fa_icon": "fas fa-bezier-curve", + "description": "Path to directory or tar.gz archive for pre-built RSEM index." + }, + "salmon_index": { + "type": "string", + "format": "path", + "exists": true, + "fa_icon": "fas fa-bezier-curve", + "description": "Path to directory or tar.gz archive for pre-built Salmon index." + }, + "kallisto_index": { + "type": "string", + "format": "path", + "exists": true, + "fa_icon": "fas fa-bezier-curve", + "description": "Path to directory or tar.gz archive for pre-built Kallisto index." + }, + "hisat2_build_memory": { + "type": "string", + "default": "200.GB", + "fa_icon": "fas fa-memory", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "description": "Minimum memory required to use splice sites and exons in the HiSAT2 index build process.", + "help_text": "HiSAT2 requires a huge amount of RAM to build a genome index for larger genomes, if including splice sites and exons e.g. the human genome might typically require 200GB. If you specify less than this threshold for the `HISAT2_BUILD` process then the splice sites and exons will be ignored, meaning that the process will require a lot less memory. If you are working with a small genome, set this parameter to a lower value to reduce the threshold for skipping this check. If using a larger genome, consider supplying more memory to the `HISAT2_BUILD` process." + }, + "gencode": { + "type": "boolean", + "fa_icon": "fas fa-code-branch", + "description": "Specify if your GTF annotation is in GENCODE format.", + "help_text": "If your GTF file is in GENCODE format and you would like to run Salmon i.e. `--pseudo_aligner salmon`, you will need to provide this parameter in order to build the Salmon index appropriately." + }, + "gtf_extra_attributes": { + "type": "string", + "default": "gene_name", + "fa_icon": "fas fa-plus-square", + "description": "By default, the pipeline uses the `gene_name` field to obtain additional gene identifiers from the input GTF file when running Salmon.", + "help_text": "This behaviour can be modified by specifying `--gtf_extra_attributes` when running the pipeline. Note that you can also specify more than one desired value, separated by a comma e.g. `--gtf_extra_attributes gene_id,...`.\n" + }, + "gtf_group_features": { + "type": "string", + "default": "gene_id", + "description": "Define the attribute type used to group features in the GTF file when running Salmon.", + "fa_icon": "fas fa-layer-group" + }, + "featurecounts_group_type": { + "type": "string", + "default": "gene_biotype", + "fa_icon": "fas fa-layer-group", + "description": "The attribute type used to group feature types in the GTF file when generating the biotype plot with featureCounts." + }, + "featurecounts_feature_type": { + "type": "string", + "default": "exon", + "description": "By default, the pipeline assigns reads based on the 'exon' attribute within the GTF file.", + "fa_icon": "fas fa-indent", + "help_text": "The feature type used from the GTF file when generating the biotype plot with featureCounts." + }, + "igenomes_base": { + "type": "string", + "format": "directory-path", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes", + "fa_icon": "fas fa-cloud-download-alt", + "hidden": true + }, "igenomes_ignore": { "type": "boolean", "description": "Do not load the iGenomes reference config.", @@ -71,6 +211,421 @@ "hidden": true, "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." } + }, + "required": ["fasta"] + }, + "read_trimming_options": { + "title": "Read trimming options", + "type": "object", + "fa_icon": "fas fa-cut", + "description": "Options to adjust read trimming criteria.", + "properties": { + "trimmer": { + "type": "string", + "default": "trimgalore", + "description": "Specifies the trimming tool to use - available options are 'trimgalore' and 'fastp'.", + "fa_icon": "fas fa-cut", + "enum": ["trimgalore", "fastp"] + }, + "extra_trimgalore_args": { + "type": "string", + "description": "Extra arguments to pass to Trim Galore! command in addition to defaults defined by the pipeline.", + "fa_icon": "fas fa-plus" + }, + "extra_fastp_args": { + "type": "string", + "description": "Extra arguments to pass to fastp command in addition to defaults defined by the pipeline.", + "fa_icon": "fas fa-plus" + }, + "min_trimmed_reads": { + "type": "integer", + "default": 10000, + "fa_icon": "fas fa-hand-paper", + "description": "Minimum number of trimmed reads below which samples are removed from further processing. Some downstream steps in the pipeline will fail if this threshold is too low." + } + } + }, + "read_filtering_options": { + "title": "Read filtering options", + "type": "object", + "description": "Options for filtering reads prior to alignment", + "default": "", + "properties": { + "bbsplit_fasta_list": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", + "fa_icon": "fas fa-list-alt", + "description": "Path to comma-separated file containing a list of reference genomes to filter reads against with BBSplit. You have to also explicitly set `--skip_bbsplit false` if you want to use BBSplit.", + "help_text": "The file should contain 2 columns: short name and full path to reference genome(s) e.g. \n```\nmm10,/path/to/mm10.fa\necoli,/path/to/ecoli.fa\n```" + }, + "bbsplit_index": { + "type": "string", + "format": "path", + "exists": true, + "fa_icon": "fas fa-bezier-curve", + "description": "Path to directory or tar.gz archive for pre-built BBSplit index.", + "help_text": "The BBSplit index will have to be built at least once with this pipeline (see `--save_reference` to save index). It can then be provided via `--bbsplit_index` for future runs." + }, + "remove_ribo_rna": { + "type": "boolean", + "fa_icon": "fas fa-trash-alt", + "description": "Enable the removal of reads derived from ribosomal RNA using SortMeRNA.", + "help_text": "Any patterns found in the sequences defined by the '--ribo_database_manifest' parameter will be used." + }, + "ribo_database_manifest": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", + "default": "${projectDir}/assets/rrna-db-defaults.txt", + "fa_icon": "fas fa-database", + "description": "Text file containing paths to fasta files (one per line) that will be used to create the database for SortMeRNA.", + "help_text": "By default, [rRNA databases](https://github.com/biocore/sortmerna/tree/master/data/rRNA_databases) defined in the SortMeRNA GitHub repo are used. You can see an example in the pipeline Github repository in `assets/rrna-default-dbs.txt`.\nPlease note that commercial/non-academic entities require [`licensing for SILVA`](https://www.arb-silva.de/silva-license-information) for these default databases." + } + }, + "fa_icon": "fas fa-trash-alt" + }, + "umi_options": { + "title": "UMI options", + "type": "object", + "description": "Options for processing reads with unique molecular identifiers", + "default": "", + "properties": { + "with_umi": { + "type": "boolean", + "fa_icon": "fas fa-barcode", + "description": "Enable UMI-based read deduplication." + }, + "umitools_extract_method": { + "type": "string", + "default": "string", + "fa_icon": "fas fa-barcode", + "description": "UMI pattern to use. Can be either 'string' (default) or 'regex'.", + "help_text": "More details can be found in the [UMI-tools documentation](https://umi-tools.readthedocs.io/en/latest/reference/extract.html#extract-method).\n" + }, + "umitools_bc_pattern": { + "type": "string", + "fa_icon": "fas fa-barcode", + "help_text": "More details can be found in the [UMI-tools documentation](https://umi-tools.readthedocs.io/en/latest/reference/extract.html#extract-method).", + "description": "The UMI barcode pattern to use e.g. 'NNNNNN' indicates that the first 6 nucleotides of the read are from the UMI." + }, + "umitools_bc_pattern2": { + "type": "string", + "fa_icon": "fas fa-barcode", + "description": "The UMI barcode pattern to use if the UMI is located in read 2." + }, + "umi_discard_read": { + "type": "integer", + "fa_icon": "fas fa-barcode", + "description": "After UMI barcode extraction discard either R1 or R2 by setting this parameter to 1 or 2, respectively." + }, + "umitools_umi_separator": { + "type": "string", + "fa_icon": "fas fa-star-half-alt", + "description": "The character that separates the UMI in the read name. Most likely a colon if you skipped the extraction with UMI-tools and used other software." + }, + "umitools_grouping_method": { + "type": "string", + "default": "directional", + "fa_icon": "far fa-object-ungroup", + "description": "Method to use to determine read groups by subsuming those with similar UMIs. All methods start by identifying the reads with the same mapping position, but treat similar yet nonidentical UMIs differently.", + "enum": ["unique", "percentile", "cluster", "adjacency", "directional"] + }, + "umitools_dedup_stats": { + "type": "boolean", + "fa_icon": "fas fa-barcode", + "help_text": "It can be quite time consuming generating these output stats - see [#827](https://github.com/nf-core/rnaseq/issues/827).", + "description": "Generate output stats when running \"umi_tools dedup\"." + } + }, + "fa_icon": "fas fa-barcode" + }, + "alignment_options": { + "title": "Alignment options", + "type": "object", + "fa_icon": "fas fa-map-signs", + "description": "Options to adjust parameters and filtering criteria for read alignments.", + "properties": { + "aligner": { + "type": "string", + "default": "star_salmon", + "description": "Specifies the alignment algorithm to use - available options are 'star_salmon', 'star_rsem' and 'hisat2'.", + "fa_icon": "fas fa-map-signs", + "enum": ["star_salmon", "star_rsem", "hisat2"] + }, + "pseudo_aligner": { + "type": "string", + "description": "Specifies the pseudo aligner to use - available options are 'salmon'. Runs in addition to '--aligner'.", + "fa_icon": "fas fa-hamburger", + "enum": ["salmon", "kallisto"] + }, + "pseudo_aligner_kmer_size": { + "type": "integer", + "default": 31, + "description": "Kmer length passed to indexing step of pseudoaligners", + "help_text": "Failure to set a good kmer size could cause issues with quantification with Kallisto or Salmon. This is mostly an issue for short reads (<50bp), where the default kmer size of 31 is an problem.", + "fa_icon": "fas fa-ruler-horizontal" + }, + "bam_csi_index": { + "type": "boolean", + "description": "Create a CSI index for BAM files instead of the traditional BAI index. This will be required for genomes with larger chromosome sizes.", + "fa_icon": "fas fa-sort-alpha-down" + }, + "star_ignore_sjdbgtf": { + "type": "boolean", + "fa_icon": "fas fa-ban", + "description": "When using pre-built STAR indices do not re-extract and use splice junctions from the GTF file." + }, + "salmon_quant_libtype": { + "type": "string", + "fa_icon": "fas fa-fast-forward", + "description": " Override Salmon library type inferred based on strandedness defined in meta object.", + "help_text": "See [Salmon docs](https://salmon.readthedocs.io/en/latest/library_type.html).", + "enum": [ + "A", + "IS", + "ISF", + "ISR", + "IU", + "MS", + "MSF", + "MSR", + "MU", + "OS", + "OSF", + "OSR", + "OU", + "SF", + "SR", + "U" + ] + }, + "min_mapped_reads": { + "type": "number", + "default": 5, + "fa_icon": "fas fa-percentage", + "description": "Minimum percentage of uniquely mapped reads below which samples are removed from further processing.", + "help_text": "Some downstream steps in the pipeline will fail if this threshold is too low." + }, + "seq_center": { + "type": "string", + "description": "Sequencing center information to be added to read group of BAM files.", + "fa_icon": "fas fa-synagogue" + }, + "stringtie_ignore_gtf": { + "type": "boolean", + "description": "Perform reference-guided de novo assembly of transcripts using StringTie i.e. dont restrict to those in GTF file.", + "fa_icon": "fas fa-ban" + }, + "extra_star_align_args": { + "type": "string", + "description": "Extra arguments to pass to STAR alignment command in addition to defaults defined by the pipeline. Only available for the STAR-Salmon route.", + "fa_icon": "fas fa-plus" + }, + "extra_salmon_quant_args": { + "type": "string", + "description": "Extra arguments to pass to Salmon quant command in addition to defaults defined by the pipeline.", + "fa_icon": "fas fa-plus" + }, + "extra_kallisto_quant_args": { + "type": "string", + "description": "Extra arguments to pass to Kallisto quant command in addition to defaults defined by the pipeline.", + "fa_icon": "fas fa-plus" + }, + "kallisto_quant_fraglen": { + "type": "integer", + "description": "In single-end mode Kallisto requires an estimated fragment length. Specify a default value for that here. TODO: use existing RSeQC results to do this dynamically.", + "default": 200, + "fa_icon": "fas fa-ruler-horizontal" + }, + "kallisto_quant_fraglen_sd": { + "type": "integer", + "description": "In single-end mode, Kallisto requires an estimated standard error for fragment length. Specify a default value for that here. TODO: use existing RSeQC results to do this dynamically.", + "default": 200, + "fa_icon": "fas fa-sort-amount-up-alt" + } + } + }, + "optional_outputs": { + "title": "Optional outputs", + "type": "object", + "description": "Additional output files produces as intermediates that can be saved", + "default": "", + "properties": { + "save_merged_fastq": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Save FastQ files after merging re-sequenced libraries in the results directory." + }, + "save_umi_intermeds": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "If this option is specified, intermediate FastQ and BAM files produced by UMI-tools are also saved in the results directory." + }, + "save_non_ribo_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "If this option is specified, intermediate FastQ files containing non-rRNA reads will be saved in the results directory." + }, + "save_bbsplit_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "If this option is specified, FastQ files split by reference will be saved in the results directory." + }, + "save_reference": { + "type": "boolean", + "description": "If generated by the pipeline save the STAR index in the results directory.", + "help_text": "If an alignment index is generated by the pipeline use this parameter to save it to your results folder. These can then be used for future pipeline runs, reducing processing times.", + "fa_icon": "fas fa-save" + }, + "save_trimmed": { + "type": "boolean", + "description": "Save the trimmed FastQ files in the results directory.", + "help_text": "By default, trimmed FastQ files will not be saved to the results directory. Specify this flag (or set to true in your config file) to copy these files to the results directory when complete.", + "fa_icon": "fas fa-save" + }, + "save_align_intermeds": { + "type": "boolean", + "description": "Save the intermediate BAM files from the alignment step.", + "help_text": "By default, intermediate BAM files will not be saved. The final BAM files created after the appropriate filtering step are always saved to limit storage usage. Set this parameter to also save other intermediate BAM files.", + "fa_icon": "fas fa-save" + }, + "save_unaligned": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Where possible, save unaligned reads from either STAR, HISAT2 or Salmon to the results directory.", + "help_text": "This may either be in the form of FastQ or BAM files depending on the options available for that particular tool." + } + } + }, + "quality_control": { + "title": "Quality Control", + "type": "object", + "description": "Additional quality control options.", + "default": "", + "properties": { + "deseq2_vst": { + "type": "boolean", + "description": "Use vst transformation instead of rlog with DESeq2.", + "help_text": "See [DESeq2 docs](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization).", + "fa_icon": "fas fa-dolly", + "default": true + }, + "rseqc_modules": { + "type": "string", + "default": "bam_stat,inner_distance,infer_experiment,junction_annotation,junction_saturation,read_distribution,read_duplication", + "fa_icon": "fas fa-chart-pie", + "description": "Specify the RSeQC modules to run." + } + } + }, + "process_skipping_options": { + "title": "Process skipping options", + "type": "object", + "fa_icon": "fas fa-fast-forward", + "description": "Options to skip various steps within the workflow.", + "properties": { + "skip_gtf_filter": { + "type": "boolean", + "fa_icon": "fas fa-forward", + "description": "Skip filtering of GTF for valid scaffolds and/ or transcript IDs.", + "help_text": "If you're confident on the validity of the GTF with respect to the genome fasta file, or wish to disregard failures thriggered by the filtering module, activate this option." + }, + "skip_gtf_transcript_filter": { + "type": "boolean", + "fa_icon": "fas fa-forward", + "description": "Skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline." + }, + "skip_bbsplit": { + "type": "boolean", + "default": true, + "fa_icon": "fas fa-fast-forward", + "description": "Skip BBSplit for removal of non-reference genome reads." + }, + "skip_umi_extract": { + "type": "boolean", + "fa_icon": "fas fa-compress-alt", + "description": "Skip the UMI extraction from the read in case the UMIs have been moved to the headers in advance of the pipeline run." + }, + "skip_trimming": { + "type": "boolean", + "description": "Skip the adapter trimming step.", + "help_text": "Use this if your input FastQ files have already been trimmed outside of the workflow or if you're very confident that there is no adapter contamination in your data.", + "fa_icon": "fas fa-fast-forward" + }, + "skip_alignment": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "description": "Skip all of the alignment-based processes within the pipeline." + }, + "skip_pseudo_alignment": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "description": "Skip all of the pseudoalignment-based processes within the pipeline." + }, + "skip_markduplicates": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "description": "Skip picard MarkDuplicates step." + }, + "skip_bigwig": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "description": "Skip bigWig file creation." + }, + "skip_stringtie": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "description": "Skip StringTie." + }, + "skip_fastqc": { + "type": "boolean", + "description": "Skip FastQC.", + "fa_icon": "fas fa-fast-forward" + }, + "skip_preseq": { + "type": "boolean", + "description": "Skip Preseq.", + "fa_icon": "fas fa-fast-forward", + "default": true + }, + "skip_dupradar": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "description": "Skip dupRadar." + }, + "skip_qualimap": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "description": "Skip Qualimap." + }, + "skip_rseqc": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "description": "Skip RSeQC." + }, + "skip_biotype_qc": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "description": "Skip additional featureCounts process for biotype QC." + }, + "skip_deseq2_qc": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "description": "Skip DESeq2 PCA and heatmap plotting." + }, + "skip_multiqc": { + "type": "boolean", + "description": "Skip MultiQC.", + "fa_icon": "fas fa-fast-forward" + }, + "skip_qc": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "description": "Skip all QC steps except for MultiQC." + } } }, "institutional_config_options": { @@ -118,6 +673,13 @@ "description": "Institutional config URL link.", "hidden": true, "fa_icon": "fas fa-users-cog" + }, + "test_data_base": { + "type": "string", + "default": "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq3", + "description": "Base path / URL for data used in the test profiles", + "help_text": "Warning: The `-profile test` samplesheet file itself contains remote paths. Setting this parameter does not alter the contents of that file.", + "hidden": true } } }, @@ -201,7 +763,6 @@ "max_multiqc_email_size": { "type": "string", "description": "File size limit when attaching MultiQC reports to summary emails.", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", "default": "25.MB", "fa_icon": "fas fa-file-upload", "hidden": true @@ -222,18 +783,26 @@ "multiqc_config": { "type": "string", "format": "file-path", + "exists": true, + "mimetype": "text/plain", "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true }, "multiqc_logo": { "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", "fa_icon": "fas fa-image", "hidden": true }, "multiqc_methods_description": { "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", "description": "Custom MultiQC yaml file containing HTML including a methods description.", "fa_icon": "fas fa-cog" }, @@ -275,6 +844,27 @@ { "$ref": "#/definitions/reference_genome_options" }, + { + "$ref": "#/definitions/read_trimming_options" + }, + { + "$ref": "#/definitions/read_filtering_options" + }, + { + "$ref": "#/definitions/umi_options" + }, + { + "$ref": "#/definitions/alignment_options" + }, + { + "$ref": "#/definitions/optional_outputs" + }, + { + "$ref": "#/definitions/quality_control" + }, + { + "$ref": "#/definitions/process_skipping_options" + }, { "$ref": "#/definitions/institutional_config_options" }, diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf new file mode 100644 index 0000000..0840c77 --- /dev/null +++ b/subworkflows/local/prepare_genome/main.nf @@ -0,0 +1,304 @@ +// +// Uncompress and prepare reference genome files +// + +include { GUNZIP as GUNZIP_FASTA } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_GTF } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_GFF } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_GENE_BED } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_TRANSCRIPT_FASTA } from '../../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_ADDITIONAL_FASTA } from '../../../modules/nf-core/gunzip' + +include { UNTAR as UNTAR_BBSPLIT_INDEX } from '../../../modules/nf-core/untar' +include { UNTAR as UNTAR_STAR_INDEX } from '../../../modules/nf-core/untar' +include { UNTAR as UNTAR_RSEM_INDEX } from '../../../modules/nf-core/untar' +include { UNTAR as UNTAR_HISAT2_INDEX } from '../../../modules/nf-core/untar' +include { UNTAR as UNTAR_SALMON_INDEX } from '../../../modules/nf-core/untar' +include { UNTAR as UNTAR_KALLISTO_INDEX } from '../../../modules/nf-core/untar' + +include { CUSTOM_GETCHROMSIZES } from '../../../modules/nf-core/custom/getchromsizes' +include { GFFREAD } from '../../../modules/nf-core/gffread' +include { BBMAP_BBSPLIT } from '../../../modules/nf-core/bbmap/bbsplit' +include { STAR_GENOMEGENERATE } from '../../../modules/nf-core/star/genomegenerate' +include { HISAT2_EXTRACTSPLICESITES } from '../../../modules/nf-core/hisat2/extractsplicesites' +include { HISAT2_BUILD } from '../../../modules/nf-core/hisat2/build' +include { SALMON_INDEX } from '../../../modules/nf-core/salmon/index' +include { KALLISTO_INDEX } from '../../../modules/nf-core/kallisto/index' +include { RSEM_PREPAREREFERENCE as RSEM_PREPAREREFERENCE_GENOME } from '../../../modules/nf-core/rsem/preparereference' +include { RSEM_PREPAREREFERENCE as MAKE_TRANSCRIPTS_FASTA } from '../../../modules/nf-core/rsem/preparereference' + +include { PREPROCESS_TRANSCRIPTS_FASTA_GENCODE } from '../../../modules/local/preprocess_transcripts_fasta_gencode' +include { GTF2BED } from '../../../modules/local/gtf2bed' +include { CAT_ADDITIONAL_FASTA } from '../../../modules/local/cat_additional_fasta' +include { GTF_FILTER } from '../../../modules/local/gtf_filter' +include { STAR_GENOMEGENERATE_IGENOMES } from '../../../modules/local/star_genomegenerate_igenomes' + +workflow PREPARE_GENOME { + take: + fasta // file: /path/to/genome.fasta + gtf // file: /path/to/genome.gtf + gff // file: /path/to/genome.gff + additional_fasta // file: /path/to/additional.fasta + transcript_fasta // file: /path/to/transcript.fasta + gene_bed // file: /path/to/gene.bed + splicesites // file: /path/to/splicesites.txt + bbsplit_fasta_list // file: /path/to/bbsplit_fasta_list.txt + star_index // directory: /path/to/star/index/ + rsem_index // directory: /path/to/rsem/index/ + salmon_index // directory: /path/to/salmon/index/ + kallisto_index // directory: /path/to/kallisto/index/ + hisat2_index // directory: /path/to/hisat2/index/ + bbsplit_index // directory: /path/to/rsem/index/ + gencode // boolean: whether the genome is from GENCODE + is_aws_igenome // boolean: whether the genome files are from AWS iGenomes + biotype // string: if additional fasta file is provided biotype value to use when appending entries to GTF file + prepare_tool_indices // list: tools to prepare indices for + filter_gtf // boolean: whether to filter GTF file + + main: + + ch_versions = Channel.empty() + + // + // Uncompress genome fasta file if required + // + if (fasta.endsWith('.gz')) { + ch_fasta = GUNZIP_FASTA ( [ [:], fasta ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions) + } else { + ch_fasta = Channel.value(file(fasta)) + } + + // + // Uncompress GTF annotation file or create from GFF3 if required + // + if (gtf || gff) { + if (gtf) { + if (gtf.endsWith('.gz')) { + ch_gtf = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) + } else { + ch_gtf = Channel.value(file(gtf)) + } + } else if (gff) { + if (gff.endsWith('.gz')) { + ch_gff = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) + } else { + ch_gff = Channel.value(file(gff)) + } + ch_gtf = GFFREAD ( ch_gff ).gtf + ch_versions = ch_versions.mix(GFFREAD.out.versions) + } + + if (filter_gtf) { + GTF_FILTER ( ch_fasta, ch_gtf ) + ch_gtf = GTF_FILTER.out.genome_gtf + ch_versions = ch_versions.mix(GTF_FILTER.out.versions) + } + } + + // + // Uncompress additional fasta file and concatenate with reference fasta and gtf files + // + if (additional_fasta) { + if (additional_fasta.endsWith('.gz')) { + ch_add_fasta = GUNZIP_ADDITIONAL_FASTA ( [ [:], additional_fasta ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_ADDITIONAL_FASTA.out.versions) + } else { + ch_add_fasta = Channel.value(file(additional_fasta)) + } + CAT_ADDITIONAL_FASTA ( ch_fasta, ch_gtf, ch_add_fasta, biotype ) + ch_fasta = CAT_ADDITIONAL_FASTA.out.fasta + ch_gtf = CAT_ADDITIONAL_FASTA.out.gtf + ch_versions = ch_versions.mix(CAT_ADDITIONAL_FASTA.out.versions) + } + + // + // Uncompress gene BED annotation file or create from GTF if required + // + if (gene_bed) { + if (gene_bed.endsWith('.gz')) { + ch_gene_bed = GUNZIP_GENE_BED ( [ [:], gene_bed ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_GENE_BED.out.versions) + } else { + ch_gene_bed = Channel.value(file(gene_bed)) + } + } else { + ch_gene_bed = GTF2BED ( ch_gtf ).bed + ch_versions = ch_versions.mix(GTF2BED.out.versions) + } + + // + // Uncompress transcript fasta file / create if required + // + if (transcript_fasta) { + if (transcript_fasta.endsWith('.gz')) { + ch_transcript_fasta = GUNZIP_TRANSCRIPT_FASTA ( [ [:], transcript_fasta ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_TRANSCRIPT_FASTA.out.versions) + } else { + ch_transcript_fasta = Channel.value(file(transcript_fasta)) + } + if (gencode) { + PREPROCESS_TRANSCRIPTS_FASTA_GENCODE ( ch_transcript_fasta ) + ch_transcript_fasta = PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.fasta + ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions) + } + } else { + ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf ).transcript_fasta + ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions) + } + + // + // Create chromosome sizes file + // + CUSTOM_GETCHROMSIZES ( ch_fasta.map { [ [:], it ] } ) + ch_fai = CUSTOM_GETCHROMSIZES.out.fai.map { it[1] } + ch_chrom_sizes = CUSTOM_GETCHROMSIZES.out.sizes.map { it[1] } + ch_versions = ch_versions.mix(CUSTOM_GETCHROMSIZES.out.versions) + + // + // Uncompress BBSplit index or generate from scratch if required + // + ch_bbsplit_index = Channel.empty() + if ('bbsplit' in prepare_tool_indices) { + if (bbsplit_index) { + if (bbsplit_index.endsWith('.tar.gz')) { + ch_bbsplit_index = UNTAR_BBSPLIT_INDEX ( [ [:], bbsplit_index ] ).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR_BBSPLIT_INDEX.out.versions) + } else { + ch_bbsplit_index = Channel.value(file(bbsplit_index)) + } + } else { + Channel + .from(file(bbsplit_fasta_list)) + .splitCsv() // Read in 2 column csv file: short_name,path_to_fasta + .flatMap { id, fasta -> [ [ 'id', id ], [ 'fasta', file(fasta, checkIfExists: true) ] ] } // Flatten entries to be able to groupTuple by a common key + .groupTuple() + .map { it -> it[1] } // Get rid of keys and keep grouped values + .collect { [ it ] } // Collect entries as a list to pass as "tuple val(short_names), path(path_to_fasta)" to module + .set { ch_bbsplit_fasta_list } + + ch_bbsplit_index = BBMAP_BBSPLIT ( [ [:], [] ], [], ch_fasta, ch_bbsplit_fasta_list, true ).index + ch_versions = ch_versions.mix(BBMAP_BBSPLIT.out.versions) + } + } + + // + // Uncompress STAR index or generate from scratch if required + // + ch_star_index = Channel.empty() + if ('star_salmon' in prepare_tool_indices) { + if (star_index) { + if (star_index.endsWith('.tar.gz')) { + ch_star_index = UNTAR_STAR_INDEX ( [ [:], star_index ] ).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR_STAR_INDEX.out.versions) + } else { + ch_star_index = Channel.value(file(star_index)) + } + } else { + if (is_aws_igenome) { + ch_star_index = STAR_GENOMEGENERATE_IGENOMES ( ch_fasta, ch_gtf ).index + ch_versions = ch_versions.mix(STAR_GENOMEGENERATE_IGENOMES.out.versions) + } else { + ch_star_index = STAR_GENOMEGENERATE ( ch_fasta.map { [ [:], it ] }, ch_gtf.map { [ [:], it ] } ).index.map { it[1] } + ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) + } + } + } + + // + // Uncompress RSEM index or generate from scratch if required + // + ch_rsem_index = Channel.empty() + if ('star_rsem' in prepare_tool_indices) { + if (rsem_index) { + if (rsem_index.endsWith('.tar.gz')) { + ch_rsem_index = UNTAR_RSEM_INDEX ( [ [:], rsem_index ] ).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR_RSEM_INDEX.out.versions) + } else { + ch_rsem_index = Channel.value(file(rsem_index)) + } + } else { + ch_rsem_index = RSEM_PREPAREREFERENCE_GENOME ( ch_fasta, ch_gtf ).index + ch_versions = ch_versions.mix(RSEM_PREPAREREFERENCE_GENOME.out.versions) + } + } + + // + // Uncompress HISAT2 index or generate from scratch if required + // + ch_splicesites = Channel.empty() + ch_hisat2_index = Channel.empty() + if ('hisat2' in prepare_tool_indices) { + if (!splicesites) { + ch_splicesites = HISAT2_EXTRACTSPLICESITES ( ch_gtf.map { [ [:], it ] } ).txt.map { it[1] } + ch_versions = ch_versions.mix(HISAT2_EXTRACTSPLICESITES.out.versions) + } else { + ch_splicesites = Channel.value(file(splicesites)) + } + if (hisat2_index) { + if (hisat2_index.endsWith('.tar.gz')) { + ch_hisat2_index = UNTAR_HISAT2_INDEX ( [ [:], hisat2_index ] ).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR_HISAT2_INDEX.out.versions) + } else { + ch_hisat2_index = Channel.value(file(hisat2_index)) + } + } else { + ch_hisat2_index = HISAT2_BUILD ( ch_fasta.map { [ [:], it ] }, ch_gtf.map { [ [:], it ] }, ch_splicesites.map { [ [:], it ] } ).index.map { it[1] } + ch_versions = ch_versions.mix(HISAT2_BUILD.out.versions) + } + } + + // + // Uncompress Salmon index or generate from scratch if required + // + ch_salmon_index = Channel.empty() + if (salmon_index) { + if (salmon_index.endsWith('.tar.gz')) { + ch_salmon_index = UNTAR_SALMON_INDEX ( [ [:], salmon_index ] ).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR_SALMON_INDEX.out.versions) + } else { + ch_salmon_index = Channel.value(file(salmon_index)) + } + } else { + if ('salmon' in prepare_tool_indices) { + ch_salmon_index = SALMON_INDEX ( ch_fasta, ch_transcript_fasta ).index + ch_versions = ch_versions.mix(SALMON_INDEX.out.versions) + } + } + + // + // Uncompress Kallisto index or generate from scratch if required + // + ch_kallisto_index = Channel.empty() + if (kallisto_index) { + if (kallisto_index.endsWith('.tar.gz')) { + ch_kallisto_index = UNTAR_KALLISTO_INDEX ( [ [:], kallisto_index ] ).untar + ch_versions = ch_versions.mix(UNTAR_KALLISTO_INDEX.out.versions) + } else { + ch_kallisto_index = Channel.value([[:], file(kallisto_index)]) + } + } else { + if ('kallisto' in prepare_tool_indices) { + ch_kallisto_index = KALLISTO_INDEX ( ch_transcript_fasta.map{[ [:], it]} ).index + ch_versions = ch_versions.mix(KALLISTO_INDEX.out.versions) + } + } + + emit: + fasta = ch_fasta // channel: path(genome.fasta) + gtf = ch_gtf // channel: path(genome.gtf) + fai = ch_fai // channel: path(genome.fai) + gene_bed = ch_gene_bed // channel: path(gene.bed) + transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta) + chrom_sizes = ch_chrom_sizes // channel: path(genome.sizes) + splicesites = ch_splicesites // channel: path(genome.splicesites.txt) + bbsplit_index = ch_bbsplit_index // channel: path(bbsplit/index/) + star_index = ch_star_index // channel: path(star/index/) + rsem_index = ch_rsem_index // channel: path(rsem/index/) + hisat2_index = ch_hisat2_index // channel: path(hisat2/index/) + salmon_index = ch_salmon_index // channel: path(salmon/index/) + kallisto_index = ch_kallisto_index // channel: [ meta, path(kallisto/index/) ] + versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] +} diff --git a/subworkflows/local/preprocess_rnaseq/main.nf b/subworkflows/local/preprocess_rnaseq/main.nf new file mode 100644 index 0000000..6de3371 --- /dev/null +++ b/subworkflows/local/preprocess_rnaseq/main.nf @@ -0,0 +1,252 @@ +import groovy.json.JsonSlurper + +include { CAT_FASTQ } from '../../../modules/nf-core/cat/fastq/main' +include { FASTQC } from '../../../modules/nf-core/fastqc/main' +include { SORTMERNA } from '../../../modules/nf-core/sortmerna/main' + +include { FASTQ_SUBSAMPLE_FQ_SALMON } from '../../../subworkflows/nf-core/fastq_subsample_fq_salmon' +include { FASTQ_FASTQC_UMITOOLS_TRIMGALORE } from '../../../subworkflows/nf-core/fastq_fastqc_umitools_trimgalore' +include { FASTQ_FASTQC_UMITOOLS_FASTP } from '../../../subworkflows/nf-core/fastq_fastqc_umitools_fastp' + +def pass_trimmed_reads = [:] + +public static String getSalmonInferredStrandedness(json_file) { + def lib_type = new JsonSlurper().parseText(json_file.text).get('library_types')[0] + def strandedness = 'reverse' + if (lib_type) { + if (lib_type in ['U', 'IU']) { + strandedness = 'unstranded' + } else if (lib_type in ['SF', 'ISF']) { + strandedness = 'forward' + } else if (lib_type in ['SR', 'ISR']) { + strandedness = 'reverse' + } + } + return strandedness +} + +// +// Create MultiQC tsv custom content from a list of values +// +public static String multiqcTsvFromList(tsv_data, header) { + def tsv_string = "" + if (tsv_data.size() > 0) { + tsv_string += "${header.join('\t')}\n" + tsv_string += tsv_data.join('\n') + } + return tsv_string +} + +workflow PREPROCESS_RNASEQ { + + take: + ch_reads // channel: [ val(meta), [ reads ] ] + ch_fasta // channel: /path/to/genome.fasta + ch_transcript_fasta // channel: /path/to/transcript.fasta + ch_gtf // channel: /path/to/genome.gtf + make_salmon_index // boolean: Whether to create salmon index before running salmon quant + ch_salmon_index // channel: /path/to/salmon/index/ (optional) + skip_bbsplit // boolean: Skip BBSplit for removal of non-reference genome reads. + ch_bbsplit_index // channel: /path/to/bbsplit/index/ (optional) + skip_fastqc // boolean: true/false + skip_trimming // boolean: true/false + trimmer // string: 'fastp' or 'trimgalore' + min_trimmed_reads // integer: > 0 + save_trimmed // boolean: true/false + remove_ribo_rna // boolean: true/false: whether to run sortmerna to remove rrnas + ch_ribo_db // Text file containing paths to fasta files (one per line) that will be used to create the database for SortMeRNA. (optional) + with_umi // boolean: true/false: Enable UMI-based read deduplication. + skip_umi_extract // boolean: true/false + umi_discard_read // integer: 0, 1 or 2 + + main: + + ch_versions = Channel.empty() + ch_filtered_reads = Channel.empty() + ch_trim_read_count = Channel.empty() + ch_multiqc_files = Channel.empty() + + ch_reads + .branch { + meta, fastqs -> + single : fastqs.size() == 1 + return [ meta, fastqs.flatten() ] + multiple: fastqs.size() > 1 + return [ meta, fastqs.flatten() ] + } + .set { ch_fastq } + + // + // MODULE: Concatenate FastQ files from same sample if required + // + CAT_FASTQ ( + ch_fastq.multiple + ) + .reads + .mix(ch_fastq.single) + .set { ch_filtered_reads } + + ch_versions = ch_versions.mix(CAT_FASTQ.out.versions.first().ifEmpty(null)) + + // + // MODULE: Remove ribosomal RNA reads + // + if (remove_ribo_rna) { + ch_sortmerna_fastas = Channel.from(ch_ribo_db.readLines()).map { row -> file(row, checkIfExists: true) }.collect() + + SORTMERNA ( + ch_filtered_reads, + ch_sortmerna_fastas + ) + .reads + .set { ch_filtered_reads } + + ch_multiqc_files = ch_multiqc_files.mix(SORTMERNA.out.log.map{it[1]}) + + ch_versions = ch_versions.mix(SORTMERNA.out.versions.first()) + } + + // + // SUBWORKFLOW: Read QC, extract UMI and trim adapters with TrimGalore! + // + if (trimmer == 'trimgalore') { + FASTQ_FASTQC_UMITOOLS_TRIMGALORE ( + ch_filtered_reads, + skip_fastqc, + with_umi, + skip_umi_extract, + skip_trimming, + umi_discard_read, + min_trimmed_reads + ) + ch_filtered_reads = FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.reads + ch_trim_read_count = FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_read_count + + ch_versions = ch_versions.mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.versions) + ch_multiqc_files = FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.fastqc_zip + .mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_zip) + .mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_log) + .map{it[1]} + .mix(ch_multiqc_files) + } + + // + // SUBWORKFLOW: Read QC, extract UMI and trim adapters with fastp + // + if (trimmer == 'fastp') { + FASTQ_FASTQC_UMITOOLS_FASTP ( + ch_filtered_reads, + skip_fastqc, + with_umi, + skip_umi_extract, + umi_discard_read, + skip_trimming, + [], + save_trimmed, + save_trimmed, + min_trimmed_reads + ) + ch_filtered_reads = FASTQ_FASTQC_UMITOOLS_FASTP.out.reads + ch_trim_read_count = FASTQ_FASTQC_UMITOOLS_FASTP.out.trim_read_count + ch_versions = ch_versions.mix(FASTQ_FASTQC_UMITOOLS_FASTP.out.versions) + + ch_multiqc_files = FASTQ_FASTQC_UMITOOLS_FASTP.out.fastqc_zip + .mix(FASTQ_FASTQC_UMITOOLS_FASTP.out.trim_zip) + .mix(FASTQ_FASTQC_UMITOOLS_FASTP.out.trim_log) + .map{it[1]} + .mix(ch_multiqc_files) + } + + // + // Get list of samples that failed trimming threshold for MultiQC report + // + + ch_trim_read_count + .map { + meta, num_reads -> + pass_trimmed_reads[meta.id] = true + if (num_reads <= min_trimmed_reads.toFloat()) { + pass_trimmed_reads[meta.id] = false + return [ "$meta.id\t$num_reads" ] + } + } + .collect() + .map { + tsv_data -> + def header = ["Sample", "Reads after trimming"] + multiqcTsvFromList(tsv_data, header) + } + .set { ch_fail_trimming_multiqc } + + ch_multiqc_files = ch_multiqc_files + .mix( + ch_fail_trimming_multiqc.collectFile(name: 'fail_trimmed_samples_mqc.tsv').ifEmpty([]) + ) + + // + // MODULE: Remove genome contaminant reads + // + if (!skip_bbsplit) { + BBMAP_BBSPLIT ( + ch_filtered_reads, + ch_bbsplit_index, + [], + [ [], [] ], + false + ) + .primary_fastq + .set { ch_filtered_reads } + ch_versions = ch_versions.mix(BBMAP_BBSPLIT.out.versions.first()) + } + + // Branch FastQ channels if 'auto' specified to infer strandedness + ch_filtered_reads + .branch { + meta, fastq -> + auto_strand : meta.strandedness == 'auto' + return [ meta, fastq ] + known_strand: meta.strandedness != 'auto' + return [ meta, fastq ] + } + .set { ch_strand_fastq } + + // + // SUBWORKFLOW: Sub-sample FastQ files and pseudoalign with Salmon to auto-infer strandedness + // + // Return empty channel if ch_strand_fastq.auto_strand is empty so salmon index isn't created + + ch_fasta + .combine(ch_strand_fastq.auto_strand) + .map { it.first() } + .first() + .set { ch_genome_fasta } + + FASTQ_SUBSAMPLE_FQ_SALMON ( + ch_strand_fastq.auto_strand, + ch_genome_fasta, + ch_transcript_fasta, + ch_gtf, + ch_salmon_index, + make_salmon_index + ) + ch_versions = ch_versions.mix(FASTQ_SUBSAMPLE_FQ_SALMON.out.versions) + + FASTQ_SUBSAMPLE_FQ_SALMON + .out + .json_info + .join(ch_strand_fastq.auto_strand) + .map { meta, json, reads -> + return [ meta + [ strandedness: getSalmonInferredStrandedness(json) ], reads ] + } + .mix(ch_strand_fastq.known_strand) + .set { ch_strand_inferred_fastq } + + emit: + + reads = ch_strand_inferred_fastq + trim_read_count = ch_trim_read_count + + multiqc_files = ch_multiqc_files + versions = ch_versions // channel: [ versions.yml ] +} + diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf new file mode 100644 index 0000000..3dbb27e --- /dev/null +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf @@ -0,0 +1,140 @@ +// +// Read QC, UMI extraction and trimming +// + +include { FASTQC as FASTQC_RAW } from '../../../modules/nf-core/fastqc/main' +include { FASTQC as FASTQC_TRIM } from '../../../modules/nf-core/fastqc/main' +include { UMITOOLS_EXTRACT } from '../../../modules/nf-core/umitools/extract/main' +include { FASTP } from '../../../modules/nf-core/fastp/main' + +// +// Function that parses fastp json output file to get total number of reads after trimming +// +import groovy.json.JsonSlurper + +def getFastpReadsAfterFiltering(json_file) { + def Map json = (Map) new JsonSlurper().parseText(json_file.text).get('summary') + return json['after_filtering']['total_reads'].toLong() +} + +workflow FASTQ_FASTQC_UMITOOLS_FASTP { + take: + reads // channel: [ val(meta), [ reads ] ] + skip_fastqc // boolean: true/false + with_umi // boolean: true/false + skip_umi_extract // boolean: true/false + umi_discard_read // integer: 0, 1 or 2 + skip_trimming // boolean: true/false + adapter_fasta // file: adapter.fasta + save_trimmed_fail // boolean: true/false + save_merged // boolean: true/false + min_trimmed_reads // integer: > 0 + + main: + ch_versions = Channel.empty() + fastqc_raw_html = Channel.empty() + fastqc_raw_zip = Channel.empty() + if (!skip_fastqc) { + FASTQC_RAW ( + reads + ) + fastqc_raw_html = FASTQC_RAW.out.html + fastqc_raw_zip = FASTQC_RAW.out.zip + ch_versions = ch_versions.mix(FASTQC_RAW.out.versions.first()) + } + + umi_reads = reads + umi_log = Channel.empty() + if (with_umi && !skip_umi_extract) { + UMITOOLS_EXTRACT ( + reads + ) + umi_reads = UMITOOLS_EXTRACT.out.reads + umi_log = UMITOOLS_EXTRACT.out.log + ch_versions = ch_versions.mix(UMITOOLS_EXTRACT.out.versions.first()) + + // Discard R1 / R2 if required + if (umi_discard_read in [1,2]) { + UMITOOLS_EXTRACT + .out + .reads + .map { + meta, reads -> + meta.single_end ? [ meta, reads ] : [ meta + [single_end: true], reads[umi_discard_read % 2] ] + } + .set { umi_reads } + } + } + + trim_reads = umi_reads + trim_json = Channel.empty() + trim_html = Channel.empty() + trim_log = Channel.empty() + trim_reads_fail = Channel.empty() + trim_reads_merged = Channel.empty() + fastqc_trim_html = Channel.empty() + fastqc_trim_zip = Channel.empty() + trim_read_count = Channel.empty() + if (!skip_trimming) { + FASTP ( + umi_reads, + adapter_fasta, + save_trimmed_fail, + save_merged + ) + trim_json = FASTP.out.json + trim_html = FASTP.out.html + trim_log = FASTP.out.log + trim_reads_fail = FASTP.out.reads_fail + trim_reads_merged = FASTP.out.reads_merged + ch_versions = ch_versions.mix(FASTP.out.versions.first()) + + // + // Filter FastQ files based on minimum trimmed read count after adapter trimming + // + FASTP + .out + .reads + .join(trim_json) + .map { meta, reads, json -> [ meta, reads, getFastpReadsAfterFiltering(json) ] } + .set { ch_num_trimmed_reads } + + ch_num_trimmed_reads + .filter { meta, reads, num_reads -> num_reads >= min_trimmed_reads.toLong() } + .map { meta, reads, num_reads -> [ meta, reads ] } + .set { trim_reads } + + ch_num_trimmed_reads + .map { meta, reads, num_reads -> [ meta, num_reads ] } + .set { trim_read_count } + + if (!skip_fastqc) { + FASTQC_TRIM ( + trim_reads + ) + fastqc_trim_html = FASTQC_TRIM.out.html + fastqc_trim_zip = FASTQC_TRIM.out.zip + ch_versions = ch_versions.mix(FASTQC_TRIM.out.versions.first()) + } + } + + emit: + reads = trim_reads // channel: [ val(meta), [ reads ] ] + + fastqc_raw_html // channel: [ val(meta), [ html ] ] + fastqc_raw_zip // channel: [ val(meta), [ zip ] ] + + umi_log // channel: [ val(meta), [ log ] ] + + trim_json // channel: [ val(meta), [ json ] ] + trim_html // channel: [ val(meta), [ html ] ] + trim_log // channel: [ val(meta), [ log ] ] + trim_reads_fail // channel: [ val(meta), [ fastq.gz ] ] + trim_reads_merged // channel: [ val(meta), [ fastq.gz ] ] + trim_read_count // channel: [ val(meta), val(count) ] + + fastqc_trim_html // channel: [ val(meta), [ html ] ] + fastqc_trim_zip // channel: [ val(meta), [ zip ] ] + + versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml new file mode 100644 index 0000000..220e8db --- /dev/null +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml @@ -0,0 +1,128 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +# yaml-language-server: $schema=yaml-schema.json +name: "fastq_fastqc_umitools_fastp" +description: Read QC, UMI extraction and trimming +keywords: + - fastq + - fastqc + - qc + - UMI + - trimming + - fastp +components: + - fastqc + - umitools/extract + - fastp +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - skip_fastqc: + type: boolean + description: | + Skip fastqc process + - with_umi: + type: boolean + description: | + With or without umi detection + - skip_umi_extract: + type: boolean + description: | + With or without umi extrection + - umi_discard_read: + type: integer + description: | + Discard R1 / R2 if required + - skip_trimming: + type: boolean + description: | + Allows to skip trimgalore execution + - adapter_fasta: + type: file + description: | + Fasta file of adapter sequences + - save_trimmed_fail: + type: boolean + description: | + Save trimmed fastqs of failed samples + - save_merged: + type: boolean + description: | + Save merged fastqs + - min_trimmed_reads: + type: integer + description: | + Inputs with fewer than this reads will be filtered out of the "reads" output channel +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - reads: + type: file + description: > + Extracted FASTQ files. | For single-end reads, pattern is \${prefix}.umi_extract.fastq.gz. | + + + + For paired-end reads, pattern is \${prefix}.umi_extract_{1,2}.fastq.gz. + pattern: "*.{fastq.gz}" + - fastqc_html: + type: file + description: FastQC report + pattern: "*_{fastqc.html}" + - fastqc_zip: + type: file + description: FastQC report archive + pattern: "*_{fastqc.zip}" + - log: + type: file + description: Logfile for umi_tools + pattern: "*.{log}" + - trim_json: + type: file + description: FastP Trimming report + pattern: "*.{fastp.json}" + - trim_html: + type: file + description: FastP Trimming report + pattern: "*.{fastp.html}" + - log: + type: file + description: Logfile FastP + pattern: "*.{fastp.log}" + - trim_reads_fail: + type: file + description: Trimmed fastq files failing QC + pattern: "*.{fastq.gz}" + - trim_reads_merged: + type: file + description: Trimmed and merged fastq files + pattern: "*.{fastq.gz}" + - trim_read_count: + type: integer + description: Number of reads after trimming + - fastqc_trim_html: + type: file + description: FastQC report + pattern: "*_{fastqc.html}" + - fastqc_trim_zip: + type: file + description: FastQC report archive + pattern: "*_{fastqc.zip}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@robsyme" +maintainers: + - "@robsyme" diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test new file mode 100644 index 0000000..cdd7398 --- /dev/null +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test @@ -0,0 +1,60 @@ +nextflow_workflow { + + name "Test Workflow FASTQ_FASTQC_UMITOOLS_FASTP" + script "../main.nf" + workflow "FASTQ_FASTQC_UMITOOLS_FASTP" + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/fastq_fastqc_umitools_fastp" + tag "fastq_fastqc_umitools_fastp" + tag "fastqc" + tag "umitools/extract" + tag "fastp" + + + test("sarscov2 paired-end [fastq]") { + + when { + workflow { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = false // skip_fastqc + input[2] = false // with_umi + input[3] = false // skip_umi_extract + input[4] = 1 // umi_discard_read + input[5] = false // skip_trimming + input[6] = [] // adapter_fasta + input[7] = false // save_trimmed_fail + input[8] = false // save_merged + input[9] = 1 // min_trimmed_reads + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out.reads).match("reads") }, + { assert snapshot(workflow.out.umi_log).match("umi_log") }, + { assert snapshot(workflow.out.trim_json).match("trim_json") }, + { assert snapshot(workflow.out.trim_reads_fail).match("trim_reads_fail") }, + { assert snapshot(workflow.out.trim_reads_merged).match("trim_reads_merged") }, + { assert snapshot(workflow.out.trim_read_count).match("trim_read_count") }, + { assert snapshot(workflow.out.versions).match("versions") }, + + { assert workflow.out.fastqc_raw_html }, + { assert workflow.out.fastqc_raw_zip }, + { assert workflow.out.trim_html }, + { assert workflow.out.trim_log }, + { assert workflow.out.fastqc_trim_html }, + { assert workflow.out.fastqc_trim_zip } + ) + } + } +} diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test.snap b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test.snap new file mode 100644 index 0000000..38a65ae --- /dev/null +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test.snap @@ -0,0 +1,81 @@ +{ + "trim_reads_merged": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-26T02:28:26.26920982" + }, + "trim_reads_fail": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-26T02:28:26.25861515" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,85bd0117e5778fff18e3920972a296ad", + "versions.yml:md5,c50aa59475ab901bc6f9a2cf7b1a14e0", + "versions.yml:md5,f3dcaae948e8eed92b4a5557b4c6668e" + ] + ], + "timestamp": "2023-11-26T02:28:26.30891403" + }, + "trim_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.fastp.json:md5,1e0f8e27e71728e2b63fc64086be95cd" + ] + ] + ], + "timestamp": "2023-11-26T02:28:26.24768259" + }, + "reads": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastp.fastq.gz:md5,67b2bbae47f073e05a97a9c2edce23c7", + "test_2.fastp.fastq.gz:md5,25cbdca08e2083dbd4f0502de6b62f39" + ] + ] + ] + ], + "timestamp": "2023-12-04T11:30:32.061644815" + }, + "umi_log": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-26T02:28:26.238536" + }, + "trim_read_count": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + 198 + ] + ] + ], + "timestamp": "2023-11-26T02:28:26.27984169" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/tags.yml b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/tags.yml new file mode 100644 index 0000000..84a4b56 --- /dev/null +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/fastq_fastqc_umitools_fastp: + - subworkflows/nf-core/fastq_fastqc_umitools_fastp/** diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/main.nf b/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/main.nf new file mode 100644 index 0000000..db2e5b3 --- /dev/null +++ b/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/main.nf @@ -0,0 +1,123 @@ +// +// Read QC, UMI extraction and trimming +// + +include { FASTQC } from '../../../modules/nf-core/fastqc/main' +include { UMITOOLS_EXTRACT } from '../../../modules/nf-core/umitools/extract/main' +include { TRIMGALORE } from '../../../modules/nf-core/trimgalore/main' + +// +// Function that parses TrimGalore log output file to get total number of reads after trimming +// +def getTrimGaloreReadsAfterFiltering(log_file) { + def total_reads = 0 + def filtered_reads = 0 + log_file.eachLine { line -> + def total_reads_matcher = line =~ /([\d\.]+)\ssequences processed in total/ + def filtered_reads_matcher = line =~ /shorter than the length cutoff[^:]+:\s([\d\.]+)/ + if (total_reads_matcher) total_reads = total_reads_matcher[0][1].toFloat() + if (filtered_reads_matcher) filtered_reads = filtered_reads_matcher[0][1].toFloat() + } + return total_reads - filtered_reads +} + +workflow FASTQ_FASTQC_UMITOOLS_TRIMGALORE { + take: + reads // channel: [ val(meta), [ reads ] ] + skip_fastqc // boolean: true/false + with_umi // boolean: true/false + skip_umi_extract // boolean: true/false + skip_trimming // boolean: true/false + umi_discard_read // integer: 0, 1 or 2 + min_trimmed_reads // integer: > 0 + + main: + ch_versions = Channel.empty() + fastqc_html = Channel.empty() + fastqc_zip = Channel.empty() + if (!skip_fastqc) { + FASTQC (reads) + fastqc_html = FASTQC.out.html + fastqc_zip = FASTQC.out.zip + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + } + + umi_reads = reads + umi_log = Channel.empty() + if (with_umi && !skip_umi_extract) { + UMITOOLS_EXTRACT (reads) + umi_reads = UMITOOLS_EXTRACT.out.reads + umi_log = UMITOOLS_EXTRACT.out.log + ch_versions = ch_versions.mix(UMITOOLS_EXTRACT.out.versions.first()) + + // Discard R1 / R2 if required + if (umi_discard_read in [1,2]) { + UMITOOLS_EXTRACT + .out + .reads + .map { + meta, reads -> + meta.single_end ? [ meta, reads ] : [ meta + ['single_end': true], reads[umi_discard_read % 2] ] + } + .set { umi_reads } + } + } + + trim_reads = umi_reads + trim_unpaired = Channel.empty() + trim_html = Channel.empty() + trim_zip = Channel.empty() + trim_log = Channel.empty() + trim_read_count = Channel.empty() + if (!skip_trimming) { + TRIMGALORE (umi_reads) + trim_unpaired = TRIMGALORE.out.unpaired + trim_html = TRIMGALORE.out.html + trim_zip = TRIMGALORE.out.zip + trim_log = TRIMGALORE.out.log + ch_versions = ch_versions.mix(TRIMGALORE.out.versions.first()) + + // + // Filter FastQ files based on minimum trimmed read count after adapter trimming + // + TRIMGALORE + .out + .reads + .join(trim_log, remainder: true) + .map { + meta, reads, trim_log -> + if (trim_log) { + num_reads = getTrimGaloreReadsAfterFiltering(meta.single_end ? trim_log : trim_log[-1]) + [ meta, reads, num_reads ] + } else { + [ meta, reads, min_trimmed_reads.toFloat() + 1 ] + } + } + .set { ch_num_trimmed_reads } + + ch_num_trimmed_reads + .filter { meta, reads, num_reads -> num_reads >= min_trimmed_reads.toFloat() } + .map { meta, reads, num_reads -> [ meta, reads ] } + .set { trim_reads } + + ch_num_trimmed_reads + .map { meta, reads, num_reads -> [ meta, num_reads ] } + .set { trim_read_count } + } + + emit: + reads = trim_reads // channel: [ val(meta), [ reads ] ] + + fastqc_html // channel: [ val(meta), [ html ] ] + fastqc_zip // channel: [ val(meta), [ zip ] ] + + umi_log // channel: [ val(meta), [ log ] ] + + trim_unpaired // channel: [ val(meta), [ reads ] ] + trim_html // channel: [ val(meta), [ html ] ] + trim_zip // channel: [ val(meta), [ zip ] ] + trim_log // channel: [ val(meta), [ txt ] ] + trim_read_count // channel: [ val(meta), val(count) ] + + versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/meta.yml b/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/meta.yml new file mode 100644 index 0000000..a7df97f --- /dev/null +++ b/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/meta.yml @@ -0,0 +1,101 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "fastq_fastqc_umitools_trimgalore" +description: Read QC, UMI extraction and trimming +keywords: + - fastq + - fastqc + - qc + - UMI + - trimming + - trimgalore +components: + - fastqc + - umitools/extract + - trimgalore +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - skip_fastqc: + type: boolean + description: | + Skip fastqc process + - with_umi: + type: boolean + description: | + With or without umi detection + - skip_umi_extract: + type: boolean + description: | + With or without umi extrection + - skip_trimming: + type: boolean + description: | + Allows to skip trimgalore execution + - umi_discard_read: + type: integer + description: | + Discard R1 / R2 if required + - min_trimmed_reads: + type: integer + description: | + Inputs with fewer than this reads will be filtered out of the "reads" output channel +output: + - reads: + type: file + description: > + Extracted FASTQ files. | For single-end reads, pattern is \${prefix}.umi_extract.fastq.gz. | + + + + For paired-end reads, pattern is \${prefix}.umi_extract_{1,2}.fastq.gz. + pattern: "*.{fastq.gz}" + - fastqc_html: + type: file + description: FastQC report + pattern: "*_{fastqc.html}" + - fastqc_zip: + type: file + description: FastQC report archive + pattern: "*_{fastqc.zip}" + - log: + type: file + description: Logfile for umi_tools + pattern: "*.{log}" + - trim_unpaired: + type: file + description: | + FastQ files containing unpaired reads from read 1 or read 2 + pattern: "*unpaired*.fq.gz" + - trim_html: + type: file + description: FastQC report (optional) + pattern: "*_{fastqc.html}" + - trim_zip: + type: file + description: FastQC report archive (optional) + pattern: "*_{fastqc.zip}" + - trim_log: + type: file + description: Trim Galore! trimming report + pattern: "*_{report.txt}" + - trim_read_count: + type: integer + description: Number of reads remaining after trimming for all input samples + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@KamilMaliszArdigen" +maintainers: + - "@drpatelh" + - "@KamilMaliszArdigen" diff --git a/subworkflows/nf-core/fastq_subsample_fq_salmon/main.nf b/subworkflows/nf-core/fastq_subsample_fq_salmon/main.nf new file mode 100644 index 0000000..0ac3e53 --- /dev/null +++ b/subworkflows/nf-core/fastq_subsample_fq_salmon/main.nf @@ -0,0 +1,54 @@ +// +// Sub-sample FastQ files and pseudo-align with Salmon +// can be used to infer strandedness of library +// + +include { SALMON_INDEX } from '../../../modules/nf-core/salmon/index/main' +include { FQ_SUBSAMPLE } from '../../../modules/nf-core/fq/subsample/main' +include { SALMON_QUANT } from '../../../modules/nf-core/salmon/quant/main' + +workflow FASTQ_SUBSAMPLE_FQ_SALMON { + take: + ch_reads // channel: [ val(meta), [ reads ] ] + ch_genome_fasta // channel: /path/to/genome.fasta + ch_transcript_fasta // channel: /path/to/transcript.fasta + ch_gtf // channel: /path/to/genome.gtf + ch_index // channel: /path/to/salmon/index/ + make_index // boolean: Whether to create salmon index before running salmon quant + + main: + + ch_versions = Channel.empty() + + // + // Create Salmon index if required + // + if (make_index) { + ch_index = SALMON_INDEX ( ch_genome_fasta, ch_transcript_fasta ).index + ch_versions = ch_versions.mix(SALMON_INDEX.out.versions) + } + + // + // Sub-sample FastQ files with fq + // + FQ_SUBSAMPLE ( ch_reads ) + ch_versions = ch_versions.mix(FQ_SUBSAMPLE.out.versions.first()) + + // + // Pseudo-alignment with Salmon + // + def lib_type = 'A' + def alignment_mode = false + SALMON_QUANT ( FQ_SUBSAMPLE.out.fastq, ch_index, ch_gtf, ch_transcript_fasta, alignment_mode, lib_type ) + ch_versions = ch_versions.mix(SALMON_QUANT.out.versions.first()) + + emit: + index = ch_index // channel: [ index ] + + reads = FQ_SUBSAMPLE.out.fastq // channel: [ val(meta), fastq ] + + results = SALMON_QUANT.out.results // channel: [ val(meta), results_dir ] + json_info = SALMON_QUANT.out.json_info // channel: [ val(meta), json_info + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/fastq_subsample_fq_salmon/meta.yml b/subworkflows/nf-core/fastq_subsample_fq_salmon/meta.yml new file mode 100644 index 0000000..7e2f109 --- /dev/null +++ b/subworkflows/nf-core/fastq_subsample_fq_salmon/meta.yml @@ -0,0 +1,70 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "fastq_subsample_fq_salmon" +description: Subsample fastq +keywords: + - fastq + - subsample + - strandedness +components: + - fq/subsample + - salmon/quant + - salmon/index +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ch_reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - ch_genome_fasta: + type: file + description: Genome fasta file + pattern: "Path to genome sequence in fasta format" + - ch_transcript_fasta: + type: file + description: Transcript fasta file + pattern: "Path to transcript sequence in fasta format" + - ch_gtf: + type: file + description: GTF features file + pattern: "Path features in GTF format" + - ch_index: + type: file + description: Salmon index files + pattern: "Directory containing Salmon index" + - make_index: + type: boolean + description: Whether to create salmon index before running salmon quant +output: + - index: + type: directory + description: Directory containing salmon index + pattern: "salmon" + - reads: + type: file + description: Subsampled fastq reads. + pattern: "*.{fq,fastq}{,.gz}" + - results: + type: directory + description: Folder containing the quantification results for a specific sample + pattern: "${prefix}" + - json_info: + type: file + description: | + File containing meta information from Salmon quant + Which could be used to infer strandedness among other things + pattern: "*info.json" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@robsyme" + - "@drpatelh" +maintainers: + - "@robsyme" + - "@drpatelh" diff --git a/subworkflows/nf-core/fastq_subsample_fq_salmon/tests/main.nf.test b/subworkflows/nf-core/fastq_subsample_fq_salmon/tests/main.nf.test new file mode 100644 index 0000000..6342449 --- /dev/null +++ b/subworkflows/nf-core/fastq_subsample_fq_salmon/tests/main.nf.test @@ -0,0 +1,65 @@ +nextflow_workflow { + + name "Test Workflow FASTQ_SUBSAMPLE_FQ_SALMON" + script "../main.nf" + workflow "FASTQ_SUBSAMPLE_FQ_SALMON" + config "./nextflow.config" + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/fastq_subsample_fq_salmon" + tag "fastq_subsample_fq_salmon" + tag "salmon/index" + tag "fq/subsample" + tag "salmon/quant" + + test("homo_sapiens paired-end [fastq]") { + + setup { + run("SALMON_INDEX") { + script "../../../../modules/nf-core/salmon/index/main.nf" + process { + """ + input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true)) // genome_fasta + input[1] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/transcriptome.fasta', checkIfExists: true)) // transcriptome_fasta + """ + } + } + } + + when { + workflow { + """ + make_index = false + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + input[1] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true)) // genome_fasta + input[2] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/transcriptome.fasta', checkIfExists: true)) // transcriptome_fasta + input[3] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true)) // genome_gtf + input[4] = SALMON_INDEX.out.index + input[5] = make_index + """ + } + } + + then { + def readlines1 = path(workflow.out.reads[0][1][0]).linesGzip + def readlines2 = path(workflow.out.reads[0][1][1]).linesGzip + assertAll( + { assert workflow.success }, + { assert snapshot(readlines1[0..5]).match("test_reads_1_lines") }, + { assert snapshot(readlines1.size()).match("test_reads_1_size") }, + { assert snapshot(readlines2[0..5]).match("test_reads_2_lines") }, + { assert snapshot(readlines2.size()).match("test_reads_2_size") }, + { assert snapshot(workflow.out.versions).match("versions") }, + + { assert workflow.out.index }, + { assert workflow.out.results }, + { assert workflow.out.json_info } + ) + } + } +} diff --git a/subworkflows/nf-core/fastq_subsample_fq_salmon/tests/main.nf.test.snap b/subworkflows/nf-core/fastq_subsample_fq_salmon/tests/main.nf.test.snap new file mode 100644 index 0000000..afbe0b5 --- /dev/null +++ b/subworkflows/nf-core/fastq_subsample_fq_salmon/tests/main.nf.test.snap @@ -0,0 +1,49 @@ +{ + "test_reads_1_size": { + "content": [ + 1066944 + ], + "timestamp": "2024-01-11T10:33:54.747992124" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,12c0d1f67c2afb97470ae0974e5e01bb", + "versions.yml:md5,885fde9e7beac002b3a17b66b92db4bd" + ] + ], + "timestamp": "2023-11-26T16:41:10.396971682" + }, + "test_reads_1_lines": { + "content": [ + [ + "@normal#21#998579#1/1", + "CCTTCTCCCTGCTGGGGTTGCTTGTCAGTAGCGGGCAAGGTAGGAGTGTGGCGCTTTATTGCATTTACTTTCCCTCCCCCTTCCCCCCGGCCAAGAGAGG", + "+", + "102302000331;3333;23133320233330*33/233333333333333/313232333/3;3;3/333000;11/00;;01//103*1032323233", + "@normal#21#998572#2/1", + "CTCCTCTCCTTCTACCTGCTGGGGTTGCTTGTCAGTAGCGGGCAAGGTCGGAGTGTTGCGCTTTATTGCATTTACTTTCCCTCCCCCTTCCACCCGGCCA" + ] + ], + "timestamp": "2024-01-11T10:33:54.730250665" + }, + "test_reads_2_lines": { + "content": [ + [ + "@normal#21#998579#1/2", + "AAAAAAAAAGAAGAAGCAGAAGCTGTTTCCCTGGATATCCTGCTCACCGATTCCCCTCTCCAATTCTGTATTTTCCCTTCTCTTATTTAAGGGTCTCCAC", + "+", + "023333233332333310333302333211/3333;0300;*/;000/32;201003031/22;21333032;;11/23030322;2332333313/030", + "@normal#21#998572#2/2", + "TTCCCCTCTCCAATTGAGTATTTTCCCTTCTCTTATTTAAGGGTCTCCACACAAACAGATACAATTTTAGGGACAGCTAGGAGAAAGAACGAAAATAATAA" + ] + ], + "timestamp": "2024-01-11T10:33:54.756723613" + }, + "test_reads_2_size": { + "content": [ + 1066944 + ], + "timestamp": "2024-01-11T10:33:54.763399473" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/fastq_subsample_fq_salmon/tests/nextflow.config b/subworkflows/nf-core/fastq_subsample_fq_salmon/tests/nextflow.config new file mode 100644 index 0000000..7fc4d63 --- /dev/null +++ b/subworkflows/nf-core/fastq_subsample_fq_salmon/tests/nextflow.config @@ -0,0 +1,9 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + withName: FQ_SUBSAMPLE { + ext.args = '--record-count 1000000 --seed 1' + } + +} diff --git a/subworkflows/nf-core/fastq_subsample_fq_salmon/tests/tags.yml b/subworkflows/nf-core/fastq_subsample_fq_salmon/tests/tags.yml new file mode 100644 index 0000000..cc809c5 --- /dev/null +++ b/subworkflows/nf-core/fastq_subsample_fq_salmon/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/fastq_subsample_fq_salmon: + - subworkflows/nf-core/fastq_subsample_fq_salmon/** diff --git a/tests/nextflow.config b/tests/nextflow.config index ec574b3..c82fb21 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -4,4 +4,30 @@ ======================================================================================== */ +process { + + // Impose sensible resource limits for testing + + withName: '.*' { + cpus = 2 + memory = 3.GB + time = 2.h + } + + // Override modules.config so module snapshots match + + withName: FQ_SUBSAMPLE { + ext.prefix = '' + } + withName: GFFREAD { + ext.args = null + } +} + +params { + test_data_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules' + modules_testdata_base_path = 's3://ngi-igenomes/testdata/nf-core/modules/' + hisat2_build_memory = '3.GB' +} + includeConfig 'https://raw.githubusercontent.com/nf-core/modules/0094ae45ba8f5a2e30644d4e252970775a03ee91/tests/config/test_data.config' diff --git a/workflows/riboseq.nf b/workflows/riboseq.nf index 47abb81..e797e26 100644 --- a/workflows/riboseq.nf +++ b/workflows/riboseq.nf @@ -1,10 +1,29 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + GENOME PARAMETER VALUES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') +params.transcript_fasta = WorkflowMain.getGenomeAttribute(params, 'transcript_fasta') +params.additional_fasta = WorkflowMain.getGenomeAttribute(params, 'additional_fasta') +params.gtf = WorkflowMain.getGenomeAttribute(params, 'gtf') +params.gff = WorkflowMain.getGenomeAttribute(params, 'gff') +params.gene_bed = WorkflowMain.getGenomeAttribute(params, 'bed12') +params.bbsplit_index = WorkflowMain.getGenomeAttribute(params, 'bbsplit') +params.star_index = WorkflowMain.getGenomeAttribute(params, 'star') +params.hisat2_index = WorkflowMain.getGenomeAttribute(params, 'hisat2') +params.rsem_index = WorkflowMain.getGenomeAttribute(params, 'rsem') +params.salmon_index = WorkflowMain.getGenomeAttribute(params, 'salmon') +params.kallisto_index = WorkflowMain.getGenomeAttribute(params, 'kallisto') + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRINT PARAMS SUMMARY ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' +include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf-validation' def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' @@ -13,8 +32,59 @@ def summary_params = paramsSummaryMap(workflow) // Print parameter summary log to screen log.info logo + paramsSummaryLog(workflow) + citation +// Check if an AWS iGenome has been provided to use the appropriate version of STAR +def is_aws_igenome = false +if (params.fasta && params.gtf) { + if ((file(params.fasta).getName() - '.gz' == 'genome.fa') && (file(params.gtf).getName() - '.gz' == 'genes.gtf')) { + is_aws_igenome = true + } +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + VALIDATE INPUTS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + WorkflowRiboseq.initialise(params, log) +// Check rRNA databases for sortmerna +if (params.remove_ribo_rna) { + ch_ribo_db = file(params.ribo_database_manifest) + if (ch_ribo_db.isEmpty()) {exit 1, "File provided with --ribo_database_manifest is empty: ${ch_ribo_db.getName()}!"} +} + +// Check if file with list of fastas is provided when running BBSplit +if (!params.skip_bbsplit && !params.bbsplit_index && params.bbsplit_fasta_list) { + ch_bbsplit_fasta_list = file(params.bbsplit_fasta_list) + if (ch_bbsplit_fasta_list.isEmpty()) {exit 1, "File provided with --bbsplit_fasta_list is empty: ${ch_bbsplit_fasta_list.getName()}!"} +} + +// Check alignment parameters +def prepareToolIndices = [] +if (!params.skip_bbsplit) { prepareToolIndices << 'bbsplit' } +if (!params.skip_alignment) { prepareToolIndices << params.aligner } +if (!params.skip_pseudo_alignment && params.pseudo_aligner) { prepareToolIndices << params.pseudo_aligner } + +// Determine whether to filter the GTF or not +def filterGtf = + (( + // Condition 1: Alignment is required and aligner is set + !params.skip_alignment && params.aligner + ) || + ( + // Condition 2: Pseudoalignment is required and pseudoaligner is set + !params.skip_pseudo_alignment && params.pseudo_aligner + ) || + ( + // Condition 3: Transcript FASTA file is not provided + !params.transcript_fasta + )) && + ( + // Condition 4: --skip_gtf_filter is not provided + !params.skip_gtf_filter + ) + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES @@ -35,7 +105,8 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' +include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess_rnaseq' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -46,7 +117,6 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' // // MODULE: Installed directly from nf-core/modules // -include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' @@ -61,53 +131,114 @@ def multiqc_report = [] workflow RIBOSEQ { - ch_versions = Channel.empty() - + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files + // SUBWORKFLOW: Uncompress and prepare reference genome files // - INPUT_CHECK ( - file(params.input) + def biotype = params.gencode ? "gene_type" : params.featurecounts_group_type + PREPARE_GENOME ( + params.fasta, + params.gtf, + params.gff, + params.additional_fasta, + params.transcript_fasta, + params.gene_bed, + params.splicesites, + params.bbsplit_fasta_list, + params.star_index, + params.rsem_index, + params.salmon_index, + params.kallisto_index, + params.hisat2_index, + params.bbsplit_index, + params.gencode, + is_aws_igenome, + biotype, + prepareToolIndices, + filterGtf ) - ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) - // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") - // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ - // ! There is currently no tooling to help you write a sample sheet schema + ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions) + + // Check if contigs in genome fasta file > 512 Mbp + if (!params.skip_alignment && !params.bam_csi_index) { + PREPARE_GENOME + .out + .fai + .map { WorkflowRiboseq.checkMaxContigSize(it, log) } + } // - // MODULE: Run FastQC + // Create input channel from input file provided through params.input // - FASTQC ( - INPUT_CHECK.out.reads + Channel + .fromSamplesheet("input") + .map { + meta, fastq_1, fastq_2 -> + if (!fastq_2) { + return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] + } else { + return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] + } + } + .groupTuple() + .map { + WorkflowRiboseq.validateInput(it) + } + .set { ch_fastq } + + PREPROCESS_RNASEQ ( + ch_fastq, + PREPARE_GENOME.out.fasta, + PREPARE_GENOME.out.transcript_fasta, + PREPARE_GENOME.out.gtf, + PREPARE_GENOME.out.salmon_index, + !params.salmon_index && !('salmon' in prepareToolIndices), + params.skip_bbsplit, + PREPARE_GENOME.out.bbsplit_index, + params.skip_fastqc || params.skip_qc, + params.skip_trimming, + params.trimmer, + params.min_trimmed_reads, + params.save_trimmed, + params.remove_ribo_rna, + ch_ribo_db, + params.with_umi, + params.skip_umi_extract, + params.umi_discard_read ) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix(PREPROCESS_RNASEQ.out.multiqc_files) + ch_versions = ch_versions.mix(PREPROCESS_RNASEQ.out.versions) CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) + ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml) // // MODULE: MultiQC // - workflow_summary = WorkflowRiboseq.paramsSummaryMultiqc(workflow, summary_params) - ch_workflow_summary = Channel.value(workflow_summary) + if (!params.skip_multiqc) { + workflow_summary = WorkflowRiboseq.paramsSummaryMultiqc(workflow, summary_params) + ch_workflow_summary = Channel.value(workflow_summary) - methods_description = WorkflowRiboseq.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) - ch_methods_description = Channel.value(methods_description) + methods_description = WorkflowRiboseq.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) + ch_methods_description = Channel.value(methods_description) - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() - ) - multiqc_report = MULTIQC.out.report.toList() + ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) + + MULTIQC ( + ch_multiqc_files.collect(), + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList() + ) + multiqc_report = MULTIQC.out.report.toList() + + } } /*
    Process Name \\", + " \\ Software Version
    CUSTOM_DUMPSOFTWAREVERSIONSpython3.11.7
    yaml5.4.1
    TOOL1tool10.11.9
    TOOL2tool21.9
    WorkflowNextflow12.922000 K (92.984097%)", + "single end (151 cycles)" ] + def log_text = [ "Q20 bases: 12922(92.9841%)", + "reads passed filter: 99" ] + def read_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { assert snapshot(process.out.json).match("test_fastp_single_end_json") }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { file(it[1]).getName() } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_single_end-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_single_end-stub") { + + options '-stub' + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = Channel.of([ + [ id:'test', single_end:true ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + + assertAll( + { assert process.success }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { file(it[1]).getName() } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_single_end-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:25.719000 K (93.033098%)", + "The input has little adapter percentage (~0.000000%), probably it's trimmed before."] + def log_text = [ "No adapter detected for read1", + "Q30 bases: 12281(88.3716%)"] + def json_text = ['"passed_filter_reads": 198'] + def read1_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { it[1].collect { item -> file(item).getName() } } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_paired_end-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end-stub") { + + options '-stub' + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + assertAll( + { assert process.success }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { it[1].collect { item -> file(item).getName() } } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_paired_end-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("fastp test_fastp_interleaved") { + config './nextflow.config' + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) ] + ]) + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:25.719000 K (93.033098%)", + "paired end (151 cycles + 151 cycles)"] + def log_text = [ "Q20 bases: 12922(92.9841%)", + "reads passed filter: 198"] + def read_lines = [ "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { assert snapshot(process.out.json).match("fastp test_fastp_interleaved_json") }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { file(it[1]).getName() } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_interleaved-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("fastp test_fastp_interleaved-stub") { + + options '-stub' + + config './nextflow.config' + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) ] + ]) + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + assertAll( + { assert process.success }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { file(it[1]).getName() } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_interleaved-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_single_end_trim_fail") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = true + save_merged = false + + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:12.922000 K (92.984097%)", + "single end (151 cycles)"] + def log_text = [ "Q20 bases: 12922(92.9841%)", + "reads passed filter: 99" ] + def read_lines = [ "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { failed_read_lines.each { failed_read_line -> + { assert path(process.out.reads_fail.get(0).get(1)).linesGzip.contains(failed_read_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { assert snapshot(process.out.json).match("test_fastp_single_end_trim_fail_json") }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end_trim_fail") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = true + save_merged = false + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + ]) + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:25.719000 K (93.033098%)", + "The input has little adapter percentage (~0.000000%), probably it's trimmed before."] + def log_text = [ "No adapter detected for read1", + "Q30 bases: 12281(88.3716%)"] + def json_text = ['"passed_filter_reads": 198'] + def read1_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { failed_read2_lines.each { failed_read2_line -> + { assert path(process.out.reads_fail.get(0).get(1).get(1)).linesGzip.contains(failed_read2_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end_merged") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = true + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "
    "] + def log_text = [ "Merged and filtered:", + "total reads: 75", + "total bases: 13683"] + def json_text = ['"merged_and_filtered": {', '"total_reads": 75', '"total_bases": 13683'] + def read1_lines = [ "@ERR5069949.1066259 NS500628:121:HK3MMAFX2:1:11312:18369:8333/1", + "CCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTC", + "AAAAAEAEEAEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEAEEEEEEEEEEEEEEEEE/EAEEEEEE/6EEEEEEEEEEAEEAEEE/EE/AEEAEEEEEAEEEA/EEAAEAE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { read_merged_lines.each { read_merged_line -> + { assert path(process.out.reads_merged.get(0).get(1)).linesGzip.contains(read_merged_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { it[1].collect { item -> file(item).getName() } } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_paired_end_merged-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end_merged-stub") { + + options '-stub' + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = true + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + assertAll( + { assert process.success }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { it[1].collect { item -> file(item).getName() } } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_paired_end_merged-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end_merged_adapterlist") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = Channel.of([ file(params.modules_testdata_base_path + 'delete_me/fastp/adapters.fasta', checkIfExists: true) ]) + save_trimmed_fail = false + save_merged = true + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "
    "] + def log_text = [ "Merged and filtered:", + "total reads: 75", + "total bases: 13683"] + def json_text = ['"merged_and_filtered": {', '"total_reads": 75', '"total_bases": 13683',"--adapter_fasta"] + def read1_lines = ["@ERR5069949.1066259 NS500628:121:HK3MMAFX2:1:11312:18369:8333/1", + "CCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTC", + "AAAAAEAEEAEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEAEEEEEEEEEEEEEEEEE/EAEEEEEE/6EEEEEEEEEEAEEAEEE/EE/AEEAEEEEEAEEEA/EEAAEAE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { read_merged_lines.each { read_merged_line -> + { assert path(process.out.reads_merged.get(0).get(1)).linesGzip.contains(read_merged_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } +} diff --git a/modules/nf-core/fastp/tests/main.nf.test.snap b/modules/nf-core/fastp/tests/main.nf.test.snap new file mode 100644 index 0000000..6a71b68 --- /dev/null +++ b/modules/nf-core/fastp/tests/main.nf.test.snap @@ -0,0 +1,107 @@ +{ + "test_fastp_paired_end-for_stub_match": { + "content": [ + [ + [ + "test_1.fastp.fastq.gz", + "test_2.fastp.fastq.gz" + ], + "test.fastp.html", + "test.fastp.json", + "test.fastp.log", + "{id=test, single_end=false}" + ] + ], + "timestamp": "2024-01-17T18:07:15.398827" + }, + "fastp test_fastp_interleaved_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.json:md5,168f516f7bd4b7b6c32da7cba87299a4" + ] + ] + ], + "timestamp": "2024-01-17T18:08:06.123035" + }, + "test_fastp_paired_end_merged-for_stub_match": { + "content": [ + [ + [ + "test_1.fastp.fastq.gz", + "test_2.fastp.fastq.gz" + ], + "test.fastp.html", + "test.fastp.json", + "test.fastp.log", + "test.merged.fastq.gz", + "{id=test, single_end=false}" + ] + ], + "timestamp": "2024-01-17T18:10:13.467574" + }, + "test_fastp_single_end_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.json:md5,c852d7a6dba5819e4ac8d9673bedcacc" + ] + ] + ], + "timestamp": "2024-01-17T18:06:00.223817" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" + ] + ], + "timestamp": "2024-01-17T18:06:00.248422" + }, + "test_fastp_interleaved-for_stub_match": { + "content": [ + [ + "test.fastp.fastq.gz", + "test.fastp.html", + "test.fastp.json", + "test.fastp.log", + "{id=test, single_end=true}" + ] + ], + "timestamp": "2024-01-17T18:08:06.127974" + }, + "test_fastp_single_end-for_stub_match": { + "content": [ + [ + "test.fastp.fastq.gz", + "test.fastp.html", + "test.fastp.json", + "test.fastp.log", + "{id=test, single_end=true}" + ] + ], + "timestamp": "2024-01-17T18:06:00.244202" + }, + "test_fastp_single_end_trim_fail_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.json:md5,9a7ee180f000e8d00c7fb67f06293eb5" + ] + ] + ], + "timestamp": "2024-01-17T18:08:41.942317" + } +} \ No newline at end of file diff --git a/modules/nf-core/fastp/tests/nextflow.config b/modules/nf-core/fastp/tests/nextflow.config new file mode 100644 index 0000000..0f7849a --- /dev/null +++ b/modules/nf-core/fastp/tests/nextflow.config @@ -0,0 +1,6 @@ +process { + + withName: FASTP { + ext.args = "--interleaved_in" + } +} diff --git a/modules/nf-core/fastp/tests/tags.yml b/modules/nf-core/fastp/tests/tags.yml new file mode 100644 index 0000000..c1afcce --- /dev/null +++ b/modules/nf-core/fastp/tests/tags.yml @@ -0,0 +1,2 @@ +fastp: + - modules/nf-core/fastp/** diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test index b9e8f92..1f21c66 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -3,24 +3,20 @@ nextflow_process { name "Test Process FASTQC" script "../main.nf" process "FASTQC" + tag "modules" tag "modules_nfcore" tag "fastqc" - test("Single-Read") { + test("sarscov2 single-end [fastq]") { when { - params { - outdir = "$outputDir" - } process { """ - input[0] = [ + input[0] = Channel.of([ [ id: 'test', single_end:true ], - [ - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) - ] - ] + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) """ } } @@ -28,82 +24,189 @@ nextflow_process { then { assertAll ( { assert process.success }, + // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. // looks like this:
    Mon 2 Oct 2023
    test.gz
    // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 - { assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" }, - { assert path(process.out.html.get(0).get(1)).getText().contains("
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls