Merge pull request nf-core#1013 from nf-core/dsl2-nf-validation

DSL2: replaces input_check with fromSamplesheet (nf-validation)
scarlhoff · May 24, 2024 · 45cc81e · 45cc81e
2 parents 62f1b57 + 09ab6aa
commit 45cc81e
Show file tree

Hide file tree

Showing 13 changed files with 336 additions and 101 deletions.
diff --git a/assets/schema_fasta.json b/assets/schema_fasta.json
@@ -0,0 +1,129 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema",
+    "$id": "https://raw.githubusercontent.com/nf-core/eager/master/assets/schema_fasta.json",
+    "title": "nf-core/eager pipeline - params.fasta schema",
+    "description": "Schema for the file provided with params.fasta",
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "reference_name": {
+                "type": "string",
+                "pattern": "^\\S+$",
+                "meta": ["id"],
+                "unique": true,
+                "errorMessage": "Reference name must be provided and cannot contain spaces."
+            },
+            "fasta": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.f(na|asta|a|as)(\\.gz)?$",
+                "exists": true,
+                "unique": true,
+                "errorMessage": "Fasta files for the mapping reference must be provided with file extensions '.fasta', '.fa', '.fas', '.fna', '.fasta.gz','.fa.gz','.fas.gz', '.fna.gz' and cannot contain any spaces."
+            },
+            "fai": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.fai$",
+                "exists": true,
+                "errorMessage": "Fasta index files for the mapping reference cannot have any spaces and must have file extension '.fai'."
+            },
+            "dict": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.dict$",
+                "exists": true,
+                "errorMessage": "Picard sequence dictionary files for the mapping reference cannot have any spaces and must have file extensions '.dict'."
+            },
+            "mapper_index": {
+                "type": "string",
+                "format": "directory-path",
+                "pattern": "^\\S+$",
+                "exists": true,
+                "errorMessage": "The directories of the index files for the mapping reference for a given mapper must not contain any spaces and have file extensions ''."
+            },
+            "circular_target": {
+                "type": "string",
+                "pattern": "^\\S+$",
+                "errorMessage": "The headers of the chromosome to be extended by circularmapper must not contain any spaces and no leading '>'."
+            },
+            "mitochondrion_header": {
+                "type": "string",
+                "pattern": "^\\S+$",
+                "errorMessage": "The names of the fasta entry of the mapping reference corresponding to the mitochondrial genome must not contain any spaces and no leading '>'."
+            },
+            "snpcapture_bed": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.bed(\\.gz)?$",
+                "exists": true,
+                "errorMessage": "SNP capture bed files must not contain any spaces and have file extensions '.bed' or '.bed.gz'."
+            },
+            "pileupcaller_bedfile": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.bed(\\.gz)?$",
+                "exists": true,
+                "dependentRequired": ["pileupcaller_snpfile"],
+                "errorMessage": "SNP capture bed files for pileupcaller must not contain any spaces, have file extensions '.bed' or '.bed.gz' and be provided alongside a pileupcall_bedfile."
+            },
+            "pileupcaller_snpfile": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.snp$",
+                "exists": true,
+                "dependentRequired": ["pileupcaller_bedfile"],
+                "errorMessage": "SNP panel files for pileupcaller must not contain any spaces, have file extension '.snp' and be provided alongside a pileupcaller_snpfile."
+            },
+            "hapmap_file": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+$",
+                "exists": true,
+                "errorMessage": "HapMap files for contamination estimation with ANGSD must not contain any spaces."
+            },
+            "pmdtools_masked_fasta": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.f(na|asta|a|as)(\\.gz)?$",
+                "exists": true,
+                "errorMessage": "Masked fasta files for PMDtools must not contain any spaces and have file extensions '.fasta', '.fa', '.fas', '.fna', '.fasta.gz','.fa.gz','.fas.gz' or '.fna.gz'."
+            },
+            "pmdtools_bed_for_masking": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.bed(\\.gz)?$",
+                "exists": true,
+                "errorMessage": "SNP capture bed files to mask the reference for PMDtools must not contain any spaces and have file extensions '.bed' or '.bed.gz'."
+            },
+            "sexdeterrmine_snp_bed": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.bed?(\\.gz)$",
+                "exists": true,
+                "errorMessage": "SNP capture bed files for SexDetERRmine must not contain any spaces and have file extensions '.bed' or '.bed.gz'."
+            },
+            "bedtools_feature_file": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.(bed|gff|gff3)(\\.gz)?$",
+                "exists": true,
+                "errorMessage": "Feature files must not contain any spaces and have file extensions '.bed', '.bed.gz', '.gff', '.gff.gz', '.gff3' or '.gff3.gz'."
+            },
+            "genotyping_reference_ploidy": {
+                "type": "integer",
+                "meta": ["genotyping_ploidy"],
+                "errorMessage": "Organism ploidy for GATK or FreeBayes must be provided as integers."
+            },
+            "genotyping_gatk_dbsnp": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.vcf$",
+                "exists": true,
+                "errorMessage": "SNP annotation files for GATK must not contain any spaces and have file extension '.vcf'."
+            }
+        },
+        "required": ["reference_name", "fasta"]
+    }
+}
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -54,23 +54,17 @@
                 "format": "file-path",
                 "pattern": "^\\S+\\.f(ast)?q\\.gz$",
                 "exists": true,
+                "unique": true,
                 "errorMessage": "FastQ file for reads 1 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'."
             },
             "r2": {
-                "errorMessage": "FastQ file for reads 2 require files for reads 1, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'.",
-                "anyOf": [
-                    {
-                        "type": "string",
-                        "format": "file-path",
-                        "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                        "exists": true,
-                        "dependentRequired": ["r1"]
-                    },
-                    {
-                        "type": "string",
-                        "maxLength": 0
-                    }
-                ]
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
+                "exists": true,
+                "unique": true,
+                "dependentRequired": ["r1"],
+                "errorMessage": "FastQ file for reads 2 require files for reads 1, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'."
             },
             "bam": {
                 "type": "string",
@@ -84,6 +78,7 @@
             "bam_reference_id": {
                 "type": "string",
                 "meta": ["bam_reference_id"],
+                "dependentRequired": ["bam"],
                 "errorMessage": "A BAM reference ID (corresponding to what is supplied to `--fasta`) must always be provided when providing a BAM file."
             }
         },
@@ -95,6 +90,14 @@
             "pairment",
             "strandedness",
             "damage_treatment"
+        ],
+        "anyOf": [
+            {
+                "required": ["r1"]
+            },
+            {
+                "required": ["bam"]
+            }
         ]
     }
 }
diff --git a/conf/test_multiref.config b/conf/test_multiref.config
@@ -5,7 +5,8 @@
     Defines input files and everything required to run a fast and simple pipeline test.
 
     Use as follows:
-        nextflow run nf-core/eager -profile test,<docker/singularity> --outdir <OUTDIR>
+    fasta_sheet = 'https://github.com/nf-core/test-datasets/raw/eager/reference/reference_sheet_multiref.csv'
+        nextflow run nf-core/eager -profile test_multiref,<docker/singularity> --outdir <OUTDIR>
 
 ----------------------------------------------------------------------------------------
 */
@@ -23,7 +24,7 @@ params {
     input = params.pipelines_testdata_base_path + 'eager/testdata/Mammoth/samplesheet_multilane_multilib.tsv'
 
     // Genome references
-    fasta = params.pipelines_testdata_base_path + 'eager/reference/reference_sheet_multiref.csv'
+    fasta_sheet = params.pipelines_testdata_base_path + 'eager/reference/reference_sheet_multiref.csv'
 
     // BAM filtering
     run_bamfiltering                      = true

diff --git a/conf/test_nothing.config b/conf/test_nothing.config
@@ -0,0 +1,46 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/eager -profile test_nothing,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
+    // TODO nf-core: Give any required params for the test so that command line flags are not needed
+    input = params.pipelines_testdata_base_path + 'eager/testdata/Human/human_design_bam_eager3.tsv'
+
+    // Genome references
+    fasta = params.pipelines_testdata_base_path + 'eager/reference/Human/hs37d5_chr21-MT.fa.gz'
+
+    skip_preprocessing      = true
+    skip_deduplication      = true
+    skip_qualimap           = true
+    skip_damage_calculation = true
+    mapstats_skip_preseq    = true
+
+    run_fastq_sharding                 = false
+    run_bamfiltering                   = false
+    run_bedtools_coverage              = false
+    run_metagenomicscreening           = false
+    run_contamination_estimation_angsd = false
+    run_mtnucratio                     = false
+    run_mapdamage_rescaling            = false
+    run_pmd_filtering                  = false
+    run_trim_bam                       = false
+
+}
diff --git a/docs/development/dev_docs.md b/docs/development/dev_docs.md
@@ -17,18 +17,33 @@ To add new input files or options to the reference sheet, you have to complete a
 ### Multi-reference input workflow
 
 1. Add new column named <SOFTWARE_FILETYPE> and test data to the test reference sheet (https://github.com/nf-core/test-datasets/blob/eager/reference/reference_sheet_multiref.csv).
-2. Read in new input within the reference_indexing_multi local subworkflow.
-    1. Add new line to the large `.map{}` operation starting on [line 30](https://github.com/nf-core/eager/blob/d4211582f349cc30c88202c12942218f99006041/subworkflows/local/reference_indexing_multi.nf#L30). Add check if the file exists if appropriate. `def <PARAM_NAME> = row["<SOFTWARE_FILETYPE>"] != "" ? file(row["<SOFTWARE_FILETYPE>"], checkIfExists: true) : ""`
+2. Read in new input via nf-validation plugin within the reference_indexing_multi local subworkflow.
+    1. Add new "property" <SOFTWARE_FILETYPE> to the fasta validation schema (assets/schema_fasta.json).
+        1. Add "type" of your object, e.g. `"type": "string"` for file paths and `"type": "integer"` for numbers.
+        2. Add `"meta": "<NEW_META>"` to add your input to the meta map.
+        3. Add `"format": "file-path"` to check if the entered string is a file path.
+        4. Add `"pattern": "<REGEX_PATTERN>"` to check that the string doesn't contain spaces (`"pattern": "^\\S+$"`), contains a certain file extension (`"pattern": "^\\S+\\.vcf$"`), etc.
+        5. Add `"exists": "true"` to check if the file exists.
+        6. Add an `"errorMessage":` to be displayed if one of the checks fails describing the criteria above.
+    2. Add new line to the large `.map{}` operation starting on [line 22](https://github.com/nf-core/eager/blob/927efb7a45ba1312983056213bc425612db445c7/subworkflows/local/reference_indexing_multi.nf#L22) and add conversion from empty array `[]` to empty string `""`.
+    `<PARAM_NAME> = <PARAM_NAME> != [] ? <PARAM_NAME> : ""`
     2. Add <PARAM_NAME> to the result of the `.map{}` operation. Double-check the order!
-    3. With the `ch_input_from_referencesheet.multiMap{}` below you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step. `<NEW_SUBCHANNEL>: [ meta, <PARAM_NAME> ]`
-    4. Add ch_input_from_referencesheet.<NEW_SUBCHANNEL> to the final emit. `<NEW_EMIT> = ch_input_from_referencesheet.<NEW_SUBCHANNEL>`
+    3. With the `ch_input_from_referencesheet.multiMap{}` below, you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step.
+    `<NEW_SUBCHANNEL>: [ meta, <PARAM_NAME> ]`
+    4. Add ch_input_from_referencesheet.<NEW_SUBCHANNEL> to the final emit.
+    `<NEW_EMIT> = ch_input_from_referencesheet.<NEW_SUBCHANNEL>`
+    5. Add corresponding `params.<NEW>` to warning on [line 23](https://github.com/nf-core/eager/blob/927efb7a45ba1312983056213bc425612db445c7/subworkflows/local/reference_indexing.nf#L23) of the reference indexing subworkflow.
 
 ### Combining in the Reference Indexing workflow
 
 1. Add you new parameter channel to the `if` condition selecting between the direct parameter input or the reference sheet input.
-    1. below "REFERENCE_INDEXING_MULTI" for reference sheet input `<NEW_CHANNEL> = REFERENCE_INDEXING_MULTI.out.<NEW_EMIT>`
-    2. below "REFERENCE_INDEXING_SINGLE" `<NEW_CHANNEL> = REFERENCE_INDEXING_SINGLE.out.<NEW_EMIT>`
-    3. Filter out options that have not been provided. `<NEW_CHANNEL> = <NEW_CHANNEL>.filter{ it[1] != "" }`
+    1. below "REFERENCE_INDEXING_MULTI" for reference sheet input
+    `<NEW_CHANNEL> = REFERENCE_INDEXING_MULTI.out.<NEW_EMIT>`
+    2. below "REFERENCE_INDEXING_SINGLE"
+    `<NEW_CHANNEL> = REFERENCE_INDEXING_SINGLE.out.<NEW_EMIT>`
+    3. Filter out options that have not been provided.
+    `<NEW_CHANNEL> = <NEW_CHANNEL>.filter{ it[1] != "" }`
     4. Add unzipping of zipped input files with GUNZIP.
-    5. Add <NEW_CHANNEL> to the final emit. `<NEW_EMIT> = <NEW_CHANNEL>`
+    5. Add <NEW_CHANNEL> to the final emit.
+    `<NEW_EMIT> = <NEW_CHANNEL>`
     6. Call new inputs within the main eager.nf with `REFERENCE_INDEXING.out.<NEW_EMIT>`.