Skip to content

Commit

Permalink
Merge pull request nf-core#1013 from nf-core/dsl2-nf-validation
Browse files Browse the repository at this point in the history
DSL2: replaces input_check with fromSamplesheet (nf-validation)
  • Loading branch information
scarlhoff authored May 24, 2024
2 parents 62f1b57 + 09ab6aa commit 45cc81e
Show file tree
Hide file tree
Showing 13 changed files with 336 additions and 101 deletions.
129 changes: 129 additions & 0 deletions assets/schema_fasta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "https://raw.githubusercontent.com/nf-core/eager/master/assets/schema_fasta.json",
"title": "nf-core/eager pipeline - params.fasta schema",
"description": "Schema for the file provided with params.fasta",
"type": "array",
"items": {
"type": "object",
"properties": {
"reference_name": {
"type": "string",
"pattern": "^\\S+$",
"meta": ["id"],
"unique": true,
"errorMessage": "Reference name must be provided and cannot contain spaces."
},
"fasta": {
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.f(na|asta|a|as)(\\.gz)?$",
"exists": true,
"unique": true,
"errorMessage": "Fasta files for the mapping reference must be provided with file extensions '.fasta', '.fa', '.fas', '.fna', '.fasta.gz','.fa.gz','.fas.gz', '.fna.gz' and cannot contain any spaces."
},
"fai": {
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.fai$",
"exists": true,
"errorMessage": "Fasta index files for the mapping reference cannot have any spaces and must have file extension '.fai'."
},
"dict": {
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.dict$",
"exists": true,
"errorMessage": "Picard sequence dictionary files for the mapping reference cannot have any spaces and must have file extensions '.dict'."
},
"mapper_index": {
"type": "string",
"format": "directory-path",
"pattern": "^\\S+$",
"exists": true,
"errorMessage": "The directories of the index files for the mapping reference for a given mapper must not contain any spaces and have file extensions ''."
},
"circular_target": {
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "The headers of the chromosome to be extended by circularmapper must not contain any spaces and no leading '>'."
},
"mitochondrion_header": {
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "The names of the fasta entry of the mapping reference corresponding to the mitochondrial genome must not contain any spaces and no leading '>'."
},
"snpcapture_bed": {
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.bed(\\.gz)?$",
"exists": true,
"errorMessage": "SNP capture bed files must not contain any spaces and have file extensions '.bed' or '.bed.gz'."
},
"pileupcaller_bedfile": {
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.bed(\\.gz)?$",
"exists": true,
"dependentRequired": ["pileupcaller_snpfile"],
"errorMessage": "SNP capture bed files for pileupcaller must not contain any spaces, have file extensions '.bed' or '.bed.gz' and be provided alongside a pileupcall_bedfile."
},
"pileupcaller_snpfile": {
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.snp$",
"exists": true,
"dependentRequired": ["pileupcaller_bedfile"],
"errorMessage": "SNP panel files for pileupcaller must not contain any spaces, have file extension '.snp' and be provided alongside a pileupcaller_snpfile."
},
"hapmap_file": {
"type": "string",
"format": "file-path",
"pattern": "^\\S+$",
"exists": true,
"errorMessage": "HapMap files for contamination estimation with ANGSD must not contain any spaces."
},
"pmdtools_masked_fasta": {
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.f(na|asta|a|as)(\\.gz)?$",
"exists": true,
"errorMessage": "Masked fasta files for PMDtools must not contain any spaces and have file extensions '.fasta', '.fa', '.fas', '.fna', '.fasta.gz','.fa.gz','.fas.gz' or '.fna.gz'."
},
"pmdtools_bed_for_masking": {
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.bed(\\.gz)?$",
"exists": true,
"errorMessage": "SNP capture bed files to mask the reference for PMDtools must not contain any spaces and have file extensions '.bed' or '.bed.gz'."
},
"sexdeterrmine_snp_bed": {
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.bed?(\\.gz)$",
"exists": true,
"errorMessage": "SNP capture bed files for SexDetERRmine must not contain any spaces and have file extensions '.bed' or '.bed.gz'."
},
"bedtools_feature_file": {
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.(bed|gff|gff3)(\\.gz)?$",
"exists": true,
"errorMessage": "Feature files must not contain any spaces and have file extensions '.bed', '.bed.gz', '.gff', '.gff.gz', '.gff3' or '.gff3.gz'."
},
"genotyping_reference_ploidy": {
"type": "integer",
"meta": ["genotyping_ploidy"],
"errorMessage": "Organism ploidy for GATK or FreeBayes must be provided as integers."
},
"genotyping_gatk_dbsnp": {
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.vcf$",
"exists": true,
"errorMessage": "SNP annotation files for GATK must not contain any spaces and have file extension '.vcf'."
}
},
"required": ["reference_name", "fasta"]
}
}
31 changes: 17 additions & 14 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,23 +54,17 @@
"format": "file-path",
"pattern": "^\\S+\\.f(ast)?q\\.gz$",
"exists": true,
"unique": true,
"errorMessage": "FastQ file for reads 1 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'."
},
"r2": {
"errorMessage": "FastQ file for reads 2 require files for reads 1, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'.",
"anyOf": [
{
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.f(ast)?q\\.gz$",
"exists": true,
"dependentRequired": ["r1"]
},
{
"type": "string",
"maxLength": 0
}
]
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.f(ast)?q\\.gz$",
"exists": true,
"unique": true,
"dependentRequired": ["r1"],
"errorMessage": "FastQ file for reads 2 require files for reads 1, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'."
},
"bam": {
"type": "string",
Expand All @@ -84,6 +78,7 @@
"bam_reference_id": {
"type": "string",
"meta": ["bam_reference_id"],
"dependentRequired": ["bam"],
"errorMessage": "A BAM reference ID (corresponding to what is supplied to `--fasta`) must always be provided when providing a BAM file."
}
},
Expand All @@ -95,6 +90,14 @@
"pairment",
"strandedness",
"damage_treatment"
],
"anyOf": [
{
"required": ["r1"]
},
{
"required": ["bam"]
}
]
}
}
5 changes: 3 additions & 2 deletions conf/test_multiref.config
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/eager -profile test,<docker/singularity> --outdir <OUTDIR>
fasta_sheet = 'https://github.com/nf-core/test-datasets/raw/eager/reference/reference_sheet_multiref.csv'
nextflow run nf-core/eager -profile test_multiref,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/
Expand All @@ -23,7 +24,7 @@ params {
input = params.pipelines_testdata_base_path + 'eager/testdata/Mammoth/samplesheet_multilane_multilib.tsv'

// Genome references
fasta = params.pipelines_testdata_base_path + 'eager/reference/reference_sheet_multiref.csv'
fasta_sheet = params.pipelines_testdata_base_path + 'eager/reference/reference_sheet_multiref.csv'

// BAM filtering
run_bamfiltering = true
Expand Down
46 changes: 46 additions & 0 deletions conf/test_nothing.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/eager -profile test_nothing,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'

// Input data
// TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
// TODO nf-core: Give any required params for the test so that command line flags are not needed
input = params.pipelines_testdata_base_path + 'eager/testdata/Human/human_design_bam_eager3.tsv'

// Genome references
fasta = params.pipelines_testdata_base_path + 'eager/reference/Human/hs37d5_chr21-MT.fa.gz'

skip_preprocessing = true
skip_deduplication = true
skip_qualimap = true
skip_damage_calculation = true
mapstats_skip_preseq = true

run_fastq_sharding = false
run_bamfiltering = false
run_bedtools_coverage = false
run_metagenomicscreening = false
run_contamination_estimation_angsd = false
run_mtnucratio = false
run_mapdamage_rescaling = false
run_pmd_filtering = false
run_trim_bam = false

}
31 changes: 23 additions & 8 deletions docs/development/dev_docs.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,33 @@ To add new input files or options to the reference sheet, you have to complete a
### Multi-reference input workflow

1. Add new column named <SOFTWARE_FILETYPE> and test data to the test reference sheet (https://github.com/nf-core/test-datasets/blob/eager/reference/reference_sheet_multiref.csv).
2. Read in new input within the reference_indexing_multi local subworkflow.
1. Add new line to the large `.map{}` operation starting on [line 30](https://github.com/nf-core/eager/blob/d4211582f349cc30c88202c12942218f99006041/subworkflows/local/reference_indexing_multi.nf#L30). Add check if the file exists if appropriate. `def <PARAM_NAME> = row["<SOFTWARE_FILETYPE>"] != "" ? file(row["<SOFTWARE_FILETYPE>"], checkIfExists: true) : ""`
2. Read in new input via nf-validation plugin within the reference_indexing_multi local subworkflow.
1. Add new "property" <SOFTWARE_FILETYPE> to the fasta validation schema (assets/schema_fasta.json).
1. Add "type" of your object, e.g. `"type": "string"` for file paths and `"type": "integer"` for numbers.
2. Add `"meta": "<NEW_META>"` to add your input to the meta map.
3. Add `"format": "file-path"` to check if the entered string is a file path.
4. Add `"pattern": "<REGEX_PATTERN>"` to check that the string doesn't contain spaces (`"pattern": "^\\S+$"`), contains a certain file extension (`"pattern": "^\\S+\\.vcf$"`), etc.
5. Add `"exists": "true"` to check if the file exists.
6. Add an `"errorMessage":` to be displayed if one of the checks fails describing the criteria above.
2. Add new line to the large `.map{}` operation starting on [line 22](https://github.com/nf-core/eager/blob/927efb7a45ba1312983056213bc425612db445c7/subworkflows/local/reference_indexing_multi.nf#L22) and add conversion from empty array `[]` to empty string `""`.
`<PARAM_NAME> = <PARAM_NAME> != [] ? <PARAM_NAME> : ""`
2. Add <PARAM_NAME> to the result of the `.map{}` operation. Double-check the order!
3. With the `ch_input_from_referencesheet.multiMap{}` below you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step. `<NEW_SUBCHANNEL>: [ meta, <PARAM_NAME> ]`
4. Add ch_input_from_referencesheet.<NEW_SUBCHANNEL> to the final emit. `<NEW_EMIT> = ch_input_from_referencesheet.<NEW_SUBCHANNEL>`
3. With the `ch_input_from_referencesheet.multiMap{}` below, you add the reference name as a meta. You can also combine your new parameter with others if useful for the workflow step.
`<NEW_SUBCHANNEL>: [ meta, <PARAM_NAME> ]`
4. Add ch_input_from_referencesheet.<NEW_SUBCHANNEL> to the final emit.
`<NEW_EMIT> = ch_input_from_referencesheet.<NEW_SUBCHANNEL>`
5. Add corresponding `params.<NEW>` to warning on [line 23](https://github.com/nf-core/eager/blob/927efb7a45ba1312983056213bc425612db445c7/subworkflows/local/reference_indexing.nf#L23) of the reference indexing subworkflow.

### Combining in the Reference Indexing workflow

1. Add you new parameter channel to the `if` condition selecting between the direct parameter input or the reference sheet input.
1. below "REFERENCE_INDEXING_MULTI" for reference sheet input `<NEW_CHANNEL> = REFERENCE_INDEXING_MULTI.out.<NEW_EMIT>`
2. below "REFERENCE_INDEXING_SINGLE" `<NEW_CHANNEL> = REFERENCE_INDEXING_SINGLE.out.<NEW_EMIT>`
3. Filter out options that have not been provided. `<NEW_CHANNEL> = <NEW_CHANNEL>.filter{ it[1] != "" }`
1. below "REFERENCE_INDEXING_MULTI" for reference sheet input
`<NEW_CHANNEL> = REFERENCE_INDEXING_MULTI.out.<NEW_EMIT>`
2. below "REFERENCE_INDEXING_SINGLE"
`<NEW_CHANNEL> = REFERENCE_INDEXING_SINGLE.out.<NEW_EMIT>`
3. Filter out options that have not been provided.
`<NEW_CHANNEL> = <NEW_CHANNEL>.filter{ it[1] != "" }`
4. Add unzipping of zipped input files with GUNZIP.
5. Add <NEW_CHANNEL> to the final emit. `<NEW_EMIT> = <NEW_CHANNEL>`
5. Add <NEW_CHANNEL> to the final emit.
`<NEW_EMIT> = <NEW_CHANNEL>`
6. Call new inputs within the main eager.nf with `REFERENCE_INDEXING.out.<NEW_EMIT>`.
Loading

0 comments on commit 45cc81e

Please sign in to comment.