Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Presupplied ORFs functionality #264

Closed
wants to merge 9 commits into from
10 changes: 10 additions & 0 deletions assets/multiqc_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,16 @@ report_section_order:
"nf-core-funcscan-summary":
order: -1002

run_modules:
- prokka
- custom_content

prokka_fn_snames: True

table_columns_visible:
Prokka:
organism: False

export_plots: true

custom_logo: "nf-core-funcscan_logo_flat_light.png"
Expand Down
63 changes: 56 additions & 7 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,16 @@ class RowChecker:
".fasta.gz",
)

VALID_PROTEIN_FORMATS = (".faa", ".fasta", ".fa")

VALID_FEATURE_FORMATS = (".gbk", ".gff")

def __init__(
self,
sample_col="sample",
contig_col="fasta",
protein_col="protein",
feature_col="feature",
**kwargs,
):
"""
Expand All @@ -46,18 +52,23 @@ def __init__(
sample_col (str): The name of the column that contains a contig's
identifier (default "sample").
contig_col (str): The name of the column that contains the contig's
FASTA file path (default "fastqa").

FASTA file path (default "fasta").
protein_col (str): The name of the column that contains the contig's
amino acid FASTA file path (default "faa").
feature_col (str): The name of the column that contains the contig's
feature file path (default "gbk").
"""
super().__init__(**kwargs)
self._sample_col = sample_col
self._contig_col = contig_col
self._protein_col = protein_col
self._feature_col = feature_col
self._seen = set()
self.modified = []

def validate_and_transform(self, row):
"""
Perform all validations on the given row and insert the read pairing status.
Perform all validations on the given row

Args:
row (dict): A mapping from column headers (keys) to elements of that row
Expand All @@ -67,7 +78,11 @@ def validate_and_transform(self, row):
self._validate_sample(row)
self._validate_fasta(row)
self._validate_fasta_format(row)
self._seen.add((row[self._sample_col], row[self._contig_col]))
self._validate_protein(row)
self._validate_protein_format(row)
self._validate_feature(row)
self._validate_feature_format(row)
self._seen.add((row[self._sample_col], row[self._contig_col], row[self._protein_col], row[self._feature_col]))
self.modified.append(row)

def _validate_sample(self, row):
Expand All @@ -85,13 +100,46 @@ def _validate_fasta(self, row):
), f"The FASTA filename may not contain any spaces '{row[self._contig_col]}'."

def _validate_fasta_format(self, row):
"""Assert that a given filename has one of the expected FASTQ extensions."""
"""Assert that a given filename has one of the expected FASTA extensions."""
filename = Path(row[self._contig_col]).name
assert any(filename.endswith(extension) for extension in self.VALID_FORMATS), (
f"The FASTA file has an unrecognized extension: {filename}\n"
f"It should be one of: {', '.join(self.VALID_FORMATS)}"
)

def _validate_protein(self, row):
"""Assert that the optional amino acid FASTA entry has the right format."""
if self._protein_col in row and len(row[self._protein_col]) > 0:
assert (
" " not in Path(row[self._protein_col]).name
), f"The protein FASTA filename may not contain any spaces '{row[self._protein_col]}'."

def _validate_protein_format(self, row):
"""Assert that a given filename has one of the expected (if supplied) amino acid FASTA extensions."""
filename = Path(row[self._protein_col]).name
if len(row[self._protein_col]) > 0:
assert any(filename.endswith(extension) for extension in self.VALID_PROTEIN_FORMATS), (
f"The protein FASTA file has an unrecognized extension: {filename}\n"
f"It should be one of: {', '.join(self.VALID_PROTEIN_FORMATS)}"
)

def _validate_feature(self, row):
"""Assert that the optional feature file entry has the right format."""
if self._feature_col in row and len(row[self._feature_col]) > 0:
assert (
" " not in Path(row[self._feature_col]).name
), f"The feature GBK/GFF filename may not contain any spaces '{row[self._feature_col]}'."

def _validate_feature_format(self, row):
"""Assert that a given filename has one of the expected (if supplied) feature extensions."""
if self._feature_col in row:
filename = Path(row[self._feature_col]).name
if len(row[self._feature_col]) > 0:
assert any(filename.endswith(extension) for extension in self.VALID_FEATURE_FORMATS), (
f"The feature file has an unrecognized extension: {filename}\n"
f"It should be one of: {', '.join(self.VALID_FEATURE_FORMATS)}"
)


def read_head(handle, num_lines=10):
"""Read the specified number of lines from the current position in the file."""
Expand Down Expand Up @@ -141,8 +189,8 @@ def check_samplesheet(file_in, file_out):
Example:
This function checks that the samplesheet follows the following structure::

sample,fasta
contig_1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/bacteroides_fragilis/genome/genome.fna.gz
sample,fasta,protein,feature
contig_1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/bacteroides_fragilis/genome/genome.fna.gz,genome.faa.gz,genome.gbk

"""
required_columns = {"sample", "fasta"}
Expand All @@ -162,6 +210,7 @@ def check_samplesheet(file_in, file_out):
except AssertionError as error:
logger.critical(f"{str(error)} On line {i + 2}.")
sys.exit(1)
## TODO: Update `validate_and_transform()` to not parse protein/gff if not presnet in file
header = list(reader.fieldnames)
header.insert(1, "single_end")
# See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
Expand Down
6 changes: 3 additions & 3 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ process {
params.annotation_prokka_rawproduct ? '--rawproduct' : '',
params.annotation_prokka_rnammer ? '--rnammer' : '',
params.annotation_prokka_compliant ? '--compliant' : '',
params.annotation_prokka_addgenes ? '--addgenes' : ''
params.annotation_prokka_addgenes ? '--addgenes' : '',
].join(' ').trim()
}

Expand Down Expand Up @@ -130,7 +130,7 @@ process {
path: { "${params.outdir}/annotation/prodigal/${meta.id}" },
mode: params.publish_dir_mode,
enabled: params.save_annotations,
pattern: "*.{faa,fna,gff}",
pattern: "*.{faa.gz,fna.gz,gff.gz}",
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
ext.args = [
Expand All @@ -146,7 +146,7 @@ process {
path: { "${params.outdir}/annotation/prodigal/${meta.id}" },
mode: params.publish_dir_mode,
enabled: params.save_annotations,
pattern: "*.gbk",
pattern: "*.gbk.gz",
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
ext.args = [
Expand Down
18 changes: 11 additions & 7 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,22 @@ nf-core/funcscan takes FASTA files as input, typically contigs or whole genome s
The input samplesheet has to be a comma-separated file (`.csv`) with 2 columns (`sample`, and `fasta`), and a header row as shown in the examples below.

```bash
sample,fasta
sample_1,/<path>/<to>/wastewater_metagenome_contigs_1.fasta.gz
sample_2,/<path>/<to>/wastewater_metagenome_contigs_2.fasta.gz
sample,fasta,protein,feature
sample_1,/<path>/<to>/wastewater_metagenome_contigs_1.fasta.gz,,
sample_2,/<path>/<to>/wastewater_metagenome_contigs_2.fasta.gz,/<path>/<to>/wastewater_metagenome_contigs_2.faa,
```

| Column | Description |
| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). |
| `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. |
| Column | Description |
| --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). |
| `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. |
| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`. Leave empty if not available. |
| `feature` | Optional path to a pre-generated annotation file (`.gbk` or `.gff`) containing annotations information of `fasta`. Leave empty if not available. |

An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.

If you already have annotated contigs, you can supply these to the pipeline using optional `protein` and `feature` columns. If either of the two columns are supplied, pipeline annotation will not be performed for the corresponding FASTA file.

> ⚠️ We highly recommend performing quality control on input contigs before running the pipeline. You may not receive results for some tools if none of the contigs in a FASTA file reach certain thresholds. Check parameter documentation for relevant minimum contig parameters.

## Notes on screening tools
Expand Down
4 changes: 2 additions & 2 deletions subworkflows/local/amp.nf
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,10 @@ workflow AMP {
input: [ it[0] ]
summary: it[1]
}

ch_tabix_input = Channel.of(['id':'ampcombi_complete_summary'])
.combine(ch_ampcombi_summaries_out.summary.collectFile(name: 'ampcombi_complete_summary.csv', keepHeader:true))

TABIX_BGZIP(ch_tabix_input)

emit:
Expand Down
102 changes: 102 additions & 0 deletions subworkflows/local/annotation.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
Run annotation tools
*/

include { PROKKA } from '../../modules/nf-core/prokka/main'
include { PRODIGAL as PRODIGAL_GFF } from '../../modules/nf-core/prodigal/main'
include { PRODIGAL as PRODIGAL_GBK } from '../../modules/nf-core/prodigal/main'
include { PYRODIGAL } from '../../modules/nf-core/pyrodigal/main'
include { BAKTA_BAKTADBDOWNLOAD } from '../../modules/nf-core/bakta/baktadbdownload/main'
include { BAKTA_BAKTA } from '../../modules/nf-core/bakta/bakta/main'
include { GUNZIP as GUNZIP_PRODIGAL_FNA } from '../../modules/nf-core/gunzip/main'
include { GUNZIP as GUNZIP_PRODIGAL_FAA } from '../../modules/nf-core/gunzip/main'
include { GUNZIP as GUNZIP_PRODIGAL_GFF } from '../../modules/nf-core/gunzip/main'
include { GUNZIP as GUNZIP_PRODIGAL_GBK } from '../../modules/nf-core/gunzip/main'
include { GUNZIP as GUNZIP_PYRODIGAL_FNA } from '../../modules/nf-core/gunzip/main'
include { GUNZIP as GUNZIP_PYRODIGAL_FAA } from '../../modules/nf-core/gunzip/main'
include { GUNZIP as GUNZIP_PYRODIGAL_GFF } from '../../modules/nf-core/gunzip/main'

workflow ANNOTATION {
take:
fasta // tuple val(meta), path(contigs)

main:
ch_versions = Channel.empty()
ch_multiqc_files = Channel.empty()

if ( params.annotation_tool == "prodigal" ) {
PRODIGAL_GFF ( fasta, "gff" )
GUNZIP_PRODIGAL_FAA ( PRODIGAL_GFF.out.amino_acid_fasta )
GUNZIP_PRODIGAL_FNA ( PRODIGAL_GFF.out.nucleotide_fasta)
GUNZIP_PRODIGAL_GFF ( PRODIGAL_GFF.out.gene_annotations )
ch_versions = ch_versions.mix(PRODIGAL_GFF.out.versions)
ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FAA.out.versions)
ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FNA.out.versions)
ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_GFF.out.versions)
ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip
ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip
ch_annotation_gff = GUNZIP_PRODIGAL_GFF.out.gunzip
ch_annotation_gbk = Channel.empty() // Prodigal GBK and GFF output are mutually exclusive

if ( params.save_annotations == true ) {
PRODIGAL_GBK ( fasta, "gbk" )
GUNZIP_PRODIGAL_GBK ( PRODIGAL_GBK.out.gene_annotations)
ch_versions = ch_versions.mix(PRODIGAL_GBK.out.versions)
ch_annotation_gbk = PRODIGAL_GBK.out.gene_annotations // Prodigal GBK output stays zipped because it is currently not used by any downstream subworkflow.
}

} else if ( params.annotation_tool == "pyrodigal" ) {

PYRODIGAL ( fasta )
GUNZIP_PYRODIGAL_FAA ( PYRODIGAL.out.faa )
GUNZIP_PYRODIGAL_FNA ( PYRODIGAL.out.fna)
GUNZIP_PYRODIGAL_GFF ( PYRODIGAL.out.gff )
ch_versions = ch_versions.mix(PYRODIGAL.out.versions)
ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_FAA.out.versions)
ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_FNA.out.versions)
ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_GFF.out.versions)
ch_annotation_faa = GUNZIP_PYRODIGAL_FAA.out.gunzip
ch_annotation_fna = GUNZIP_PYRODIGAL_FAA.out.gunzip
ch_annotation_gff = GUNZIP_PYRODIGAL_FAA.out.gunzip
ch_annotation_gbk = Channel.empty() // Pyrodigal doesn't produce GBK

} else if ( params.annotation_tool == "prokka" ) {

PROKKA ( fasta, [], [] )
ch_versions = ch_versions.mix(PROKKA.out.versions)
ch_annotation_faa = PROKKA.out.faa
ch_annotation_fna = PROKKA.out.fna
ch_annotation_gff = PROKKA.out.gff
ch_annotation_gbk = PROKKA.out.gbk
ch_multiqc_files = PROKKA.out.txt

} else if ( params.annotation_tool == "bakta" ) {

// BAKTA prepare download
if ( params.annotation_bakta_db_localpath ) {
ch_bakta_db = Channel
.fromPath( params.annotation_bakta_db_localpath )
.first()
} else {
BAKTA_BAKTADBDOWNLOAD ( )
ch_versions = ch_versions.mix( BAKTA_BAKTADBDOWNLOAD.out.versions )
ch_bakta_db = ( BAKTA_BAKTADBDOWNLOAD.out.db )
}

BAKTA_BAKTA ( fasta, ch_bakta_db, [], [] )
ch_versions = ch_versions.mix(BAKTA_BAKTA.out.versions)
ch_annotation_faa = BAKTA_BAKTA.out.faa
ch_annotation_fna = BAKTA_BAKTA.out.fna
ch_annotation_gff = BAKTA_BAKTA.out.gff
ch_annotation_gbk = BAKTA_BAKTA.out.gbff

}

emit:
versions = ch_versions
multiqc_files = ch_multiqc_files
faa = ch_annotation_faa // [ [meta], path(faa) ]
fna = ch_annotation_fna // [ [meta], path(fna) ]
gff = ch_annotation_gff // [ [meta], path(gff) ]
gbk = ch_annotation_gbk // [ [meta], path(gbk) ]
}
16 changes: 10 additions & 6 deletions subworkflows/local/input_check.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,10 @@ workflow INPUT_CHECK {
samplesheet // file: /path/to/samplesheet.csv

main:
SAMPLESHEET_CHECK ( samplesheet )
.csv
.splitCsv ( header:true, sep:',' )
.map { create_input_channels(it) }
.set { contigs }
contigs = SAMPLESHEET_CHECK ( samplesheet )
.csv
.splitCsv ( header:true, sep:',' )
.map { create_input_channels(it) }

emit:
contigs // channel: [ val(meta), [ fasta ] ]
Expand All @@ -29,7 +28,12 @@ def create_input_channels(LinkedHashMap row) {
if (!file(row.fasta).exists()) {
error("[funscan] error: please check input samplesheet. FASTA file does not exist for: \n${row.fasta}")
} else {
array = [ meta, file(row.fasta) ]
array = [
meta,
file(row.fasta),
row.protein ? file(row.protein, checkIfExists: true) : null,
row.feature ? file(row.feature, checkIfExists: true) : null
]
}

return array
Expand Down
Loading