nf-core · jfy133 · Apr 26, 2023 · May 10, 2023 · May 10, 2023 · May 24, 2023
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -10,6 +10,16 @@ report_section_order:
   "nf-core-funcscan-summary":
     order: -1002
 
+run_modules:
+  - prokka
+  - custom_content
+
+prokka_fn_snames: True
+
+table_columns_visible:
+  Prokka:
+    organism: False
+
 export_plots: true
 
 custom_logo: "nf-core-funcscan_logo_flat_light.png"

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -33,10 +33,16 @@ class RowChecker:
         ".fasta.gz",
     )
 
+    VALID_PROTEIN_FORMATS = (".faa", ".fasta", ".fa")
+
+    VALID_FEATURE_FORMATS = (".gbk", ".gff")
+
     def __init__(
         self,
         sample_col="sample",
         contig_col="fasta",
+        protein_col="protein",
+        feature_col="feature",
         **kwargs,
     ):
         """
@@ -46,18 +52,23 @@ def __init__(
             sample_col (str): The name of the column that contains a contig's
                 identifier (default "sample").
             contig_col (str): The name of the column that contains the contig's
-                FASTA file path (default "fastqa").
-
+                FASTA file path (default "fasta").
+            protein_col (str): The name of the column that contains the contig's
+                amino acid FASTA file path (default "faa").
+            feature_col (str): The name of the column that contains the contig's
+                feature file path (default "gbk").
         """
         super().__init__(**kwargs)
         self._sample_col = sample_col
         self._contig_col = contig_col
+        self._protein_col = protein_col
+        self._feature_col = feature_col
         self._seen = set()
         self.modified = []
 
     def validate_and_transform(self, row):
         """
-        Perform all validations on the given row and insert the read pairing status.
+        Perform all validations on the given row
 
         Args:
             row (dict): A mapping from column headers (keys) to elements of that row
@@ -67,7 +78,11 @@ def validate_and_transform(self, row):
         self._validate_sample(row)
         self._validate_fasta(row)
         self._validate_fasta_format(row)
-        self._seen.add((row[self._sample_col], row[self._contig_col]))
+        self._validate_protein(row)
+        self._validate_protein_format(row)
+        self._validate_feature(row)
+        self._validate_feature_format(row)
+        self._seen.add((row[self._sample_col], row[self._contig_col], row[self._protein_col], row[self._feature_col]))
         self.modified.append(row)
 
     def _validate_sample(self, row):
@@ -85,13 +100,46 @@ def _validate_fasta(self, row):
         ), f"The FASTA filename may not contain any spaces '{row[self._contig_col]}'."
 
     def _validate_fasta_format(self, row):
-        """Assert that a given filename has one of the expected FASTQ extensions."""
+        """Assert that a given filename has one of the expected FASTA extensions."""
         filename = Path(row[self._contig_col]).name
         assert any(filename.endswith(extension) for extension in self.VALID_FORMATS), (
             f"The FASTA file has an unrecognized extension: {filename}\n"
             f"It should be one of: {', '.join(self.VALID_FORMATS)}"
         )
 
+    def _validate_protein(self, row):
+        """Assert that the optional amino acid FASTA entry has the right format."""
+        if self._protein_col in row and len(row[self._protein_col]) > 0:
+            assert (
+                " " not in Path(row[self._protein_col]).name
+            ), f"The protein FASTA filename may not contain any spaces '{row[self._protein_col]}'."
+
+    def _validate_protein_format(self, row):
+        """Assert that a given filename has one of the expected (if supplied) amino acid FASTA extensions."""
+        filename = Path(row[self._protein_col]).name
+        if len(row[self._protein_col]) > 0:
+            assert any(filename.endswith(extension) for extension in self.VALID_PROTEIN_FORMATS), (
+                f"The protein FASTA file has an unrecognized extension: {filename}\n"
+                f"It should be one of: {', '.join(self.VALID_PROTEIN_FORMATS)}"
+            )
+
+    def _validate_feature(self, row):
+        """Assert that the optional feature file entry has the right format."""
+        if self._feature_col in row and len(row[self._feature_col]) > 0:
+            assert (
+                " " not in Path(row[self._feature_col]).name
+            ), f"The feature GBK/GFF filename may not contain any spaces '{row[self._feature_col]}'."
+
+    def _validate_feature_format(self, row):
+        """Assert that a given filename has one of the expected (if supplied) feature extensions."""
+        if self._feature_col in row:
+            filename = Path(row[self._feature_col]).name
+            if len(row[self._feature_col]) > 0:
+                assert any(filename.endswith(extension) for extension in self.VALID_FEATURE_FORMATS), (
+                    f"The feature file has an unrecognized extension: {filename}\n"
+                    f"It should be one of: {', '.join(self.VALID_FEATURE_FORMATS)}"
+                )
+
 
 def read_head(handle, num_lines=10):
     """Read the specified number of lines from the current position in the file."""
@@ -141,8 +189,8 @@ def check_samplesheet(file_in, file_out):
     Example:
         This function checks that the samplesheet follows the following structure::
 
-            sample,fasta
-            contig_1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/bacteroides_fragilis/genome/genome.fna.gz
+            sample,fasta,protein,feature
+            contig_1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/bacteroides_fragilis/genome/genome.fna.gz,genome.faa.gz,genome.gbk
 
     """
     required_columns = {"sample", "fasta"}
@@ -162,6 +210,7 @@ def check_samplesheet(file_in, file_out):
             except AssertionError as error:
                 logger.critical(f"{str(error)} On line {i + 2}.")
                 sys.exit(1)
+        ## TODO: Update `validate_and_transform()` to not parse protein/gff if not presnet in file
     header = list(reader.fieldnames)
     header.insert(1, "single_end")
     # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.

diff --git a/conf/modules.config b/conf/modules.config
@@ -80,7 +80,7 @@ process {
             params.annotation_prokka_rawproduct ? '--rawproduct' : '',
             params.annotation_prokka_rnammer ? '--rnammer' : '',
             params.annotation_prokka_compliant ? '--compliant' : '',
-            params.annotation_prokka_addgenes ? '--addgenes' : ''
+            params.annotation_prokka_addgenes ? '--addgenes' : '',
         ].join(' ').trim()
     }
 
@@ -130,7 +130,7 @@ process {
             path: { "${params.outdir}/annotation/prodigal/${meta.id}" },
             mode: params.publish_dir_mode,
             enabled: params.save_annotations,
-            pattern: "*.{faa,fna,gff}",
+            pattern: "*.{faa.gz,fna.gz,gff.gz}",
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
         ext.args = [
@@ -146,7 +146,7 @@ process {
             path: { "${params.outdir}/annotation/prodigal/${meta.id}" },
             mode: params.publish_dir_mode,
             enabled: params.save_annotations,
-            pattern: "*.gbk",
+            pattern: "*.gbk.gz",
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
         ext.args = [

diff --git a/docs/usage.md b/docs/usage.md
@@ -54,18 +54,22 @@ nf-core/funcscan takes FASTA files as input, typically contigs or whole genome s
 The input samplesheet has to be a comma-separated file (`.csv`) with 2 columns (`sample`, and `fasta`), and a header row as shown in the examples below.
 
 ```bash
-sample,fasta
-sample_1,/<path>/<to>/wastewater_metagenome_contigs_1.fasta.gz
-sample_2,/<path>/<to>/wastewater_metagenome_contigs_2.fasta.gz
+sample,fasta,protein,feature
+sample_1,/<path>/<to>/wastewater_metagenome_contigs_1.fasta.gz,,
+sample_2,/<path>/<to>/wastewater_metagenome_contigs_2.fasta.gz,/<path>/<to>/wastewater_metagenome_contigs_2.faa,
 ```
 
-| Column   | Description                                                                                                                                                |
-| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). |
-| `fasta`  | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`.    |
+| Column    | Description                                                                                                                                                |
+| --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `sample`  | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). |
+| `fasta`   | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`.    |
+| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`. Leave empty if not available.                   |
+| `feature` | Optional path to a pre-generated annotation file (`.gbk` or `.gff`) containing annotations information of `fasta`. Leave empty if not available.           |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
+If you already have annotated contigs, you can supply these to the pipeline using optional `protein` and `feature` columns. If either of the two columns are supplied, pipeline annotation will not be performed for the corresponding FASTA file.
+
 > ⚠️ We highly recommend performing quality control on input contigs before running the pipeline. You may not receive results for some tools if none of the contigs in a FASTA file reach certain thresholds. Check parameter documentation for relevant minimum contig parameters.
 
 ## Notes on screening tools

diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf
@@ -109,10 +109,10 @@ workflow AMP {
                 input: [ it[0] ]
                 summary: it[1]
         }
-    
+
     ch_tabix_input = Channel.of(['id':'ampcombi_complete_summary'])
         .combine(ch_ampcombi_summaries_out.summary.collectFile(name: 'ampcombi_complete_summary.csv', keepHeader:true))
-    
+
     TABIX_BGZIP(ch_tabix_input)
 
     emit:

diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf
@@ -0,0 +1,102 @@
+/*
+    Run annotation tools
+*/
+
+include { PROKKA                         } from '../../modules/nf-core/prokka/main'
+include { PRODIGAL as PRODIGAL_GFF       } from '../../modules/nf-core/prodigal/main'
+include { PRODIGAL as PRODIGAL_GBK       } from '../../modules/nf-core/prodigal/main'
+include { PYRODIGAL                      } from '../../modules/nf-core/pyrodigal/main'
+include { BAKTA_BAKTADBDOWNLOAD          } from '../../modules/nf-core/bakta/baktadbdownload/main'
+include { BAKTA_BAKTA                    } from '../../modules/nf-core/bakta/bakta/main'
+include { GUNZIP as GUNZIP_PRODIGAL_FNA  } from '../../modules/nf-core/gunzip/main'
+include { GUNZIP as GUNZIP_PRODIGAL_FAA  } from '../../modules/nf-core/gunzip/main'
+include { GUNZIP as GUNZIP_PRODIGAL_GFF  } from '../../modules/nf-core/gunzip/main'
+include { GUNZIP as GUNZIP_PRODIGAL_GBK  } from '../../modules/nf-core/gunzip/main'
+include { GUNZIP as GUNZIP_PYRODIGAL_FNA } from '../../modules/nf-core/gunzip/main'
+include { GUNZIP as GUNZIP_PYRODIGAL_FAA } from '../../modules/nf-core/gunzip/main'
+include { GUNZIP as GUNZIP_PYRODIGAL_GFF } from '../../modules/nf-core/gunzip/main'
+
+workflow ANNOTATION {
+    take:
+    fasta // tuple val(meta), path(contigs)
+
+    main:
+    ch_versions      = Channel.empty()
+    ch_multiqc_files = Channel.empty()
+
+    if ( params.annotation_tool == "prodigal" ) {
+        PRODIGAL_GFF ( fasta, "gff" )
+        GUNZIP_PRODIGAL_FAA ( PRODIGAL_GFF.out.amino_acid_fasta )
+        GUNZIP_PRODIGAL_FNA ( PRODIGAL_GFF.out.nucleotide_fasta)
+        GUNZIP_PRODIGAL_GFF ( PRODIGAL_GFF.out.gene_annotations )
+        ch_versions       = ch_versions.mix(PRODIGAL_GFF.out.versions)
+        ch_versions       = ch_versions.mix(GUNZIP_PRODIGAL_FAA.out.versions)
+        ch_versions       = ch_versions.mix(GUNZIP_PRODIGAL_FNA.out.versions)
+        ch_versions       = ch_versions.mix(GUNZIP_PRODIGAL_GFF.out.versions)
+        ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip
+        ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip
+        ch_annotation_gff = GUNZIP_PRODIGAL_GFF.out.gunzip
+        ch_annotation_gbk = Channel.empty() // Prodigal GBK and GFF output are mutually exclusive
+
+        if ( params.save_annotations == true ) {
+            PRODIGAL_GBK ( fasta, "gbk" )
+            GUNZIP_PRODIGAL_GBK ( PRODIGAL_GBK.out.gene_annotations)
+            ch_versions              = ch_versions.mix(PRODIGAL_GBK.out.versions)
+            ch_annotation_gbk        = PRODIGAL_GBK.out.gene_annotations // Prodigal GBK output stays zipped because it is currently not used by any downstream subworkflow.
+        }
+
+    } else if ( params.annotation_tool == "pyrodigal" ) {
+
+        PYRODIGAL ( fasta )
+        GUNZIP_PYRODIGAL_FAA ( PYRODIGAL.out.faa )
+        GUNZIP_PYRODIGAL_FNA ( PYRODIGAL.out.fna)
+        GUNZIP_PYRODIGAL_GFF ( PYRODIGAL.out.gff )
+        ch_versions       = ch_versions.mix(PYRODIGAL.out.versions)
+        ch_versions       = ch_versions.mix(GUNZIP_PYRODIGAL_FAA.out.versions)
+        ch_versions       = ch_versions.mix(GUNZIP_PYRODIGAL_FNA.out.versions)
+        ch_versions       = ch_versions.mix(GUNZIP_PYRODIGAL_GFF.out.versions)
+        ch_annotation_faa = GUNZIP_PYRODIGAL_FAA.out.gunzip
+        ch_annotation_fna = GUNZIP_PYRODIGAL_FAA.out.gunzip
+        ch_annotation_gff = GUNZIP_PYRODIGAL_FAA.out.gunzip
+        ch_annotation_gbk = Channel.empty() // Pyrodigal doesn't produce GBK
+
+    }  else if ( params.annotation_tool == "prokka" ) {
+
+        PROKKA ( fasta, [], [] )
+        ch_versions              = ch_versions.mix(PROKKA.out.versions)
+        ch_annotation_faa        = PROKKA.out.faa
+        ch_annotation_fna        = PROKKA.out.fna
+        ch_annotation_gff        = PROKKA.out.gff
+        ch_annotation_gbk        = PROKKA.out.gbk
+        ch_multiqc_files         = PROKKA.out.txt
+
+    }   else if ( params.annotation_tool == "bakta" ) {
+
+        // BAKTA prepare download
+        if ( params.annotation_bakta_db_localpath ) {
+            ch_bakta_db = Channel
+                .fromPath( params.annotation_bakta_db_localpath )
+                .first()
+        } else {
+            BAKTA_BAKTADBDOWNLOAD ( )
+            ch_versions = ch_versions.mix( BAKTA_BAKTADBDOWNLOAD.out.versions )
+            ch_bakta_db = ( BAKTA_BAKTADBDOWNLOAD.out.db )
+        }
+
+        BAKTA_BAKTA ( fasta, ch_bakta_db, [], [] )
+        ch_versions              = ch_versions.mix(BAKTA_BAKTA.out.versions)
+        ch_annotation_faa        = BAKTA_BAKTA.out.faa
+        ch_annotation_fna        = BAKTA_BAKTA.out.fna
+        ch_annotation_gff        = BAKTA_BAKTA.out.gff
+        ch_annotation_gbk        = BAKTA_BAKTA.out.gbff
+
+    }
+
+    emit:
+    versions        = ch_versions
+    multiqc_files   = ch_multiqc_files
+    faa             = ch_annotation_faa // [ [meta], path(faa) ]
+    fna             = ch_annotation_fna // [ [meta], path(fna) ]
+    gff             = ch_annotation_gff // [ [meta], path(gff) ]
+    gbk             = ch_annotation_gbk // [ [meta], path(gbk) ]
+}
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
@@ -9,11 +9,10 @@ workflow INPUT_CHECK {
     samplesheet // file: /path/to/samplesheet.csv
 
     main:
-    SAMPLESHEET_CHECK ( samplesheet )
-        .csv
-        .splitCsv ( header:true, sep:',' )
-        .map { create_input_channels(it) }
-        .set { contigs }
+    contigs = SAMPLESHEET_CHECK ( samplesheet )
+                .csv
+                .splitCsv ( header:true, sep:',' )
+                .map { create_input_channels(it) }
 
     emit:
     contigs                                   // channel: [ val(meta), [ fasta ] ]
@@ -29,7 +28,12 @@ def create_input_channels(LinkedHashMap row) {
     if (!file(row.fasta).exists()) {
         error("[funscan] error: please check input samplesheet. FASTA file does not exist for: \n${row.fasta}")
     } else {
-        array = [ meta, file(row.fasta) ]
+        array = [
+            meta,
+            file(row.fasta),
+            row.protein ? file(row.protein, checkIfExists: true) : null,
+            row.feature ? file(row.feature, checkIfExists: true) : null
+        ]
     }
 
     return array