oschwengers · oschwengers · Jan 20, 2022 · Jan 17, 2022 · Jan 17, 2022 · Jan 17, 2022
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -33,6 +33,9 @@ jobs:
         auto-activate-base: false
         environment-file: environment.yml
         activate-environment: bakta
+    - name: Install DeepSig
+      if: ${{ matrix.os == 'ubuntu-latest' }}
+      run: conda install deepsig
     - name: Install PyTest
       run: conda install pytest
     - name: Conda info

diff --git a/Dockerfile b/Dockerfile
@@ -17,6 +17,7 @@ RUN apk update && apk add wget tar bash \
     && cp /root/.bashrc /opt/conda/bashrc
 
 COPY environment.yml /tmp/
+RUN echo -e '\n  - deepsig>=1.2.5' >> /tmp/environment.yml
 
 SHELL ["bash", "-l" ,"-c"]
 

diff --git a/README.md b/README.md
@@ -103,7 +103,7 @@ Installation instructions, get-started and guides: Singularity [docs](https://sy
 $ python3 -m pip install --user bakta
 ```
 
-Bacta requires the following 3rd party executables which must be installed & executable:
+Bacta requires the following 3rd party software tools which must be installed and executable to use the full set of features:
 
 - tRNAscan-SE (2.0.6) <https://doi.org/10.1101/614032> <http://lowelab.ucsc.edu/tRNAscan-SE>
 - Aragorn (1.2.38) <http://dx.doi.org/10.1093/nar/gkh152> <http://130.235.244.92/ARAGORN>
@@ -114,6 +114,7 @@ Bacta requires the following 3rd party executables which must be installed & exe
 - Diamond (2.0.11) <https://doi.org/10.1038/nmeth.3176> <https://github.com/bbuchfink/diamond>
 - Blast+ (2.7.1) <https://www.ncbi.nlm.nih.gov/pubmed/2231712> <https://blast.ncbi.nlm.nih.gov>
 - AMRFinderPlus (3.10.16) <https://github.com/ncbi/amr>
+- DeepSig (1.2.5) <https://doi.org/10.1093/bioinformatics/btx818>
 
 ### Database download
 
@@ -349,7 +350,7 @@ Annotation:
                         Path to existing Prodigal training file to use for CDS prediction
   --translation-table {11,4}
                         Translation table: 11/4 (default = 11)
-  --gram {+,-,?}        Gram type: +/-/? (default = '?')
+  --gram {+,-,?}        Gram type for signal peptide predictions: +/-/? (default = '?')
   --locus LOCUS         Locus prefix (default = 'contig')
   --locus-tag LOCUS_TAG
                         Locus tag prefix (default = autogenerated)
@@ -427,12 +428,13 @@ Conceptual terms:
 2. discard spurious CDS via AntiFam
 3. Detection of UPSs via MD5 digests and lookup of related IPS and PCS
 4. Sequence alignments of remainder via Diamond vs. PSC (query/subject coverage=0.8, identity=0.5)
-5. Assign protein sequences to UniRef90 or UniRef50 clusters if alignment hits meet an identity larger than 0.9 or 0.5, respectively
+5. Assignment to UniRef90 or UniRef50 clusters if alignment hits achieve identities larger than 0.9 or 0.5, respectively
 6. Execution of expert systems:
   - AMR: AMRFinderPlus
   - Expert proteins: NCBI BlastRules, VFDB
-  - User proteins
-7. Combination of available IPS, PSC, PSCC and expert system information favouring more specific annotations and avoiding redundancy
+  - User proteins (optionally via `--proteins <Fasta/GenBank>`)
+7. Prediction of signal peptides (optionally via `--gram <+/->`)
+8. Combination of IPS, PSC, PSCC and expert system information favouring more specific annotations and avoiding redundancy
 
 CDS without IPS or PSC hits as well as those without gene symbols or product descriptions different from `hypothetical` will be marked as `hypothetical`.
 
@@ -449,6 +451,7 @@ Such hypothetical CDS are further analyzed:
 4. Detection of UPS via MD5 hashes and lookup of related IPS
 5. Sequence alignments of remainder via Diamond vs. an sORF subset of PSCs (coverage=0.9, identity=0.9)
 6. Exclude sORF without sufficient annotation information
+7. Prediction of signal peptides (optionally via `--gram <+/->`)
 
 sORF not identified via IPS or PSC will be discarded. Additionally, all sORF without gene symbols or product descriptions different from `hypothetical` will be discarded.
 Due due to uncertain nature of sORF prediction, only those identified via IPS / PSC hits exhibiting proper gene symbols or product descriptions different from `hypothetical` will be included in the final annotation.
@@ -587,6 +590,7 @@ Bakta is *standing on the shoulder of giants* taking advantage of many great sof
 - BLAST+ <10.1186/1471-2105-10-421>
 - HMMER <10.1371/journal.pcbi.1002195>
 - AMRFinderPlus <10.1038/s41598-021-91456-0>
+- DeepSig <https://doi.org/10.1093/bioinformatics/btx818>
 
 ### Databases
 
@@ -607,6 +611,9 @@ Bakta is *standing on the shoulder of giants* taking advantage of many great sof
 - __AMRFinder fails__
 If AMRFinder constantly crashes even on fresh setups and Bakta's database was downloaded manually, then AMRFinder needs to setup its own internal database. This is required only once: `amrfinder_update --force_update --database <bakta-db>/amrfinderplus-db`. You could also try Bakta's internal database download logic automatically taking care of this: `bakta_db download --output <bakta-db>`
 
+- __DeepSig not found in Conda environment__
+For the prediction of signal predictions, Bakta uses DeepSig that is currently not available for MacOS. Therefore, we decided to exclude DeepSig from Bakta's default Conda dependencies because otherwise it would not be installable on MacOS systems. On Linux systems it can be installed via `conda install -c conda-forge -c bioconda python=3.8 deepsig`.
+
 - __Nice, but I'm mising XYZ...__
 Bakta is quite new and we're keen to constantly improve it and further expand its feature set. In case there's anything missing, please do not hesitate to open an issue and ask for it!
 

diff --git a/bakta/constants.py b/bakta/constants.py
@@ -69,6 +69,7 @@
 FEATURE_ORF = 'orf'
 FEATURE_SORF = 'sorf'
 FEATURE_CDS = 'cds'
+FEATURE_SIGNAL_PEPTIDE = 'signal-peptide'
 FEATURE_GAP = 'gap'
 FEATURE_ORIC = 'oriC'
 FEATURE_ORIV = 'oriV'
@@ -94,6 +95,7 @@
 INSDC_FEATURE_REPEAT_UNIT_RANGE = 'rpt_unit_range'  # /rpt_unit_range=<base_range>
 INSDC_FEATURE_REPEAT_UNIT_SEQ = 'rpt_unit_seq'  # /rpt_unit_seq="text"
 INSDC_FEATURE_CDS = 'CDS'
+INSDC_FEATURE_SIGNAL_PEPTIDE = 'sig_peptide'
 INSDC_FEATURE_GAP = 'gap'
 INSDC_FEATURE_ASSEMBLY_GAP = 'assembly_gap'
 INSDC_FEATURE_MISC_FEATURE = 'misc_feature'
@@ -130,6 +132,13 @@
 STRAND_UNKNOWN = '?'
 STRAND_NA = '.'
 
+############################################################################
+# Gram types
+############################################################################
+GRAM_POSITIVE = '+'
+GRAM_NEGATIVE = '-'
+GRAM_UNKNOWN = '?'
+
 ############################################################################
 # Replicon types, length thresholds & topology
 ############################################################################

diff --git a/bakta/features/signal_peptides.py b/bakta/features/signal_peptides.py
@@ -0,0 +1,86 @@
+import logging
+import sys
+import subprocess as sp
+from collections import OrderedDict
+
+import bakta
+import bakta.constants as bc
+import bakta.config as cfg
+
+log = logging.getLogger('SIGNAL_PEPTIDES')
+
+def search(orfs, orf_aa_path):
+    """Search for signal peptides with DeepSig."""
+
+    deepsig_output_path = cfg.tmp_path.joinpath('deepsig.gff3')
+    gram = 'gramn' if cfg.gram == '-' else 'gramp'
+    cmd = [
+        'deepsig',
+        '--fasta', str(orf_aa_path),
+        '--organism', gram,
+        '--outf', str(deepsig_output_path),
+        '--outfmt', 'gff3',
+        '--threads', str(cfg.threads)
+    ]
+    log.debug('cmd=%s', cmd)
+    proc = sp.run(
+        cmd,
+        cwd=str(cfg.tmp_path),
+        env=cfg.env,
+        stdout=sp.PIPE,
+        stderr=sp.PIPE,
+        universal_newlines=True
+    )
+    if(proc.returncode != 0):
+        log.debug('stdout=\'%s\', stderr=\'%s\'', proc.stdout, proc.stderr)
+        log.warning('signal peptides failed! deep-sig-error-code=%d', proc.returncode)
+        raise Exception(f'deepsig error! error code: {proc.returncode}')
+
+    sequences_by_hexdigest = {f"{orf['aa_hexdigest']}": orf for orf in orfs}
+
+    sig_peps = []
+    with deepsig_output_path.open() as fh:
+        for line in fh:
+            (identifier, tool, feature_type, start_aa, stop_aa, score, placeholder, placeholder2, description) = line.split('\t')
+            if feature_type == 'Signal peptide':
+                start_aa = int(start_aa)
+                stop_aa = int(stop_aa)
+                score = float(score)
+                aa_identifier = identifier.split('-')[0]
+                if (aa_identifier in sequences_by_hexdigest):
+                    orf = sequences_by_hexdigest[aa_identifier]
+                    start_nt, stop_nt = orf_nt_start_stop(orf, start_aa, stop_aa)
+
+                    sig_pep = {
+                        'type': bc.FEATURE_SIGNAL_PEPTIDE,
+                        'start': start_nt,
+                        'stop': stop_nt,
+                        'score': score
+                    }
+                    if (bc.FEATURE_SIGNAL_PEPTIDE not in orf):
+                        orf[bc.FEATURE_SIGNAL_PEPTIDE] = {}
+                    orf[bc.FEATURE_SIGNAL_PEPTIDE] = sig_pep
+                    log.debug(
+                        'hit: contig=%s, nt-start=%i, nt-stop=%i, aa-start=%i, aa-stop=%i, score=%0.2f',
+                        orf['contig'], start_nt, stop_nt, start_aa, stop_aa, score
+                    )
+                    sig_peps.append(sig_pep)
+                else:
+                    log.error('signal peptide: unknown aa_identifier=%s', aa_identifier)
+                    sys.exit('ERROR: signal peptide found for unknown aa_identifier=%s!', aa_identifier)
+    return sig_peps
+
+
+def orf_nt_start_stop(orf, start_aa, stop_aa):
+    """Determin signal peptide nucleotide position.
+        orf: dictonary of ORF, either CDS or sORF
+        start_aa: sig pep start position within aa seq
+        stop_aa: sig pep stop position within aa seq
+    """
+    if(orf['strand'] == bc.STRAND_FORWARD):
+        start_nt = orf['start'] + ((start_aa-1) * 3)
+        stop_nt = orf['start'] + ((stop_aa-1) * 3) + 2
+    else:
+        start_nt = orf['stop'] - ((stop_aa-1) * 3 + 2)
+        stop_nt = orf['stop'] - ((start_aa-1) * 3)
+    return start_nt, stop_nt
diff --git a/bakta/io/gff.py b/bakta/io/gff.py
@@ -212,6 +212,8 @@ def write_gff3(genome, features_by_contig, gff3_path):
                         fh.write(f"{feat['contig']}\tProdigal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
                     fh.write(f"{feat['contig']}\tProdigal\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
+                    if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
+                        write_signal_peptide(fh, feat)
                 elif(feat['type'] is bc.FEATURE_SORF):
                     annotations = {
                         'ID': feat['locus'],
@@ -241,6 +243,8 @@ def write_gff3(genome, features_by_contig, gff3_path):
                         fh.write(f"{feat['contig']}\tBakta\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
                     fh.write(f"{feat['contig']}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
+                    if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
+                        write_signal_peptide(fh, feat)
                 elif(feat['type'] is bc.FEATURE_GAP):
                     annotations = {
                         'ID': feat['id'],
@@ -308,3 +312,16 @@ def encode_annotations(annotations):
         else:
             annotation_strings.append(f'{key}={encode_attribute(val)}')
     return ';'.join(annotation_strings)
+
+
+def write_signal_peptide(fh, feat):
+    sig_peptide = feat[bc.FEATURE_SIGNAL_PEPTIDE]
+    annotations = {
+        'ID': f"{feat['locus']}_sigpep",
+        'Name': 'signal peptide',
+        'product': 'signal peptide',
+        'score': sig_peptide['score'],
+        'Parent': feat['locus']
+    }
+    annotations = encode_annotations(annotations)
+    fh.write(f"{feat['contig']}\tDeepSig\t{so.SO_SIGNAL_PEPTIDE.name}\t{sig_peptide['start']}\t{sig_peptide['stop']}\t{sig_peptide['score']:.2f}\t{feat['strand']}\t.\t{annotations}\n")
diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py
@@ -102,6 +102,7 @@ def write_insdc(genome, features, genbank_output_path, embl_output_path):
             if('locus' in feature):
                 qualifiers['locus_tag'] = feature['locus']
 
+            accompanying_features=[]
             if(feature['type'] == bc.FEATURE_GAP):
                 insdc_feature_type = bc.INSDC_FEATURE_GAP
                 qualifiers['estimated_length'] = feature['length']
@@ -148,6 +149,15 @@ def write_insdc(genome, features, genbank_output_path, embl_output_path):
                         if(bc.DB_XREF_EC in note):
                             qualifiers['EC_number'] = note.replace('EC:', '')
                     qualifiers['note'] = [note for note in qualifiers['note'] if bc.DB_XREF_EC not in note]
+                if(bc.FEATURE_SIGNAL_PEPTIDE in feature):
+                    sigpep_qualifiers = {}
+                    sigpep_qualifiers['locus_tag'] = feature['locus']
+                    sigpep_qualifiers['inference'] = 'ab initio prediction:DeepSig:1.2'
+                    sigpep = feature[bc.FEATURE_SIGNAL_PEPTIDE]
+                    sigpep_strand = 1 if feature['strand'] == bc.STRAND_FORWARD else -1 if feature['strand'] == bc.STRAND_REVERSE else 0
+                    sigpep_location = FeatureLocation(sigpep['start'] - 1, sigpep['stop'], strand = sigpep_strand)
+                    sigpep_feature = SeqFeature(sigpep_location, type=bc.INSDC_FEATURE_SIGNAL_PEPTIDE, qualifiers=sigpep_qualifiers)
+                    accompanying_features.append(sigpep_feature)
             elif(feature['type'] == bc.FEATURE_T_RNA):
                 if('amino_acid' in feature and 'anti_codon' in feature):
                     if('anti_codon_pos' in feature):
@@ -238,6 +248,8 @@ def write_insdc(genome, features, genbank_output_path, embl_output_path):
                 seq_feature_list.append(gen_seqfeat)
             feat_seqfeat = SeqFeature(feature_location, type=insdc_feature_type, qualifiers=qualifiers)
             seq_feature_list.append(feat_seqfeat)
+            for acc_feature in accompanying_features:  # add accompanying features, e.g. signal peptides
+                seq_feature_list.append(acc_feature)
         contig_rec.features = seq_feature_list
         contig_list.append(contig_rec)
 

diff --git a/bakta/main.py b/bakta/main.py
@@ -29,6 +29,7 @@
 import bakta.features.crispr as crispr
 import bakta.features.orf as orf
 import bakta.features.cds as feat_cds
+import bakta.features.signal_peptides as sig_peptides
 import bakta.features.s_orf as s_orf
 import bakta.features.gaps as gaps
 import bakta.features.ori as ori
@@ -289,6 +290,10 @@ def main():
                 user_aa_found = exp_aa_seq.search(cdss, cds_aa_path, 'user_proteins', user_aa_path)
                 print(f'\t\tuser protein sequences: {len(user_aa_found)}')
 
+            if(cfg.gram != bc.GRAM_UNKNOWN):
+                sig_peptides_found = sig_peptides.search(cdss, cds_aa_path)
+                print(f"\tsignal peptides: {len(sig_peptides_found)}")
+
             print('\tcombine annotations and mark hypotheticals...')
             log.debug('combine CDS annotations')
             for cds in cdss:
@@ -361,6 +366,14 @@ def main():
             anno.combine_annotation(feat)  # combine IPS and PSC annotations
         genome['features'][bc.FEATURE_SORF] = sorfs_filtered
         print(f'\tfiltered sORFs: {len(sorfs_filtered)}')
+
+        if(cfg.gram != bc.GRAM_UNKNOWN  and  len(sorfs_filtered) > 0):
+            sorf_aa_path = cfg.tmp_path.joinpath('sorfs.faa')
+            with sorf_aa_path.open(mode='wt') as fh:
+                for sorf in sorfs_filtered:
+                    fh.write(f">{sorf['aa_hexdigest']}-{sorf['contig']}-{sorf['start']}\n{sorf['aa']}\n")
+            sig_peptides_found = sig_peptides.search(sorfs_filtered, sorf_aa_path)
+            print(f"\tsignal peptides: {len(sig_peptides_found)}")
 
     ############################################################################
     # gap annotation
@@ -474,6 +487,7 @@ def main():
     cdss = [f for f in features if f['type'] == bc.FEATURE_CDS]
     print(f"\tCDSs: {len(cdss)}")
     print(f"\t  hypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}")
+    print(f"\t  signal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}")
     print(f"\tsORFs: {len([f for f in features if f['type'] == bc.FEATURE_SORF])}")
     print(f"\tgaps: {len([f for f in features if f['type'] == bc.FEATURE_GAP])}")
     print(f"\toriCs/oriVs: {len([f for f in features if (f['type'] == bc.FEATURE_ORIC or f['type'] == bc.FEATURE_ORIV)])}")
@@ -552,6 +566,7 @@ def main():
         fh_out.write(f"CRISPR arrays: {len([f for f in features if f['type'] == bc.FEATURE_CRISPR])}\n")
         fh_out.write(f"CDSs: {len(cdss)}\n")
         fh_out.write(f"hypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}\n")
+        fh_out.write(f"signal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}\n")
         fh_out.write(f"sORFs: {len([f for f in features if f['type'] == bc.FEATURE_SORF])}\n")
         fh_out.write(f"gaps: {len([f for f in features if f['type'] == bc.FEATURE_GAP])}\n")
         fh_out.write(f"oriCs: {len([f for f in features if f['type'] == bc.FEATURE_ORIC])}\n")

diff --git a/bakta/utils.py b/bakta/utils.py
@@ -39,6 +39,7 @@ def print_version(self):
 DEPENDENCY_PRODIGAL = (Version(2, 6, 3), Version(VERSION_MAX_DIGIT, VERSION_MAX_DIGIT, VERSION_MAX_DIGIT), VERSION_REGEX, ('prodigal', '-v'), ['--skip-cds'])
 DEPENDENCY_HMMSEARCH = (Version(3, 3, 1), Version(VERSION_MAX_DIGIT, VERSION_MAX_DIGIT, VERSION_MAX_DIGIT), VERSION_REGEX, ('hmmsearch', '-h'), ['--skip-cds', '--skip-sorf'])
 DEPENDENCY_DIAMOND = (Version(2, 0, 11), Version(VERSION_MAX_DIGIT, VERSION_MAX_DIGIT, VERSION_MAX_DIGIT), VERSION_REGEX, ('diamond', 'help'), ['--skip-cds', '--skip-sorf'])
+DEPENDENCY_DEEPSIG = (Version(1, 2, 5), Version(VERSION_MAX_DIGIT, VERSION_MAX_DIGIT, VERSION_MAX_DIGIT), VERSION_REGEX, ('deepsig', '--version'), ['--gram ?'])
 DEPENDENCY_BLASTN = (Version(2, 7, 1), Version(VERSION_MAX_DIGIT, VERSION_MAX_DIGIT, VERSION_MAX_DIGIT), VERSION_REGEX, ('blastn', '-version'), ['--skip-ori'])
 DEPENDENCY_AMRFINDERPLUS = (Version(3, 10, 16), Version(VERSION_MAX_DIGIT, VERSION_MAX_DIGIT, VERSION_MAX_DIGIT), VERSION_REGEX, ('amrfinder', '--version'), ['--skip-cds'])
 
@@ -74,7 +75,7 @@ def parse_arguments():
     arg_group_annotation.add_argument('--complete', action='store_true', help='All sequences are complete replicons (chromosome/plasmid[s])')
     arg_group_annotation.add_argument('--prodigal-tf', action='store', default=None, dest='prodigal_tf', help='Path to existing Prodigal training file to use for CDS prediction')
     arg_group_annotation.add_argument('--translation-table', action='store', type=int, default=11, choices=[11, 4], dest='translation_table', help='Translation table: 11/4 (default = 11)')
-    arg_group_annotation.add_argument('--gram', action='store', default='?', choices=['+', '-', '?'], help="Gram type: +/-/? (default = '?')")
+    arg_group_annotation.add_argument('--gram', action='store', default=bc.GRAM_UNKNOWN, choices=[bc.GRAM_POSITIVE, bc.GRAM_NEGATIVE, bc.GRAM_UNKNOWN], help=f'Gram type for signal peptide predictions: {bc.GRAM_POSITIVE}/{bc.GRAM_NEGATIVE}/{bc.GRAM_UNKNOWN} (default = {bc.GRAM_UNKNOWN})')
     arg_group_annotation.add_argument('--locus', action='store', default=None, help="Locus prefix (default = 'contig')")
     arg_group_annotation.add_argument('--locus-tag', action='store', default=None, dest='locus_tag', help='Locus tag prefix (default = autogenerated)')
     arg_group_annotation.add_argument('--keep-contig-headers', action='store_true', dest='keep_contig_headers', help='Keep original contig headers')
@@ -208,6 +209,8 @@ def test_dependencies():
     if(not cfg.skip_cds or not cfg.skip_sorf):
         test_dependency(DEPENDENCY_HMMSEARCH)
         test_dependency(DEPENDENCY_DIAMOND)
+        if(cfg.gram != '?'):
+            test_dependency(DEPENDENCY_DEEPSIG)
 
     if(not cfg.skip_ori):
         test_dependency(DEPENDENCY_BLASTN)