From 53df2bf0c5baa5915ed765428d6a44ccacd8ee77 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 24 Oct 2023 17:59:36 +0200
Subject: [PATCH] Improve PILER-CR CRISPR parser (#249)

* implement parsing of CRISPR repeats & spacers
* write CRISPR repeats & spacers to GFF
* add CRISPR repeat & spacer output to TSV
* add CRISPR PILER-CR test
* refactor test
* fix full black box test for expanded CRISPR features
* refactor test code
---
 bakta/constants.py          |   2 +
 bakta/features/crispr.py    |  93 ++++++++++++++++++++++++--------
 bakta/io/gff.py             |  23 ++++++++
 bakta/io/tsv.py             |  37 +++++++++----
 test/test_args.py           |   4 +-
 test/test_bakta.py          | 102 ++++++++++++++++--------------------
 test/test_bakta_proteins.py |   1 +
 test/test_crispr.py         |  43 +++++++++++++++
 test/test_edge_features.py  |   2 -
 test/test_nt_sequences.py   |  10 ++--
 test/test_sORF.py           |  24 ++++-----
 test/test_sig_peps.py       |  24 ++++-----
 test/test_user_proteins.py  |  56 ++++++++++----------
 13 files changed, 269 insertions(+), 152 deletions(-)
 create mode 100644 test/test_crispr.py

diff --git a/bakta/constants.py b/bakta/constants.py
index 1b96efbb..7e3d5598 100644
--- a/bakta/constants.py
+++ b/bakta/constants.py
@@ -97,6 +97,8 @@
 FEATURE_NC_RNA = 'ncRNA'
 FEATURE_NC_RNA_REGION = 'ncRNA-region'
 FEATURE_CRISPR = 'crispr'
+FEATURE_CRISPR_REPEAT = 'crispr-repeat'
+FEATURE_CRISPR_SPACER = 'crispr-spacer'
 FEATURE_ORF = 'orf'
 FEATURE_SORF = 'sorf'
 FEATURE_CDS = 'cds'
diff --git a/bakta/features/crispr.py b/bakta/features/crispr.py
index d6012058..ec66f04d 100644
--- a/bakta/features/crispr.py
+++ b/bakta/features/crispr.py
@@ -38,18 +38,72 @@ def predict_crispr(genome: dict, contigs_path: Path):
         log.warning('CRISPRs failed! pilercr-error-code=%d', proc.returncode)
         raise Exception(f'PILER-CR error! error code: {proc.returncode}')
 
-    # parse orfs
-    crispr_arrays = []
+    # parse crispr arrays
+    crispr_arrays = {}
     contigs = {c['id']: c for c in genome['contigs']}
     with output_path.open() as fh:
+        output_section = None
         contig_id = None
+        array_id = None
         skip_lines = True
+        crispr_array = None
+        gap_count = 0
         for line in fh:
             line = line.strip()
-            if(line == 'SUMMARY BY POSITION'):
+            if(line == ''):
+                continue
+            if(line == 'DETAIL REPORT'):
+                output_section = 'DETAIL'
+                skip_lines = False
+            elif(line == 'SUMMARY BY POSITION'):
+                output_section = 'POSITION'
+                skip_lines = False
+            elif(line == 'SUMMARY BY SIMILARITY'):
+                output_section = 'SIMILARITY'
                 skip_lines = False
             elif(skip_lines is False):
-                if(len(line) > 0):
+                if(output_section == 'DETAIL'):
+                    if(line[0:5] == 'Array'):
+                        gap_count = 0
+                        array_id = line.split()[1]
+                        crispr_array = OrderedDict()
+                        crispr_array['type'] = bc.FEATURE_CRISPR
+                        crispr_array['strand'] = bc.STRAND_UNKNOWN
+                        crispr_array['repeats'] = []
+                        crispr_array['spacers'] = []
+                        crispr_arrays[array_id] = crispr_array
+                    elif(line[0] == '>'):
+                        contig_id = line[1:]
+                        crispr_array['contig'] = contig_id
+                    elif(line[0] != '='):
+                        cols = line.split()
+                        if(len(cols) == 7  and  cols[0] != 'Pos'):
+                            (position, repeat_length, id, spacer_length, left_flank, repeat_seq, spacer_seq) = cols
+                            position, repeat_length, spacer_length = int(position), int(repeat_length), int(spacer_length)
+                            spacer_seq = spacer_seq.upper()
+                            crispr_repeat = OrderedDict()
+                            crispr_repeat['strand'] = bc.STRAND_UNKNOWN
+                            crispr_repeat['start'] = position - gap_count
+                            crispr_repeat['stop'] = position + repeat_length - 1 - gap_count
+                            crispr_array['repeats'].append(crispr_repeat)
+                            gap_count += repeat_seq.count('-')  # correct wrong PILER-CR detail positions by gaps
+                            crispr_spacer = OrderedDict()
+                            crispr_spacer['strand'] = bc.STRAND_UNKNOWN
+                            crispr_spacer['start'] = position + repeat_length  - gap_count
+                            crispr_spacer['stop'] = position + repeat_length + spacer_length - 1 - gap_count
+                            crispr_spacer['sequence'] = spacer_seq
+                            crispr_array['spacers'].append(crispr_spacer)
+                            spacer_genome_seq = bu.extract_feature_sequence(crispr_spacer, contigs[contig_id])
+                            assert spacer_seq == spacer_genome_seq  # assure PILER-CR spacer sequence equal extraction from genome
+                        elif(len(cols) == 6  and  cols[0] != 'Pos'):  # last line in array without spacer
+                            (position, repeat_length, id, left_flank, repeat_seq, spacer_seq) = cols
+                            position, repeat_length, spacer_length = int(position), int(repeat_length), int(spacer_length)
+                            crispr_repeat = OrderedDict()
+                            crispr_repeat['strand'] = bc.STRAND_UNKNOWN
+                            crispr_repeat['start'] = position - gap_count
+                            crispr_repeat['stop'] = position + repeat_length - 1 - gap_count
+                            crispr_array['repeats'].append(crispr_repeat)
+                elif(output_section == 'POSITION'):
                     if(line[0] == '>'):
                         contig_id = line[1:]
                     elif(line[0] != 'A' and line[0] != '='):
@@ -58,27 +112,22 @@ def predict_crispr(genome: dict, contigs_path: Path):
                             (array_id, contig, position, length, copies, repeat_length, spacer_length, repeat_consensus) = cols
                         else:
                             (array_id, contig, position, length, copies, repeat_length, spacer_length, distance, repeat_consensus) = cols
+                        crispr_array = crispr_arrays[array_id]
+                        crispr_array['start'] = int(position)
+                        crispr_array['stop'] = int(position) + int(length) - 1
+                        crispr_array['product'] = f'CRISPR array with {copies} repeats of length {repeat_length}, consensus sequence {repeat_consensus} and spacer length {spacer_length}'
+                        crispr_array['spacer_length'] = int(spacer_length)
+                        crispr_array['repeat_length'] = int(repeat_length)
+                        assert len(crispr_array['repeats']) == int(copies), print(f"len(reps)={len(crispr_array['repeats'])}, int(copies)={int(copies)}")
+                        crispr_array['repeat_consensus'] = repeat_consensus
+                        crispr_array['db_xrefs'] = [so.SO_CRISPR.id]
 
-                        crispr = OrderedDict()
-                        crispr['type'] = bc.FEATURE_CRISPR
-                        crispr['contig'] = contig_id
-                        crispr['start'] = int(position)
-                        crispr['stop'] = int(position) + int(length) - 1
-                        crispr['strand'] = bc.STRAND_UNKNOWN
-                        crispr['product'] = f'CRISPR array with {copies} repeats of length {repeat_length}, consensus sequence {repeat_consensus} and spacer length {spacer_length}'
-                        crispr['spacer_length'] = int(spacer_length)
-                        crispr['repeat_length'] = int(repeat_length)
-                        crispr['repeats'] = int(copies)
-                        crispr['repeat_consensus'] = repeat_consensus
-                        crispr['db_xrefs'] = [so.SO_CRISPR.id]
-
-                        nt = bu.extract_feature_sequence(crispr, contigs[contig_id])  # extract nt sequences
-                        crispr['nt'] = nt
-
-                        crispr_arrays.append(crispr)
+                        nt = bu.extract_feature_sequence(crispr_array, contigs[contig_id])  # extract nt sequences
+                        crispr_array['nt'] = nt
                         log.info(
                             'contig=%s, start=%i, stop=%i, spacer-length=%i, repeat-length=%i, # repeats=%i, repeat-consensus=%s, nt=[%s..%s]',
-                            crispr['contig'], crispr['start'], crispr['stop'], crispr['spacer_length'], crispr['repeat_length'], crispr['repeats'], crispr['repeat_consensus'], nt[:10], nt[-10:]
+                            crispr_array['contig'], crispr_array['start'], crispr_array['stop'], crispr_array['spacer_length'], crispr_array['repeat_length'], len(crispr_array['repeats']), crispr_array['repeat_consensus'], nt[:10], nt[-10:]
                         )
+    crispr_arrays = crispr_arrays.values()                        
     log.info('predicted=%i', len(crispr_arrays))
     return crispr_arrays
diff --git a/bakta/io/gff.py b/bakta/io/gff.py
index 4af60252..91d0445b 100644
--- a/bakta/io/gff.py
+++ b/bakta/io/gff.py
@@ -186,6 +186,29 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                         annotations[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feat['repeat_consensus']
                     annotations = encode_annotations(annotations)
                     fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    if(not cfg.compliant):
+                        i = 0
+                        while i < len(feat['spacers']):
+                            repeat = feat['repeats'][i]
+                            annotations = {
+                                'ID': f"{feat['id']}_repeat_{i+1}"
+                            }
+                            annotations = encode_annotations(annotations)
+                            fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
+                            spacer = feat['spacers'][i]
+                            annotations = {
+                                'ID': f"{feat['id']}_spacer_{i+1}",
+                                'sequence': spacer['sequence']
+                            }
+                            annotations = encode_annotations(annotations)
+                            fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
+                            i += 1
+                        repeat = feat['repeats'][i]
+                        annotations = {
+                            'ID': f"{feat['id']}_repeat_{i+1}"
+                        }
+                        annotations = encode_annotations(annotations)
+                        fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] is bc.FEATURE_CDS):
                     annotations = {
                         'ID': feat['locus'],
diff --git a/bakta/io/tsv.py b/bakta/io/tsv.py
index bfe68e07..426917da 100644
--- a/bakta/io/tsv.py
+++ b/bakta/io/tsv.py
@@ -27,21 +27,38 @@ def write_tsv(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_
         for contig in contigs:
             for feat in features_by_contig[contig['id']]:
                 feat_type = feat['type']
-                if(feat['type'] == bc.FEATURE_GAP):
+                if(feat_type == bc.FEATURE_GAP):
                     feat_type = bc.INSDC_FEATURE_ASSEMBLY_GAP if feat['length'] >= 100 else bc.INSDC_FEATURE_GAP
 
                 gene = feat['gene'] if feat.get('gene', None) else ''
                 product = f"(pseudo) {feat.get('product', '')}" if feat.get('pseudo', False) else feat.get('product', '')
-                fh.write('\t'.join([feat['contig'],
-                                    feat_type,
-                                    str(feat['start']),
-                                    str(feat['stop']),
-                                    feat['strand'],
-                                    feat.get('locus', ''),
-                                    gene,
-                                    product,
-                                    ', '.join(sorted(feat.get('db_xrefs', [])))]))
+                fh.write('\t'.join(
+                    [
+                        feat['contig'],
+                        feat_type,
+                        str(feat['start']),
+                        str(feat['stop']),
+                        feat['strand'],
+                        feat.get('locus', ''),
+                        gene,
+                        product,
+                        ', '.join(sorted(feat.get('db_xrefs', [])))
+                    ])
+                )
                 fh.write('\n')
+                if(feat_type == bc.FEATURE_CRISPR):
+                    i = 0
+                    while i < len(feat['spacers']):
+                        repeat = feat['repeats'][i]
+                        fh.write('\t'.join([feat['contig'], 'CRISPR repeat', str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
+                        fh.write('\n')
+                        spacer = feat['spacers'][i]
+                        fh.write('\t'.join([feat['contig'], 'CRISPR spacer', str(spacer['start']), str(spacer['stop']), spacer['strand'], '', '', f"CRISPR spacer, sequence {spacer['sequence']}", '']))
+                        fh.write('\n')
+                        i += 1
+                    repeat = feat['repeats'][i]
+                    fh.write('\t'.join([feat['contig'], 'CRISPR repeat', str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
+                    fh.write('\n')
     return
 
 
diff --git a/test/test_args.py b/test/test_args.py
index 5cf323a9..6c09ed92 100644
--- a/test/test_args.py
+++ b/test/test_args.py
@@ -1,10 +1,10 @@
-from inspect import Parameter
 import os
-import pytest
 
 from pathlib import Path
 from subprocess import run
 
+import pytest
+
 from .conftest import FILES, SKIP_PARAMETERS
 
 
diff --git a/test/test_bakta.py b/test/test_bakta.py
index ace2f865..31297f49 100644
--- a/test/test_bakta.py
+++ b/test/test_bakta.py
@@ -1,11 +1,13 @@
+import json
 import subprocess as sp
-import sys
 
 from pathlib import Path
 from subprocess import run
 
 import pytest
 
+import bakta.constants as bc
+
 from .conftest import FILES, SKIP_PARAMETERS
 
 
@@ -58,24 +60,28 @@ def test_bakta_plasmid(tmpdir):
         assert Path.exists(output_path)
         assert output_path.stat().st_size > 0
 
-    output_path = tmpdir_path.joinpath('test.tsv')
-    feature_count, feature_counts = count_features(output_path)
-    assert feature_count == 3
+    results_path = tmpdir_path.joinpath('test.json')
+    results = None
+    with results_path.open() as fh:
+        results = json.load(fh)
+    assert results is not None
+    features = results['features']
+    assert len(features) == 3
     feature_counts_expected = {
-        'tRNA': 0,
-        'tmRNA': 0,
-        'rRNA': 0,
-        'ncRNA': 0,
-        'ncRNA-region': 0,
-        'crispr': 0,
-        'sorf': 0,
-        'oriV': 0,
-        'oriC': 0,
-        'oriT': 0,
-        'cds': 3
+        bc.FEATURE_T_RNA: 0,
+        bc.FEATURE_TM_RNA: 0,
+        bc.FEATURE_R_RNA: 0,
+        bc.FEATURE_NC_RNA: 0,
+        bc.FEATURE_NC_RNA_REGION: 0,
+        bc.FEATURE_CRISPR: 0,
+        bc.FEATURE_CDS: 3,
+        bc.FEATURE_SORF: 0,
+        bc.FEATURE_ORIC: 0,
+        bc.FEATURE_ORIV: 0,
+        bc.FEATURE_ORIT: 0
     }
-    for type in feature_counts:
-        assert feature_counts[type] == feature_counts_expected[type]
+    for type, count in feature_counts_expected.items():
+        assert len([f for f in features if f['type'] == type]) == count
 
 
 @pytest.mark.parametrize(
@@ -96,46 +102,26 @@ def test_bakta_genome(db, tmpdir):
         assert Path.exists(output_path)
         assert output_path.stat().st_size > 0
 
-    output_path = tmpdir_path.joinpath('test.tsv')
-    feature_count, feature_counts = count_features(output_path)
-    assert feature_count == 5551
+    results_path = tmpdir_path.joinpath('test.json')
+    results = None
+    with results_path.open() as fh:
+        results = json.load(fh)
+    assert results is not None
+    features = results['features']
+    assert len(features) == 5551
     feature_counts_expected = {
-        'tRNA': 107,
-        'tmRNA': 1,
-        'rRNA': 7,
-        'ncRNA': 57,
-        'ncRNA-region': 1,
-        'crispr': 1,
-        'sorf': 2,
-        'oriV': 0,
-        'oriC': 0,
-        'oriT': 0,
-        'cds': 5375
+        bc.FEATURE_T_RNA: 107,
+        bc.FEATURE_TM_RNA: 1,
+        bc.FEATURE_R_RNA: 7,
+        bc.FEATURE_NC_RNA: 57,
+        bc.FEATURE_NC_RNA_REGION: 1,
+        bc.FEATURE_CRISPR: 1,
+        bc.FEATURE_CDS: 5375,
+        bc.FEATURE_SORF: 2,
+        bc.FEATURE_ORIC: 0,
+        bc.FEATURE_ORIV: 0,
+        bc.FEATURE_ORIT: 0
     }
-    for type in feature_counts:
-        assert feature_counts[type] == feature_counts_expected[type]
-
-
-def count_features(file_path):
-    with open(file_path, 'r') as fh:
-        feature_count = 0
-        feature_counts = {
-            'tRNA': 0,
-            'tmRNA': 0,
-            'rRNA': 0,
-            'ncRNA': 0,
-            'ncRNA-region': 0,
-            'crispr': 0,
-            'sorf': 0,
-            'oriV': 0,
-            'oriC': 0,
-            'oriT': 0,
-            'cds': 0
-        }
-        for line in fh:
-            if not line.startswith('#'):
-                feature_count += 1
-                feature = line.split('\t')[1]
-                if feature in feature_counts:
-                    feature_counts[feature] += 1
-    return feature_count, feature_counts
+    for type, count in feature_counts_expected.items():
+        assert len([f for f in features if f['type'] == type]) == count
+
diff --git a/test/test_bakta_proteins.py b/test/test_bakta_proteins.py
index 6452df00..4e04ba02 100644
--- a/test/test_bakta_proteins.py
+++ b/test/test_bakta_proteins.py
@@ -3,6 +3,7 @@
 
 import pytest
 
+
 FILES = [
     'test.tsv',
     'test.hypotheticals.tsv',
diff --git a/test/test_crispr.py b/test/test_crispr.py
new file mode 100644
index 00000000..8196cdc2
--- /dev/null
+++ b/test/test_crispr.py
@@ -0,0 +1,43 @@
+import json
+
+from pathlib import Path
+from subprocess import run
+
+
+CRISPR_ARRAYS = [
+    {
+        'repeat_consensus': 'CGGTTTATCCCCGCTGGCGCGGGGAACACA',
+        'spacers': [
+            'AACCGAAACACACGATCAATCCGAATATGAG',
+            'TTGGTGACAGTTTTTGTCACTGTTTTGGTGA',
+            'CTAAGCATACATATCTGTTTTTAAACA'
+        ],
+        'repeats': 3
+    }
+]
+
+
+def test_crispr_arrays(tmpdir):
+    proc = run(
+        [
+            'bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--prefix', 'test',
+            '--skip-tmrna', '--skip-trna', '--skip-rrna', '--skip-ncrna', '--skip-ncrna-region', '--skip-cds', '--skip-sorf', '--skip-ori', '--skip-gap', '--skip-plot',
+            'test/data/GCF_000008865.2.fna.gz'
+        ]
+    )
+    assert proc.returncode == 0
+
+    results_path = Path(tmpdir).joinpath('test.json')
+    assert Path.exists(results_path)
+    
+    results = None
+    with results_path.open() as fh:
+        results = json.load(fh)
+    assert results is not None
+    
+    crispr_arrays = [feat for feat in results['features'] if feat['type'] == 'crispr']
+    assert len(crispr_arrays) == 1
+
+    for idx, crispr_array in enumerate(crispr_arrays):
+        assert crispr_array['repeat_consensus'] == CRISPR_ARRAYS[idx]['repeat_consensus']
+        assert len(crispr_array['repeats']) == CRISPR_ARRAYS[idx]['repeats']
diff --git a/test/test_edge_features.py b/test/test_edge_features.py
index 170fe805..38b381ce 100644
--- a/test/test_edge_features.py
+++ b/test/test_edge_features.py
@@ -3,8 +3,6 @@
 from pathlib import Path
 from subprocess import run
 
-import pytest
-
 from bakta import constants as bc
 
 
diff --git a/test/test_nt_sequences.py b/test/test_nt_sequences.py
index dc264059..93e2bd2f 100644
--- a/test/test_nt_sequences.py
+++ b/test/test_nt_sequences.py
@@ -3,10 +3,8 @@
 from pathlib import Path
 from subprocess import run
 
-import pytest
-
-cds = 'TTGACTACGCCATTGAAAAAGATTGTGATTGTCGGCGGCGGTGCTGGTGGGCTGGAAATGGCAACACAGCTGGGGCATAAGCTGGGACGCAAGAAAAAAGCCAAAATTACGCTGGTCGATCGTAACCACAGCCATCTGTGGAAACCGCTGCTGCACGAAGTGGCGACTGGCTCGCTTGATGAAGGCGTCGATGCGTTGAGCTATCTGGCCCATGCGCGCAATCATGGTTTCCAGTTCCAGCTGGGTTCCGTCATTGATATTGATCGTGAAGCGAAAACAATCACTATTGCAGAACTGCGCGATGAGAAAGGTGAACTGCTGGTTCCGGAACGTAAAATCGCCTATGACACCCTGGTAATGGCGCTGGGTAGCACCTCTAACGATTTCAATACGCCAGGTGTCAAAGAGAACTGCATTTTCCTCGATAACCCGCACCAGGCGCGTCGCTTTCACCAGGAGATGCTGAATCTCTTCCTGAAATACTCCGCCAACCTGGGCGCAAATGGCAAAGTGAACATTGCGATTGTCGGCGGCGGCGCGACGGGTGTAGAACTCTCCGCTGAATTGCACAACGCGGTCAAGCAACTGCACAGCTACGGTTACAAAGGCCTGACCAACGAAGCCCTGAACGTAACGCTGGTAGAAGCGGGAGAACGTATTTTGCCTGCATTACCGCCACGTATCTCTGCTGCGGCCCACAACGAGCTAACGAAACTTGGCGTTCGCGTGCTGACGCAAACCATGGTCACCAGTGCTGATGAAGGCGGCCTGCACACTAAAGATGGCGAATATATTGAGGCTGATCTGATGGTGTGGGCAGCCGGGATCAAAGCGCCAGACTTCCTGAAAGATATCGGTGGTCTTGAAACTAACCGTATCAACCAGCTGGTGGTGGAACCGACGCTGCAAACCACCCGCGATCCAGACATTTACGCTATTGGCGACTGCGCGTCATGCCCGCGTCCGGAAGGGGGCTTTGTTCCGCCGCGTGCTCAGGCTGCACACCAGATGGCGACTTGCGCAATGAACAACATTCTGGCGCAGATGAATGGTAAGCCGCTGAAAAATTATCAGTATAAAGATCATGGTTCGCTGGTATCGCTGTCGAACTTCTCCACCGTTGGTAGCCTGATGGGTAACCTGACGCGCGGCTCAATGATGATTGAAGGACGAATTGCGCGCTTTGTATATATCTCGCTATACCGAATGCATCAGATTGCGCTGCATGGTTACTTTAAAACCGGATTAATGATGCTGGTGGGGAGTATTAACCGCGTTATCCGTCCGCGTTTGAAGTTGCATTAA'
-sorf = 'ATGGTGAATACCGGCGGCAATAAACGTCAGGTGCCGGCGAAACGTCAGAATCGTGGCTCCCGTAATTCCAAAGATGATGGCGGCTAA'
+CDS = 'TTGACTACGCCATTGAAAAAGATTGTGATTGTCGGCGGCGGTGCTGGTGGGCTGGAAATGGCAACACAGCTGGGGCATAAGCTGGGACGCAAGAAAAAAGCCAAAATTACGCTGGTCGATCGTAACCACAGCCATCTGTGGAAACCGCTGCTGCACGAAGTGGCGACTGGCTCGCTTGATGAAGGCGTCGATGCGTTGAGCTATCTGGCCCATGCGCGCAATCATGGTTTCCAGTTCCAGCTGGGTTCCGTCATTGATATTGATCGTGAAGCGAAAACAATCACTATTGCAGAACTGCGCGATGAGAAAGGTGAACTGCTGGTTCCGGAACGTAAAATCGCCTATGACACCCTGGTAATGGCGCTGGGTAGCACCTCTAACGATTTCAATACGCCAGGTGTCAAAGAGAACTGCATTTTCCTCGATAACCCGCACCAGGCGCGTCGCTTTCACCAGGAGATGCTGAATCTCTTCCTGAAATACTCCGCCAACCTGGGCGCAAATGGCAAAGTGAACATTGCGATTGTCGGCGGCGGCGCGACGGGTGTAGAACTCTCCGCTGAATTGCACAACGCGGTCAAGCAACTGCACAGCTACGGTTACAAAGGCCTGACCAACGAAGCCCTGAACGTAACGCTGGTAGAAGCGGGAGAACGTATTTTGCCTGCATTACCGCCACGTATCTCTGCTGCGGCCCACAACGAGCTAACGAAACTTGGCGTTCGCGTGCTGACGCAAACCATGGTCACCAGTGCTGATGAAGGCGGCCTGCACACTAAAGATGGCGAATATATTGAGGCTGATCTGATGGTGTGGGCAGCCGGGATCAAAGCGCCAGACTTCCTGAAAGATATCGGTGGTCTTGAAACTAACCGTATCAACCAGCTGGTGGTGGAACCGACGCTGCAAACCACCCGCGATCCAGACATTTACGCTATTGGCGACTGCGCGTCATGCCCGCGTCCGGAAGGGGGCTTTGTTCCGCCGCGTGCTCAGGCTGCACACCAGATGGCGACTTGCGCAATGAACAACATTCTGGCGCAGATGAATGGTAAGCCGCTGAAAAATTATCAGTATAAAGATCATGGTTCGCTGGTATCGCTGTCGAACTTCTCCACCGTTGGTAGCCTGATGGGTAACCTGACGCGCGGCTCAATGATGATTGAAGGACGAATTGCGCGCTTTGTATATATCTCGCTATACCGAATGCATCAGATTGCGCTGCATGGTTACTTTAAAACCGGATTAATGATGCTGGTGGGGAGTATTAACCGCGTTATCCGTCCGCGTTTGAAGTTGCATTAA'
+SORF = 'ATGGTGAATACCGGCGGCAATAAACGTCAGGTGCCGGCGAAACGTCAGAATCGTGGCTCCCGTAATTCCAAAGATGATGGCGGCTAA'
 
 
 def test_bakta_cds_nt_sequence(tmpdir):
@@ -30,7 +28,7 @@ def test_bakta_cds_nt_sequence(tmpdir):
 
     for feat in results['features']:
         if(feat['contig'] != 'dummy'):
-            assert feat['nt'] == cds
+            assert feat['nt'] == CDS
 
 
 def test_bakta_sorf_nt_sequence(tmpdir):
@@ -53,4 +51,4 @@ def test_bakta_sorf_nt_sequence(tmpdir):
         results = json.load(fh)
 
     for feat in results['features']:
-        assert feat['nt'] == sorf
+        assert feat['nt'] == SORF
diff --git a/test/test_sORF.py b/test/test_sORF.py
index ef190fb8..72214a6a 100644
--- a/test/test_sORF.py
+++ b/test/test_sORF.py
@@ -5,39 +5,39 @@
 from bakta.features import s_orf as bu
 
 
-contig_1 = {
+CONTIG_1 = {
     'id': 1,
     'description': 'no sORFs',
     'sequence': 'GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG'
 }
-contig_2 = {
+CONTIG_2 = {
     'id': 2,
     'description': 'out of limits',
     'sequence': 'ATGAAAAAATAGGGGATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAG'
 }
-contig_3 = {
+CONTIG_3 = {
     'id': 3,
     'description': 'two sORFs',
     'sequence': 'ATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAG'
 }
 
-genome_1 = {
-    'contigs': [contig_1]
+GENOME_1 = {
+    'contigs': [CONTIG_1]
 }
-genome_2 = {
-    'contigs': [contig_2]
+GENOME_2 = {
+    'contigs': [CONTIG_2]
 }
-genome_3 = {
-    'contigs': [contig_3]
+GENOME_3 = {
+    'contigs': [CONTIG_3]
 }
 
 
 @pytest.mark.parametrize(
     "genome, expected",
     [
-        (genome_1, 0),
-        (genome_2, 0),
-        (genome_3, 2)
+        (GENOME_1, 0),
+        (GENOME_2, 0),
+        (GENOME_3, 2)
     ]
 )
 def test_sORF(genome, expected):
diff --git a/test/test_sig_peps.py b/test/test_sig_peps.py
index 8eaf5471..469a6f6f 100644
--- a/test/test_sig_peps.py
+++ b/test/test_sig_peps.py
@@ -3,25 +3,25 @@
 from bakta.features import signal_peptides as bsp
 
 
-one_twentyone_fwd = {'start': 1, 'stop': 21, 'strand': '+'}
-one_twentyone_rev = {'start': 1, 'stop': 21, 'strand': '-'}
-four_fifteen_fwd = {'start': 4, 'stop': 15, 'strand': '+'}
-four_fifteen_rev = {'start': 4, 'stop': 15, 'strand': '-'}
+ONE_TWENTYONE_FWD = {'start': 1, 'stop': 21, 'strand': '+'}
+ONE_TWENTYONE_REV = {'start': 1, 'stop': 21, 'strand': '-'}
+FOUR_FIFTEEN_FWD = {'start': 4, 'stop': 15, 'strand': '+'}
+FOUR_FIFTEEN_REV = {'start': 4, 'stop': 15, 'strand': '-'}
 
 
 @pytest.mark.parametrize(
     "orf, start_aa, stop_aa, expected",
     [
         # Forward strand
-        (one_twentyone_fwd, 1, 3, (1,9)),   # ORF spans whole sequence length, signal peptide starts at AA 1
-        (one_twentyone_fwd, 2, 7, (4,21)),  # ORF spans whole sequence length, signal peptide stops at last AA
-        (four_fifteen_fwd, 1, 3, (4,12)),   # ORF does not span whole sequence length, signal peptide starts at AA 1
-        (four_fifteen_fwd, 2, 4, (7,15)),   # ORF does not span whole sequence length, signal peptide stops at last AA
+        (ONE_TWENTYONE_FWD, 1, 3, (1,9)),   # ORF spans whole sequence length, signal peptide starts at AA 1
+        (ONE_TWENTYONE_FWD, 2, 7, (4,21)),  # ORF spans whole sequence length, signal peptide stops at last AA
+        (FOUR_FIFTEEN_FWD, 1, 3, (4,12)),   # ORF does not span whole sequence length, signal peptide starts at AA 1
+        (FOUR_FIFTEEN_FWD, 2, 4, (7,15)),   # ORF does not span whole sequence length, signal peptide stops at last AA
         # Reverse strand
-        (one_twentyone_rev, 1, 3, (13,21)), # ORF spans whole sequence length, signal peptide starts at AA 1
-        (one_twentyone_rev, 3, 7, (1,15)),  # ORF spans whole sequence length, signal peptide stops at last AA
-        (four_fifteen_rev, 1, 2, (10,15)),  # ORF does not span whole sequence length, signal peptide starts at AA 1
-        (four_fifteen_rev, 2, 3, (7,12))    # ORF does not span whole sequence length, signal peptide in the middle of ORF
+        (ONE_TWENTYONE_REV, 1, 3, (13,21)), # ORF spans whole sequence length, signal peptide starts at AA 1
+        (ONE_TWENTYONE_REV, 3, 7, (1,15)),  # ORF spans whole sequence length, signal peptide stops at last AA
+        (FOUR_FIFTEEN_REV, 1, 2, (10,15)),  # ORF does not span whole sequence length, signal peptide starts at AA 1
+        (FOUR_FIFTEEN_REV, 2, 3, (7,12))    # ORF does not span whole sequence length, signal peptide in the middle of ORF
     ]
 )
 def test_start_stop(orf, start_aa, stop_aa, expected):
diff --git a/test/test_user_proteins.py b/test/test_user_proteins.py
index 759f9b92..082da8f2 100644
--- a/test/test_user_proteins.py
+++ b/test/test_user_proteins.py
@@ -12,74 +12,74 @@
 SEQUENCE = 'MRADEEPGDLSAVAQDYLKVIWTAQEWSQDKVSTKMLAERIGVSASTASESIRKLAEQGLVDHEKYGAVTLTDSGRRAALAMVRRHRLLETFLVNELGYRWDEVHDEA'
 
 
-aa_min = {
+AA_MIN = {
     'id': 'min',
     'description': '~~~product~~~',
     'sequence': SEQUENCE
 }
-aa_min_gene = {
+AA_MIN_GENE = {
     'id': 'min',
     'description': 'gene~~~product~~~',
     'sequence': SEQUENCE
 }
-aa_min_dbxref = {
+AA_MIN_DBXREF = {
     'id': 'min',
     'description': 'gene~~~product~~~db-1:id-1',
     'sequence': SEQUENCE
 }
-aa_min_dbxrefs = {
+AA_MIN_DBXREFS = {
     'id': 'min',
     'description': 'gene~~~product~~~db-1:id-1,db-2:id-2',
     'sequence': SEQUENCE
 }
-aa_full = {
+AA_FULL = {
     'id': 'full',
     'description': '90.0~~~80.0~~~80.0~~~gene~~~product~~~db-1:id-1,db-2:id-2',
     'sequence': SEQUENCE
 }
 
 
-aa_wrong_1 = {
+AA_WRONG_1 = {
     'id': 'low-cols',
     'description': '~~~product',
     'sequence': SEQUENCE
 }
-aa_wrong_2 = {
+AA_WRONG_2 = {
     'id': 'high-cols',
     'description': '90~~~80~~~80~~~gene~~~product~~~dbxref:dbxref~~~',
     'sequence': SEQUENCE
 }
-aa_wrong_3 = {
+AA_WRONG_3 = {
     'id': 'no-product',
     'description': 'gene~~~~~~dbxref:dbxref',
     'sequence': SEQUENCE
 }
-aa_wrong_4 = {
+AA_WRONG_4 = {
     'id': 'no-product-full',
     'description': '90~~~80~~~80~~~gene~~~~~~dbxref:dbxref',
     'sequence': SEQUENCE
 }
-aa_wrong_5 = {
+AA_WRONG_5 = {
     'id': 'wrong-dbxref',
     'description': 'gene~~~product~~~dbxrefdbxref',
     'sequence': SEQUENCE
 }
-aa_wrong_6 = {
+AA_WRONG_6 = {
     'id': 'wrong-dbxref-full',
     'description': '90~~~80~~~80~~~gene~~~product~~~dbxrefdbxref',
     'sequence': SEQUENCE
 }
-aa_wrong_7 = {
+AA_WRONG_7 = {
     'id': 'wrong-id',
     'description': 'ninety~~~80~~~80~~~gene~~~product~~~dbxref:dbxref',
     'sequence': SEQUENCE
 }
-aa_wrong_8 = {
+AA_WRONG_8 = {
     'id': 'wrong-min-query-cov',
     'description': '90~~~eighty~~~80~~~gene~~~product~~~dbxref:dbxref',
     'sequence': SEQUENCE
 }
-aa_wrong_9 = {
+AA_WRONG_9 = {
     'id': 'wrong-min-model-cov',
     'description': '90~~~80~~~eighty~~~gene~~~product~~~dbxref:dbxref',
     'sequence': SEQUENCE
@@ -89,15 +89,15 @@
 @pytest.mark.parametrize(
     "parameters",
     [
-        (aa_wrong_1),
-        (aa_wrong_2),
-        (aa_wrong_3),
-        (aa_wrong_4),
-        (aa_wrong_5),
-        (aa_wrong_6),
-        (aa_wrong_7),
-        (aa_wrong_8),
-        (aa_wrong_9)
+        AA_WRONG_1,
+        AA_WRONG_2,
+        AA_WRONG_3,
+        AA_WRONG_4,
+        AA_WRONG_5,
+        AA_WRONG_6,
+        AA_WRONG_7,
+        AA_WRONG_8,
+        AA_WRONG_9
     ]
 )
 def test_wrong_user_proteins_io(tmpdir, parameters):
@@ -114,11 +114,11 @@ def test_wrong_user_proteins_io(tmpdir, parameters):
 @pytest.mark.parametrize(
     "parameters",
     [
-        (aa_min),
-        (aa_min_gene),
-        (aa_min_dbxref),
-        (aa_min_dbxrefs),
-        (aa_full)
+        AA_MIN,
+        AA_MIN_GENE,
+        AA_MIN_DBXREF,
+        AA_MIN_DBXREFS,
+        AA_FULL
     ]
 )
 def test_user_proteins_io(parameters, tmpdir):