allow uneven numbers of CRISPR repeats & spacers #265

oschwengers · Jan 11, 2024 · d1aafb0 · d1aafb0
1 parent decb615
commit d1aafb0
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 31 deletions.
diff --git a/bakta/features/crispr.py b/bakta/features/crispr.py
@@ -1,4 +1,5 @@
 import logging
+import re
 import subprocess as sp
 
 from collections import OrderedDict
@@ -10,6 +11,9 @@
 import bakta.utils as bu
 
 
+RE_CRISPR = re.compile(r'(\d{1,8})\s+(\d{2})\s+(\d{1,3}\.\d)\s+(?:(\d{2})\s+)?([ATGCN]+)\s+([ATGCN\.]+)\s*(?:([ATGCN]+))?')
+
+
 log = logging.getLogger('CRISPR')
 
 
@@ -76,33 +80,31 @@ def predict_crispr(genome: dict, contigs_path: Path):
                         contig_id = line[1:]
                         crispr_array['contig'] = contig_id
                     elif(line[0] != '='):
-                        cols = line.split()
-                        if(len(cols) == 7  and  cols[0] != 'Pos'):
-                            (position, repeat_length, id, spacer_length, left_flank, repeat_seq, spacer_seq) = cols
-                            position, repeat_length, spacer_length = int(position), int(repeat_length), int(spacer_length)
-                            spacer_seq = spacer_seq.upper()
+                        m = RE_CRISPR.fullmatch(line)
+                        print(f'CRISPR DEBUG: match: {m}')
+                        if(m is not None):
+                            print(f'CRISPR DEBUG: detected line: {line}')
+                            position = int(m.group(1))
+                            repeat_length = int(m.group(2))
+                            repeat_seq = m.group(6)
+                            spacer_seq = m.group(7)
                             crispr_repeat = OrderedDict()
                             crispr_repeat['strand'] = bc.STRAND_UNKNOWN
                             crispr_repeat['start'] = position - gap_count
                             crispr_repeat['stop'] = position + repeat_length - 1 - gap_count
                             crispr_array['repeats'].append(crispr_repeat)
                             gap_count += repeat_seq.count('-')  # correct wrong PILER-CR detail positions by gaps
-                            crispr_spacer = OrderedDict()
-                            crispr_spacer['strand'] = bc.STRAND_UNKNOWN
-                            crispr_spacer['start'] = position + repeat_length  - gap_count
-                            crispr_spacer['stop'] = position + repeat_length + spacer_length - 1 - gap_count
-                            crispr_spacer['sequence'] = spacer_seq
-                            crispr_array['spacers'].append(crispr_spacer)
-                            spacer_genome_seq = bu.extract_feature_sequence(crispr_spacer, contigs[contig_id])
-                            assert spacer_seq == spacer_genome_seq  # assure PILER-CR spacer sequence equal extraction from genome
-                        elif(len(cols) == 6  and  cols[0] != 'Pos'):  # last line in array without spacer
-                            (position, repeat_length, id, left_flank, repeat_seq, spacer_seq) = cols
-                            position, repeat_length, spacer_length = int(position), int(repeat_length), int(spacer_length)
-                            crispr_repeat = OrderedDict()
-                            crispr_repeat['strand'] = bc.STRAND_UNKNOWN
-                            crispr_repeat['start'] = position - gap_count
-                            crispr_repeat['stop'] = position + repeat_length - 1 - gap_count
-                            crispr_array['repeats'].append(crispr_repeat)
+                            if(spacer_seq is not None):
+                                spacer_seq = spacer_seq.upper()
+                                spacer_length = len(spacer_seq)
+                                crispr_spacer = OrderedDict()
+                                crispr_spacer['strand'] = bc.STRAND_UNKNOWN
+                                crispr_spacer['start'] = position + repeat_length  - gap_count
+                                crispr_spacer['stop'] = position + repeat_length + spacer_length - 1 - gap_count
+                                crispr_spacer['sequence'] = spacer_seq
+                                crispr_array['spacers'].append(crispr_spacer)
+                                spacer_genome_seq = bu.extract_feature_sequence(crispr_spacer, contigs[contig_id])
+                                assert spacer_seq == spacer_genome_seq  # assure PILER-CR spacer sequence equal extraction from genome
                 elif(output_section == 'POSITION'):
                     if(line[0] == '>'):
                         contig_id = line[1:]
@@ -118,7 +120,7 @@ def predict_crispr(genome: dict, contigs_path: Path):
                         crispr_array['product'] = f'CRISPR array with {copies} repeats of length {repeat_length}, consensus sequence {repeat_consensus} and spacer length {spacer_length}'
                         crispr_array['spacer_length'] = int(spacer_length)
                         crispr_array['repeat_length'] = int(repeat_length)
-                        assert len(crispr_array['repeats']) == int(copies), print(f"len(reps)={len(crispr_array['repeats'])}, int(copies)={int(copies)}")
+                        assert (len(crispr_array['repeats']) - int(copies)) <= 1, print(f"len(reps)={len(crispr_array['repeats'])}, int(copies)={int(copies)}")
                         crispr_array['repeat_consensus'] = repeat_consensus
                         crispr_array['db_xrefs'] = [so.SO_CRISPR.id]
 

diff --git a/bakta/io/gff.py b/bakta/io/gff.py
@@ -203,12 +203,13 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                             annotations = encode_annotations(annotations)
                             fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
                             i += 1
-                        repeat = feat['repeats'][i]
-                        annotations = {
-                            'ID': f"{feat['id']}_repeat_{i+1}"
-                        }
-                        annotations = encode_annotations(annotations)
-                        fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                        if(len(feat['repeats']) - 1 == i):
+                            repeat = feat['repeats'][i]
+                            annotations = {
+                                'ID': f"{feat['id']}_repeat_{i+1}"
+                            }
+                            annotations = encode_annotations(annotations)
+                            fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_CDS):
                     annotations = {
                         'ID': feat['locus'],

diff --git a/bakta/io/tsv.py b/bakta/io/tsv.py
@@ -56,9 +56,10 @@ def write_tsv(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_
                         fh.write('\t'.join([feat['contig'], 'CRISPR spacer', str(spacer['start']), str(spacer['stop']), spacer['strand'], '', '', f"CRISPR spacer, sequence {spacer['sequence']}", '']))
                         fh.write('\n')
                         i += 1
-                    repeat = feat['repeats'][i]
-                    fh.write('\t'.join([feat['contig'], 'CRISPR repeat', str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
-                    fh.write('\n')
+                    if(len(feat['repeats']) - 1 == i):
+                        repeat = feat['repeats'][i]
+                        fh.write('\t'.join([feat['contig'], 'CRISPR repeat', str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
+                        fh.write('\n')
     return