Skip to content

Commit

Permalink
allow uneven numbers of CRISPR repeats & spacers #265
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Jan 11, 2024
1 parent decb615 commit d1aafb0
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 31 deletions.
46 changes: 24 additions & 22 deletions bakta/features/crispr.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import re
import subprocess as sp

from collections import OrderedDict
Expand All @@ -10,6 +11,9 @@
import bakta.utils as bu


RE_CRISPR = re.compile(r'(\d{1,8})\s+(\d{2})\s+(\d{1,3}\.\d)\s+(?:(\d{2})\s+)?([ATGCN]+)\s+([ATGCN\.]+)\s*(?:([ATGCN]+))?')


log = logging.getLogger('CRISPR')


Expand Down Expand Up @@ -76,33 +80,31 @@ def predict_crispr(genome: dict, contigs_path: Path):
contig_id = line[1:]
crispr_array['contig'] = contig_id
elif(line[0] != '='):
cols = line.split()
if(len(cols) == 7 and cols[0] != 'Pos'):
(position, repeat_length, id, spacer_length, left_flank, repeat_seq, spacer_seq) = cols
position, repeat_length, spacer_length = int(position), int(repeat_length), int(spacer_length)
spacer_seq = spacer_seq.upper()
m = RE_CRISPR.fullmatch(line)
print(f'CRISPR DEBUG: match: {m}')
if(m is not None):
print(f'CRISPR DEBUG: detected line: {line}')
position = int(m.group(1))
repeat_length = int(m.group(2))
repeat_seq = m.group(6)
spacer_seq = m.group(7)
crispr_repeat = OrderedDict()
crispr_repeat['strand'] = bc.STRAND_UNKNOWN
crispr_repeat['start'] = position - gap_count
crispr_repeat['stop'] = position + repeat_length - 1 - gap_count
crispr_array['repeats'].append(crispr_repeat)
gap_count += repeat_seq.count('-') # correct wrong PILER-CR detail positions by gaps
crispr_spacer = OrderedDict()
crispr_spacer['strand'] = bc.STRAND_UNKNOWN
crispr_spacer['start'] = position + repeat_length - gap_count
crispr_spacer['stop'] = position + repeat_length + spacer_length - 1 - gap_count
crispr_spacer['sequence'] = spacer_seq
crispr_array['spacers'].append(crispr_spacer)
spacer_genome_seq = bu.extract_feature_sequence(crispr_spacer, contigs[contig_id])
assert spacer_seq == spacer_genome_seq # assure PILER-CR spacer sequence equal extraction from genome
elif(len(cols) == 6 and cols[0] != 'Pos'): # last line in array without spacer
(position, repeat_length, id, left_flank, repeat_seq, spacer_seq) = cols
position, repeat_length, spacer_length = int(position), int(repeat_length), int(spacer_length)
crispr_repeat = OrderedDict()
crispr_repeat['strand'] = bc.STRAND_UNKNOWN
crispr_repeat['start'] = position - gap_count
crispr_repeat['stop'] = position + repeat_length - 1 - gap_count
crispr_array['repeats'].append(crispr_repeat)
if(spacer_seq is not None):
spacer_seq = spacer_seq.upper()
spacer_length = len(spacer_seq)
crispr_spacer = OrderedDict()
crispr_spacer['strand'] = bc.STRAND_UNKNOWN
crispr_spacer['start'] = position + repeat_length - gap_count
crispr_spacer['stop'] = position + repeat_length + spacer_length - 1 - gap_count
crispr_spacer['sequence'] = spacer_seq
crispr_array['spacers'].append(crispr_spacer)
spacer_genome_seq = bu.extract_feature_sequence(crispr_spacer, contigs[contig_id])
assert spacer_seq == spacer_genome_seq # assure PILER-CR spacer sequence equal extraction from genome
elif(output_section == 'POSITION'):
if(line[0] == '>'):
contig_id = line[1:]
Expand All @@ -118,7 +120,7 @@ def predict_crispr(genome: dict, contigs_path: Path):
crispr_array['product'] = f'CRISPR array with {copies} repeats of length {repeat_length}, consensus sequence {repeat_consensus} and spacer length {spacer_length}'
crispr_array['spacer_length'] = int(spacer_length)
crispr_array['repeat_length'] = int(repeat_length)
assert len(crispr_array['repeats']) == int(copies), print(f"len(reps)={len(crispr_array['repeats'])}, int(copies)={int(copies)}")
assert (len(crispr_array['repeats']) - int(copies)) <= 1, print(f"len(reps)={len(crispr_array['repeats'])}, int(copies)={int(copies)}")
crispr_array['repeat_consensus'] = repeat_consensus
crispr_array['db_xrefs'] = [so.SO_CRISPR.id]

Expand Down
13 changes: 7 additions & 6 deletions bakta/io/gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,12 +203,13 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
i += 1
repeat = feat['repeats'][i]
annotations = {
'ID': f"{feat['id']}_repeat_{i+1}"
}
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
if(len(feat['repeats']) - 1 == i):
repeat = feat['repeats'][i]
annotations = {
'ID': f"{feat['id']}_repeat_{i+1}"
}
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
elif(feat['type'] == bc.FEATURE_CDS):
annotations = {
'ID': feat['locus'],
Expand Down
7 changes: 4 additions & 3 deletions bakta/io/tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,10 @@ def write_tsv(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_
fh.write('\t'.join([feat['contig'], 'CRISPR spacer', str(spacer['start']), str(spacer['stop']), spacer['strand'], '', '', f"CRISPR spacer, sequence {spacer['sequence']}", '']))
fh.write('\n')
i += 1
repeat = feat['repeats'][i]
fh.write('\t'.join([feat['contig'], 'CRISPR repeat', str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
fh.write('\n')
if(len(feat['repeats']) - 1 == i):
repeat = feat['repeats'][i]
fh.write('\t'.join([feat['contig'], 'CRISPR repeat', str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
fh.write('\n')
return


Expand Down

0 comments on commit d1aafb0

Please sign in to comment.