From 53df2bf0c5baa5915ed765428d6a44ccacd8ee77 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 24 Oct 2023 17:59:36 +0200 Subject: [PATCH] Improve PILER-CR CRISPR parser (#249) * implement parsing of CRISPR repeats & spacers * write CRISPR repeats & spacers to GFF * add CRISPR repeat & spacer output to TSV * add CRISPR PILER-CR test * refactor test * fix full black box test for expanded CRISPR features * refactor test code --- bakta/constants.py | 2 + bakta/features/crispr.py | 93 ++++++++++++++++++++++++-------- bakta/io/gff.py | 23 ++++++++ bakta/io/tsv.py | 37 +++++++++---- test/test_args.py | 4 +- test/test_bakta.py | 102 ++++++++++++++++-------------------- test/test_bakta_proteins.py | 1 + test/test_crispr.py | 43 +++++++++++++++ test/test_edge_features.py | 2 - test/test_nt_sequences.py | 10 ++-- test/test_sORF.py | 24 ++++----- test/test_sig_peps.py | 24 ++++----- test/test_user_proteins.py | 56 ++++++++++---------- 13 files changed, 269 insertions(+), 152 deletions(-) create mode 100644 test/test_crispr.py diff --git a/bakta/constants.py b/bakta/constants.py index 1b96efbb..7e3d5598 100644 --- a/bakta/constants.py +++ b/bakta/constants.py @@ -97,6 +97,8 @@ FEATURE_NC_RNA = 'ncRNA' FEATURE_NC_RNA_REGION = 'ncRNA-region' FEATURE_CRISPR = 'crispr' +FEATURE_CRISPR_REPEAT = 'crispr-repeat' +FEATURE_CRISPR_SPACER = 'crispr-spacer' FEATURE_ORF = 'orf' FEATURE_SORF = 'sorf' FEATURE_CDS = 'cds' diff --git a/bakta/features/crispr.py b/bakta/features/crispr.py index d6012058..ec66f04d 100644 --- a/bakta/features/crispr.py +++ b/bakta/features/crispr.py @@ -38,18 +38,72 @@ def predict_crispr(genome: dict, contigs_path: Path): log.warning('CRISPRs failed! pilercr-error-code=%d', proc.returncode) raise Exception(f'PILER-CR error! error code: {proc.returncode}') - # parse orfs - crispr_arrays = [] + # parse crispr arrays + crispr_arrays = {} contigs = {c['id']: c for c in genome['contigs']} with output_path.open() as fh: + output_section = None contig_id = None + array_id = None skip_lines = True + crispr_array = None + gap_count = 0 for line in fh: line = line.strip() - if(line == 'SUMMARY BY POSITION'): + if(line == ''): + continue + if(line == 'DETAIL REPORT'): + output_section = 'DETAIL' + skip_lines = False + elif(line == 'SUMMARY BY POSITION'): + output_section = 'POSITION' + skip_lines = False + elif(line == 'SUMMARY BY SIMILARITY'): + output_section = 'SIMILARITY' skip_lines = False elif(skip_lines is False): - if(len(line) > 0): + if(output_section == 'DETAIL'): + if(line[0:5] == 'Array'): + gap_count = 0 + array_id = line.split()[1] + crispr_array = OrderedDict() + crispr_array['type'] = bc.FEATURE_CRISPR + crispr_array['strand'] = bc.STRAND_UNKNOWN + crispr_array['repeats'] = [] + crispr_array['spacers'] = [] + crispr_arrays[array_id] = crispr_array + elif(line[0] == '>'): + contig_id = line[1:] + crispr_array['contig'] = contig_id + elif(line[0] != '='): + cols = line.split() + if(len(cols) == 7 and cols[0] != 'Pos'): + (position, repeat_length, id, spacer_length, left_flank, repeat_seq, spacer_seq) = cols + position, repeat_length, spacer_length = int(position), int(repeat_length), int(spacer_length) + spacer_seq = spacer_seq.upper() + crispr_repeat = OrderedDict() + crispr_repeat['strand'] = bc.STRAND_UNKNOWN + crispr_repeat['start'] = position - gap_count + crispr_repeat['stop'] = position + repeat_length - 1 - gap_count + crispr_array['repeats'].append(crispr_repeat) + gap_count += repeat_seq.count('-') # correct wrong PILER-CR detail positions by gaps + crispr_spacer = OrderedDict() + crispr_spacer['strand'] = bc.STRAND_UNKNOWN + crispr_spacer['start'] = position + repeat_length - gap_count + crispr_spacer['stop'] = position + repeat_length + spacer_length - 1 - gap_count + crispr_spacer['sequence'] = spacer_seq + crispr_array['spacers'].append(crispr_spacer) + spacer_genome_seq = bu.extract_feature_sequence(crispr_spacer, contigs[contig_id]) + assert spacer_seq == spacer_genome_seq # assure PILER-CR spacer sequence equal extraction from genome + elif(len(cols) == 6 and cols[0] != 'Pos'): # last line in array without spacer + (position, repeat_length, id, left_flank, repeat_seq, spacer_seq) = cols + position, repeat_length, spacer_length = int(position), int(repeat_length), int(spacer_length) + crispr_repeat = OrderedDict() + crispr_repeat['strand'] = bc.STRAND_UNKNOWN + crispr_repeat['start'] = position - gap_count + crispr_repeat['stop'] = position + repeat_length - 1 - gap_count + crispr_array['repeats'].append(crispr_repeat) + elif(output_section == 'POSITION'): if(line[0] == '>'): contig_id = line[1:] elif(line[0] != 'A' and line[0] != '='): @@ -58,27 +112,22 @@ def predict_crispr(genome: dict, contigs_path: Path): (array_id, contig, position, length, copies, repeat_length, spacer_length, repeat_consensus) = cols else: (array_id, contig, position, length, copies, repeat_length, spacer_length, distance, repeat_consensus) = cols + crispr_array = crispr_arrays[array_id] + crispr_array['start'] = int(position) + crispr_array['stop'] = int(position) + int(length) - 1 + crispr_array['product'] = f'CRISPR array with {copies} repeats of length {repeat_length}, consensus sequence {repeat_consensus} and spacer length {spacer_length}' + crispr_array['spacer_length'] = int(spacer_length) + crispr_array['repeat_length'] = int(repeat_length) + assert len(crispr_array['repeats']) == int(copies), print(f"len(reps)={len(crispr_array['repeats'])}, int(copies)={int(copies)}") + crispr_array['repeat_consensus'] = repeat_consensus + crispr_array['db_xrefs'] = [so.SO_CRISPR.id] - crispr = OrderedDict() - crispr['type'] = bc.FEATURE_CRISPR - crispr['contig'] = contig_id - crispr['start'] = int(position) - crispr['stop'] = int(position) + int(length) - 1 - crispr['strand'] = bc.STRAND_UNKNOWN - crispr['product'] = f'CRISPR array with {copies} repeats of length {repeat_length}, consensus sequence {repeat_consensus} and spacer length {spacer_length}' - crispr['spacer_length'] = int(spacer_length) - crispr['repeat_length'] = int(repeat_length) - crispr['repeats'] = int(copies) - crispr['repeat_consensus'] = repeat_consensus - crispr['db_xrefs'] = [so.SO_CRISPR.id] - - nt = bu.extract_feature_sequence(crispr, contigs[contig_id]) # extract nt sequences - crispr['nt'] = nt - - crispr_arrays.append(crispr) + nt = bu.extract_feature_sequence(crispr_array, contigs[contig_id]) # extract nt sequences + crispr_array['nt'] = nt log.info( 'contig=%s, start=%i, stop=%i, spacer-length=%i, repeat-length=%i, # repeats=%i, repeat-consensus=%s, nt=[%s..%s]', - crispr['contig'], crispr['start'], crispr['stop'], crispr['spacer_length'], crispr['repeat_length'], crispr['repeats'], crispr['repeat_consensus'], nt[:10], nt[-10:] + crispr_array['contig'], crispr_array['start'], crispr_array['stop'], crispr_array['spacer_length'], crispr_array['repeat_length'], len(crispr_array['repeats']), crispr_array['repeat_consensus'], nt[:10], nt[-10:] ) + crispr_arrays = crispr_arrays.values() log.info('predicted=%i', len(crispr_arrays)) return crispr_arrays diff --git a/bakta/io/gff.py b/bakta/io/gff.py index 4af60252..91d0445b 100644 --- a/bakta/io/gff.py +++ b/bakta/io/gff.py @@ -186,6 +186,29 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat annotations[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feat['repeat_consensus'] annotations = encode_annotations(annotations) fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") + if(not cfg.compliant): + i = 0 + while i < len(feat['spacers']): + repeat = feat['repeats'][i] + annotations = { + 'ID': f"{feat['id']}_repeat_{i+1}" + } + annotations = encode_annotations(annotations) + fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n") + spacer = feat['spacers'][i] + annotations = { + 'ID': f"{feat['id']}_spacer_{i+1}", + 'sequence': spacer['sequence'] + } + annotations = encode_annotations(annotations) + fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n") + i += 1 + repeat = feat['repeats'][i] + annotations = { + 'ID': f"{feat['id']}_repeat_{i+1}" + } + annotations = encode_annotations(annotations) + fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") elif(feat['type'] is bc.FEATURE_CDS): annotations = { 'ID': feat['locus'], diff --git a/bakta/io/tsv.py b/bakta/io/tsv.py index bfe68e07..426917da 100644 --- a/bakta/io/tsv.py +++ b/bakta/io/tsv.py @@ -27,21 +27,38 @@ def write_tsv(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_ for contig in contigs: for feat in features_by_contig[contig['id']]: feat_type = feat['type'] - if(feat['type'] == bc.FEATURE_GAP): + if(feat_type == bc.FEATURE_GAP): feat_type = bc.INSDC_FEATURE_ASSEMBLY_GAP if feat['length'] >= 100 else bc.INSDC_FEATURE_GAP gene = feat['gene'] if feat.get('gene', None) else '' product = f"(pseudo) {feat.get('product', '')}" if feat.get('pseudo', False) else feat.get('product', '') - fh.write('\t'.join([feat['contig'], - feat_type, - str(feat['start']), - str(feat['stop']), - feat['strand'], - feat.get('locus', ''), - gene, - product, - ', '.join(sorted(feat.get('db_xrefs', [])))])) + fh.write('\t'.join( + [ + feat['contig'], + feat_type, + str(feat['start']), + str(feat['stop']), + feat['strand'], + feat.get('locus', ''), + gene, + product, + ', '.join(sorted(feat.get('db_xrefs', []))) + ]) + ) fh.write('\n') + if(feat_type == bc.FEATURE_CRISPR): + i = 0 + while i < len(feat['spacers']): + repeat = feat['repeats'][i] + fh.write('\t'.join([feat['contig'], 'CRISPR repeat', str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", ''])) + fh.write('\n') + spacer = feat['spacers'][i] + fh.write('\t'.join([feat['contig'], 'CRISPR spacer', str(spacer['start']), str(spacer['stop']), spacer['strand'], '', '', f"CRISPR spacer, sequence {spacer['sequence']}", ''])) + fh.write('\n') + i += 1 + repeat = feat['repeats'][i] + fh.write('\t'.join([feat['contig'], 'CRISPR repeat', str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", ''])) + fh.write('\n') return diff --git a/test/test_args.py b/test/test_args.py index 5cf323a9..6c09ed92 100644 --- a/test/test_args.py +++ b/test/test_args.py @@ -1,10 +1,10 @@ -from inspect import Parameter import os -import pytest from pathlib import Path from subprocess import run +import pytest + from .conftest import FILES, SKIP_PARAMETERS diff --git a/test/test_bakta.py b/test/test_bakta.py index ace2f865..31297f49 100644 --- a/test/test_bakta.py +++ b/test/test_bakta.py @@ -1,11 +1,13 @@ +import json import subprocess as sp -import sys from pathlib import Path from subprocess import run import pytest +import bakta.constants as bc + from .conftest import FILES, SKIP_PARAMETERS @@ -58,24 +60,28 @@ def test_bakta_plasmid(tmpdir): assert Path.exists(output_path) assert output_path.stat().st_size > 0 - output_path = tmpdir_path.joinpath('test.tsv') - feature_count, feature_counts = count_features(output_path) - assert feature_count == 3 + results_path = tmpdir_path.joinpath('test.json') + results = None + with results_path.open() as fh: + results = json.load(fh) + assert results is not None + features = results['features'] + assert len(features) == 3 feature_counts_expected = { - 'tRNA': 0, - 'tmRNA': 0, - 'rRNA': 0, - 'ncRNA': 0, - 'ncRNA-region': 0, - 'crispr': 0, - 'sorf': 0, - 'oriV': 0, - 'oriC': 0, - 'oriT': 0, - 'cds': 3 + bc.FEATURE_T_RNA: 0, + bc.FEATURE_TM_RNA: 0, + bc.FEATURE_R_RNA: 0, + bc.FEATURE_NC_RNA: 0, + bc.FEATURE_NC_RNA_REGION: 0, + bc.FEATURE_CRISPR: 0, + bc.FEATURE_CDS: 3, + bc.FEATURE_SORF: 0, + bc.FEATURE_ORIC: 0, + bc.FEATURE_ORIV: 0, + bc.FEATURE_ORIT: 0 } - for type in feature_counts: - assert feature_counts[type] == feature_counts_expected[type] + for type, count in feature_counts_expected.items(): + assert len([f for f in features if f['type'] == type]) == count @pytest.mark.parametrize( @@ -96,46 +102,26 @@ def test_bakta_genome(db, tmpdir): assert Path.exists(output_path) assert output_path.stat().st_size > 0 - output_path = tmpdir_path.joinpath('test.tsv') - feature_count, feature_counts = count_features(output_path) - assert feature_count == 5551 + results_path = tmpdir_path.joinpath('test.json') + results = None + with results_path.open() as fh: + results = json.load(fh) + assert results is not None + features = results['features'] + assert len(features) == 5551 feature_counts_expected = { - 'tRNA': 107, - 'tmRNA': 1, - 'rRNA': 7, - 'ncRNA': 57, - 'ncRNA-region': 1, - 'crispr': 1, - 'sorf': 2, - 'oriV': 0, - 'oriC': 0, - 'oriT': 0, - 'cds': 5375 + bc.FEATURE_T_RNA: 107, + bc.FEATURE_TM_RNA: 1, + bc.FEATURE_R_RNA: 7, + bc.FEATURE_NC_RNA: 57, + bc.FEATURE_NC_RNA_REGION: 1, + bc.FEATURE_CRISPR: 1, + bc.FEATURE_CDS: 5375, + bc.FEATURE_SORF: 2, + bc.FEATURE_ORIC: 0, + bc.FEATURE_ORIV: 0, + bc.FEATURE_ORIT: 0 } - for type in feature_counts: - assert feature_counts[type] == feature_counts_expected[type] - - -def count_features(file_path): - with open(file_path, 'r') as fh: - feature_count = 0 - feature_counts = { - 'tRNA': 0, - 'tmRNA': 0, - 'rRNA': 0, - 'ncRNA': 0, - 'ncRNA-region': 0, - 'crispr': 0, - 'sorf': 0, - 'oriV': 0, - 'oriC': 0, - 'oriT': 0, - 'cds': 0 - } - for line in fh: - if not line.startswith('#'): - feature_count += 1 - feature = line.split('\t')[1] - if feature in feature_counts: - feature_counts[feature] += 1 - return feature_count, feature_counts + for type, count in feature_counts_expected.items(): + assert len([f for f in features if f['type'] == type]) == count + diff --git a/test/test_bakta_proteins.py b/test/test_bakta_proteins.py index 6452df00..4e04ba02 100644 --- a/test/test_bakta_proteins.py +++ b/test/test_bakta_proteins.py @@ -3,6 +3,7 @@ import pytest + FILES = [ 'test.tsv', 'test.hypotheticals.tsv', diff --git a/test/test_crispr.py b/test/test_crispr.py new file mode 100644 index 00000000..8196cdc2 --- /dev/null +++ b/test/test_crispr.py @@ -0,0 +1,43 @@ +import json + +from pathlib import Path +from subprocess import run + + +CRISPR_ARRAYS = [ + { + 'repeat_consensus': 'CGGTTTATCCCCGCTGGCGCGGGGAACACA', + 'spacers': [ + 'AACCGAAACACACGATCAATCCGAATATGAG', + 'TTGGTGACAGTTTTTGTCACTGTTTTGGTGA', + 'CTAAGCATACATATCTGTTTTTAAACA' + ], + 'repeats': 3 + } +] + + +def test_crispr_arrays(tmpdir): + proc = run( + [ + 'bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--prefix', 'test', + '--skip-tmrna', '--skip-trna', '--skip-rrna', '--skip-ncrna', '--skip-ncrna-region', '--skip-cds', '--skip-sorf', '--skip-ori', '--skip-gap', '--skip-plot', + 'test/data/GCF_000008865.2.fna.gz' + ] + ) + assert proc.returncode == 0 + + results_path = Path(tmpdir).joinpath('test.json') + assert Path.exists(results_path) + + results = None + with results_path.open() as fh: + results = json.load(fh) + assert results is not None + + crispr_arrays = [feat for feat in results['features'] if feat['type'] == 'crispr'] + assert len(crispr_arrays) == 1 + + for idx, crispr_array in enumerate(crispr_arrays): + assert crispr_array['repeat_consensus'] == CRISPR_ARRAYS[idx]['repeat_consensus'] + assert len(crispr_array['repeats']) == CRISPR_ARRAYS[idx]['repeats'] diff --git a/test/test_edge_features.py b/test/test_edge_features.py index 170fe805..38b381ce 100644 --- a/test/test_edge_features.py +++ b/test/test_edge_features.py @@ -3,8 +3,6 @@ from pathlib import Path from subprocess import run -import pytest - from bakta import constants as bc diff --git a/test/test_nt_sequences.py b/test/test_nt_sequences.py index dc264059..93e2bd2f 100644 --- a/test/test_nt_sequences.py +++ b/test/test_nt_sequences.py @@ -3,10 +3,8 @@ from pathlib import Path from subprocess import run -import pytest - -cds = 'TTGACTACGCCATTGAAAAAGATTGTGATTGTCGGCGGCGGTGCTGGTGGGCTGGAAATGGCAACACAGCTGGGGCATAAGCTGGGACGCAAGAAAAAAGCCAAAATTACGCTGGTCGATCGTAACCACAGCCATCTGTGGAAACCGCTGCTGCACGAAGTGGCGACTGGCTCGCTTGATGAAGGCGTCGATGCGTTGAGCTATCTGGCCCATGCGCGCAATCATGGTTTCCAGTTCCAGCTGGGTTCCGTCATTGATATTGATCGTGAAGCGAAAACAATCACTATTGCAGAACTGCGCGATGAGAAAGGTGAACTGCTGGTTCCGGAACGTAAAATCGCCTATGACACCCTGGTAATGGCGCTGGGTAGCACCTCTAACGATTTCAATACGCCAGGTGTCAAAGAGAACTGCATTTTCCTCGATAACCCGCACCAGGCGCGTCGCTTTCACCAGGAGATGCTGAATCTCTTCCTGAAATACTCCGCCAACCTGGGCGCAAATGGCAAAGTGAACATTGCGATTGTCGGCGGCGGCGCGACGGGTGTAGAACTCTCCGCTGAATTGCACAACGCGGTCAAGCAACTGCACAGCTACGGTTACAAAGGCCTGACCAACGAAGCCCTGAACGTAACGCTGGTAGAAGCGGGAGAACGTATTTTGCCTGCATTACCGCCACGTATCTCTGCTGCGGCCCACAACGAGCTAACGAAACTTGGCGTTCGCGTGCTGACGCAAACCATGGTCACCAGTGCTGATGAAGGCGGCCTGCACACTAAAGATGGCGAATATATTGAGGCTGATCTGATGGTGTGGGCAGCCGGGATCAAAGCGCCAGACTTCCTGAAAGATATCGGTGGTCTTGAAACTAACCGTATCAACCAGCTGGTGGTGGAACCGACGCTGCAAACCACCCGCGATCCAGACATTTACGCTATTGGCGACTGCGCGTCATGCCCGCGTCCGGAAGGGGGCTTTGTTCCGCCGCGTGCTCAGGCTGCACACCAGATGGCGACTTGCGCAATGAACAACATTCTGGCGCAGATGAATGGTAAGCCGCTGAAAAATTATCAGTATAAAGATCATGGTTCGCTGGTATCGCTGTCGAACTTCTCCACCGTTGGTAGCCTGATGGGTAACCTGACGCGCGGCTCAATGATGATTGAAGGACGAATTGCGCGCTTTGTATATATCTCGCTATACCGAATGCATCAGATTGCGCTGCATGGTTACTTTAAAACCGGATTAATGATGCTGGTGGGGAGTATTAACCGCGTTATCCGTCCGCGTTTGAAGTTGCATTAA' -sorf = 'ATGGTGAATACCGGCGGCAATAAACGTCAGGTGCCGGCGAAACGTCAGAATCGTGGCTCCCGTAATTCCAAAGATGATGGCGGCTAA' +CDS = 'TTGACTACGCCATTGAAAAAGATTGTGATTGTCGGCGGCGGTGCTGGTGGGCTGGAAATGGCAACACAGCTGGGGCATAAGCTGGGACGCAAGAAAAAAGCCAAAATTACGCTGGTCGATCGTAACCACAGCCATCTGTGGAAACCGCTGCTGCACGAAGTGGCGACTGGCTCGCTTGATGAAGGCGTCGATGCGTTGAGCTATCTGGCCCATGCGCGCAATCATGGTTTCCAGTTCCAGCTGGGTTCCGTCATTGATATTGATCGTGAAGCGAAAACAATCACTATTGCAGAACTGCGCGATGAGAAAGGTGAACTGCTGGTTCCGGAACGTAAAATCGCCTATGACACCCTGGTAATGGCGCTGGGTAGCACCTCTAACGATTTCAATACGCCAGGTGTCAAAGAGAACTGCATTTTCCTCGATAACCCGCACCAGGCGCGTCGCTTTCACCAGGAGATGCTGAATCTCTTCCTGAAATACTCCGCCAACCTGGGCGCAAATGGCAAAGTGAACATTGCGATTGTCGGCGGCGGCGCGACGGGTGTAGAACTCTCCGCTGAATTGCACAACGCGGTCAAGCAACTGCACAGCTACGGTTACAAAGGCCTGACCAACGAAGCCCTGAACGTAACGCTGGTAGAAGCGGGAGAACGTATTTTGCCTGCATTACCGCCACGTATCTCTGCTGCGGCCCACAACGAGCTAACGAAACTTGGCGTTCGCGTGCTGACGCAAACCATGGTCACCAGTGCTGATGAAGGCGGCCTGCACACTAAAGATGGCGAATATATTGAGGCTGATCTGATGGTGTGGGCAGCCGGGATCAAAGCGCCAGACTTCCTGAAAGATATCGGTGGTCTTGAAACTAACCGTATCAACCAGCTGGTGGTGGAACCGACGCTGCAAACCACCCGCGATCCAGACATTTACGCTATTGGCGACTGCGCGTCATGCCCGCGTCCGGAAGGGGGCTTTGTTCCGCCGCGTGCTCAGGCTGCACACCAGATGGCGACTTGCGCAATGAACAACATTCTGGCGCAGATGAATGGTAAGCCGCTGAAAAATTATCAGTATAAAGATCATGGTTCGCTGGTATCGCTGTCGAACTTCTCCACCGTTGGTAGCCTGATGGGTAACCTGACGCGCGGCTCAATGATGATTGAAGGACGAATTGCGCGCTTTGTATATATCTCGCTATACCGAATGCATCAGATTGCGCTGCATGGTTACTTTAAAACCGGATTAATGATGCTGGTGGGGAGTATTAACCGCGTTATCCGTCCGCGTTTGAAGTTGCATTAA' +SORF = 'ATGGTGAATACCGGCGGCAATAAACGTCAGGTGCCGGCGAAACGTCAGAATCGTGGCTCCCGTAATTCCAAAGATGATGGCGGCTAA' def test_bakta_cds_nt_sequence(tmpdir): @@ -30,7 +28,7 @@ def test_bakta_cds_nt_sequence(tmpdir): for feat in results['features']: if(feat['contig'] != 'dummy'): - assert feat['nt'] == cds + assert feat['nt'] == CDS def test_bakta_sorf_nt_sequence(tmpdir): @@ -53,4 +51,4 @@ def test_bakta_sorf_nt_sequence(tmpdir): results = json.load(fh) for feat in results['features']: - assert feat['nt'] == sorf + assert feat['nt'] == SORF diff --git a/test/test_sORF.py b/test/test_sORF.py index ef190fb8..72214a6a 100644 --- a/test/test_sORF.py +++ b/test/test_sORF.py @@ -5,39 +5,39 @@ from bakta.features import s_orf as bu -contig_1 = { +CONTIG_1 = { 'id': 1, 'description': 'no sORFs', 'sequence': 'GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG' } -contig_2 = { +CONTIG_2 = { 'id': 2, 'description': 'out of limits', 'sequence': 'ATGAAAAAATAGGGGATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAG' } -contig_3 = { +CONTIG_3 = { 'id': 3, 'description': 'two sORFs', 'sequence': 'ATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAG' } -genome_1 = { - 'contigs': [contig_1] +GENOME_1 = { + 'contigs': [CONTIG_1] } -genome_2 = { - 'contigs': [contig_2] +GENOME_2 = { + 'contigs': [CONTIG_2] } -genome_3 = { - 'contigs': [contig_3] +GENOME_3 = { + 'contigs': [CONTIG_3] } @pytest.mark.parametrize( "genome, expected", [ - (genome_1, 0), - (genome_2, 0), - (genome_3, 2) + (GENOME_1, 0), + (GENOME_2, 0), + (GENOME_3, 2) ] ) def test_sORF(genome, expected): diff --git a/test/test_sig_peps.py b/test/test_sig_peps.py index 8eaf5471..469a6f6f 100644 --- a/test/test_sig_peps.py +++ b/test/test_sig_peps.py @@ -3,25 +3,25 @@ from bakta.features import signal_peptides as bsp -one_twentyone_fwd = {'start': 1, 'stop': 21, 'strand': '+'} -one_twentyone_rev = {'start': 1, 'stop': 21, 'strand': '-'} -four_fifteen_fwd = {'start': 4, 'stop': 15, 'strand': '+'} -four_fifteen_rev = {'start': 4, 'stop': 15, 'strand': '-'} +ONE_TWENTYONE_FWD = {'start': 1, 'stop': 21, 'strand': '+'} +ONE_TWENTYONE_REV = {'start': 1, 'stop': 21, 'strand': '-'} +FOUR_FIFTEEN_FWD = {'start': 4, 'stop': 15, 'strand': '+'} +FOUR_FIFTEEN_REV = {'start': 4, 'stop': 15, 'strand': '-'} @pytest.mark.parametrize( "orf, start_aa, stop_aa, expected", [ # Forward strand - (one_twentyone_fwd, 1, 3, (1,9)), # ORF spans whole sequence length, signal peptide starts at AA 1 - (one_twentyone_fwd, 2, 7, (4,21)), # ORF spans whole sequence length, signal peptide stops at last AA - (four_fifteen_fwd, 1, 3, (4,12)), # ORF does not span whole sequence length, signal peptide starts at AA 1 - (four_fifteen_fwd, 2, 4, (7,15)), # ORF does not span whole sequence length, signal peptide stops at last AA + (ONE_TWENTYONE_FWD, 1, 3, (1,9)), # ORF spans whole sequence length, signal peptide starts at AA 1 + (ONE_TWENTYONE_FWD, 2, 7, (4,21)), # ORF spans whole sequence length, signal peptide stops at last AA + (FOUR_FIFTEEN_FWD, 1, 3, (4,12)), # ORF does not span whole sequence length, signal peptide starts at AA 1 + (FOUR_FIFTEEN_FWD, 2, 4, (7,15)), # ORF does not span whole sequence length, signal peptide stops at last AA # Reverse strand - (one_twentyone_rev, 1, 3, (13,21)), # ORF spans whole sequence length, signal peptide starts at AA 1 - (one_twentyone_rev, 3, 7, (1,15)), # ORF spans whole sequence length, signal peptide stops at last AA - (four_fifteen_rev, 1, 2, (10,15)), # ORF does not span whole sequence length, signal peptide starts at AA 1 - (four_fifteen_rev, 2, 3, (7,12)) # ORF does not span whole sequence length, signal peptide in the middle of ORF + (ONE_TWENTYONE_REV, 1, 3, (13,21)), # ORF spans whole sequence length, signal peptide starts at AA 1 + (ONE_TWENTYONE_REV, 3, 7, (1,15)), # ORF spans whole sequence length, signal peptide stops at last AA + (FOUR_FIFTEEN_REV, 1, 2, (10,15)), # ORF does not span whole sequence length, signal peptide starts at AA 1 + (FOUR_FIFTEEN_REV, 2, 3, (7,12)) # ORF does not span whole sequence length, signal peptide in the middle of ORF ] ) def test_start_stop(orf, start_aa, stop_aa, expected): diff --git a/test/test_user_proteins.py b/test/test_user_proteins.py index 759f9b92..082da8f2 100644 --- a/test/test_user_proteins.py +++ b/test/test_user_proteins.py @@ -12,74 +12,74 @@ SEQUENCE = 'MRADEEPGDLSAVAQDYLKVIWTAQEWSQDKVSTKMLAERIGVSASTASESIRKLAEQGLVDHEKYGAVTLTDSGRRAALAMVRRHRLLETFLVNELGYRWDEVHDEA' -aa_min = { +AA_MIN = { 'id': 'min', 'description': '~~~product~~~', 'sequence': SEQUENCE } -aa_min_gene = { +AA_MIN_GENE = { 'id': 'min', 'description': 'gene~~~product~~~', 'sequence': SEQUENCE } -aa_min_dbxref = { +AA_MIN_DBXREF = { 'id': 'min', 'description': 'gene~~~product~~~db-1:id-1', 'sequence': SEQUENCE } -aa_min_dbxrefs = { +AA_MIN_DBXREFS = { 'id': 'min', 'description': 'gene~~~product~~~db-1:id-1,db-2:id-2', 'sequence': SEQUENCE } -aa_full = { +AA_FULL = { 'id': 'full', 'description': '90.0~~~80.0~~~80.0~~~gene~~~product~~~db-1:id-1,db-2:id-2', 'sequence': SEQUENCE } -aa_wrong_1 = { +AA_WRONG_1 = { 'id': 'low-cols', 'description': '~~~product', 'sequence': SEQUENCE } -aa_wrong_2 = { +AA_WRONG_2 = { 'id': 'high-cols', 'description': '90~~~80~~~80~~~gene~~~product~~~dbxref:dbxref~~~', 'sequence': SEQUENCE } -aa_wrong_3 = { +AA_WRONG_3 = { 'id': 'no-product', 'description': 'gene~~~~~~dbxref:dbxref', 'sequence': SEQUENCE } -aa_wrong_4 = { +AA_WRONG_4 = { 'id': 'no-product-full', 'description': '90~~~80~~~80~~~gene~~~~~~dbxref:dbxref', 'sequence': SEQUENCE } -aa_wrong_5 = { +AA_WRONG_5 = { 'id': 'wrong-dbxref', 'description': 'gene~~~product~~~dbxrefdbxref', 'sequence': SEQUENCE } -aa_wrong_6 = { +AA_WRONG_6 = { 'id': 'wrong-dbxref-full', 'description': '90~~~80~~~80~~~gene~~~product~~~dbxrefdbxref', 'sequence': SEQUENCE } -aa_wrong_7 = { +AA_WRONG_7 = { 'id': 'wrong-id', 'description': 'ninety~~~80~~~80~~~gene~~~product~~~dbxref:dbxref', 'sequence': SEQUENCE } -aa_wrong_8 = { +AA_WRONG_8 = { 'id': 'wrong-min-query-cov', 'description': '90~~~eighty~~~80~~~gene~~~product~~~dbxref:dbxref', 'sequence': SEQUENCE } -aa_wrong_9 = { +AA_WRONG_9 = { 'id': 'wrong-min-model-cov', 'description': '90~~~80~~~eighty~~~gene~~~product~~~dbxref:dbxref', 'sequence': SEQUENCE @@ -89,15 +89,15 @@ @pytest.mark.parametrize( "parameters", [ - (aa_wrong_1), - (aa_wrong_2), - (aa_wrong_3), - (aa_wrong_4), - (aa_wrong_5), - (aa_wrong_6), - (aa_wrong_7), - (aa_wrong_8), - (aa_wrong_9) + AA_WRONG_1, + AA_WRONG_2, + AA_WRONG_3, + AA_WRONG_4, + AA_WRONG_5, + AA_WRONG_6, + AA_WRONG_7, + AA_WRONG_8, + AA_WRONG_9 ] ) def test_wrong_user_proteins_io(tmpdir, parameters): @@ -114,11 +114,11 @@ def test_wrong_user_proteins_io(tmpdir, parameters): @pytest.mark.parametrize( "parameters", [ - (aa_min), - (aa_min_gene), - (aa_min_dbxref), - (aa_min_dbxrefs), - (aa_full) + AA_MIN, + AA_MIN_GENE, + AA_MIN_DBXREF, + AA_MIN_DBXREFS, + AA_FULL ] ) def test_user_proteins_io(parameters, tmpdir):