diff --git a/ariba/card_record.py b/ariba/card_record.py index f1bccd02..f1a1f495 100644 --- a/ariba/card_record.py +++ b/ariba/card_record.py @@ -38,9 +38,9 @@ def _ARO_name_to_fasta_name(aro_name): re_search = aro_regex.search(aro_name) if re_search is not None: - return re_search.group(1) + return re_search.group(1).replace('.', '_') else: - return '_'.join(aro_name.split()[:3]) + return '_'.join(aro_name.split()[:3]).replace('.', '_') @staticmethod diff --git a/ariba/ref_preparer.py b/ariba/ref_preparer.py index 377a8df6..85cbf305 100644 --- a/ariba/ref_preparer.py +++ b/ariba/ref_preparer.py @@ -137,6 +137,7 @@ def run(self, outdir): print('\nLoading and checking input data', flush=True) refdata_outprefix = os.path.join(outdir, 'refcheck') + self.refdata.rename_sequences(refdata_outprefix + '.rename_info') self.refdata.sanity_check(refdata_outprefix) if self.verbose: diff --git a/ariba/reference_data.py b/ariba/reference_data.py index 4a99fbcf..e41e7d14 100644 --- a/ariba/reference_data.py +++ b/ariba/reference_data.py @@ -1,5 +1,6 @@ import os import sys +import re import copy import pyfastaq from ariba import sequence_metadata, cdhit @@ -7,6 +8,8 @@ class Error (Exception): pass +rename_sub_regex = re.compile(r'[^\w.-]') + class ReferenceData: def __init__(self, @@ -105,7 +108,7 @@ def _load_fasta_file(filename): if filename is not None: seq_reader = pyfastaq.sequences.file_reader(filename) for seq in seq_reader: - seq.id = seq.id.split()[0] + #seq.id = seq.id.split()[0] if seq.id in d: raise Error('Duplicate name "' + seq.id + '" found in file ' + filename + '. Cannot continue)') d[seq.id] = copy.copy(seq) @@ -294,6 +297,100 @@ def sanity_check(self, outprefix): self._filter_bad_variant_data(outprefix + '.01.check_variants', variants_only_removed, presence_absence_removed) + @classmethod + def _new_seq_name(cls, name): + name = name.split()[0] + return re.sub(rename_sub_regex, '_', name) + + + @classmethod + def _seq_names_to_rename_dict(cls, names): + used_names = set() + old_name_to_new = {} + + for old_name in sorted(names): + new_name = ReferenceData._new_seq_name(old_name) + if new_name in used_names: + i = 1 + new_name_prefix = new_name + while new_name in used_names: + new_name = new_name_prefix + '_' + str(i) + i += 1 + + assert new_name not in used_names + if new_name != old_name: + old_name_to_new[old_name] = new_name + + used_names.add(new_name) + + return old_name_to_new + + + @classmethod + def _rename_names_in_seq_dicts(cls, seq_dicts, rename_dict): + '''Changes seq_dicts in place''' + for seq_type in ['presence_absence', 'variants_only', 'non_coding']: + new_dict = {} + while len(seq_dicts[seq_type]): + old_name, seq = seq_dicts[seq_type].popitem() + if old_name in rename_dict: + seq.id = rename_dict[old_name] + + new_dict[seq.id] = seq + seq_dicts[seq_type] = new_dict + + + @classmethod + def _rename_metadata_set(cls, metadata_set, new_name): + new_set = set() + for meta in metadata_set: + new_meta = copy.copy(meta) + new_meta.name = new_name + new_set.add(new_meta) + return new_set + + + @classmethod + def _rename_names_in_metadata(cls, meta_dict, rename_dict): + new_dict = {} + + while len(meta_dict): + old_name, gene_dict = meta_dict.popitem() + if old_name in rename_dict: + new_name = rename_dict[old_name] + for seq_type in ['n', 'p']: + for position, metaset in gene_dict[seq_type].items(): + gene_dict[seq_type][position] = ReferenceData._rename_metadata_set(metaset, new_name) + + gene_dict['.'] = ReferenceData._rename_metadata_set(gene_dict['.'], new_name) + else: + new_name = old_name + + new_dict[new_name] = gene_dict + + return new_dict + + + def rename_sequences(self, outfile): + presabs_names = set(self.seq_dicts['presence_absence'].keys()) + noncoding_names = set(self.seq_dicts['non_coding'].keys()) + varonly_names = set(self.seq_dicts['variants_only'].keys()) + # we should have already checked that all the names are unique, but let's do it again! + all_names = presabs_names.union(noncoding_names).union(varonly_names) + if len(all_names) != len(presabs_names) + len(noncoding_names) + len(varonly_names): + raise Error('Got a non-unique name in input data. Cannot continue') + + rename_dict = ReferenceData._seq_names_to_rename_dict(all_names) + if len(rename_dict): + print('Had to rename some sequences. See', outfile, 'for old -> new names', file=sys.stderr) + with open(outfile, 'w') as f: + for old_name, new_name in sorted(rename_dict.items()): + print(old_name, new_name, sep='\t', file=f) + + ReferenceData._rename_names_in_seq_dicts(self.seq_dicts, rename_dict) + self.metadata = ReferenceData._rename_names_in_metadata(self.metadata, rename_dict) + + def make_catted_fasta(self, outfile): f = pyfastaq.utils.open_file_write(outfile) diff --git a/ariba/sequence_metadata.py b/ariba/sequence_metadata.py index 53c03622..a1efde2e 100644 --- a/ariba/sequence_metadata.py +++ b/ariba/sequence_metadata.py @@ -24,10 +24,9 @@ def __init__(self, line): else: self.variant = sequence_variant.Variant(self.variant_type, variant_string) - self.hashed = hash((self.name, self.variant_type, variant_string)) def __eq__(self, other): - return type(other) is type(self) and self.__dict__ == other.__dict__ + return type(other) is type(self) and self.name == other.name and self.variant == other.variant and self.variant_type == other.variant_type and self.free_text == other.free_text def __lt__(self, other): @@ -35,7 +34,7 @@ def __lt__(self, other): def __hash__(self): - return self.hashed + return hash((self.name, self.variant_type, str(self.variant), self.free_text)) def __str__(self): diff --git a/ariba/tests/card_record_test.py b/ariba/tests/card_record_test.py index 8de8ab84..6058eac3 100644 --- a/ariba/tests/card_record_test.py +++ b/ariba/tests/card_record_test.py @@ -46,6 +46,8 @@ def test_ARO_name_to_fasta_name(self): ('abcD foo bar match at the start', 'abcD'), ('foo bar abcD match in the middle', 'abcD'), ('match at the end abcD', 'abcD'), + ('use first three foo bar', 'use_first_three'), + ('remove.any.dots first three', 'remove_any_dots_first_three') ] for aro_name, expected in tests: diff --git a/ariba/tests/data/reference_data_rename_sequences.noncoding.fa b/ariba/tests/data/reference_data_rename_sequences.noncoding.fa new file mode 100644 index 00000000..a8b5192c --- /dev/null +++ b/ariba/tests/data/reference_data_rename_sequences.noncoding.fa @@ -0,0 +1,4 @@ +>noncoding1 +AAAA +>noncoding1 blah +CCCC diff --git a/ariba/tests/data/reference_data_rename_sequences.presence_absence.fa b/ariba/tests/data/reference_data_rename_sequences.presence_absence.fa new file mode 100644 index 00000000..d9a7ab50 --- /dev/null +++ b/ariba/tests/data/reference_data_rename_sequences.presence_absence.fa @@ -0,0 +1,10 @@ +>pres_abs1 foo bar spam eggs +ACGT +>pres_abs1 blah +AAAA +>pres'abs1 +CCCC +>pres_abs2 +TTTT +>pres!abs3 +GGGG diff --git a/ariba/tests/data/reference_data_rename_sequences.variants_only.fa b/ariba/tests/data/reference_data_rename_sequences.variants_only.fa new file mode 100644 index 00000000..87d7377d --- /dev/null +++ b/ariba/tests/data/reference_data_rename_sequences.variants_only.fa @@ -0,0 +1,8 @@ +>var_only1 hello +AAAA +>var:only1 boo +CCCC +>var_only1 +GGGG +>var_only2 +TTTT diff --git a/ariba/tests/data/reference_data_rename_sequences_metadata.tsv b/ariba/tests/data/reference_data_rename_sequences_metadata.tsv new file mode 100644 index 00000000..1dd13740 --- /dev/null +++ b/ariba/tests/data/reference_data_rename_sequences_metadata.tsv @@ -0,0 +1,11 @@ +noncoding1 . . original name "noncoding1" +noncoding1 blah . . original name "noncoding1 blah" +pres_abs1 foo bar spam eggs . . original name "pres_abs1 foo bar spam eggs" +pres_abs1 blah . . original name "pres_abs1 blah" +pres'abs1 . . original name "pres'abs1" +pres_abs2 . . original name "pres_abs2" +pres!abs3 . . original name "pres!abs3" +var_only1 hello . . original name "var_only1 hello" +var:only1 boo . . original name "var:only1 boo" +var_only1 . . original name "var_only1" +var_only2 . . original name "var_only2" diff --git a/ariba/tests/data/reference_data_test_rename_sequences.out b/ariba/tests/data/reference_data_test_rename_sequences.out new file mode 100644 index 00000000..d47d87c9 --- /dev/null +++ b/ariba/tests/data/reference_data_test_rename_sequences.out @@ -0,0 +1,8 @@ +noncoding1 blah noncoding1_1 +pres!abs3 pres_abs3 +pres'abs1 pres_abs1 +pres_abs1 blah pres_abs1_1 +pres_abs1 foo bar spam eggs pres_abs1_2 +var:only1 boo var_only1 +var_only1 var_only1_1 +var_only1 hello var_only1_2 diff --git a/ariba/tests/reference_data_test.py b/ariba/tests/reference_data_test.py index c820b42b..6b07e8b1 100644 --- a/ariba/tests/reference_data_test.py +++ b/ariba/tests/reference_data_test.py @@ -212,6 +212,202 @@ def test_remove_bad_genes(self): os.unlink(tmp_log) + def test_new_seq_name(self): + '''Test _new_seq_name''' + tests = [ + ('name', 'name'), + ('name ', 'name'), + ('name xyz', 'name'), + ('name_a', 'name_a'), + ('name.a', 'name.a'), + ('name-a', 'name-a'), + ('name spam eggs foo', 'name'), + ('name!', 'name_'), + ('name:foo', 'name_foo'), + ('name:!@foo', 'name___foo'), + ] + + for name, expected in tests: + self.assertEqual(expected, reference_data.ReferenceData._new_seq_name(name)) + + + def test_seq_names_to_rename_dict(self): + '''Test _seq_names_to_rename_dict''' + names = {'foo', 'foo abc', 'foo xyz', 'bar!', 'bar:', 'spam abc', 'eggs'} + got = reference_data.ReferenceData._seq_names_to_rename_dict(names) + expected = { + 'foo abc': 'foo_1', + 'foo xyz': 'foo_2', + 'bar!': 'bar_', + 'bar:': 'bar__1', + 'spam abc': 'spam' + } + self.assertEqual(expected, got) + + + def test_rename_names_in_seq_dicts(self): + '''Test _rename_names_in_seq_dicts''' + rename_dict = { + 'pa abc': 'pa', + 'pa 1': 'pa_1', + 'vo:': 'vo_', + } + seqs_dict = { + 'presence_absence': { + 'pa abc': pyfastaq.sequences.Fasta('pa abc', 'AAAA'), + 'pa 1': pyfastaq.sequences.Fasta('pa 1', 'CCC'), + }, + 'variants_only': { + 'vo:': pyfastaq.sequences.Fasta('vo:', 'GGG'), + }, + 'non_coding': { + 'nonc': pyfastaq.sequences.Fasta('nonc', 'TTT'), + } + } + + got = reference_data.ReferenceData._rename_names_in_seq_dicts(seqs_dict, rename_dict) + expected = { + 'presence_absence': { + 'pa': pyfastaq.sequences.Fasta('pa', 'AAAA'), + 'pa_1': pyfastaq.sequences.Fasta('pa_1', 'CCC'), + }, + 'variants_only': { + 'vo_': pyfastaq.sequences.Fasta('vo_', 'GGG'), + }, + 'non_coding': { + 'nonc': pyfastaq.sequences.Fasta('nonc', 'TTT'), + } + } + self.assertEqual(expected, seqs_dict) + + + def test_rename_metadata_set(self): + '''Test _rename_metadata_set''' + metaset = { + sequence_metadata.SequenceMetadata('foo 1\t.\t.\tdescription'), + sequence_metadata.SequenceMetadata('foo 1\tp\tI42L\tspam eggs') + } + + expected = { + sequence_metadata.SequenceMetadata('new_name\t.\t.\tdescription'), + sequence_metadata.SequenceMetadata('new_name\tp\tI42L\tspam eggs') + } + got = reference_data.ReferenceData._rename_metadata_set(metaset, 'new_name') + self.assertEqual(expected, got) + + + def test_rename_names_in_metadata(self): + '''Test _rename_names_in_metadata''' + meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\tfree text') + meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tA42T\tfree text2') + meta3 = sequence_metadata.SequenceMetadata('gene1\t.\t.\tfree text3') + meta4 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\tconfers killer rabbit resistance') + meta5 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\tremoves tardigrade's space-living capability") + meta1rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tA42G\tfree text') + meta2rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tA42T\tfree text2') + meta3rename = sequence_metadata.SequenceMetadata('new_gene1\t.\t.\tfree text3') + meta4rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tG13T\tconfers killer rabbit resistance') + + metadata = { + 'gene1': { + 'n': {12: {meta4}, 41: {meta1, meta2}}, + 'p': {}, + '.': {meta3}, + }, + 'gene2': { + 'n': {}, + 'p': {41: {meta5}}, + '.': set(), + } + } + + expected = { + 'new_gene1': { + 'n': {12: {meta4rename}, 41: {meta1rename, meta2rename}}, + 'p': {}, + '.': {meta3rename}, + }, + 'gene2': { + 'n': {}, + 'p': {41: {meta5}}, + '.': set(), + } + } + + rename_dict = {'gene1': 'new_gene1'} + got = reference_data.ReferenceData._rename_names_in_metadata(metadata, rename_dict) + self.assertEqual(expected, got) + + + def test_rename_sequences(self): + '''Test rename_sequences''' + presence_absence_fa = os.path.join(data_dir, 'reference_data_rename_sequences.presence_absence.fa') + variants_only_fa = os.path.join(data_dir, 'reference_data_rename_sequences.variants_only.fa') + noncoding_fa = os.path.join(data_dir, 'reference_data_rename_sequences.noncoding.fa') + metadata_tsv = os.path.join(data_dir, 'reference_data_rename_sequences_metadata.tsv') + refdata = reference_data.ReferenceData( + presence_absence_fa=presence_absence_fa, + variants_only_fa=variants_only_fa, + non_coding_fa=noncoding_fa, + metadata_tsv=metadata_tsv + ) + tmp_out = 'tmp.test_rename_sequences.out' + refdata.rename_sequences(tmp_out) + expected_file = os.path.join(data_dir, 'reference_data_test_rename_sequences.out') + self.assertTrue(filecmp.cmp(expected_file, tmp_out, shallow=False)) + os.unlink(tmp_out) + + meta1 = sequence_metadata.SequenceMetadata('noncoding1\t.\t.\toriginal name "noncoding1"') + meta2 = sequence_metadata.SequenceMetadata('noncoding1_1\t.\t.\toriginal name "noncoding1 blah"') + meta3 = sequence_metadata.SequenceMetadata('pres_abs1_2\t.\t.\toriginal name "pres_abs1 foo bar spam eggs"') + meta4 = sequence_metadata.SequenceMetadata('pres_abs1_1\t.\t.\toriginal name "pres_abs1 blah"') + meta5 = sequence_metadata.SequenceMetadata('pres_abs1\t.\t.\toriginal name "pres\'abs1"') + meta6 = sequence_metadata.SequenceMetadata('pres_abs2\t.\t.\toriginal name "pres_abs2"') + meta7 = sequence_metadata.SequenceMetadata('pres_abs3\t.\t.\toriginal name "pres!abs3"') + meta8 = sequence_metadata.SequenceMetadata('var_only1_2\t.\t.\toriginal name "var_only1 hello"') + meta9 = sequence_metadata.SequenceMetadata('var_only1\t.\t.\toriginal name "var:only1 boo"') + meta10 = sequence_metadata.SequenceMetadata('var_only1_1\t.\t.\toriginal name "var_only1"') + meta11 = sequence_metadata.SequenceMetadata('var_only2\t.\t.\toriginal name "var_only2"') + + expected_meta = { + 'noncoding1': {'n': {}, 'p': {}, '.': {meta1}}, + 'noncoding1_1': {'n': {}, 'p': {}, '.': {meta2}}, + 'pres_abs1_2': {'n': {}, 'p': {}, '.': {meta3}}, + 'pres_abs1_1': {'n': {}, 'p': {}, '.': {meta4}}, + 'pres_abs1': {'n': {}, 'p': {}, '.': {meta5}}, + 'pres_abs2': {'n': {}, 'p': {}, '.': {meta6}}, + 'pres_abs3': {'n': {}, 'p': {}, '.': {meta7}}, + 'var_only1_2': {'n': {}, 'p': {}, '.': {meta8}}, + 'var_only1': {'n': {}, 'p': {}, '.': {meta9}}, + 'var_only1_1': {'n': {}, 'p': {}, '.': {meta10}}, + 'var_only2': {'n': {}, 'p': {}, '.': {meta11}}, + } + + self.assertEqual(expected_meta, refdata.metadata) + + expected_seqs_dict = { + 'non_coding': { + 'noncoding1': pyfastaq.sequences.Fasta('noncoding1', 'AAAA'), + 'noncoding1_1': pyfastaq.sequences.Fasta('noncoding1_1', 'CCCC'), + }, + 'presence_absence': { + 'pres_abs1_2': pyfastaq.sequences.Fasta('pres_abs1_2', 'ACGT'), + 'pres_abs1_1': pyfastaq.sequences.Fasta('pres_abs1_1', 'AAAA'), + 'pres_abs1': pyfastaq.sequences.Fasta('pres_abs1', 'CCCC'), + 'pres_abs2': pyfastaq.sequences.Fasta('pres_abs2', 'TTTT'), + 'pres_abs3': pyfastaq.sequences.Fasta('pres_abs3', 'GGGG'), + }, + 'variants_only': { + 'var_only1_2': pyfastaq.sequences.Fasta('var_only1_2', 'AAAA'), + 'var_only1': pyfastaq.sequences.Fasta('var_only1', 'CCCC'), + 'var_only1_1': pyfastaq.sequences.Fasta('var_only1_1', 'GGGG'), + 'var_only2': pyfastaq.sequences.Fasta('var_only2', 'TTTT'), + } + } + + self.assertEqual(expected_seqs_dict, refdata.seq_dicts) + + def test_make_catted_fasta(self): '''Test make_catted_fasta''' presence_absence_fa = os.path.join(data_dir, 'reference_data_make_catted_fasta.presence_absence.fa')