Skip to content

Commit

Permalink
Merge pull request #66 from martinghunt/rename_sequences
Browse files Browse the repository at this point in the history
Rename sequences
  • Loading branch information
martinghunt committed Apr 27, 2016
2 parents 9ea4ef2 + 970e7b6 commit 3edf1cc
Show file tree
Hide file tree
Showing 11 changed files with 342 additions and 6 deletions.
4 changes: 2 additions & 2 deletions ariba/card_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ def _ARO_name_to_fasta_name(aro_name):

re_search = aro_regex.search(aro_name)
if re_search is not None:
return re_search.group(1)
return re_search.group(1).replace('.', '_')
else:
return '_'.join(aro_name.split()[:3])
return '_'.join(aro_name.split()[:3]).replace('.', '_')


@staticmethod
Expand Down
1 change: 1 addition & 0 deletions ariba/ref_preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def run(self, outdir):
print('\nLoading and checking input data', flush=True)

refdata_outprefix = os.path.join(outdir, 'refcheck')
self.refdata.rename_sequences(refdata_outprefix + '.rename_info')
self.refdata.sanity_check(refdata_outprefix)

if self.verbose:
Expand Down
99 changes: 98 additions & 1 deletion ariba/reference_data.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import os
import sys
import re
import copy
import pyfastaq
from ariba import sequence_metadata, cdhit


class Error (Exception): pass

rename_sub_regex = re.compile(r'[^\w.-]')


class ReferenceData:
def __init__(self,
Expand Down Expand Up @@ -105,7 +108,7 @@ def _load_fasta_file(filename):
if filename is not None:
seq_reader = pyfastaq.sequences.file_reader(filename)
for seq in seq_reader:
seq.id = seq.id.split()[0]
#seq.id = seq.id.split()[0]
if seq.id in d:
raise Error('Duplicate name "' + seq.id + '" found in file ' + filename + '. Cannot continue)')
d[seq.id] = copy.copy(seq)
Expand Down Expand Up @@ -294,6 +297,100 @@ def sanity_check(self, outprefix):
self._filter_bad_variant_data(outprefix + '.01.check_variants', variants_only_removed, presence_absence_removed)


@classmethod
def _new_seq_name(cls, name):
name = name.split()[0]
return re.sub(rename_sub_regex, '_', name)


@classmethod
def _seq_names_to_rename_dict(cls, names):
used_names = set()
old_name_to_new = {}

for old_name in sorted(names):
new_name = ReferenceData._new_seq_name(old_name)
if new_name in used_names:
i = 1
new_name_prefix = new_name
while new_name in used_names:
new_name = new_name_prefix + '_' + str(i)
i += 1

assert new_name not in used_names
if new_name != old_name:
old_name_to_new[old_name] = new_name

used_names.add(new_name)

return old_name_to_new


@classmethod
def _rename_names_in_seq_dicts(cls, seq_dicts, rename_dict):
'''Changes seq_dicts in place'''
for seq_type in ['presence_absence', 'variants_only', 'non_coding']:
new_dict = {}
while len(seq_dicts[seq_type]):
old_name, seq = seq_dicts[seq_type].popitem()
if old_name in rename_dict:
seq.id = rename_dict[old_name]

new_dict[seq.id] = seq
seq_dicts[seq_type] = new_dict


@classmethod
def _rename_metadata_set(cls, metadata_set, new_name):
new_set = set()
for meta in metadata_set:
new_meta = copy.copy(meta)
new_meta.name = new_name
new_set.add(new_meta)
return new_set


@classmethod
def _rename_names_in_metadata(cls, meta_dict, rename_dict):
new_dict = {}

while len(meta_dict):
old_name, gene_dict = meta_dict.popitem()
if old_name in rename_dict:
new_name = rename_dict[old_name]
for seq_type in ['n', 'p']:
for position, metaset in gene_dict[seq_type].items():
gene_dict[seq_type][position] = ReferenceData._rename_metadata_set(metaset, new_name)

gene_dict['.'] = ReferenceData._rename_metadata_set(gene_dict['.'], new_name)
else:
new_name = old_name

new_dict[new_name] = gene_dict

return new_dict


def rename_sequences(self, outfile):
presabs_names = set(self.seq_dicts['presence_absence'].keys())
noncoding_names = set(self.seq_dicts['non_coding'].keys())
varonly_names = set(self.seq_dicts['variants_only'].keys())
# we should have already checked that all the names are unique, but let's do it again!
all_names = presabs_names.union(noncoding_names).union(varonly_names)
if len(all_names) != len(presabs_names) + len(noncoding_names) + len(varonly_names):
raise Error('Got a non-unique name in input data. Cannot continue')

rename_dict = ReferenceData._seq_names_to_rename_dict(all_names)
if len(rename_dict):
print('Had to rename some sequences. See', outfile, 'for old -> new names', file=sys.stderr)
with open(outfile, 'w') as f:
for old_name, new_name in sorted(rename_dict.items()):
print(old_name, new_name, sep='\t', file=f)

ReferenceData._rename_names_in_seq_dicts(self.seq_dicts, rename_dict)
self.metadata = ReferenceData._rename_names_in_metadata(self.metadata, rename_dict)


def make_catted_fasta(self, outfile):
f = pyfastaq.utils.open_file_write(outfile)

Expand Down
5 changes: 2 additions & 3 deletions ariba/sequence_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,17 @@ def __init__(self, line):
else:
self.variant = sequence_variant.Variant(self.variant_type, variant_string)

self.hashed = hash((self.name, self.variant_type, variant_string))

def __eq__(self, other):
return type(other) is type(self) and self.__dict__ == other.__dict__
return type(other) is type(self) and self.name == other.name and self.variant == other.variant and self.variant_type == other.variant_type and self.free_text == other.free_text


def __lt__(self, other):
return self.name < other.name or (self.name == other.name and self.variant < other.variant)


def __hash__(self):
return self.hashed
return hash((self.name, self.variant_type, str(self.variant), self.free_text))


def __str__(self):
Expand Down
2 changes: 2 additions & 0 deletions ariba/tests/card_record_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ def test_ARO_name_to_fasta_name(self):
('abcD foo bar match at the start', 'abcD'),
('foo bar abcD match in the middle', 'abcD'),
('match at the end abcD', 'abcD'),
('use first three foo bar', 'use_first_three'),
('remove.any.dots first three', 'remove_any_dots_first_three')
]

for aro_name, expected in tests:
Expand Down
4 changes: 4 additions & 0 deletions ariba/tests/data/reference_data_rename_sequences.noncoding.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>noncoding1
AAAA
>noncoding1 blah
CCCC
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>pres_abs1 foo bar spam eggs
ACGT
>pres_abs1 blah
AAAA
>pres'abs1
CCCC
>pres_abs2
TTTT
>pres!abs3
GGGG
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>var_only1 hello
AAAA
>var:only1 boo
CCCC
>var_only1
GGGG
>var_only2
TTTT
11 changes: 11 additions & 0 deletions ariba/tests/data/reference_data_rename_sequences_metadata.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
noncoding1 . . original name "noncoding1"
noncoding1 blah . . original name "noncoding1 blah"
pres_abs1 foo bar spam eggs . . original name "pres_abs1 foo bar spam eggs"
pres_abs1 blah . . original name "pres_abs1 blah"
pres'abs1 . . original name "pres'abs1"
pres_abs2 . . original name "pres_abs2"
pres!abs3 . . original name "pres!abs3"
var_only1 hello . . original name "var_only1 hello"
var:only1 boo . . original name "var:only1 boo"
var_only1 . . original name "var_only1"
var_only2 . . original name "var_only2"
8 changes: 8 additions & 0 deletions ariba/tests/data/reference_data_test_rename_sequences.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
noncoding1 blah noncoding1_1
pres!abs3 pres_abs3
pres'abs1 pres_abs1
pres_abs1 blah pres_abs1_1
pres_abs1 foo bar spam eggs pres_abs1_2
var:only1 boo var_only1
var_only1 var_only1_1
var_only1 hello var_only1_2
Loading

0 comments on commit 3edf1cc

Please sign in to comment.