Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename sequences #66

Merged
merged 8 commits into from
Apr 27, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ariba/card_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ def _ARO_name_to_fasta_name(aro_name):

re_search = aro_regex.search(aro_name)
if re_search is not None:
return re_search.group(1)
return re_search.group(1).replace('.', '_')
else:
return '_'.join(aro_name.split()[:3])
return '_'.join(aro_name.split()[:3]).replace('.', '_')


@staticmethod
Expand Down
1 change: 1 addition & 0 deletions ariba/ref_preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def run(self, outdir):
print('\nLoading and checking input data', flush=True)

refdata_outprefix = os.path.join(outdir, 'refcheck')
self.refdata.rename_sequences(refdata_outprefix + '.rename_info')
self.refdata.sanity_check(refdata_outprefix)

if self.verbose:
Expand Down
99 changes: 98 additions & 1 deletion ariba/reference_data.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import os
import sys
import re
import copy
import pyfastaq
from ariba import sequence_metadata, cdhit


class Error (Exception): pass

rename_sub_regex = re.compile(r'[^\w.-]')


class ReferenceData:
def __init__(self,
Expand Down Expand Up @@ -105,7 +108,7 @@ def _load_fasta_file(filename):
if filename is not None:
seq_reader = pyfastaq.sequences.file_reader(filename)
for seq in seq_reader:
seq.id = seq.id.split()[0]
#seq.id = seq.id.split()[0]
if seq.id in d:
raise Error('Duplicate name "' + seq.id + '" found in file ' + filename + '. Cannot continue)')
d[seq.id] = copy.copy(seq)
Expand Down Expand Up @@ -294,6 +297,100 @@ def sanity_check(self, outprefix):
self._filter_bad_variant_data(outprefix + '.01.check_variants', variants_only_removed, presence_absence_removed)


@classmethod
def _new_seq_name(cls, name):
name = name.split()[0]
return re.sub(rename_sub_regex, '_', name)


@classmethod
def _seq_names_to_rename_dict(cls, names):
used_names = set()
old_name_to_new = {}

for old_name in sorted(names):
new_name = ReferenceData._new_seq_name(old_name)
if new_name in used_names:
i = 1
new_name_prefix = new_name
while new_name in used_names:
new_name = new_name_prefix + '_' + str(i)
i += 1

assert new_name not in used_names
if new_name != old_name:
old_name_to_new[old_name] = new_name

used_names.add(new_name)

return old_name_to_new


@classmethod
def _rename_names_in_seq_dicts(cls, seq_dicts, rename_dict):
'''Changes seq_dicts in place'''
for seq_type in ['presence_absence', 'variants_only', 'non_coding']:
new_dict = {}
while len(seq_dicts[seq_type]):
old_name, seq = seq_dicts[seq_type].popitem()
if old_name in rename_dict:
seq.id = rename_dict[old_name]

new_dict[seq.id] = seq
seq_dicts[seq_type] = new_dict


@classmethod
def _rename_metadata_set(cls, metadata_set, new_name):
new_set = set()
for meta in metadata_set:
new_meta = copy.copy(meta)
new_meta.name = new_name
new_set.add(new_meta)
return new_set


@classmethod
def _rename_names_in_metadata(cls, meta_dict, rename_dict):
new_dict = {}

while len(meta_dict):
old_name, gene_dict = meta_dict.popitem()
if old_name in rename_dict:
new_name = rename_dict[old_name]
for seq_type in ['n', 'p']:
for position, metaset in gene_dict[seq_type].items():
gene_dict[seq_type][position] = ReferenceData._rename_metadata_set(metaset, new_name)

gene_dict['.'] = ReferenceData._rename_metadata_set(gene_dict['.'], new_name)
else:
new_name = old_name

new_dict[new_name] = gene_dict

return new_dict


def rename_sequences(self, outfile):
presabs_names = set(self.seq_dicts['presence_absence'].keys())
noncoding_names = set(self.seq_dicts['non_coding'].keys())
varonly_names = set(self.seq_dicts['variants_only'].keys())
# we should have already checked that all the names are unique, but let's do it again!
all_names = presabs_names.union(noncoding_names).union(varonly_names)
if len(all_names) != len(presabs_names) + len(noncoding_names) + len(varonly_names):
raise Error('Got a non-unique name in input data. Cannot continue')

rename_dict = ReferenceData._seq_names_to_rename_dict(all_names)
if len(rename_dict):
print('Had to rename some sequences. See', outfile, 'for old -> new names', file=sys.stderr)
with open(outfile, 'w') as f:
for old_name, new_name in sorted(rename_dict.items()):
print(old_name, new_name, sep='\t', file=f)

ReferenceData._rename_names_in_seq_dicts(self.seq_dicts, rename_dict)
self.metadata = ReferenceData._rename_names_in_metadata(self.metadata, rename_dict)


def make_catted_fasta(self, outfile):
f = pyfastaq.utils.open_file_write(outfile)

Expand Down
5 changes: 2 additions & 3 deletions ariba/sequence_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,17 @@ def __init__(self, line):
else:
self.variant = sequence_variant.Variant(self.variant_type, variant_string)

self.hashed = hash((self.name, self.variant_type, variant_string))

def __eq__(self, other):
return type(other) is type(self) and self.__dict__ == other.__dict__
return type(other) is type(self) and self.name == other.name and self.variant == other.variant and self.variant_type == other.variant_type and self.free_text == other.free_text


def __lt__(self, other):
return self.name < other.name or (self.name == other.name and self.variant < other.variant)


def __hash__(self):
return self.hashed
return hash((self.name, self.variant_type, str(self.variant), self.free_text))


def __str__(self):
Expand Down
2 changes: 2 additions & 0 deletions ariba/tests/card_record_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ def test_ARO_name_to_fasta_name(self):
('abcD foo bar match at the start', 'abcD'),
('foo bar abcD match in the middle', 'abcD'),
('match at the end abcD', 'abcD'),
('use first three foo bar', 'use_first_three'),
('remove.any.dots first three', 'remove_any_dots_first_three')
]

for aro_name, expected in tests:
Expand Down
4 changes: 4 additions & 0 deletions ariba/tests/data/reference_data_rename_sequences.noncoding.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>noncoding1
AAAA
>noncoding1 blah
CCCC
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>pres_abs1 foo bar spam eggs
ACGT
>pres_abs1 blah
AAAA
>pres'abs1
CCCC
>pres_abs2
TTTT
>pres!abs3
GGGG
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>var_only1 hello
AAAA
>var:only1 boo
CCCC
>var_only1
GGGG
>var_only2
TTTT
11 changes: 11 additions & 0 deletions ariba/tests/data/reference_data_rename_sequences_metadata.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
noncoding1 . . original name "noncoding1"
noncoding1 blah . . original name "noncoding1 blah"
pres_abs1 foo bar spam eggs . . original name "pres_abs1 foo bar spam eggs"
pres_abs1 blah . . original name "pres_abs1 blah"
pres'abs1 . . original name "pres'abs1"
pres_abs2 . . original name "pres_abs2"
pres!abs3 . . original name "pres!abs3"
var_only1 hello . . original name "var_only1 hello"
var:only1 boo . . original name "var:only1 boo"
var_only1 . . original name "var_only1"
var_only2 . . original name "var_only2"
8 changes: 8 additions & 0 deletions ariba/tests/data/reference_data_test_rename_sequences.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
noncoding1 blah noncoding1_1
pres!abs3 pres_abs3
pres'abs1 pres_abs1
pres_abs1 blah pres_abs1_1
pres_abs1 foo bar spam eggs pres_abs1_2
var:only1 boo var_only1
var_only1 var_only1_1
var_only1 hello var_only1_2
Loading