Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Contig renaming #130

Merged
merged 9 commits into from
Aug 22, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions ariba/cdhit.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import tempfile
import shutil
import sys
import os
import pyfastaq
from ariba import common, external_progs
Expand Down Expand Up @@ -47,14 +48,26 @@ def fake_run(self):


@staticmethod
def _load_user_clusters_file(filename):
def _load_user_clusters_file(filename, all_ref_seqs, rename_dict=None):
if rename_dict is None:
rename_dict = {}

f = pyfastaq.utils.open_file_read(filename)
clusters = {}
used_names = set()

for line in f:
names_list = line.rstrip().split()
new_names = set(names_list)
to_remove = set()

for name in names_list:
new_name = rename_dict.get(name, name)
if new_name not in all_ref_seqs:
to_remove.add(name)
print('WARNING: ignoring sequence', name, 'from clusters file because not in fasta file. This probably means it failed sanity checks - see the log files 01.filter.check_genes.log, 01.filter.check_metadata.log.', file=sys.stderr)

names_list = [x for x in names_list if x not in to_remove]
new_names = set([rename_dict.get(name, name) for name in names_list])
if len(names_list) != len(new_names) or not new_names.isdisjoint(used_names):
pyfastaq.utils.close(f)
raise Error('Error in user-provided clusters file ' + filename + '. Non unique name found at this line:\n' + line)
Expand All @@ -66,16 +79,19 @@ def _load_user_clusters_file(filename):
return clusters


def run_get_clusters_from_file(self, clusters_infile):
def run_get_clusters_from_file(self, clusters_infile, all_ref_seqs, rename_dict=None):
'''Instead of running cdhit, gets the clusters info from the input file.'''
clusters = self._load_user_clusters_file(clusters_infile)
if rename_dict is None:
rename_dict = {}

# check that every sequence in the clusters file can be
# found in the fasta file
seq_reader = pyfastaq.sequences.file_reader(self.infile)
names_list_from_fasta_file = [seq.id for seq in seq_reader]
names_set_from_fasta_file = set(names_list_from_fasta_file)

clusters = self._load_user_clusters_file(clusters_infile, all_ref_seqs, rename_dict=rename_dict)

if len(names_set_from_fasta_file) != len(names_list_from_fasta_file):
raise Error('At least one duplicate name in fasta file ' + self.infile + '. Cannot continue')

Expand Down
15 changes: 8 additions & 7 deletions ariba/reference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

class Error (Exception): pass

rename_sub_regex = re.compile(r'''[':!@,-]''')
rename_sub_regex = re.compile(r'''[|()\[\];"':!@,-]''')


class ReferenceData:
Expand All @@ -30,6 +30,7 @@ def __init__(self,

self.genetic_code = genetic_code
pyfastaq.sequences.genetic_code = self.genetic_code
self.rename_dict = None


@classmethod
Expand Down Expand Up @@ -353,15 +354,15 @@ def _rename_names_in_metadata(cls, meta_dict, rename_dict):


def rename_sequences(self, outfile):
rename_dict = ReferenceData._seq_names_to_rename_dict(self.sequences.keys())
if len(rename_dict):
self.rename_dict = ReferenceData._seq_names_to_rename_dict(self.sequences.keys())
if len(self.rename_dict):
print('Had to rename some sequences. See', outfile, 'for old -> new names', file=sys.stderr)
with open(outfile, 'w') as f:
for old_name, new_name in sorted(rename_dict.items()):
for old_name, new_name in sorted(self.rename_dict.items()):
print(old_name, new_name, sep='\t', file=f)

self.sequences = ReferenceData._rename_names_in_seq_dict(self.sequences, rename_dict)
self.metadata = ReferenceData._rename_names_in_metadata(self.metadata, rename_dict)
self.sequences = ReferenceData._rename_names_in_seq_dict(self.sequences, self.rename_dict)
self.metadata = ReferenceData._rename_names_in_metadata(self.metadata, self.rename_dict)


def sequence_type(self, sequence_name):
Expand Down Expand Up @@ -427,7 +428,7 @@ def cluster_with_cdhit(self, outprefix, seq_identity_threshold=0.9, threads=1, l
)

if clusters_file is not None:
new_clusters = cdhit_runner.run_get_clusters_from_file(clusters_file)
new_clusters = cdhit_runner.run_get_clusters_from_file(clusters_file, self.sequences, rename_dict=self.rename_dict)
elif nocluster:
new_clusters = cdhit_runner.fake_run()
else:
Expand Down
39 changes: 36 additions & 3 deletions ariba/tests/cdhit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,26 @@ def test_load_user_clusters_file_good_file(self):
'2': {'seq5', 'seq6'}
}

got = cdhit.Runner._load_user_clusters_file(infile)
got = cdhit.Runner._load_user_clusters_file(infile, {'seq' + str(i) for i in range(1,7,1)})
self.assertEqual(expected, got)

expected['2'] = {'seq5'}
got = cdhit.Runner._load_user_clusters_file(infile, {'seq' + str(i) for i in range(1,6,1)})
self.assertEqual(expected, got)


def test_load_user_clusters_file_good_file_with_renaming(self):
'''test _load_user_clusters_file with good input file with some renamed'''
rename_dict = {'seq2': 'seq2_renamed', 'seq6': 'seq6_renamed'}
infile = os.path.join(data_dir, 'cdhit_test_load_user_clusters_file.good')
expected = {
'0': {'seq1', 'seq2_renamed', 'seq3'},
'1': {'seq4'},
'2': {'seq5', 'seq6_renamed'}
}

names = {'seq1', 'seq2_renamed', 'seq3', 'seq4', 'seq5', 'seq6_renamed'}
got = cdhit.Runner._load_user_clusters_file(infile, names, rename_dict=rename_dict)
self.assertEqual(expected, got)


Expand All @@ -115,17 +134,31 @@ def test_load_user_clusters_file_bad_file(self):
]
for filename in infiles:
with self.assertRaises(cdhit.Error):
cdhit.Runner._load_user_clusters_file(filename)
cdhit.Runner._load_user_clusters_file(filename, {'seq1', 'seq2', 'seq3'})


def test_run_get_clusters_from_file(self):
'''test run_get_clusters_from_file'''
fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.fa')
clusters_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.clusters')
r = cdhit.Runner(fa_infile)
clusters = r.run_get_clusters_from_file(clusters_infile)
clusters = r.run_get_clusters_from_file(clusters_infile, {'seq1', 'seq2', 'seq3'})
expected_clusters = {
'0': {'seq1', 'seq2'},
'1': {'seq3'},
}
self.assertEqual(clusters, expected_clusters)


def test_run_get_clusters_from_file_with_renaming(self):
'''test run_get_clusters_from_file with renaming'''
rename_dict = {'seq2': 'seq2_renamed'}
fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict_rename.in.fa')
clusters_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.clusters')
r = cdhit.Runner(fa_infile)
clusters = r.run_get_clusters_from_file(clusters_infile, {'seq1', 'seq2_renamed', 'seq3'}, rename_dict=rename_dict)
expected_clusters = {
'0': {'seq1', 'seq2_renamed'},
'1': {'seq3'},
}
self.assertEqual(clusters, expected_clusters)
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>seq1
ACGT
>seq2_renamed
AAAA
>seq3
CCCC
34 changes: 33 additions & 1 deletion ariba/tests/reference_data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,12 +278,33 @@ def test_new_seq_name(self):

def test_seq_names_to_rename_dict(self):
'''Test _seq_names_to_rename_dict'''
names = {'foo', 'bar!', 'bar:', 'bar,', 'spam', 'eggs,123'}
names = {
'foo',
'bar!',
'bar:',
'bar,',
'spam',
'eggs,123',
'ab(c1',
'ab(c)2',
'ab[c]3',
'abc;4',
"abc'5",
'abc"6',
'abc|7',
}
got = reference_data.ReferenceData._seq_names_to_rename_dict(names)
expected = {
'bar!': 'bar_',
'bar,': 'bar__1',
'bar:': 'bar__2',
'ab(c1': 'ab_c1',
'ab(c)2': 'ab_c_2',
'ab[c]3': 'ab_c_3',
'abc;4': 'abc_4',
"abc'5": 'abc_5',
'abc"6': 'abc_6',
'abc|7': 'abc_7',
'eggs,123': 'eggs_123'
}

Expand Down Expand Up @@ -423,6 +444,17 @@ def test_rename_sequences(self):

self.assertEqual(expected_seqs_dict, refdata.sequences)

expected_rename_dict = {
'pres!abs3': 'pres_abs3',
'pres\'abs1': 'pres_abs1',
'pres_abs1': 'pres_abs1_1',
'var,only1': 'var_only1',
'var:only1': 'var_only1_1',
'var_only1': 'var_only1_2',
}

self.assertEqual(expected_rename_dict, refdata.rename_dict)


def test_sequence_type(self):
'''Test sequence_type'''
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
setup(
ext_modules=[minimap_mod, fermilite_mod],
name='ariba',
version='2.2.1',
version='2.2.2',
description='ARIBA: Antibiotic Resistance Identification By Assembly',
packages = find_packages(),
package_data={'ariba': ['test_run_data/*']},
Expand Down