Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make cdhit optional #29

Merged
merged 2 commits into from
Jun 10, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions ariba/cdhit.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,25 @@ def __init__(
self.verbose = verbose


def fake_run(self):
'''Doesn't actually run cd-hit. Instead, puts each input sequence into its own cluster. So it's as if cdhit was run, but didn't cluster anything'''
cluster_to_name = {}
found_names = set()
seq_reader = pyfastaq.sequences.file_reader(self.infile)
f = pyfastaq.utils.open_file_write(self.outfile)
for seq in seq_reader:
if seq.id in found_names:
raise Error('Sequence name "' + seq.id + '" not unique. Cannot continue')
found_names.add(seq.id)
cluster_number = str(len(cluster_to_name))
cluster_to_name[cluster_number] = {seq.id}
seq.id = cluster_number
print(seq, file=f)

pyfastaq.utils.close(f)
return cluster_to_name


def run(self):
tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
cdhit_fasta = os.path.join(tmpdir, 'cdhit')
Expand All @@ -51,7 +70,7 @@ def run(self):
'-bak 1',
])

common.syscall(cmd, verbose=self.verbose)
common.syscall(cmd, verbose=self.verbose)

cluster_representatives = self._get_ids(cdhit_fasta)
clusters, cluster_rep_to_cluster = self._parse_cluster_info_file(cluster_info_outfile, new_to_old_name, cluster_representatives)
Expand All @@ -64,7 +83,7 @@ def _enumerate_fasta(self, infile, outfile):
rename_file = outfile + '.tmp.rename_info'
assert not os.path.exists(rename_file)
pyfastaq.tasks.enumerate_names(infile, outfile, rename_file=rename_file)

with open(rename_file) as f:
lines = [x.rstrip().split('\t') for x in f.readlines() if x != '#old\tnew\n']
new_to_old_name = {x[1]: x[0] for x in lines}
Expand Down
15 changes: 11 additions & 4 deletions ariba/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(self,
velvet_exe='velvet', # prefix of velvet{g,h}
cdhit_seq_identity_threshold=0.9,
cdhit_length_diff_cutoff=0.9,
run_cd_hit=True,
clean=1,
):
self.db_fasta = os.path.abspath(db_fasta)
Expand Down Expand Up @@ -111,6 +112,7 @@ def __init__(self,

self.cdhit_seq_identity_threshold = cdhit_seq_identity_threshold
self.cdhit_length_diff_cutoff = cdhit_length_diff_cutoff
self.run_cd_hit = run_cd_hit

for d in [self.outdir, self.clusters_outdir]:
try:
Expand All @@ -121,13 +123,18 @@ def __init__(self,
def _run_cdhit(self):
r = cdhit.Runner(
self.db_fasta,
self.db_fasta_clustered,
self.db_fasta_clustered,
seq_identity_threshold=self.cdhit_seq_identity_threshold,
threads=self.threads,
length_diff_cutoff=self.cdhit_length_diff_cutoff,
verbose=self.verbose,
)
self.cluster_ids = r.run()
if self.run_cd_hit:
self.cluster_ids = r.run()
else:
if self.verbose:
print('Skipping cd-hit because --no_cdhit option used')
self.cluster_ids = r.fake_run()


def _write_clusters_info_file(self):
Expand Down Expand Up @@ -338,7 +345,7 @@ def _write_reports(self):

columns[0] = 'gene'
workbook = openpyxl.Workbook()
worksheet = workbook.worksheets[0]
worksheet = workbook.worksheets[0]
worksheet.title = 'ARIBA_report'
worksheet.append(columns)

Expand Down Expand Up @@ -383,7 +390,7 @@ def run(self):

if self.verbose:
print('{:_^79}'.format(' Running cd-hit '), flush=True)
self._run_cdhit()
self._run_cdhit()
self._write_clusters_info_file()
if self.verbose:
print('Finished cd-hit\n')
Expand Down
2 changes: 2 additions & 0 deletions ariba/tasks/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def run():
parser.add_argument('outdir', help='Output directory (must not already exist)')

cdhit_group = parser.add_argument_group('cd-hit options')
cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit')
cdhit_group.add_argument('--cdhit_seq_identity_threshold', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT')
cdhit_group.add_argument('--cdhit_length_diff_cutoff', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.9, metavar='FLOAT')

Expand Down Expand Up @@ -84,6 +85,7 @@ def run():
cdhit_seq_identity_threshold=options.cdhit_seq_identity_threshold,
cdhit_length_diff_cutoff=options.cdhit_length_diff_cutoff,
clean=options.clean,
run_cd_hit=(not options.no_cdhit)
)
c.run()

29 changes: 29 additions & 0 deletions ariba/tests/cdhit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,32 @@ def test_run(self):
self.assertEqual(clusters, expected_clusters)
self.assertTrue(filecmp.cmp(tmpfile, expected_outfile, shallow=False))
os.unlink(tmpfile)


def test_fake_run(self):
'''test fake_run'''
infile = os.path.join(data_dir, 'cdhit_test_fake_run.in.fa')
expected_outfile = os.path.join(data_dir, 'cdhit_test_fake_run.out.fa')
tmpfile = 'tmp.cdhit_test_fake_run.out.fa'
r = cdhit.Runner(infile, tmpfile)
clusters = r.fake_run()
expected_clusters = {
'0': {'seq1'},
'1': {'seq2'},
'2': {'seq3'},
'3': {'seq4'},
}
self.assertEqual(clusters, expected_clusters)
self.assertTrue(filecmp.cmp(tmpfile, expected_outfile, shallow=False))
os.unlink(tmpfile)


def test_fake_run_fail(self):
'''test fake_run with non-unique names'''
infile = os.path.join(data_dir, 'cdhit_test_fake_run.non-unique.in.fa')
tmpfile = 'tmp.cdhit_test_fake_run.out.non-unique.fa'
r = cdhit.Runner(infile, tmpfile)
with self.assertRaises(cdhit.Error):
clusters = r.fake_run()
os.unlink(tmpfile)

40 changes: 40 additions & 0 deletions ariba/tests/data/cdhit_test_fake_run.in.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
>seq1
TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
AACTCTATCGTAGGGTCGCA
>seq2
TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
AACTCTATGTAGGGTCGCA
>seq3
TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA
GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA
TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGATAGCC
ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA
AACTCTATGTAGGGTCGCA
>seq4
CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC
TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT
TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC
TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG
AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA
CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG
AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA
GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA
TGCGGTCAACAAGTCCAGGT
40 changes: 40 additions & 0 deletions ariba/tests/data/cdhit_test_fake_run.non-unique.in.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
>seq1
TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
AACTCTATCGTAGGGTCGCA
>seq2
TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
AACTCTATGTAGGGTCGCA
>seq1
TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA
GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA
TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGATAGCC
ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA
AACTCTATGTAGGGTCGCA
>seq4
CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC
TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT
TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC
TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG
AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA
CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG
AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA
GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA
TGCGGTCAACAAGTCCAGGT
40 changes: 40 additions & 0 deletions ariba/tests/data/cdhit_test_fake_run.out.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
>0
TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
AACTCTATCGTAGGGTCGCA
>1
TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
AACTCTATGTAGGGTCGCA
>2
TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA
GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA
TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGATAGCC
ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA
AACTCTATGTAGGGTCGCA
>3
CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC
TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT
TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC
TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG
AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA
CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG
AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA
GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA
TGCGGTCAACAAGTCCAGGT