diff --git a/ariba/cdhit.py b/ariba/cdhit.py index 3c08a05d..32764041 100644 --- a/ariba/cdhit.py +++ b/ariba/cdhit.py @@ -30,6 +30,25 @@ def __init__( self.verbose = verbose + def fake_run(self): + '''Doesn't actually run cd-hit. Instead, puts each input sequence into its own cluster. So it's as if cdhit was run, but didn't cluster anything''' + cluster_to_name = {} + found_names = set() + seq_reader = pyfastaq.sequences.file_reader(self.infile) + f = pyfastaq.utils.open_file_write(self.outfile) + for seq in seq_reader: + if seq.id in found_names: + raise Error('Sequence name "' + seq.id + '" not unique. Cannot continue') + found_names.add(seq.id) + cluster_number = str(len(cluster_to_name)) + cluster_to_name[cluster_number] = {seq.id} + seq.id = cluster_number + print(seq, file=f) + + pyfastaq.utils.close(f) + return cluster_to_name + + def run(self): tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd()) cdhit_fasta = os.path.join(tmpdir, 'cdhit') @@ -51,7 +70,7 @@ def run(self): '-bak 1', ]) - common.syscall(cmd, verbose=self.verbose) + common.syscall(cmd, verbose=self.verbose) cluster_representatives = self._get_ids(cdhit_fasta) clusters, cluster_rep_to_cluster = self._parse_cluster_info_file(cluster_info_outfile, new_to_old_name, cluster_representatives) @@ -64,7 +83,7 @@ def _enumerate_fasta(self, infile, outfile): rename_file = outfile + '.tmp.rename_info' assert not os.path.exists(rename_file) pyfastaq.tasks.enumerate_names(infile, outfile, rename_file=rename_file) - + with open(rename_file) as f: lines = [x.rstrip().split('\t') for x in f.readlines() if x != '#old\tnew\n'] new_to_old_name = {x[1]: x[0] for x in lines} diff --git a/ariba/clusters.py b/ariba/clusters.py index e6784bf1..ba06009d 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -41,6 +41,7 @@ def __init__(self, velvet_exe='velvet', # prefix of velvet{g,h} cdhit_seq_identity_threshold=0.9, cdhit_length_diff_cutoff=0.9, + run_cd_hit=True, clean=1, ): self.db_fasta = os.path.abspath(db_fasta) @@ -111,6 +112,7 @@ def __init__(self, self.cdhit_seq_identity_threshold = cdhit_seq_identity_threshold self.cdhit_length_diff_cutoff = cdhit_length_diff_cutoff + self.run_cd_hit = run_cd_hit for d in [self.outdir, self.clusters_outdir]: try: @@ -121,13 +123,18 @@ def __init__(self, def _run_cdhit(self): r = cdhit.Runner( self.db_fasta, - self.db_fasta_clustered, + self.db_fasta_clustered, seq_identity_threshold=self.cdhit_seq_identity_threshold, threads=self.threads, length_diff_cutoff=self.cdhit_length_diff_cutoff, verbose=self.verbose, ) - self.cluster_ids = r.run() + if self.run_cd_hit: + self.cluster_ids = r.run() + else: + if self.verbose: + print('Skipping cd-hit because --no_cdhit option used') + self.cluster_ids = r.fake_run() def _write_clusters_info_file(self): @@ -338,7 +345,7 @@ def _write_reports(self): columns[0] = 'gene' workbook = openpyxl.Workbook() - worksheet = workbook.worksheets[0] + worksheet = workbook.worksheets[0] worksheet.title = 'ARIBA_report' worksheet.append(columns) @@ -383,7 +390,7 @@ def run(self): if self.verbose: print('{:_^79}'.format(' Running cd-hit '), flush=True) - self._run_cdhit() + self._run_cdhit() self._write_clusters_info_file() if self.verbose: print('Finished cd-hit\n') diff --git a/ariba/tasks/run.py b/ariba/tasks/run.py index 21b5a09a..c52ccba6 100644 --- a/ariba/tasks/run.py +++ b/ariba/tasks/run.py @@ -13,6 +13,7 @@ def run(): parser.add_argument('outdir', help='Output directory (must not already exist)') cdhit_group = parser.add_argument_group('cd-hit options') + cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit') cdhit_group.add_argument('--cdhit_seq_identity_threshold', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT') cdhit_group.add_argument('--cdhit_length_diff_cutoff', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.9, metavar='FLOAT') @@ -84,6 +85,7 @@ def run(): cdhit_seq_identity_threshold=options.cdhit_seq_identity_threshold, cdhit_length_diff_cutoff=options.cdhit_length_diff_cutoff, clean=options.clean, + run_cd_hit=(not options.no_cdhit) ) c.run() diff --git a/ariba/tests/cdhit_test.py b/ariba/tests/cdhit_test.py index 7cc4fe16..dcb1aec7 100644 --- a/ariba/tests/cdhit_test.py +++ b/ariba/tests/cdhit_test.py @@ -78,3 +78,32 @@ def test_run(self): self.assertEqual(clusters, expected_clusters) self.assertTrue(filecmp.cmp(tmpfile, expected_outfile, shallow=False)) os.unlink(tmpfile) + + + def test_fake_run(self): + '''test fake_run''' + infile = os.path.join(data_dir, 'cdhit_test_fake_run.in.fa') + expected_outfile = os.path.join(data_dir, 'cdhit_test_fake_run.out.fa') + tmpfile = 'tmp.cdhit_test_fake_run.out.fa' + r = cdhit.Runner(infile, tmpfile) + clusters = r.fake_run() + expected_clusters = { + '0': {'seq1'}, + '1': {'seq2'}, + '2': {'seq3'}, + '3': {'seq4'}, + } + self.assertEqual(clusters, expected_clusters) + self.assertTrue(filecmp.cmp(tmpfile, expected_outfile, shallow=False)) + os.unlink(tmpfile) + + + def test_fake_run_fail(self): + '''test fake_run with non-unique names''' + infile = os.path.join(data_dir, 'cdhit_test_fake_run.non-unique.in.fa') + tmpfile = 'tmp.cdhit_test_fake_run.out.non-unique.fa' + r = cdhit.Runner(infile, tmpfile) + with self.assertRaises(cdhit.Error): + clusters = r.fake_run() + os.unlink(tmpfile) + diff --git a/ariba/tests/data/cdhit_test_fake_run.in.fa b/ariba/tests/data/cdhit_test_fake_run.in.fa new file mode 100644 index 00000000..bf8b12c8 --- /dev/null +++ b/ariba/tests/data/cdhit_test_fake_run.in.fa @@ -0,0 +1,40 @@ +>seq1 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA +GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC +ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA +AACTCTATCGTAGGGTCGCA +>seq2 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA +GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC +ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA +AACTCTATGTAGGGTCGCA +>seq3 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA +GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGATAGCC +ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA +AACTCTATGTAGGGTCGCA +>seq4 +CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC +TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT +TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC +TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG +AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA +CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG +AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA +GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA +TGCGGTCAACAAGTCCAGGT diff --git a/ariba/tests/data/cdhit_test_fake_run.non-unique.in.fa b/ariba/tests/data/cdhit_test_fake_run.non-unique.in.fa new file mode 100644 index 00000000..462852a3 --- /dev/null +++ b/ariba/tests/data/cdhit_test_fake_run.non-unique.in.fa @@ -0,0 +1,40 @@ +>seq1 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA +GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC +ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA +AACTCTATCGTAGGGTCGCA +>seq2 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA +GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC +ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA +AACTCTATGTAGGGTCGCA +>seq1 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA +GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGATAGCC +ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA +AACTCTATGTAGGGTCGCA +>seq4 +CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC +TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT +TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC +TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG +AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA +CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG +AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA +GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA +TGCGGTCAACAAGTCCAGGT diff --git a/ariba/tests/data/cdhit_test_fake_run.out.fa b/ariba/tests/data/cdhit_test_fake_run.out.fa new file mode 100644 index 00000000..2a8bbc9e --- /dev/null +++ b/ariba/tests/data/cdhit_test_fake_run.out.fa @@ -0,0 +1,40 @@ +>0 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA +GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC +ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA +AACTCTATCGTAGGGTCGCA +>1 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA +GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC +ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA +AACTCTATGTAGGGTCGCA +>2 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA +GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGATAGCC +ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA +AACTCTATGTAGGGTCGCA +>3 +CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC +TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT +TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC +TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG +AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA +CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG +AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA +GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA +TGCGGTCAACAAGTCCAGGT