From a1f20970c328afff6e284429582ca507313b2801 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 08:21:07 +0000 Subject: [PATCH 01/18] Add function get_total_alignemtn_score --- ariba/mapping.py | 14 ++++++++++++++ ariba/tests/mapping_test.py | 8 ++++++++ 2 files changed, 22 insertions(+) diff --git a/ariba/mapping.py b/ariba/mapping.py index 348c4b61..5131c332 100644 --- a/ariba/mapping.py +++ b/ariba/mapping.py @@ -1,4 +1,5 @@ import os +import pysam from ariba import common class Error (Exception): pass @@ -74,3 +75,16 @@ def run_smalt( common.syscall(index_cmd, verbose=verbose) for fname in clean_files: os.unlink(fname) + + +def get_total_alignment_score(bam): + '''Returns total of AS: tags in the input BAM''' + sam_reader = pysam.Samfile(bam, "rb") + total = 0 + for sam in sam_reader.fetch(until_eof=True): + try: + total += sam.opt('AS') + except: + pass + return total + diff --git a/ariba/tests/mapping_test.py b/ariba/tests/mapping_test.py index 60245d37..5c5f29b2 100644 --- a/ariba/tests/mapping_test.py +++ b/ariba/tests/mapping_test.py @@ -60,3 +60,11 @@ def test_run_smalt_and_sort(self): os.unlink(out_prefix + '.bam.bai') os.unlink(out_prefix + '.unsorted.bam') + + def test_get_total_alignment_score(self): + '''Test get_total_alignment_score''' + bam = os.path.join(data_dir, 'mapping_test_get_total_alignment_score.bam') + expected = 219 + got = mapping.get_total_alignment_score(bam) + self.assertEqual(got, expected) + From ec0b53e752e37898eabd2ea48d019fe9b464ae1b Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 10:59:08 +0000 Subject: [PATCH 02/18] Add test file needed for previous commit --- .../mapping_test_get_total_alignment_score.bam | Bin 0 -> 648 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 ariba/tests/data/mapping_test_get_total_alignment_score.bam diff --git a/ariba/tests/data/mapping_test_get_total_alignment_score.bam b/ariba/tests/data/mapping_test_get_total_alignment_score.bam new file mode 100644 index 0000000000000000000000000000000000000000..e50c639a909d3532e45b91a43c5ff16d0e6bce19 GIT binary patch literal 648 zcmb2|=3rp}f&Xj_PR>jW&l$E(xZ8EeK%iB-VzQlU+yc*48#G-UwG1&gn0nQCv8!`*4Pp*QBD|Y4M>GyERwacfa4qzwNV{Q`9x} zuyEIgwzrz|S8uV&yO9^WzSiR4jK_ z;a9ZClvn4XcND*ro?_n8pnu74SFzp1j2do5X&;;Kx)oCxJsu?eS7dUVa?M3~W$ufb z)$!B#5_De0m|QO}^MSvp7jh5MSN~~Gcqv9qxrL$aqbmI@52Tn>>sxO zV$I>W5!4$G8DrqFM=PPxn+aKWb?!3^gz_BPkLH_XiWuDVhQ|G45 z`1DJoHD3OI+nI#Ti#{86KH$6ct}^r0=_?FI#r0W?OIVYQCQ1n((*KeFy2yyB(AH{K zq2KYFx08Fx85G9>0C!$0vCyFjA^;^?Urm+xp8sktBY4WpDwFZOZM^Zn`y_l zYx_Bi;&%t9{SK&|yh|rX-PGP<`}yRo+6~41(v@FU@8s^3@zm+$tp0EF%&vdg?bR_; z69R)yhR(Whc<z3BjJM!2!dtCcg z)Serj_oi{Xci4t4OKi@4E|5DJs(YDZ0>ho^-H)rpGfo9NwYsegDLC(Vx#rumJ748e zwp@@5-?>Pmx%gkEU+jD5Wji)ZfBCcijAwHHTqftUCXUM_H$B*YD{Sek{kH dUE!Q(`GNV>g6XOkfT;{U4x|~F!ATE9008?iDnI}L literal 0 HcmV?d00001 From 268139578c1242f3bb8d81989597d50484344b61 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 11:00:06 +0000 Subject: [PATCH 03/18] Add cdhit module --- ariba/__init__.py | 1 + ariba/cdhit.py | 121 ++++++++++++++++++ ariba/tests/cdhit_test.py | 80 ++++++++++++ .../data/cdhit_test_enumerate_fasta.in.fa | 6 + .../data/cdhit_test_enumerate_fasta.out.fa | 6 + ariba/tests/data/cdhit_test_get_ids.fa | 6 + .../cdhit_test_parse_cluster_info_file.in.fa | 40 ++++++ ...test_parse_cluster_info_file.in.renamed.fa | 40 ++++++ .../cdhit_test_parse_cluster_info_file.out.fa | 20 +++ ...t_parse_cluster_info_file.out.fa.bak.clstr | 4 + .../tests/data/cdhit_test_rename_fasta.in.fa | 6 + .../tests/data/cdhit_test_rename_fasta.out.fa | 6 + ariba/tests/data/cdhit_test_run.in.fa | 40 ++++++ ariba/tests/data/cdhit_test_run.out.fa | 20 +++ 14 files changed, 396 insertions(+) create mode 100644 ariba/cdhit.py create mode 100644 ariba/tests/cdhit_test.py create mode 100644 ariba/tests/data/cdhit_test_enumerate_fasta.in.fa create mode 100644 ariba/tests/data/cdhit_test_enumerate_fasta.out.fa create mode 100644 ariba/tests/data/cdhit_test_get_ids.fa create mode 100644 ariba/tests/data/cdhit_test_parse_cluster_info_file.in.fa create mode 100644 ariba/tests/data/cdhit_test_parse_cluster_info_file.in.renamed.fa create mode 100644 ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa create mode 100644 ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa.bak.clstr create mode 100644 ariba/tests/data/cdhit_test_rename_fasta.in.fa create mode 100644 ariba/tests/data/cdhit_test_rename_fasta.out.fa create mode 100644 ariba/tests/data/cdhit_test_run.in.fa create mode 100644 ariba/tests/data/cdhit_test_run.out.fa diff --git a/ariba/__init__.py b/ariba/__init__.py index 2c86f770..f8aee982 100644 --- a/ariba/__init__.py +++ b/ariba/__init__.py @@ -1,5 +1,6 @@ __all__ = [ 'bam_parse', + 'cdhit', 'cluster', 'clusters', 'common', diff --git a/ariba/cdhit.py b/ariba/cdhit.py new file mode 100644 index 00000000..f4bfa847 --- /dev/null +++ b/ariba/cdhit.py @@ -0,0 +1,121 @@ +import tempfile +import shutil +import os +import pyfastaq +from ariba import common + +class Error (Exception): pass + + + +class Runner: + def __init__( + self, + infile, + outfile, + seq_idenity_threshold=0.9, + threads=1, + length_diff_cutoff=0.9, + verbose=False, + ): + + if not os.path.exists(infile): + raise Error('File not found: "' + infile + '". Cannot continue') + + self.infile = os.path.abspath(infile) + self.outfile = os.path.abspath(outfile) + self.seq_idenity_threshold = seq_idenity_threshold + self.threads = threads + self.length_diff_cutoff = length_diff_cutoff + self.verbose = verbose + + + def run(self): + tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd()) + cdhit_fasta = os.path.join(tmpdir, 'cdhit') + cluster_info_outfile = cdhit_fasta + '.bak.clstr' + infile_renamed = os.path.join(tmpdir, 'input.renamed.fa') + + # cd-hit truncates all names to 19 bases in its report of which + # sequences belong to which clusters. So need to temporarily + # rename all sequences to have short enough names. Grrr. + new_to_old_name = self._enumerate_fasta(self.infile, infile_renamed) + + cmd = ' '.join([ + 'cd-hit', + '-i', infile_renamed, + '-o', cdhit_fasta, + '-c', str(self.seq_idenity_threshold), + '-T', str(self.threads), + '-s', str(self.length_diff_cutoff), + '-bak 1', + ]) + + common.syscall(cmd, verbose=self.verbose) + + cluster_representatives = self._get_ids(cdhit_fasta) + clusters, cluster_rep_to_cluster = self._parse_cluster_info_file(cluster_info_outfile, new_to_old_name, cluster_representatives) + self._rename_fasta(cdhit_fasta, self.outfile, cluster_rep_to_cluster) + shutil.rmtree(tmpdir) + return clusters + + + def _enumerate_fasta(self, infile, outfile): + rename_file = outfile + '.tmp.rename_info' + assert not os.path.exists(rename_file) + pyfastaq.tasks.enumerate_names(infile, outfile, rename_file=rename_file) + + with open(rename_file) as f: + lines = [x.rstrip().split('\t') for x in f.readlines() if x != '#old\tnew\n'] + new_to_old_name = {x[1]: x[0] for x in lines} + if len(lines) != len(new_to_old_name): + raise Error('Sequence names in input file not unique! Cannot continue') + + os.unlink(rename_file) + return new_to_old_name + + + def _rename_fasta(self, infile, outfile, names_dict): + seq_reader = pyfastaq.sequences.file_reader(infile) + f = pyfastaq.utils.open_file_write(outfile) + for seq in seq_reader: + seq.id = names_dict[seq.id] + print(seq, file=f) + + pyfastaq.utils.close(f) + + + def _parse_cluster_info_file(self, infile, names_dict, cluster_representatives): + f = pyfastaq.utils.open_file_read(infile) + clusters = {} + cluster_representative_to_cluster_number = {} + for line in f: + data = line.rstrip().split() + cluster = data[0] + seqname = data[2] + if not (seqname.startswith('>') and seqname.endswith('...')): + raise Error('Unexpected format of sequence name in line:\n' + line) + seqname = seqname[1:-3] + + if seqname in cluster_representatives: + cluster_representative_to_cluster_number[seqname] = cluster + + seqname = names_dict[seqname] + + if cluster not in clusters: + clusters[cluster] = set() + + if seqname in clusters[cluster]: + raise Error('Duplicate name "' + seqname + '" found in cluster ' + str(cluster)) + + clusters[cluster].add(seqname) + + pyfastaq.utils.close(f) + + return clusters, cluster_representative_to_cluster_number + + + def _get_ids(self, infile): + seq_reader = pyfastaq.sequences.file_reader(infile) + return set([seq.id for seq in seq_reader]) + diff --git a/ariba/tests/cdhit_test.py b/ariba/tests/cdhit_test.py new file mode 100644 index 00000000..7cc4fe16 --- /dev/null +++ b/ariba/tests/cdhit_test.py @@ -0,0 +1,80 @@ +import unittest +import os +import filecmp +from ariba import cdhit + +modules_dir = os.path.dirname(os.path.abspath(cdhit.__file__)) +data_dir = os.path.join(modules_dir, 'tests', 'data') + +class TestCdhit(unittest.TestCase): + def test_init_fail_infile_missing(self): + '''test init_fail_infile_missing''' + with self.assertRaises(cdhit.Error): + r = cdhit.Runner('oopsnotafile', 'out') + + + def test_enumerate_fasta(self): + '''test _enumerate_fasta''' + infile = os.path.join(data_dir, 'cdhit_test_enumerate_fasta.in.fa') + expected_outfile = os.path.join(data_dir, 'cdhit_test_enumerate_fasta.out.fa') + tmpfile = 'tmp.test_enumerate_fasta.out.fa' + expected_dict = {'1': 'a', '2': 'b', '3': 'c'} + r = cdhit.Runner(infile, 'out') + got_dict = r._enumerate_fasta(infile, tmpfile) + self.assertTrue(filecmp.cmp(expected_outfile, tmpfile, shallow=False)) + self.assertEqual(expected_dict, got_dict) + os.unlink(tmpfile) + + + def test_get_ids(self): + '''test _get_ids''' + infile = os.path.join(data_dir, 'cdhit_test_get_ids.fa') + expected = {'id1', 'id2', 'id3'} + r = cdhit.Runner(infile, 'out') + got = r._get_ids(infile) + self.assertEqual(expected, got) + + + def test_parse_cluster_info_file(self): + '''test _parse_cluster_info_file''' + infile = os.path.join(data_dir, 'cdhit_test_parse_cluster_info_file.in.fa') + r = cdhit.Runner(infile, 'out') + names_dict = {str(i): 'seq' + str(i) for i in range(1,5)} + cluster_representatives = {'1', '4'} + cluster_file = os.path.join(data_dir, 'cdhit_test_parse_cluster_info_file.out.fa.bak.clstr') + got_clusters, got_reps = r._parse_cluster_info_file(cluster_file, names_dict, cluster_representatives) + expected_clusters = { + '0': {'seq1', 'seq2', 'seq3'}, + '1': {'seq4'} + } + expected_reps = {'1': '0', '4': '1'} + self.assertEqual(expected_clusters, got_clusters) + self.assertEqual(expected_reps, got_reps) + + + def test_rename_fasta(self): + '''test _rename_fasta''' + infile = os.path.join(data_dir, 'cdhit_test_rename_fasta.in.fa') + tmpfile = 'tmp.rename_fasta.out.fa' + expected = os.path.join(data_dir, 'cdhit_test_rename_fasta.out.fa') + names_dict = {'a': 'seq1', 'b': 'seq2', 'c': 'seq3'} + r = cdhit.Runner(infile, 'out') + r._rename_fasta(infile, tmpfile, names_dict) + self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False)) + os.unlink(tmpfile) + + + def test_run(self): + '''test run''' + infile = os.path.join(data_dir, 'cdhit_test_run.in.fa') + expected_outfile = os.path.join(data_dir, 'cdhit_test_run.out.fa') + tmpfile = 'tmp.cdhit_test_run.out.fa' + r = cdhit.Runner(infile, tmpfile) + clusters = r.run() + expected_clusters = { + '0': {'seq1', 'seq2', 'seq3'}, + '1': {'seq4'}, + } + self.assertEqual(clusters, expected_clusters) + self.assertTrue(filecmp.cmp(tmpfile, expected_outfile, shallow=False)) + os.unlink(tmpfile) diff --git a/ariba/tests/data/cdhit_test_enumerate_fasta.in.fa b/ariba/tests/data/cdhit_test_enumerate_fasta.in.fa new file mode 100644 index 00000000..85ca4cb1 --- /dev/null +++ b/ariba/tests/data/cdhit_test_enumerate_fasta.in.fa @@ -0,0 +1,6 @@ +>a +A +>b +G +>c +T diff --git a/ariba/tests/data/cdhit_test_enumerate_fasta.out.fa b/ariba/tests/data/cdhit_test_enumerate_fasta.out.fa new file mode 100644 index 00000000..4b36e898 --- /dev/null +++ b/ariba/tests/data/cdhit_test_enumerate_fasta.out.fa @@ -0,0 +1,6 @@ +>1 +A +>2 +G +>3 +T diff --git a/ariba/tests/data/cdhit_test_get_ids.fa b/ariba/tests/data/cdhit_test_get_ids.fa new file mode 100644 index 00000000..f2012398 --- /dev/null +++ b/ariba/tests/data/cdhit_test_get_ids.fa @@ -0,0 +1,6 @@ +>id1 +A +>id2 +G +>id3 +T diff --git a/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.fa b/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.fa new file mode 100644 index 00000000..bf8b12c8 --- /dev/null +++ b/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.fa @@ -0,0 +1,40 @@ +>seq1 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA +GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC +ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA +AACTCTATCGTAGGGTCGCA +>seq2 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA +GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC +ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA +AACTCTATGTAGGGTCGCA +>seq3 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA +GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGATAGCC +ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA +AACTCTATGTAGGGTCGCA +>seq4 +CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC +TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT +TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC +TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG +AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA +CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG +AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA +GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA +TGCGGTCAACAAGTCCAGGT diff --git a/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.renamed.fa b/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.renamed.fa new file mode 100644 index 00000000..9f7eca5d --- /dev/null +++ b/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.renamed.fa @@ -0,0 +1,40 @@ +>1 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA +GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC +ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA +AACTCTATCGTAGGGTCGCA +>2 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA +GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC +ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA +AACTCTATGTAGGGTCGCA +>3 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA +GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGATAGCC +ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA +AACTCTATGTAGGGTCGCA +>4 +CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC +TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT +TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC +TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG +AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA +CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG +AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA +GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA +TGCGGTCAACAAGTCCAGGT diff --git a/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa b/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa new file mode 100644 index 00000000..dba7562b --- /dev/null +++ b/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa @@ -0,0 +1,20 @@ +>1 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA +GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC +ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA +AACTCTATCGTAGGGTCGCA +>4 +CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC +TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT +TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC +TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG +AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA +CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG +AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA +GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA +TGCGGTCAACAAGTCCAGGT diff --git a/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa.bak.clstr b/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa.bak.clstr new file mode 100644 index 00000000..17451594 --- /dev/null +++ b/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa.bak.clstr @@ -0,0 +1,4 @@ +0 500aa, >1... * +0 499aa, >2... at 99.40% +0 499aa, >3... at 98.40% +1 500aa, >4... * diff --git a/ariba/tests/data/cdhit_test_rename_fasta.in.fa b/ariba/tests/data/cdhit_test_rename_fasta.in.fa new file mode 100644 index 00000000..11d5e25a --- /dev/null +++ b/ariba/tests/data/cdhit_test_rename_fasta.in.fa @@ -0,0 +1,6 @@ +>a +A +>b +C +>c +G diff --git a/ariba/tests/data/cdhit_test_rename_fasta.out.fa b/ariba/tests/data/cdhit_test_rename_fasta.out.fa new file mode 100644 index 00000000..7ab37993 --- /dev/null +++ b/ariba/tests/data/cdhit_test_rename_fasta.out.fa @@ -0,0 +1,6 @@ +>seq1 +A +>seq2 +C +>seq3 +G diff --git a/ariba/tests/data/cdhit_test_run.in.fa b/ariba/tests/data/cdhit_test_run.in.fa new file mode 100644 index 00000000..bf8b12c8 --- /dev/null +++ b/ariba/tests/data/cdhit_test_run.in.fa @@ -0,0 +1,40 @@ +>seq1 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA +GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC +ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA +AACTCTATCGTAGGGTCGCA +>seq2 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA +GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC +ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA +AACTCTATGTAGGGTCGCA +>seq3 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA +GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGATAGCC +ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA +AACTCTATGTAGGGTCGCA +>seq4 +CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC +TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT +TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC +TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG +AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA +CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG +AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA +GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA +TGCGGTCAACAAGTCCAGGT diff --git a/ariba/tests/data/cdhit_test_run.out.fa b/ariba/tests/data/cdhit_test_run.out.fa new file mode 100644 index 00000000..12a02b2b --- /dev/null +++ b/ariba/tests/data/cdhit_test_run.out.fa @@ -0,0 +1,20 @@ +>0 +TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA +GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG +AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC +TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG +AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC +ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA +GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA +TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA +AACTCTATCGTAGGGTCGCA +>1 +CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC +TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT +TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC +TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG +AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA +CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG +AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA +GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA +TGCGGTCAACAAGTCCAGGT From 5494765da08088da832d853fe1314adfa58c836a Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 11:01:35 +0000 Subject: [PATCH 04/18] Fix typo in variable name --- ariba/cdhit.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ariba/cdhit.py b/ariba/cdhit.py index f4bfa847..d1ea02a7 100644 --- a/ariba/cdhit.py +++ b/ariba/cdhit.py @@ -13,7 +13,7 @@ def __init__( self, infile, outfile, - seq_idenity_threshold=0.9, + seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.9, verbose=False, @@ -24,7 +24,7 @@ def __init__( self.infile = os.path.abspath(infile) self.outfile = os.path.abspath(outfile) - self.seq_idenity_threshold = seq_idenity_threshold + self.seq_identity_threshold = seq_identity_threshold self.threads = threads self.length_diff_cutoff = length_diff_cutoff self.verbose = verbose @@ -45,7 +45,7 @@ def run(self): 'cd-hit', '-i', infile_renamed, '-o', cdhit_fasta, - '-c', str(self.seq_idenity_threshold), + '-c', str(self.seq_identity_threshold), '-T', str(self.threads), '-s', str(self.length_diff_cutoff), '-bak 1', From 5a3e050d503a6fe9d079b6643526bbcf7ddd3504 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 11:21:05 +0000 Subject: [PATCH 05/18] Add module faidx --- ariba/__init__.py | 1 + ariba/faidx.py | 19 +++++++++++++++++++ .../data/faidx_test_write_fa_subset.in.fa | 8 ++++++++ .../data/faidx_test_write_fa_subset.in.fa.fai | 4 ++++ .../data/faidx_test_write_fa_subset.out.fa | 6 ++++++ ariba/tests/faidx_test.py | 18 ++++++++++++++++++ 6 files changed, 56 insertions(+) create mode 100644 ariba/faidx.py create mode 100644 ariba/tests/data/faidx_test_write_fa_subset.in.fa create mode 100644 ariba/tests/data/faidx_test_write_fa_subset.in.fa.fai create mode 100644 ariba/tests/data/faidx_test_write_fa_subset.out.fa create mode 100644 ariba/tests/faidx_test.py diff --git a/ariba/__init__.py b/ariba/__init__.py index f8aee982..fdffcc0f 100644 --- a/ariba/__init__.py +++ b/ariba/__init__.py @@ -5,6 +5,7 @@ 'clusters', 'common', 'external_progs', + 'faidx', 'flag', 'histogram', 'link', diff --git a/ariba/faidx.py b/ariba/faidx.py new file mode 100644 index 00000000..a9939abe --- /dev/null +++ b/ariba/faidx.py @@ -0,0 +1,19 @@ +import os +from ariba import common + + +def write_fa_subset(seq_names, infile, outfile, samtools_exe='samtools', verbose=False): + if not os.path.exists(infile + '.fai'): + common.syscall(samtools_exe + ' faidx ' + infile, verbose=verbose) + + if os.path.exists(outfile): + os.path.unlink(outfile) + + for name in seq_names: + common.syscall(' '.join([ + samtools_exe + ' faidx', + infile, + name, + '>>', outfile + ])) + diff --git a/ariba/tests/data/faidx_test_write_fa_subset.in.fa b/ariba/tests/data/faidx_test_write_fa_subset.in.fa new file mode 100644 index 00000000..d7a3a928 --- /dev/null +++ b/ariba/tests/data/faidx_test_write_fa_subset.in.fa @@ -0,0 +1,8 @@ +>seq1 +A +>seq2 +G +>seq3 +C +>seq4 +T diff --git a/ariba/tests/data/faidx_test_write_fa_subset.in.fa.fai b/ariba/tests/data/faidx_test_write_fa_subset.in.fa.fai new file mode 100644 index 00000000..0f995e3a --- /dev/null +++ b/ariba/tests/data/faidx_test_write_fa_subset.in.fa.fai @@ -0,0 +1,4 @@ +seq1 1 6 1 2 +seq2 1 14 1 2 +seq3 1 22 1 2 +seq4 1 30 1 2 diff --git a/ariba/tests/data/faidx_test_write_fa_subset.out.fa b/ariba/tests/data/faidx_test_write_fa_subset.out.fa new file mode 100644 index 00000000..ca3257f0 --- /dev/null +++ b/ariba/tests/data/faidx_test_write_fa_subset.out.fa @@ -0,0 +1,6 @@ +>seq1 +A +>seq3 +C +>seq4 +T diff --git a/ariba/tests/faidx_test.py b/ariba/tests/faidx_test.py new file mode 100644 index 00000000..ce83af69 --- /dev/null +++ b/ariba/tests/faidx_test.py @@ -0,0 +1,18 @@ +import unittest +import filecmp +import os +from ariba import faidx + +modules_dir = os.path.dirname(os.path.abspath(faidx.__file__)) +data_dir = os.path.join(modules_dir, 'tests', 'data') + + +class TestFaidx(unittest.TestCase): + def test_write_fa_subset(self): + '''test write_fa_subset''' + infile = os.path.join(data_dir, 'faidx_test_write_fa_subset.in.fa') + expected = os.path.join(data_dir, 'faidx_test_write_fa_subset.out.fa') + tmpfile = 'tmp.test_write_fa_subset.out.fa' + faidx.write_fa_subset(['seq1', 'seq3', 'seq4'], infile, tmpfile) + self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False)) + os.unlink(tmpfile) From 064dd1fea5e1766bdd09c12bae772c4eb6bcf46f Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 11:36:57 +0000 Subject: [PATCH 06/18] Make clusters with genes cluster, instead of one gene --- ariba/clusters.py | 55 +++++++++++++++++++++++------------- ariba/tests/clusters_test.py | 10 ------- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/ariba/clusters.py b/ariba/clusters.py index 4f27a3c4..ba67aca8 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -4,7 +4,7 @@ import openpyxl import pysam import pyfastaq -from ariba import cluster, common, mapping, histogram +from ariba import cdhit, cluster, common, mapping, histogram, faidx class Error (Exception): pass @@ -37,6 +37,8 @@ def __init__(self, spades_exe='spades.py', sspace_exe='SSPACE_Basic_v2.0.pl', velvet_exe='velvet', # prefix of velvet{g,h} + cdhit_seq_identity_threshold=0.9, + cdhit_length_diff_cutoff=0.9, ): self.db_fasta = os.path.abspath(db_fasta) self.reads_1 = os.path.abspath(reads_1) @@ -48,6 +50,8 @@ def __init__(self, self.assembly_kmer = assembly_kmer self.spades_other = spades_other + self.db_fasta_clustered = os.path.join(self.outdir, 'genes.clustered.fa') + self.cluster_ids = {} self.bam_prefix = os.path.join(self.outdir, 'map_all_reads') self.bam = self.bam_prefix + '.bam' self.report_file_tsv = os.path.join(self.outdir, 'report.tsv') @@ -95,17 +99,32 @@ def __init__(self, self.velvet = velvet_exe + self.cdhit_seq_identity_threshold = cdhit_seq_identity_threshold + self.cdhit_length_diff_cutoff = cdhit_length_diff_cutoff + try: os.mkdir(self.outdir) except: raise Error('Error mkdir ' + self.outdir) - def _map_reads(self): + def _run_cdhit(self): + r = cdhit.Runner( + self.db_fasta, + self.db_fasta_clustered, + seq_identity_threshold=self.cdhit_seq_identity_threshold, + threads=self.threads, + length_diff_cutoff=self.cdhit_length_diff_cutoff, + verbose=self.verbose, + ) + self.cluster_ids = r.run() + + + def _map_reads_to_clustered_genes(self): mapping.run_smalt( self.reads_1, self.reads_2, - self.db_fasta, + self.db_fasta_clustered, self.bam_prefix, index_k=self.smalt_k, index_s=self.smalt_s, @@ -215,19 +234,6 @@ def _set_insert_size_data(self): print() - def _write_gene_fa(self, gene_name, outfile): - if not os.path.exists(self.db_fasta + '.fai'): - common.syscall(self.samtools_exe + ' faidx ' + self.db_fasta, verbose=self.verbose) - - common.syscall(' '.join([ - self.samtools_exe + ' faidx', - self.db_fasta, - gene_name, - '>', outfile - ])) - - - def _init_and_run_clusters(self): if len(self.cluster_to_dir) == 0: raise Error('Did not get any reads mapped to genes. Cannot continue') @@ -239,7 +245,15 @@ def _init_and_run_clusters(self): if self.verbose: print('\nAssembling cluster', counter, 'of', str(len(self.cluster_to_dir)) + ':', gene) new_dir = self.cluster_to_dir[gene] - self._write_gene_fa(gene, os.path.join(new_dir, 'gene.fa')) + + faidx.write_fa_subset( + self.cluster_ids[gene], + self.db_fasta, + os.path.join(new_dir, 'genes.fa'), + samtools_exe=self.samtools_exe, + verbose=self.verbose + ) + self.clusters[gene] = cluster.Cluster( new_dir, assembly_kmer=self.assembly_kmer, @@ -307,8 +321,11 @@ def _write_reports(self): def run(self): if self.verbose: - print('{:_^79}'.format(' Mapping reads to reference genes ')) - self._map_reads() + print('{:_^79}'.format(' Running cd-hit ')) + self._run_cdhit() + if self.verbose: + print('{:_^79}'.format(' Mapping reads to clustered genes ')) + self._map_reads_to_clustered_genes() if self.verbose: print('Finished mapping\n') print('{:_^79}'.format(' Generating clusters ')) diff --git a/ariba/tests/clusters_test.py b/ariba/tests/clusters_test.py index d5ed8aeb..7d8a877e 100644 --- a/ariba/tests/clusters_test.py +++ b/ariba/tests/clusters_test.py @@ -116,16 +116,6 @@ def test_set_insert_size_data(self): self.assertEqual(self.clusters.insert_sspace_sd, 0.91) - def test_write_gene_fa(self): - '''Test _write_gene_fa''' - self.clusters.db_fasta = os.path.join(data_dir, 'clusters_test_write_gene_fa.db.fa') - expected = os.path.join(data_dir, 'clusters_test_write_gene_fa.out.fa') - tmp_file = 'tmp.test_write_gene_fa.fa' - self.clusters._write_gene_fa('gene2', tmp_file) - self.assertTrue(filecmp.cmp(expected, tmp_file, shallow=False)) - os.unlink(tmp_file) - - def test_write_reports(self): class FakeCluster: def __init__(self, lines): From 99131d9e78778f94e731cc11d9af96de9e8be39f Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 13:18:32 +0000 Subject: [PATCH 07/18] Find best gene for each cluster --- ariba/cluster.py | 51 +- ariba/tests/cluster_test.py | 71 +- ...cluster_test_assemble_with_spades.gene.fa} | 0 .../genes.fa} | 0 .../cluster_test_assemble_with_velvet/gene.fa | 9 - .../gene.reads_mapped.unsorted.bam | Bin 9453 -> 0 bytes .../reads_1.fq | 772 ------------------ .../reads_2.fq | 772 ------------------ .../cluster_test_choose_best_gene.gene.fa | 6 + .../cluster_test_choose_best_gene/genes.fa | 18 + .../cluster_test_choose_best_gene/reads_1.fq | 180 ++++ .../cluster_test_choose_best_gene/reads_2.fq | 180 ++++ ...uster_test_fix_contig_orientation.gene.fa} | 0 .../genes.fa | 10 + ...uster_test_gapfill_with_gapfiller.gene.fa} | 0 .../genes.fa} | 0 .../gene.fa => cluster_test_generic/genes.fa} | 0 .../genes.fa | 18 + .../reads_1.fq | 180 ++++ .../reads_2.fq | 180 ++++ .../tests/data/cluster_test_get_gene/gene.fa | 2 - .../data/cluster_test_get_gene/reads_1.fq | 4 - .../data/cluster_test_get_gene/reads_2.fq | 4 - .../genes.fa | 18 + .../reads_1.fq | 180 ++++ .../reads_2.fq | 180 ++++ .../genes.fa} | 0 .../genes.fa} | 0 .../genes.fa} | 0 .../genes.fa} | 0 .../{gene.fa => genes.fa} | 0 .../genes.fa} | 0 .../cluster_test_scaffold_with_sspace.gene.fa | 2 + .../genes.fa | 2 + .../cluster_test_set_assembly_kmer/genes.fa | 2 + 35 files changed, 1256 insertions(+), 1585 deletions(-) rename ariba/tests/data/{cluster_test_assemble_with_spades/gene.fa => cluster_test_assemble_with_spades.gene.fa} (100%) rename ariba/tests/data/{cluster_test_gapfill_with_gapfiller/gene.fa => cluster_test_assemble_with_spades/genes.fa} (100%) delete mode 100644 ariba/tests/data/cluster_test_assemble_with_velvet/gene.fa delete mode 100644 ariba/tests/data/cluster_test_assemble_with_velvet/gene.reads_mapped.unsorted.bam delete mode 100644 ariba/tests/data/cluster_test_assemble_with_velvet/reads_1.fq delete mode 100644 ariba/tests/data/cluster_test_assemble_with_velvet/reads_2.fq create mode 100644 ariba/tests/data/cluster_test_choose_best_gene.gene.fa create mode 100644 ariba/tests/data/cluster_test_choose_best_gene/genes.fa create mode 100644 ariba/tests/data/cluster_test_choose_best_gene/reads_1.fq create mode 100644 ariba/tests/data/cluster_test_choose_best_gene/reads_2.fq rename ariba/tests/data/{cluster_test_fix_contig_orientation/gene.fa => cluster_test_fix_contig_orientation.gene.fa} (100%) create mode 100644 ariba/tests/data/cluster_test_fix_contig_orientation/genes.fa rename ariba/tests/data/{cluster_test_generic/gene.fa => cluster_test_gapfill_with_gapfiller.gene.fa} (100%) rename ariba/tests/data/{cluster_test_init_no_reads_1/gene.fa => cluster_test_gapfill_with_gapfiller/genes.fa} (100%) rename ariba/tests/data/{cluster_test_init_no_reads_2/gene.fa => cluster_test_generic/genes.fa} (100%) create mode 100644 ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/genes.fa create mode 100644 ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_1.fq create mode 100644 ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_2.fq delete mode 100644 ariba/tests/data/cluster_test_get_gene/gene.fa delete mode 100644 ariba/tests/data/cluster_test_get_gene/reads_1.fq delete mode 100644 ariba/tests/data/cluster_test_get_gene/reads_2.fq create mode 100644 ariba/tests/data/cluster_test_get_total_alignment_score/genes.fa create mode 100644 ariba/tests/data/cluster_test_get_total_alignment_score/reads_1.fq create mode 100644 ariba/tests/data/cluster_test_get_total_alignment_score/reads_2.fq rename ariba/tests/data/{cluster_test_load_final_contigs/gene.fa => cluster_test_init_no_reads_1/genes.fa} (100%) rename ariba/tests/data/{cluster_test_parse_assembly_bam/gene.fa => cluster_test_init_no_reads_2/genes.fa} (100%) rename ariba/tests/data/{cluster_test_rename_scaffolds/gene.fa => cluster_test_load_final_contigs/genes.fa} (100%) rename ariba/tests/data/{cluster_test_scaffold_with_sspace/gene.fa => cluster_test_parse_assembly_bam/genes.fa} (100%) rename ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/{gene.fa => genes.fa} (100%) rename ariba/tests/data/{cluster_test_set_assembly_kmer/gene.fa => cluster_test_rename_scaffolds/genes.fa} (100%) create mode 100644 ariba/tests/data/cluster_test_scaffold_with_sspace.gene.fa create mode 100644 ariba/tests/data/cluster_test_scaffold_with_sspace/genes.fa create mode 100644 ariba/tests/data/cluster_test_set_assembly_kmer/genes.fa diff --git a/ariba/cluster.py b/ariba/cluster.py index e47e6288..89dbcc1b 100644 --- a/ariba/cluster.py +++ b/ariba/cluster.py @@ -5,7 +5,7 @@ import operator import pyfastaq import pymummer -from ariba import common, mapping, bam_parse, flag +from ariba import common, mapping, bam_parse, flag, faidx class Error (Exception): pass @@ -48,13 +48,13 @@ def __init__(self, self.reads1 = os.path.join(self.root_dir, 'reads_1.fq') self.reads2 = os.path.join(self.root_dir, 'reads_2.fq') self.gene_fa = os.path.join(self.root_dir, 'gene.fa') + self.genes_fa = os.path.join(self.root_dir, 'genes.fa') self.gene_bam = os.path.join(self.root_dir, 'gene.reads_mapped.bam') - for fname in [self.reads1, self.reads2, self.gene_fa]: + for fname in [self.reads1, self.reads2, self.genes_fa]: if not os.path.exists(fname): raise Error('File ' + fname + ' not found. Cannot continue') - self.gene = self._get_gene() self.max_insert = max_insert self.min_scaff_depth = min_scaff_depth @@ -123,7 +123,48 @@ def __init__(self, self.variants = {} - def _get_gene(self): + def _get_total_alignment_score(self, gene_name): + tmp_bam = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.bam') + assert not os.path.exists(tmp_bam) + tmp_fa = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.ref.fa') + assert not os.path.exists(tmp_fa) + faidx.write_fa_subset([gene_name], self.genes_fa, tmp_fa, samtools_exe=self.samtools_exe, verbose=self.verbose) + mapping.run_smalt( + self.reads1, + self.reads2, + tmp_fa, + tmp_bam[:-4], + threads=self.threads, + samtools=self.samtools_exe, + smalt=self.smalt_exe, + verbose=self.verbose, + ) + + score = mapping.get_total_alignment_score(tmp_bam) + os.unlink(tmp_bam) + os.unlink(tmp_fa) + os.unlink(tmp_fa + '.fai') + return score + + + def _get_best_gene_by_alignment_score(self): + file_reader = pyfastaq.sequences.file_reader(self.genes_fa) + best_score = 0 + best_gene_name = None + for seq in file_reader: + score = self._get_total_alignment_score(seq.id) + if self.verbose: + print('Total alignment score for gene', seq.id, 'is', score) + if score > best_score: + best_score = score + best_gene_name = seq.id + + return best_gene_name + + + def _choose_best_gene(self): + gene_name = self._get_best_gene_by_alignment_score() + faidx.write_fa_subset([gene_name], self.genes_fa, self.gene_fa, samtools_exe=self.samtools_exe, verbose=self.verbose) seqs = {} pyfastaq.tasks.file_to_dict(self.gene_fa, seqs) assert len(seqs) == 1 @@ -676,6 +717,8 @@ def _make_report_lines(self): def run(self): + self.gene = self._choose_best_gene() + if self.assembler == 'velvet': self._assemble_with_velvet() elif self.assembler == 'spades': diff --git a/ariba/tests/cluster_test.py b/ariba/tests/cluster_test.py index af46113e..89d70cd4 100644 --- a/ariba/tests/cluster_test.py +++ b/ariba/tests/cluster_test.py @@ -17,7 +17,7 @@ def clean_cluster_dir(d, exclude=None): return '''Cleans up all files made except original ones in a cluster directory''' - keep = set(['gene.fa', 'reads_1.fq', 'reads_2.fq']) + keep = set(['genes.fa', 'reads_1.fq', 'reads_2.fq']) if exclude is not None: for f in exclude: keep.add(f) @@ -31,12 +31,20 @@ def clean_cluster_dir(d, exclude=None): os.unlink(full_path) +def load_gene(filename): + file_reader = pyfastaq.sequences.file_reader(filename) + seq = None + for seq in file_reader: + pass + return seq + + class TestCluster(unittest.TestCase): def test_init_fail_files_missing(self): '''test init_fail_files_missing''' dirs = [ 'cluster_test_directorynotexist' - 'cluster_test_init_no_gene_fa', + 'cluster_test_init_no_genes_fa', 'cluster_test_init_no_reads_1', 'cluster_test_init_no_reads_2', ] @@ -48,14 +56,43 @@ def test_init_fail_files_missing(self): clean_cluster_dir(d) - def test_get_gene(self): - '''test _get_gene''' - cluster_dir = os.path.join(data_dir, 'cluster_test_get_gene') + def test_get_total_alignment_score(self): + '''test _get_total_alignment_score''' + cluster_dir = os.path.join(data_dir, 'cluster_test_get_total_alignment_score') + clean_cluster_dir(cluster_dir) + c = cluster.Cluster(cluster_dir) + got_score = c._get_total_alignment_score('1') + expected_score = 1500 + self.assertEqual(got_score, expected_score) + clean_cluster_dir(cluster_dir) + + + def test_get_best_gene_by_alignment_score(self): + '''test _get_best_gene_by_alignment_score''' + cluster_dir = os.path.join(data_dir, 'cluster_test_get_best_gene_by_alignment_score') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir) - expected = pyfastaq.sequences.Fasta('name_of_gene', 'CATGCGAAAGAAAAC') - got = c._get_gene() - self.assertEqual(expected, got) + got_name = c._get_best_gene_by_alignment_score() + self.assertEqual(got_name, '1') + clean_cluster_dir(cluster_dir) + + + def test_choose_best_gene(self): + '''test _choose_best_gene''' + cluster_dir = os.path.join(data_dir, 'cluster_test_choose_best_gene') + clean_cluster_dir(cluster_dir) + c = cluster.Cluster(cluster_dir) + expected_gene = pyfastaq.sequences.Fasta('1', ''.join([ + 'AGCGCCTAGCTTTGGCACTTCAGGAGCGCCCGGAAATAATGGCGGGCGATGAAGGTTCTG', + 'TAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAAC', + 'CCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAGAAAGGGGCCGAAATGCAAAA', + 'GTCTTAAGGACTCTGCGAGGCAAAGTACGGGCGAACTAAACCCCCGTGACAGGTCAGACG', + 'TTGTTTCGGCAATCTGTCGCGCTCCCACACCTATAAGCGTACACCGTCTCTTCTGCCAGC', + ])) + expected_gene_fa = os.path.join(data_dir, 'cluster_test_choose_best_gene.gene.fa') + got = c._choose_best_gene() + self.assertEqual(got, expected_gene) + self.assertTrue(filecmp.cmp(expected_gene_fa, c.gene_fa, shallow=False)) clean_cluster_dir(cluster_dir) @@ -71,21 +108,12 @@ def test_set_assembly_kmer(self): clean_cluster_dir(cluster_dir) - #def test_assemble_with_velvet(self): - # '''test _assemble_with_velvet''' - # cluster_dir = os.path.join(data_dir, 'cluster_test_assemble_with_velvet') - # clean_cluster_dir(cluster_dir, exclude=set(['gene.reads_mapped.unsorted.bam'])) - # c = cluster.Cluster(cluster_dir) - # c._assemble_with_velvet() - # self.assertEqual(c.status_flag.to_number(), 0) - # clean_cluster_dir(cluster_dir, exclude=set(['gene.reads_mapped.unsorted.bam'])) - - def test_assemble_with_spades(self): '''test _assemble_with_spades''' cluster_dir = os.path.join(data_dir, 'cluster_test_assemble_with_spades') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir) + shutil.copyfile(os.path.join(data_dir, 'cluster_test_assemble_with_spades.gene.fa'), c.gene_fa) c._assemble_with_spades(unittest=True) self.assertEqual(c.status_flag.to_number(), 0) clean_cluster_dir(cluster_dir) @@ -96,6 +124,7 @@ def test_assemble_with_spades_fail(self): cluster_dir = os.path.join(data_dir, 'cluster_test_assemble_with_spades') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir) + shutil.copyfile(os.path.join(data_dir, 'cluster_test_assemble_with_spades.gene.fa'), c.gene_fa) c._assemble_with_spades() self.assertEqual(c.status_flag.to_number(), 64) clean_cluster_dir(cluster_dir) @@ -110,6 +139,7 @@ def test_scaffold_with_sspace(self): os.path.join(data_dir, 'cluster_test_scaffold_with_sspace.contigs.fa'), c.assembly_contigs ) + #shutil.copyfile(os.path.join(data_dir, 'cluster_test_scaffold_with_sspace.gene.fa'), c.gene_fa) c._scaffold_with_sspace() self.assertTrue(os.path.exists(c.scaffolder_scaffolds)) clean_cluster_dir(cluster_dir) @@ -124,6 +154,7 @@ def test_gap_fill_with_gapfiller_no_gaps(self): os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller.scaffolds_no_gaps.fa'), c.scaffolder_scaffolds ) + c.gene = pyfastaq.sequences.Fasta('name_of_gene', 'AAACCCGGGTTT') c._gap_fill_with_gapfiller() self.assertTrue(os.path.exists(c.gapfilled_scaffolds)) clean_cluster_dir(cluster_dir) @@ -138,6 +169,7 @@ def test_gap_fill_with_gapfiller_with_gaps(self): os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa'), c.scaffolder_scaffolds ) + c.gene = pyfastaq.sequences.Fasta('name_of_gene', 'AAACCCGGGTTT') c._gap_fill_with_gapfiller() self.assertTrue(os.path.exists(c.gapfilled_scaffolds)) clean_cluster_dir(cluster_dir) @@ -148,6 +180,7 @@ def test_rename_scaffolds(self): cluster_dir = os.path.join(data_dir, 'cluster_test_rename_scaffolds') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir) + c.gene = pyfastaq.sequences.Fasta('name_of_gene', 'AAACCCGGGTTT') infile = os.path.join(data_dir, 'cluster_test_rename_scaffolds.in.fa') outfile = os.path.join(data_dir, 'cluster_test_rename_scaffolds.out.fa') tmpfile = 'tmp.fa' @@ -165,6 +198,7 @@ def test_fix_contig_orientation(self): scaffs_in = os.path.join(data_dir, 'cluster_test_fix_contig_orientation.in.fa') scaffs_out = os.path.join(data_dir, 'cluster_test_fix_contig_orientation.out.fa') shutil.copyfile(scaffs_in, c.gapfilled_scaffolds) + shutil.copyfile(os.path.join(data_dir, 'cluster_test_fix_contig_orientation.gene.fa'), c.gene_fa) c._fix_contig_orientation() self.assertTrue(filecmp.cmp(scaffs_out, c.final_assembly_fa, shallow=False)) clean_cluster_dir(cluster_dir) @@ -194,6 +228,7 @@ def test_parse_assembly_vs_gene_coords(self): clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir) shutil.copyfile(coords_file, c.assembly_vs_gene_coords) + c.gene = pyfastaq.sequences.Fasta('gene', 'AAACCCGGGTTT') c._parse_assembly_vs_gene_coords() line1 = ['1', '1000', '1', '1000', '1000', '1000', '100.00', '1000', '1000', '1', '1', 'gene', 'contig1'] line2 = ['1', '240', '1', '240', '240', '240', '100.00', '1000', '580', '1', '1', 'gene', 'contig2'] diff --git a/ariba/tests/data/cluster_test_assemble_with_spades/gene.fa b/ariba/tests/data/cluster_test_assemble_with_spades.gene.fa similarity index 100% rename from ariba/tests/data/cluster_test_assemble_with_spades/gene.fa rename to ariba/tests/data/cluster_test_assemble_with_spades.gene.fa diff --git a/ariba/tests/data/cluster_test_gapfill_with_gapfiller/gene.fa b/ariba/tests/data/cluster_test_assemble_with_spades/genes.fa similarity index 100% rename from ariba/tests/data/cluster_test_gapfill_with_gapfiller/gene.fa rename to ariba/tests/data/cluster_test_assemble_with_spades/genes.fa diff --git a/ariba/tests/data/cluster_test_assemble_with_velvet/gene.fa b/ariba/tests/data/cluster_test_assemble_with_velvet/gene.fa deleted file mode 100644 index f067cdc9..00000000 --- a/ariba/tests/data/cluster_test_assemble_with_velvet/gene.fa +++ /dev/null @@ -1,9 +0,0 @@ ->02__aac_6prime__1_AY553333 -ATGCAGTACTCCATTCGCTCGGTTCGTGTTTCGGATACATCTGATTGGTTACGCCTTCGC -AATCTCCTGTGGGAAGGGGATGACCACGAAACCGAGATCGCCCAGTTTTTCGCCGGAGCC -CTGGCCGAGCCCAACGAAGTGCTGGTAGCCCATGATGATGCGGGGGCCGTTGTTGGGCAT -GTCGAGTTATCCATCCGCGAGGATGTCGCAGGGCTGGAAGGCATCAGAGCGGGCTATATC -GAAGGCCTGTACATCGAGGAGGCCCATCGCTCGTCCAGCGTCGCGACGCAGTTACTACGG -CACTCCGAGCAATGGGCCCAAAGTCAGGGATGCCGGGCGTTTGCATCGGATCGAGAGGAT -CGCCTGATCATCCATAAGCGGTTTTCTGTGAGTCCGCTTTCTAACCCTTCATTCCAGCGG -ACCGCCTTCGGCGGCCGCTGA diff --git a/ariba/tests/data/cluster_test_assemble_with_velvet/gene.reads_mapped.unsorted.bam b/ariba/tests/data/cluster_test_assemble_with_velvet/gene.reads_mapped.unsorted.bam deleted file mode 100644 index 3b2031f20f707f715427bb3222cbe5ebe283f7e2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9453 zcmZvCWmpt#*tXIL!qO7bAt~Js03Hs(1U zX|$i+S9#?sg;M4$R!$=&CL=qq(o37xm+7m?iR#CetckBXocUo0msRZdNz$W3;=12r zl=O<01Lk@fVqnJ+NlnCZ^5r6;wk^$k;l#BKN8Y57TyDwC{OY5= zg17H1`(KDl=wC>>Bwu`!T~3s)YzsKQUX-zlZsWT%w5}hT%SB*O2i`kgUueZiUrj5! zyAuC1A(Xi-PCO&YzDXRQlM=(#h{Oh${&;X$S4+W8=N;Aj<7H*EsIx(tznhkYq=wo; zvsQFNCmW5BP0;;$wf8A%^Yiul*Q7EXfe-#=m%%r48W_-hwlhk)4+?OHw|I$Fo$`~hKE(U`{Z zF7x2F*)vDWbg0Rxj&n+Y0hQzatd6OU!axVa)HmgzNgaM-ZcN0lVtJd<9DSljzH@8H z#>d%O@M^DM7hb`>5J|URgD}iJzia>3Tu?gJ?P@r{z1VL8MT-NKK*d6+?ib?5J zza^e&x2(5dIt?SETkGZ-Rrf zw&dl&zmf{i^zND_zhlmo8i|G)c>fMtmk3S8qA{G9xHdE6W&BAC!S@q862;a0Kc@VH z^Lu0coK3I6DIyH@+T#SkaYhdVO0W_2lKhqWep0?q@hWbFwb%pe;_gYI)p|6EjK5lp z=!QDSI-3N+TTbw7o35bx;=B>u+X*QKMS2=pY9|{tTA4OtQq`5U;-9(b}aoH_K> z%tuPeX!U%*On>m(h>EUBM4OHf?U80lCa7<`^t%c{fwIFyM?NG*Fa;f@!OAKrU4u~BYDkqQN)@mTem)t z428Xol-g0dhnGR#Z9|L!RX!iP8=FE6+;Rijeqpq;ckX_0eCpjOVIF(>sf`iRe$+v7 zIz;N+vGvAL{VKuOg$Wu}+DZOlx$-Kmr{KM@#a{PJZb@QLC44t(WPg|ke3v$g+NN4RCR-@l?ptq~R6D;W{XE%(H7G51D-ADU1%7c88w*6bZcJGlz0vfst(W@nAI+_?!= z^$X7=qv-agO1w{X#JFX)lbsi{-XFzd#$qs>I&?|rQn;`&U=xUX5J&6z_w6SE^51EW z+@4rZPn-ueJKy!6N=%$x<_OcY)Mo^}HL{u*0wvm4Yk{4H@)1X(`J9NLDXp>lD+M*N zou01kUYE~!8BVDgM8XcTM%wv0b1}}d6}FtXnepB!C-(F^h>v2Gq7I}dg952ezf~wp zM7e=>#Th1A-%o;kb#r(*OV79$g+_nzI=0l}thzvy`q3(~mZI=I{SQx4WwT^Xh%xIH3F`1%)&`~4mcFR90 zeRoyxYoYJnea{#lT{>ce~T#x5f(lP`R1Z6U_ICY)0*gC3T8mipHeZ;z-IfnT>qcEP()g|7hldP;mJxupT$!F~0t07X{?7J`-A`=buw_&>9 zNi|RKHQUQf`Zp3%^*&^p2;GM2xeME3$EXg6XB`N_H;3#T%)nz-+1GZDCp;-!%T!q1 zKrW;xU*X}fTc-QR0AA`)Qko@J6AIOh?3Vh+2 z{hSAufw_GEDRH;F_t(91cwMA6Qgj(JdGxG^qak)pFBImkFe7NZc(s!!ha0gYIX93B zrsXn_jU%;yfX!*~=gIpgBpz!^_jXV(ubp{kOBC~gRhmMQu#I!VbCGiw9`HtMoJRNy z&o=Xq%`!kRmkJGJT2oBsOMIRy1zg1s-h4x?L0v~U#_1YV{@jd>7UA@Sj&q0O>bL|sF4Y0$Hk37x8 z4w>rQN6P9z)f!-O44o06i-o|I>wNmd>Xa%FPBrM+p4~4)fm-sSYPXAjdS`b?oX zK2#UQB7P1RF!;(N+8SO>Pa8x$Jt~m*6}NZ?S6n#rupwiY*+;=8QI^#4Sn{PGN2s*& z-*7EEa$*?_hAX;}X64l#hCW9u#;YT*M19;VEqj6GV#*ho8t7)GqZgPvHPU=P7aO2G<6K-NcMSK{ou(l9RvGt4vEY?WrKP*1overBADaF{Utms5Ony=SaoL_efLDIO%MCL7+n_ z*nCQXm-|`fm?=5$k`3bLrh9KK7~QB-uuh*|`o8X`>UemX(GN&E7cHI_K77TGC6ipv z5xh~rLc9`LKd&Vgh{?3$ooE@4rY`6CmABH{5~(K^sGvKJcI2Jge{BogLnQ(iHDqYA z1#e7u+=ZQ8ep@b}gWi%qw$*%Lc)6hQ%!0xq!WRcsKbMOZBC57$!nQaencidK z4S&mOR(1QQcnxbEuB7HuQY2qRG@xb1Dn|9fvWPAEzQya?HOa&zxHT$=!8c?_N!j=@ zI@FXTN3LMauH@kZSNw=6f=5a`&@%1VB$ofoBdL60g?I7dGYRp8E0%bsU!9VqyT%&0 z-z6JmmwtbDQjM+fOT>aZK;X$bE@aK#=6n=33v^UYCJxnR!UZM{O916?$(=VH&%a53 zRx}^HRFuVuwU6!tB))|uVKG3pr&bh|e82j${?_~}!62VAV%p#U7MYT#iI-lKr72hAP5ldgg7(($$3%!r397>yHNY}QR% zmG>qwpq4HSgov#&LJ_PUUxQ!_6Mt?9pxMaKm!}V9KajNB!8!I$4WDS+j%wL@vw9K{ z8J|TyN6P4odLd2eF0%afPTTAgb-L7pUBIHCUrcndD3*z5Dli`X{`y7lm+)pZPVvsh zou}yX)!Mm4$SNc^tW+|!YPG zPu7ity$s6{_$O>AF`{?ifpvU5U}fagNuM1*tfza=TdLsZ^=_aPn}BKrH|(z;-+)0c z*qb(=-yqRY?1`eQ1e zd)Ke5vS58yh9j<>i@jkzdRmd;VhyM0?b!J1exBn10HgEHzd^sb^Y~9BP-9C3o8TWnsQlm0peStRz zK`~!lr4)0W0W;dD-9xbHxAn{x4Q3Z>RH!R=nG4vUFJJ+r^w|G;Qev;e!+sH5n|=cB zU4?q28&`fLeWMA;BGglx$YEkr;a)fs;vey+UO#HaJh#mJdUM7F+`I}sV=3_HiTvp7 zfm15|&ElEHZ{)^*0I@_fkW|lD9rUvM04<8+?ltg_?GgvxmBpZk`aqpjU;T$rjQF1* z@1?E~f8y|XeQ~juCAx*VAKQl928l8E*CrmY3ERn5^Z-tbkj?C+XGuG1>YU_76q~`V zui0l03LkKb^ZDkp%}>QMp)_HW36&nG4%O#NeIwaxF?=E>p`nP%vlcQR)A%Uy`5{Dt z1FSG~>+dm{nf3NbbNm{2=%YvE)f1%IWXlpv@+rQ1Y=FTG&sI0c5x47zFHkF2Vck>^ z_l$sTsR<>PF#(~weaj53Z=RrsDN9y(zBdMWlw3mQ_+tXF+*%bJ5KbIZdJ7`l=LA}` z5sK!_9GL}@URq=hw5RwQxPJnmo9P1J@_tJ0!y8OMWX+;@Em;;JuT?yZz>tkZuel%F+4X@K3$S> zD(K%RWw#BX|0<-nHe^&?b)GKnJq&FUF{xuSGqf? zsPooiAr$yXFfT@6BIfy4B9(QbC1kMnAV6d>yY+@C_V#sT{qw)y@z!%XSX(g3{cnM9 z@WZ~1+nSc!4@=db5>c@m{un2AH{}%8 zH#C?=Te5^WX*L=~%d|!d667|cCDpV#A{D8_TmGQkbh8{#2uBk$PI#452s(;PyLFQ# z+3La;l^b;|ZkI?NRR^s*p1X=H+Y}HvjY2R_X86=yJXm3yGMZZb20$8S6T%zvShP4s zRMEicWu%1;xC!&=Gt_C8>7QR{Jp0r0h8*9jSVp(G#e(EpcB@`n4l0UVgdPgsFsO}% z@(j_3k)6*IP>bce3oc(XK&)O@&D7~-q_W9pIbJ$f-AT^--;O>c{<4x=aeLzb;%NG1 zWI)gc2wbr2rJjW^-IvW?az+cS5ua$PDeVmSpj&B5Mtw_OIpB_Vogb82Y$_c+X-{Nu zb>;oT%BDG}VDfL7@q?S5741FFR*}TmjybUb#J+q2JlHa z75M?ienzPJv%${e`nql}@qIVZ&z=;}HT4VSsJVv2eHgNbzA#k6umv$?+|Xsd3YC|RE~RF!|SH*ESOBf<=(O$lh4O+QBn{iJmY4m;;^Og&CL&u zp+-A5kUD3wXMgXh_7P*~q76DG4`{voO<}cE<8Ef_46i%*n(9>c^B;@MlDO_bL%2X6j61kf|;yH$Yv0DdaQWy57*q(0u*x)uZ z7s%;Po;yVHwWYgbJbksqnJAgX98rKxrrY+z`7nN#l9IbAXw}B}-n-%VvR7MDjR9OP z&^L=lOQ_X2WqN#FoD-$aAcZd~_#HM!m$2ySTG@A%Dx)_rwupLua?@Chv zQSBJGKSne;`Jdk`(rMO7|5kLPwy7ne419}7Hi>trD0y^|IJ9}o0nFCjR}&T-ED&q4 zd6?_F(ZE3sq&iB9Cm4^aXm#5WR~c_Y;(gnl{gZd$4XYM}uX|WvzHW$Vf`~Y9v$H?h zqp-W9h4AwvEH}_Vwv1gQuqq!Ed^)v-7r1Ki|I@yKN4e#ATAg8Ew~E@hjT|t=o>T4B z|NfT+!Pm>$Hl;#2y!B7mj|qJM;e?M_wox(4wOj)*9z*apAaGku?lpQlv44O(Ob)dW zbLhVFPF6;|zBGGW)#CmPYZ#o4NPCma46#+r#w?wgG+VdGcyuJQ&>Yok9HxY5Ac*Uw zKac)M<5*8S_F|iId-Cghvk}JXSt2WkujNzbHTZvU5_U?q!p@*C`Y5Mu7&bbx6e*KT zbh1sx7<*Cl>A{hMcDA3ol-=g$P$Zu&QN__z$sd)%mrQ*jf=+0!QtgcZ7rGSmz1-E%39@2g>rRhjqnITvK_P z8f+JFp#~G-4~;wBD08Ipm)Z7?nk&L`Za_FrowOyom$L8Tq!rP-#H;!(C9g!LsX!rf zn#v70%CD>56S6(0CdNsyR+cH}+%?S=OR|Jbs+107wLBAg%=a4R%-m)hzUXUD0E`pg z0yq~}%-k}uStp9muvvRz_+MoG%UI`AHCcVv&(xch`RB|HI@eivnW68tDlKgF(_Ixq z%j`jh%k5<($Nm2??Z=zR?1-Wrpd7eBBYG8LI;(|GPLOKa8H{&wcw@feZ_+XtUGRbe zL{fzd5{=I94(UvJsblN;qmvki9fD?mbg9Iz@Us>#)F0JJR?3 zux&DyT(K}%;%fkD_QlO@mbK!Fh2FW%gV2b3f~8lj#Qb3tBJghY@P;m9Ttmb9-`yVr z=WH-?IdEqg4uc(Q-8dW#FE~!?M-`PW#9#&Dmnx_%ZXyy)S>PuNd}v%~DF# z)vum{Ii*~ED}&rmUz==n43wNKobYEs^ehgtAB=e+lq}7T*;r7yCDVuM0Z+QV%m*po zQ~9~HClorG4a5Sbkausa`S|i=RU7*CEnYbLs5flYMuou?fJqW!)ccv{`6v?i^YF~^o|(DC8kYWHeI6F&OC;Vnm)_#7WpIDdt%~i z63FdAT}?U(%xc-Q^OIw97@1OLg>Gy{Nh)l41bep%knvJ6`Re>AGaYgU-fH=t2}p0- zxQS}?qI5M(OxTXC{_y8`gSor0!+uk#tjG!CgHd_Tw&p)EULIWjg8Gm@?JI0dhaT-x zd`>sLMpr&%RRp0($h7~}m7$48$GydLGN()qm!3BENo;8orCP+@Yr<;8=uJbLj|_$R zvb38pvHH+{doSjKc2;sO}Zn$nzSql7+A z%<8G*#Z$UCxb;IqRj1<%aum#6=l2Y$Rwt58D)X78PyQBo*J&VBCHjlVei^a|9mJ1k zqwt3`V;t)=Rn*LhN9UDE`DMzamejsQxJ&D*$0^O>76*%)L`^m=RE#|TaN&@t$Bdba z?|S-SKwW8R|FdJsIBhj-UJY%1N(Zkul?UuWR2HhOb2w_yz~#z# zDOhD9#ZTcz*XOJ_-pBJ%C52G^)|3AmUNCzoDI98u_ndt}5YtgPUC@`V zsw3+|TzL*!W=meZ{;UL($h+uIMo-G$C4FhLkNoj8kga(wgfZh;s=;OMXBjcoh?gY; z?Xfsk=nfp|SnJw3f(&9>V3;C3zHJm|EHA)p;gcEP1g^IzjMdh|_1$$yG3dLI<+1;k zfBi^e{1l(%-zPVAy35%4GU+9!GNG8Qz$Uf2Aak<$ zGI8%02CP=|LZ0e6b~=;gS)zr!lK+HWO|z9(4HSAVkb=jO(D_`$H;n{QpureldeA?A z-fC*)J&?rVT9hEe0=IbU{i zye!V(QdyguZuv-*76ADyNRD1b2|f#_(-@tk_A+ZQZnUVMf4RsWAtc9rO0FIB(q1vC z?p@YL#=M|sXF#JYVhPeFnOI)r)AOQ_<+fi9deVay^Kw@{ZU#s*x;oRxHceTySJdQ& zX?NjdeY+gZO($;hH?v;Yi7kA=_#vcaA2QwcHc+UVe6i6`r#a=g>J!mF_=c1R2oe7_ zkZfa88oI(1$JtK8c%gG~XIt$jhHwR-TSH-QKJxavC*%UB!wo)oYWPrd+h5 z0W*`BVJ8Rpe0JgQRb0rN6QpU(Q)c&_=Me(@0vCAi;d zgO*ftkVQQPV#Y2|p4vlwcgpKOM5$3?aXyF+s?=8rU(z*qR@b3Q8&0LZ=K;5@Az3bB z(LA#N5=E!!e4v}TjkJ<}G_m*iS|{4$9L^c4TE6X{Nbd7F-+1M4IlbHb!%K_;pq=Qw z7hx4sl+$GkZh~Kac6P~WNjeVjg#l0Y*XoZ-yWvMjfCQ@(vKjFoF3ea>3r;DASUn}< ziiT67A`j}ypSLH~es~jxd!cu2hzRgB9HC@S%G1v*#dx%35zmSKlFQH($H%=_Cz;`V zl9i)TI|(3mgQVSmerc@ne!;QpkG&1Y$ULW4z}+F|jiJBATP{gV4V=Io2|{;&b)<;d zX*-S#H2r=VKpcdE+tNcS`q$HyYUFQxhDj z^O!>co`gays1`Tsdv##`Hcmd#^8#)kP^`d% zB|+xA?l&035p>>O-0-(YEiPW%VRM9mIN4;k8}BvZnsAWn@>~UmZ1N8}uM8d-7!Bl@ zOO06cdA{o*4wbX&h^0_g;*fpW3wIE_eKC^&7GUuBM?28=A`@O}cnE3Fu$-KTmr9-= zYKzcG)oAsbV>u+N5j-2ZDnYb4=vR1BM5tUC!pv&vC>4OYoV90n#tSAg6?Ge_==T5lT^nJw=pvEOKb~Cx`~q`=G`xumzn|BSa3Zmr~|`(l}*))eC??he67G^BUH4>YD-CV`qzM{+V4v_MzCki$7o9Y z6gB3=x>eQ9;k)moPG|&Gkazuj+VG%a&J(M@$N@ihp>)Unrds5dWYmlA2qP($qtv$| zUAfr3<8^PJ<6~{1=2pwOQMR;3O|t?!`-dR8rjE|zxl-z#aY;=xY>mPJzf*w3^()R| zf4ZcAcr|U`11yEZHvtc=DhM?nKTF?TY`(9f1GKetqDvs9aGGfN>%JFWZGmUzCuOZ} z$5VCX$fqtBqsE3Zxb2zo;@&kxF7zhJ0zY|LBIys9Fzv1 +AGCGCCTAGCTTTGGCACTTCAGGAGCGCCCGGAAATAATGGCGGGCGATGAAGGTTCTG +TAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAAC +CCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAGAAAGGGGCCGAAATGCAAAA +GTCTTAAGGACTCTGCGAGGCAAAGTACGGGCGAACTAAACCCCCGTGACAGGTCAGACG +TTGTTTCGGCAATCTGTCGCGCTCCCACACCTATAAGCGTACACCGTCTCTTCTGCCAGC diff --git a/ariba/tests/data/cluster_test_choose_best_gene/genes.fa b/ariba/tests/data/cluster_test_choose_best_gene/genes.fa new file mode 100644 index 00000000..ebf51a01 --- /dev/null +++ b/ariba/tests/data/cluster_test_choose_best_gene/genes.fa @@ -0,0 +1,18 @@ +>1 +AGCGCCTAGCTTTGGCACTTCAGGAGCGCCCGGAAATAATGGCGGGCGATGAAGGTTCTG +TAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAAC +CCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAGAAAGGGGCCGAAATGCAAAA +GTCTTAAGGACTCTGCGAGGCAAAGTACGGGCGAACTAAACCCCCGTGACAGGTCAGACG +TTGTTTCGGCAATCTGTCGCGCTCCCACACCTATAAGCGTACACCGTCTCTTCTGCCAGC +>2 +ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAAT +TATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACT +GCCAGTGGCATCTGTGTAAGCGCTTAGCCACGACGGTTGAGTAGGCCACATCCAAGTTCC +TGACGTTTTTTAGCGAGGTTGTCCGAGCGGCCGGCTGCTAGCTCTCTTATCGTGTGAACT +GATTGTTGTTTCTCAAGAGCTCGTTTTTTGCTTGCGAAG +>3 +AGGTGTACCCGTAAGCCGCTACTAGGTAATGATGAGCTAAGGGTCGACGCCTTCTCCGCA +AGATCCTATCATGATCACAAGTGCAGTACACGCGGACGTTATCGAAGAATGTTGAATTTT +TCATCGGATACAGGCAGGACCAGGTACAGATGGTGAGTTTATTTGTATAATGGCCCCGGT +TAGCTAGTACCGCGCCTGTGCGATATCCCCATTTTGCTCCTGCCGTGCTACTAGCGTCGC +CTCAACATCCGCCTCAGCAAGGACTATAATTGCGCAAAGCAAGTGCCAAGAACATTTGGT diff --git a/ariba/tests/data/cluster_test_choose_best_gene/reads_1.fq b/ariba/tests/data/cluster_test_choose_best_gene/reads_1.fq new file mode 100644 index 00000000..bd98e88d --- /dev/null +++ b/ariba/tests/data/cluster_test_choose_best_gene/reads_1.fq @@ -0,0 +1,180 @@ +@1:1:142:249/1 +CTATGGGTAATCAATCCAGAAAGGGGCCGAAATGCAAAAGTCTTAAGGAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:2:79:164/1 +CTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAACCCGTGGCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:3:66:162/1 +ACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:4:113:217/1 +CTGTTAACCCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAGAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:5:69:174/1 +CAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:6:49:156/1 +ATGAAGGTTCTGTAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:7:103:194/1 +GGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCCTATGGGTAAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:8:94:181/1 +GTAATCTGCGGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:9:100:206/1 +TGCGGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCCTATGGGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:10:93:185/1 +TGTAATCTGCGGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:11:72:166/1 +GATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAACC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:12:98:187/1 +TCTGCGGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCCTATGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:13:111:229/1 +CCCTGTTAACCCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:14:73:170/1 +ATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAACCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:15:68:164/1 +GCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:16:130:234/1 +ATCTGTGTAAGCGCTTAGCCACGACGGTTGAGTTGGCCACATCCAAGTTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:17:78:190/1 +TTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:18:36:130/1 +TGCGCCACGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:19:31:119/1 +GTACTTGCGCCACGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:20:45:166/1 +CTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:21:50:163/1 +TTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:22:119:227/1 +CTGCCAGTGGCATCTGTGTAAGCGCTTAGCCACGACGGTTGAGTTGGCCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:23:85:191/1 +TCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:24:34:149/1 +CTTGCGCCACGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:25:139:236/1 +AGCGCTTAGCCACGACGGTTGAGTTGGCCACATCCAAGTTCCTGACGTTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:26:73:145/1 +TCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:27:39:139/1 +GCCACGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:28:73:180/1 +TCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:29:143:216/1 +CTTAGCCACGACGGTTGAGTTGGCCACATCCAAGTTCCTGACGTTTTTTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:30:73:170/1 +TCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:31:155:249/1 +GAGTTTATTTGTATAATGGCCCCGGTTAGCTAGTACCGCGCCTGTGCGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:32:27:129/1 +TAATGATGAGCTAAGGGTCGACGCCTTCTCCGCAATGGAGATCCTATCAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:33:15:99/1 +GCCGCTACTAGGTAATGATGAGCTAAGGGTCGACGCCTTCTCCGCAATGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:34:81:172/1 +ACAAGTGCAGTACACGCGGACGTTATCGAAGAATGTTGAATCATCGGATA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:35:145:243/1 +TACAGATGGTGAGTTTATTTGTATAATGGCCCCGGTTAGCTAGTACCGCG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:36:62:163/1 +TGGAGATCCTATCATGATCACAAGTGCAGTACACGCGGACGTTATCGAAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:37:117:219/1 +TGAATCATCGGATACAGGCAGGACCAGGTACAGATGGTGAGTTTATTTGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:38:57:145/1 +CGCAATGGAGATCCTATCATGATCACAAGTGCAGTACACGCGGACGTTAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:39:40:152/1 +AGGGTCGACGCCTTCTCCGCAATGGAGATCCTATCATGATCACAAGTGCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:40:95:200/1 +CGCGGACGTTATCGAAGAATGTTGAATCATCGGATACAGGCAGGACCAGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:41:24:119/1 +AGGTAATGATGAGCTAAGGGTCGACGCCTTCTCCGCAATGGAGATCCTAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:42:41:153/1 +GGGTCGACGCCTTCTCCGCAATGGAGATCCTATCATGATCACAAGTGCAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:43:86:189/1 +TGCAGTACACGCGGACGTTATCGAAGAATGTTGAATCATCGGATACAGGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:44:25:110/1 +GGTAATGATGAGCTAAGGGTCGACGCCTTCTCCGCAATGGAGATCCTATC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:45:99:196/1 +GACGTTATCGAAGAATGTTGAATCATCGGATACAGGCAGGACCAGGTACA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII diff --git a/ariba/tests/data/cluster_test_choose_best_gene/reads_2.fq b/ariba/tests/data/cluster_test_choose_best_gene/reads_2.fq new file mode 100644 index 00000000..23083a69 --- /dev/null +++ b/ariba/tests/data/cluster_test_choose_best_gene/reads_2.fq @@ -0,0 +1,180 @@ +@1:1:142:249/2 +TGGCAGAAGAGACGGTGTACGCTTATAGGTGTGGGAGCGCGACAGATTGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:2:79:164/2 +CGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCCCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:3:66:162/2 +CCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCCCCTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:4:113:217/2 +GGGAGCGCGACAGATTGCCGAAACAACGTCTGACCTGTCACGGGGGTTTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:5:69:174/2 +GGGTTTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:6:49:156/2 +CTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCCCCTTTCTGGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:7:103:194/2 +CAACGTCTGACCTGTCACGGGGGTTTAGTTCGCCCGTACTTTGCCTCGCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:8:94:181/2 +GTCACGGGGGTTTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCTTAAGAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:9:100:206/2 +AGATTGCCGAAACAACGTCTGACCTGTCACGGGGGTTTAGTTCGCCCGTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:10:93:185/2 +ACCTGTCACGGGGGTTTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCTTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:11:72:166/2 +TTCGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:12:98:187/2 +TGACCTGTCACGGGGGTTTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:13:111:229/2 +GCTTATAGGTGTGGGAGCGCGACAGATTGCCGAAACAACGTCTGACCTGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:14:73:170/2 +TTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:15:68:164/2 +CGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCCCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:16:130:234/2 +GAACCTTCGCAAGCAAAAAACGAGCTCTTGAGAAACAACAATCAGTTCAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:17:78:190/2 +GTTCACACGATAAGAGAGCTAGCAGCCGGCCGCTCGGACAACCTCGCTAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:18:36:130/2 +GAACTTGGATGTGGCCAACTCAACCGTCGTGGCTAAGCGCTTACACAGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:19:31:119/2 +TGGCCAACTCAACCGTCGTGGCTAAGCGCTTACACAGATGCCACTGGCAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:20:45:166/2 +GCCGGCCGCTCGGACAACCTCGCTAAAAAACGTCAGGAACTTGGATGTGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:21:50:163/2 +GGCCGCTCGGACAACCTCGCTAAAAAACGTCAGGAACTTGGATGTGGCCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:22:119:227/2 +CGCAAGCAAAAAACGAGCTCTTGAGAAACAACAATCAGTTCACACGATAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:23:85:191/2 +AGTTCACACGATAAGAGAGCTAGCAGCCGGCCGCTCGGACAACCTCGCTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:24:34:149/2 +CCTCGCTAAAAAACGTCAGGAACTTGGATGTGGCCAACTCAACCGTCGTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:25:139:236/2 +CGGAACCTTCGCAAGCAAAAAACGAGCTCTTGAGAAACAACAATCAGTTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:26:73:145/2 +GCTAAAAAACGTCAGGAACTTGGATGTGGCCAACTCAACCGTCGTGGCTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:27:39:139/2 +AAACGTCAGGAACTTGGATGTGGCCAACTCAACCGTCGTGGCTAAGCGCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:28:73:180/2 +TAAGAGAGCTAGCAGCCGGCCGCTCGGACAACCTCGCTAAAAAACGTCAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:29:143:216/2 +AACGAGCTCTTGAGAAACAACAATCAGTTCACACGATAAGAGAGCTAGCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:30:73:170/2 +AGCAGCCGGCCGCTCGGACAACCTCGCTAAAAAACGTCAGGAACTTGGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:31:155:249/2 +CAAATGTTCTTGGCACTTGCTTTGCGCAATTATAGTCCTTGCTGAGGCGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:32:27:129/2 +CGGGGCCATTATACAAATAAACTCACCATCTGTACCTGGTCCTGCCTGTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:33:15:99/2 +TGTACCTGGTCCTGCCTGTATCCGATGATTCAACATTCTTCGATAACGTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:34:81:172/2 +AGGAGCAAAATGGGGATATCGCACAGGCGCGGTACTAGCTAACCGGGGCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:35:145:243/2 +TTCTTGGCACTTGCTTTGCGCAATTATAGTCCTTGCTGAGGCGGATGTTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:36:62:163/2 +ATGGGGATATCGCACAGGCGCGGTACTAGCTAACCGGGGCCATTATACAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:37:117:219/2 +TATAGTCCTTGCTGAGGCGGATGTTGAGGCGACGCTAGTAGCACGGCAGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:38:57:145/2 +CGCGGTACTAGCTAACCGGGGCCATTATACAAATAAACTCACCATCTGTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:39:40:152/2 +GCACAGGCGCGGTACTAGCTAACCGGGGCCATTATACAAATAAACTCACC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:40:95:200/2 +GATGTTGAGGCGACGCTAGTAGCACGGCAGGAGCAAAATGGGGATATCGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:41:24:119/2 +ATACAAATAAACTCACCATCTGTACCTGGTCCTGCCTGTATCCGATGATT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:42:41:153/2 +CGCACAGGCGCGGTACTAGCTAACCGGGGCCATTATACAAATAAACTCAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:43:86:189/2 +GACGCTAGTAGCACGGCAGGAGCAAAATGGGGATATCGCACAGGCGCGGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:44:25:110/2 +AACTCACCATCTGTACCTGGTCCTGCCTGTATCCGATGATTCAACATTCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:45:99:196/2 +TTGAGGCGACGCTAGTAGCACGGCAGGAGCAAAATGGGGATATCGCACAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII diff --git a/ariba/tests/data/cluster_test_fix_contig_orientation/gene.fa b/ariba/tests/data/cluster_test_fix_contig_orientation.gene.fa similarity index 100% rename from ariba/tests/data/cluster_test_fix_contig_orientation/gene.fa rename to ariba/tests/data/cluster_test_fix_contig_orientation.gene.fa diff --git a/ariba/tests/data/cluster_test_fix_contig_orientation/genes.fa b/ariba/tests/data/cluster_test_fix_contig_orientation/genes.fa new file mode 100644 index 00000000..5d5102b8 --- /dev/null +++ b/ariba/tests/data/cluster_test_fix_contig_orientation/genes.fa @@ -0,0 +1,10 @@ +>gene +ACTTACCGGTTCGGGGTCTAAACCAACCATTAAACTGCGACAACCATTCATCCTGGAGTA +CGCTTCGGTCCACCATGATGGAGCGCCATGTGATGGGATTTCCAACCCCGTTGTTTCAGG +ACTCATGGCATTTACCACCGACAACCGTTTATAATCCATGAGCAAGGAATACAGTGGAGA +CAGGATTGGTTGTATTGGACTGAATACATGCCCCACTGTTACCCCGAAAGTTAACACGTA +CCCATAGTTTATTTAAACTAGGCACTCCCGATCAGCCAAGACTTAAAAAGGGGGATAGGA +ATATCAACGTAGTACTTCTCGGTTGATCCGTGTTTTTTAATCTAAAATATAATGTGTAGG +CAGCTATCGTGCTAATCGTTGAAATGAGCAGGCGAAATGCCGTTTACAACGACGCTAAAC +CTCCAAGTCGAATTAAGCCAAATTGTGCCTTCCATATGACCTCCACAGATTTGGGCTGGC +ACTGTCAGCGTAGTTGCGCT diff --git a/ariba/tests/data/cluster_test_generic/gene.fa b/ariba/tests/data/cluster_test_gapfill_with_gapfiller.gene.fa similarity index 100% rename from ariba/tests/data/cluster_test_generic/gene.fa rename to ariba/tests/data/cluster_test_gapfill_with_gapfiller.gene.fa diff --git a/ariba/tests/data/cluster_test_init_no_reads_1/gene.fa b/ariba/tests/data/cluster_test_gapfill_with_gapfiller/genes.fa similarity index 100% rename from ariba/tests/data/cluster_test_init_no_reads_1/gene.fa rename to ariba/tests/data/cluster_test_gapfill_with_gapfiller/genes.fa diff --git a/ariba/tests/data/cluster_test_init_no_reads_2/gene.fa b/ariba/tests/data/cluster_test_generic/genes.fa similarity index 100% rename from ariba/tests/data/cluster_test_init_no_reads_2/gene.fa rename to ariba/tests/data/cluster_test_generic/genes.fa diff --git a/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/genes.fa b/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/genes.fa new file mode 100644 index 00000000..ebf51a01 --- /dev/null +++ b/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/genes.fa @@ -0,0 +1,18 @@ +>1 +AGCGCCTAGCTTTGGCACTTCAGGAGCGCCCGGAAATAATGGCGGGCGATGAAGGTTCTG +TAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAAC +CCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAGAAAGGGGCCGAAATGCAAAA +GTCTTAAGGACTCTGCGAGGCAAAGTACGGGCGAACTAAACCCCCGTGACAGGTCAGACG +TTGTTTCGGCAATCTGTCGCGCTCCCACACCTATAAGCGTACACCGTCTCTTCTGCCAGC +>2 +ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAAT +TATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACT +GCCAGTGGCATCTGTGTAAGCGCTTAGCCACGACGGTTGAGTAGGCCACATCCAAGTTCC +TGACGTTTTTTAGCGAGGTTGTCCGAGCGGCCGGCTGCTAGCTCTCTTATCGTGTGAACT +GATTGTTGTTTCTCAAGAGCTCGTTTTTTGCTTGCGAAG +>3 +AGGTGTACCCGTAAGCCGCTACTAGGTAATGATGAGCTAAGGGTCGACGCCTTCTCCGCA +AGATCCTATCATGATCACAAGTGCAGTACACGCGGACGTTATCGAAGAATGTTGAATTTT +TCATCGGATACAGGCAGGACCAGGTACAGATGGTGAGTTTATTTGTATAATGGCCCCGGT +TAGCTAGTACCGCGCCTGTGCGATATCCCCATTTTGCTCCTGCCGTGCTACTAGCGTCGC +CTCAACATCCGCCTCAGCAAGGACTATAATTGCGCAAAGCAAGTGCCAAGAACATTTGGT diff --git a/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_1.fq b/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_1.fq new file mode 100644 index 00000000..bd98e88d --- /dev/null +++ b/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_1.fq @@ -0,0 +1,180 @@ +@1:1:142:249/1 +CTATGGGTAATCAATCCAGAAAGGGGCCGAAATGCAAAAGTCTTAAGGAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:2:79:164/1 +CTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAACCCGTGGCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:3:66:162/1 +ACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:4:113:217/1 +CTGTTAACCCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAGAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:5:69:174/1 +CAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:6:49:156/1 +ATGAAGGTTCTGTAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:7:103:194/1 +GGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCCTATGGGTAAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:8:94:181/1 +GTAATCTGCGGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:9:100:206/1 +TGCGGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCCTATGGGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:10:93:185/1 +TGTAATCTGCGGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:11:72:166/1 +GATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAACC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:12:98:187/1 +TCTGCGGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCCTATGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:13:111:229/1 +CCCTGTTAACCCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:14:73:170/1 +ATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAACCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:15:68:164/1 +GCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:16:130:234/1 +ATCTGTGTAAGCGCTTAGCCACGACGGTTGAGTTGGCCACATCCAAGTTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:17:78:190/1 +TTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:18:36:130/1 +TGCGCCACGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:19:31:119/1 +GTACTTGCGCCACGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:20:45:166/1 +CTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:21:50:163/1 +TTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:22:119:227/1 +CTGCCAGTGGCATCTGTGTAAGCGCTTAGCCACGACGGTTGAGTTGGCCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:23:85:191/1 +TCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:24:34:149/1 +CTTGCGCCACGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:25:139:236/1 +AGCGCTTAGCCACGACGGTTGAGTTGGCCACATCCAAGTTCCTGACGTTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:26:73:145/1 +TCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:27:39:139/1 +GCCACGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:28:73:180/1 +TCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:29:143:216/1 +CTTAGCCACGACGGTTGAGTTGGCCACATCCAAGTTCCTGACGTTTTTTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:30:73:170/1 +TCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:31:155:249/1 +GAGTTTATTTGTATAATGGCCCCGGTTAGCTAGTACCGCGCCTGTGCGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:32:27:129/1 +TAATGATGAGCTAAGGGTCGACGCCTTCTCCGCAATGGAGATCCTATCAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:33:15:99/1 +GCCGCTACTAGGTAATGATGAGCTAAGGGTCGACGCCTTCTCCGCAATGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:34:81:172/1 +ACAAGTGCAGTACACGCGGACGTTATCGAAGAATGTTGAATCATCGGATA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:35:145:243/1 +TACAGATGGTGAGTTTATTTGTATAATGGCCCCGGTTAGCTAGTACCGCG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:36:62:163/1 +TGGAGATCCTATCATGATCACAAGTGCAGTACACGCGGACGTTATCGAAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:37:117:219/1 +TGAATCATCGGATACAGGCAGGACCAGGTACAGATGGTGAGTTTATTTGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:38:57:145/1 +CGCAATGGAGATCCTATCATGATCACAAGTGCAGTACACGCGGACGTTAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:39:40:152/1 +AGGGTCGACGCCTTCTCCGCAATGGAGATCCTATCATGATCACAAGTGCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:40:95:200/1 +CGCGGACGTTATCGAAGAATGTTGAATCATCGGATACAGGCAGGACCAGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:41:24:119/1 +AGGTAATGATGAGCTAAGGGTCGACGCCTTCTCCGCAATGGAGATCCTAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:42:41:153/1 +GGGTCGACGCCTTCTCCGCAATGGAGATCCTATCATGATCACAAGTGCAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:43:86:189/1 +TGCAGTACACGCGGACGTTATCGAAGAATGTTGAATCATCGGATACAGGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:44:25:110/1 +GGTAATGATGAGCTAAGGGTCGACGCCTTCTCCGCAATGGAGATCCTATC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:45:99:196/1 +GACGTTATCGAAGAATGTTGAATCATCGGATACAGGCAGGACCAGGTACA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII diff --git a/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_2.fq b/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_2.fq new file mode 100644 index 00000000..23083a69 --- /dev/null +++ b/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_2.fq @@ -0,0 +1,180 @@ +@1:1:142:249/2 +TGGCAGAAGAGACGGTGTACGCTTATAGGTGTGGGAGCGCGACAGATTGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:2:79:164/2 +CGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCCCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:3:66:162/2 +CCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCCCCTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:4:113:217/2 +GGGAGCGCGACAGATTGCCGAAACAACGTCTGACCTGTCACGGGGGTTTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:5:69:174/2 +GGGTTTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:6:49:156/2 +CTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCCCCTTTCTGGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:7:103:194/2 +CAACGTCTGACCTGTCACGGGGGTTTAGTTCGCCCGTACTTTGCCTCGCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:8:94:181/2 +GTCACGGGGGTTTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCTTAAGAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:9:100:206/2 +AGATTGCCGAAACAACGTCTGACCTGTCACGGGGGTTTAGTTCGCCCGTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:10:93:185/2 +ACCTGTCACGGGGGTTTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCTTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:11:72:166/2 +TTCGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:12:98:187/2 +TGACCTGTCACGGGGGTTTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:13:111:229/2 +GCTTATAGGTGTGGGAGCGCGACAGATTGCCGAAACAACGTCTGACCTGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:14:73:170/2 +TTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:15:68:164/2 +CGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCCCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:16:130:234/2 +GAACCTTCGCAAGCAAAAAACGAGCTCTTGAGAAACAACAATCAGTTCAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:17:78:190/2 +GTTCACACGATAAGAGAGCTAGCAGCCGGCCGCTCGGACAACCTCGCTAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:18:36:130/2 +GAACTTGGATGTGGCCAACTCAACCGTCGTGGCTAAGCGCTTACACAGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:19:31:119/2 +TGGCCAACTCAACCGTCGTGGCTAAGCGCTTACACAGATGCCACTGGCAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:20:45:166/2 +GCCGGCCGCTCGGACAACCTCGCTAAAAAACGTCAGGAACTTGGATGTGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:21:50:163/2 +GGCCGCTCGGACAACCTCGCTAAAAAACGTCAGGAACTTGGATGTGGCCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:22:119:227/2 +CGCAAGCAAAAAACGAGCTCTTGAGAAACAACAATCAGTTCACACGATAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:23:85:191/2 +AGTTCACACGATAAGAGAGCTAGCAGCCGGCCGCTCGGACAACCTCGCTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:24:34:149/2 +CCTCGCTAAAAAACGTCAGGAACTTGGATGTGGCCAACTCAACCGTCGTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:25:139:236/2 +CGGAACCTTCGCAAGCAAAAAACGAGCTCTTGAGAAACAACAATCAGTTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:26:73:145/2 +GCTAAAAAACGTCAGGAACTTGGATGTGGCCAACTCAACCGTCGTGGCTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:27:39:139/2 +AAACGTCAGGAACTTGGATGTGGCCAACTCAACCGTCGTGGCTAAGCGCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:28:73:180/2 +TAAGAGAGCTAGCAGCCGGCCGCTCGGACAACCTCGCTAAAAAACGTCAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:29:143:216/2 +AACGAGCTCTTGAGAAACAACAATCAGTTCACACGATAAGAGAGCTAGCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:30:73:170/2 +AGCAGCCGGCCGCTCGGACAACCTCGCTAAAAAACGTCAGGAACTTGGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:31:155:249/2 +CAAATGTTCTTGGCACTTGCTTTGCGCAATTATAGTCCTTGCTGAGGCGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:32:27:129/2 +CGGGGCCATTATACAAATAAACTCACCATCTGTACCTGGTCCTGCCTGTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:33:15:99/2 +TGTACCTGGTCCTGCCTGTATCCGATGATTCAACATTCTTCGATAACGTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:34:81:172/2 +AGGAGCAAAATGGGGATATCGCACAGGCGCGGTACTAGCTAACCGGGGCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:35:145:243/2 +TTCTTGGCACTTGCTTTGCGCAATTATAGTCCTTGCTGAGGCGGATGTTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:36:62:163/2 +ATGGGGATATCGCACAGGCGCGGTACTAGCTAACCGGGGCCATTATACAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:37:117:219/2 +TATAGTCCTTGCTGAGGCGGATGTTGAGGCGACGCTAGTAGCACGGCAGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:38:57:145/2 +CGCGGTACTAGCTAACCGGGGCCATTATACAAATAAACTCACCATCTGTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:39:40:152/2 +GCACAGGCGCGGTACTAGCTAACCGGGGCCATTATACAAATAAACTCACC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:40:95:200/2 +GATGTTGAGGCGACGCTAGTAGCACGGCAGGAGCAAAATGGGGATATCGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:41:24:119/2 +ATACAAATAAACTCACCATCTGTACCTGGTCCTGCCTGTATCCGATGATT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:42:41:153/2 +CGCACAGGCGCGGTACTAGCTAACCGGGGCCATTATACAAATAAACTCAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:43:86:189/2 +GACGCTAGTAGCACGGCAGGAGCAAAATGGGGATATCGCACAGGCGCGGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:44:25:110/2 +AACTCACCATCTGTACCTGGTCCTGCCTGTATCCGATGATTCAACATTCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:45:99:196/2 +TTGAGGCGACGCTAGTAGCACGGCAGGAGCAAAATGGGGATATCGCACAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII diff --git a/ariba/tests/data/cluster_test_get_gene/gene.fa b/ariba/tests/data/cluster_test_get_gene/gene.fa deleted file mode 100644 index c20d11d8..00000000 --- a/ariba/tests/data/cluster_test_get_gene/gene.fa +++ /dev/null @@ -1,2 +0,0 @@ ->name_of_gene -CATGCGAAAGAAAAC diff --git a/ariba/tests/data/cluster_test_get_gene/reads_1.fq b/ariba/tests/data/cluster_test_get_gene/reads_1.fq deleted file mode 100644 index 6ff1e12f..00000000 --- a/ariba/tests/data/cluster_test_get_gene/reads_1.fq +++ /dev/null @@ -1,4 +0,0 @@ -@read1/1 -ACGTACGT -+ -IIIIIIII diff --git a/ariba/tests/data/cluster_test_get_gene/reads_2.fq b/ariba/tests/data/cluster_test_get_gene/reads_2.fq deleted file mode 100644 index 2eb387ff..00000000 --- a/ariba/tests/data/cluster_test_get_gene/reads_2.fq +++ /dev/null @@ -1,4 +0,0 @@ -@read1/2 -ACGTACGT -+ -IIIIIIII diff --git a/ariba/tests/data/cluster_test_get_total_alignment_score/genes.fa b/ariba/tests/data/cluster_test_get_total_alignment_score/genes.fa new file mode 100644 index 00000000..ebf51a01 --- /dev/null +++ b/ariba/tests/data/cluster_test_get_total_alignment_score/genes.fa @@ -0,0 +1,18 @@ +>1 +AGCGCCTAGCTTTGGCACTTCAGGAGCGCCCGGAAATAATGGCGGGCGATGAAGGTTCTG +TAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAAC +CCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAGAAAGGGGCCGAAATGCAAAA +GTCTTAAGGACTCTGCGAGGCAAAGTACGGGCGAACTAAACCCCCGTGACAGGTCAGACG +TTGTTTCGGCAATCTGTCGCGCTCCCACACCTATAAGCGTACACCGTCTCTTCTGCCAGC +>2 +ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAAT +TATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACT +GCCAGTGGCATCTGTGTAAGCGCTTAGCCACGACGGTTGAGTAGGCCACATCCAAGTTCC +TGACGTTTTTTAGCGAGGTTGTCCGAGCGGCCGGCTGCTAGCTCTCTTATCGTGTGAACT +GATTGTTGTTTCTCAAGAGCTCGTTTTTTGCTTGCGAAG +>3 +AGGTGTACCCGTAAGCCGCTACTAGGTAATGATGAGCTAAGGGTCGACGCCTTCTCCGCA +AGATCCTATCATGATCACAAGTGCAGTACACGCGGACGTTATCGAAGAATGTTGAATTTT +TCATCGGATACAGGCAGGACCAGGTACAGATGGTGAGTTTATTTGTATAATGGCCCCGGT +TAGCTAGTACCGCGCCTGTGCGATATCCCCATTTTGCTCCTGCCGTGCTACTAGCGTCGC +CTCAACATCCGCCTCAGCAAGGACTATAATTGCGCAAAGCAAGTGCCAAGAACATTTGGT diff --git a/ariba/tests/data/cluster_test_get_total_alignment_score/reads_1.fq b/ariba/tests/data/cluster_test_get_total_alignment_score/reads_1.fq new file mode 100644 index 00000000..bd98e88d --- /dev/null +++ b/ariba/tests/data/cluster_test_get_total_alignment_score/reads_1.fq @@ -0,0 +1,180 @@ +@1:1:142:249/1 +CTATGGGTAATCAATCCAGAAAGGGGCCGAAATGCAAAAGTCTTAAGGAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:2:79:164/1 +CTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAACCCGTGGCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:3:66:162/1 +ACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:4:113:217/1 +CTGTTAACCCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAGAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:5:69:174/1 +CAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:6:49:156/1 +ATGAAGGTTCTGTAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:7:103:194/1 +GGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCCTATGGGTAAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:8:94:181/1 +GTAATCTGCGGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:9:100:206/1 +TGCGGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCCTATGGGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:10:93:185/1 +TGTAATCTGCGGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:11:72:166/1 +GATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAACC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:12:98:187/1 +TCTGCGGGTCAGACCCTGTTAACCCGTGGCTTTCACACTCCCTCCTATGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:13:111:229/1 +CCCTGTTAACCCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:14:73:170/1 +ATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAACCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:15:68:164/1 +GCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:16:130:234/1 +ATCTGTGTAAGCGCTTAGCCACGACGGTTGAGTTGGCCACATCCAAGTTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:17:78:190/1 +TTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:18:36:130/1 +TGCGCCACGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:19:31:119/1 +GTACTTGCGCCACGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:20:45:166/1 +CTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:21:50:163/1 +TTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:22:119:227/1 +CTGCCAGTGGCATCTGTGTAAGCGCTTAGCCACGACGGTTGAGTTGGCCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:23:85:191/1 +TCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:24:34:149/1 +CTTGCGCCACGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:25:139:236/1 +AGCGCTTAGCCACGACGGTTGAGTTGGCCACATCCAAGTTCCTGACGTTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:26:73:145/1 +TCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:27:39:139/1 +GCCACGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:28:73:180/1 +TCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:29:143:216/1 +CTTAGCCACGACGGTTGAGTTGGCCACATCCAAGTTCCTGACGTTTTTTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:30:73:170/1 +TCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:31:155:249/1 +GAGTTTATTTGTATAATGGCCCCGGTTAGCTAGTACCGCGCCTGTGCGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:32:27:129/1 +TAATGATGAGCTAAGGGTCGACGCCTTCTCCGCAATGGAGATCCTATCAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:33:15:99/1 +GCCGCTACTAGGTAATGATGAGCTAAGGGTCGACGCCTTCTCCGCAATGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:34:81:172/1 +ACAAGTGCAGTACACGCGGACGTTATCGAAGAATGTTGAATCATCGGATA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:35:145:243/1 +TACAGATGGTGAGTTTATTTGTATAATGGCCCCGGTTAGCTAGTACCGCG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:36:62:163/1 +TGGAGATCCTATCATGATCACAAGTGCAGTACACGCGGACGTTATCGAAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:37:117:219/1 +TGAATCATCGGATACAGGCAGGACCAGGTACAGATGGTGAGTTTATTTGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:38:57:145/1 +CGCAATGGAGATCCTATCATGATCACAAGTGCAGTACACGCGGACGTTAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:39:40:152/1 +AGGGTCGACGCCTTCTCCGCAATGGAGATCCTATCATGATCACAAGTGCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:40:95:200/1 +CGCGGACGTTATCGAAGAATGTTGAATCATCGGATACAGGCAGGACCAGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:41:24:119/1 +AGGTAATGATGAGCTAAGGGTCGACGCCTTCTCCGCAATGGAGATCCTAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:42:41:153/1 +GGGTCGACGCCTTCTCCGCAATGGAGATCCTATCATGATCACAAGTGCAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:43:86:189/1 +TGCAGTACACGCGGACGTTATCGAAGAATGTTGAATCATCGGATACAGGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:44:25:110/1 +GGTAATGATGAGCTAAGGGTCGACGCCTTCTCCGCAATGGAGATCCTATC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:45:99:196/1 +GACGTTATCGAAGAATGTTGAATCATCGGATACAGGCAGGACCAGGTACA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII diff --git a/ariba/tests/data/cluster_test_get_total_alignment_score/reads_2.fq b/ariba/tests/data/cluster_test_get_total_alignment_score/reads_2.fq new file mode 100644 index 00000000..23083a69 --- /dev/null +++ b/ariba/tests/data/cluster_test_get_total_alignment_score/reads_2.fq @@ -0,0 +1,180 @@ +@1:1:142:249/2 +TGGCAGAAGAGACGGTGTACGCTTATAGGTGTGGGAGCGCGACAGATTGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:2:79:164/2 +CGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCCCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:3:66:162/2 +CCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCCCCTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:4:113:217/2 +GGGAGCGCGACAGATTGCCGAAACAACGTCTGACCTGTCACGGGGGTTTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:5:69:174/2 +GGGTTTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:6:49:156/2 +CTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCCCCTTTCTGGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:7:103:194/2 +CAACGTCTGACCTGTCACGGGGGTTTAGTTCGCCCGTACTTTGCCTCGCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:8:94:181/2 +GTCACGGGGGTTTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCTTAAGAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:9:100:206/2 +AGATTGCCGAAACAACGTCTGACCTGTCACGGGGGTTTAGTTCGCCCGTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:10:93:185/2 +ACCTGTCACGGGGGTTTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCTTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:11:72:166/2 +TTCGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:12:98:187/2 +TGACCTGTCACGGGGGTTTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:13:111:229/2 +GCTTATAGGTGTGGGAGCGCGACAGATTGCCGAAACAACGTCTGACCTGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:14:73:170/2 +TTAGTTCGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:15:68:164/2 +CGCCCGTACTTTGCCTCGCAGAGTCCTTAAGACTTTTGCATTTCGGCCCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:16:130:234/2 +GAACCTTCGCAAGCAAAAAACGAGCTCTTGAGAAACAACAATCAGTTCAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:17:78:190/2 +GTTCACACGATAAGAGAGCTAGCAGCCGGCCGCTCGGACAACCTCGCTAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:18:36:130/2 +GAACTTGGATGTGGCCAACTCAACCGTCGTGGCTAAGCGCTTACACAGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:19:31:119/2 +TGGCCAACTCAACCGTCGTGGCTAAGCGCTTACACAGATGCCACTGGCAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:20:45:166/2 +GCCGGCCGCTCGGACAACCTCGCTAAAAAACGTCAGGAACTTGGATGTGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:21:50:163/2 +GGCCGCTCGGACAACCTCGCTAAAAAACGTCAGGAACTTGGATGTGGCCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:22:119:227/2 +CGCAAGCAAAAAACGAGCTCTTGAGAAACAACAATCAGTTCACACGATAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:23:85:191/2 +AGTTCACACGATAAGAGAGCTAGCAGCCGGCCGCTCGGACAACCTCGCTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:24:34:149/2 +CCTCGCTAAAAAACGTCAGGAACTTGGATGTGGCCAACTCAACCGTCGTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:25:139:236/2 +CGGAACCTTCGCAAGCAAAAAACGAGCTCTTGAGAAACAACAATCAGTTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:26:73:145/2 +GCTAAAAAACGTCAGGAACTTGGATGTGGCCAACTCAACCGTCGTGGCTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:27:39:139/2 +AAACGTCAGGAACTTGGATGTGGCCAACTCAACCGTCGTGGCTAAGCGCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:28:73:180/2 +TAAGAGAGCTAGCAGCCGGCCGCTCGGACAACCTCGCTAAAAAACGTCAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:29:143:216/2 +AACGAGCTCTTGAGAAACAACAATCAGTTCACACGATAAGAGAGCTAGCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@2:30:73:170/2 +AGCAGCCGGCCGCTCGGACAACCTCGCTAAAAAACGTCAGGAACTTGGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:31:155:249/2 +CAAATGTTCTTGGCACTTGCTTTGCGCAATTATAGTCCTTGCTGAGGCGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:32:27:129/2 +CGGGGCCATTATACAAATAAACTCACCATCTGTACCTGGTCCTGCCTGTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:33:15:99/2 +TGTACCTGGTCCTGCCTGTATCCGATGATTCAACATTCTTCGATAACGTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:34:81:172/2 +AGGAGCAAAATGGGGATATCGCACAGGCGCGGTACTAGCTAACCGGGGCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:35:145:243/2 +TTCTTGGCACTTGCTTTGCGCAATTATAGTCCTTGCTGAGGCGGATGTTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:36:62:163/2 +ATGGGGATATCGCACAGGCGCGGTACTAGCTAACCGGGGCCATTATACAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:37:117:219/2 +TATAGTCCTTGCTGAGGCGGATGTTGAGGCGACGCTAGTAGCACGGCAGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:38:57:145/2 +CGCGGTACTAGCTAACCGGGGCCATTATACAAATAAACTCACCATCTGTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:39:40:152/2 +GCACAGGCGCGGTACTAGCTAACCGGGGCCATTATACAAATAAACTCACC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:40:95:200/2 +GATGTTGAGGCGACGCTAGTAGCACGGCAGGAGCAAAATGGGGATATCGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:41:24:119/2 +ATACAAATAAACTCACCATCTGTACCTGGTCCTGCCTGTATCCGATGATT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:42:41:153/2 +CGCACAGGCGCGGTACTAGCTAACCGGGGCCATTATACAAATAAACTCAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:43:86:189/2 +GACGCTAGTAGCACGGCAGGAGCAAAATGGGGATATCGCACAGGCGCGGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:44:25:110/2 +AACTCACCATCTGTACCTGGTCCTGCCTGTATCCGATGATTCAACATTCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@3:45:99:196/2 +TTGAGGCGACGCTAGTAGCACGGCAGGAGCAAAATGGGGATATCGCACAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII diff --git a/ariba/tests/data/cluster_test_load_final_contigs/gene.fa b/ariba/tests/data/cluster_test_init_no_reads_1/genes.fa similarity index 100% rename from ariba/tests/data/cluster_test_load_final_contigs/gene.fa rename to ariba/tests/data/cluster_test_init_no_reads_1/genes.fa diff --git a/ariba/tests/data/cluster_test_parse_assembly_bam/gene.fa b/ariba/tests/data/cluster_test_init_no_reads_2/genes.fa similarity index 100% rename from ariba/tests/data/cluster_test_parse_assembly_bam/gene.fa rename to ariba/tests/data/cluster_test_init_no_reads_2/genes.fa diff --git a/ariba/tests/data/cluster_test_rename_scaffolds/gene.fa b/ariba/tests/data/cluster_test_load_final_contigs/genes.fa similarity index 100% rename from ariba/tests/data/cluster_test_rename_scaffolds/gene.fa rename to ariba/tests/data/cluster_test_load_final_contigs/genes.fa diff --git a/ariba/tests/data/cluster_test_scaffold_with_sspace/gene.fa b/ariba/tests/data/cluster_test_parse_assembly_bam/genes.fa similarity index 100% rename from ariba/tests/data/cluster_test_scaffold_with_sspace/gene.fa rename to ariba/tests/data/cluster_test_parse_assembly_bam/genes.fa diff --git a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/gene.fa b/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/genes.fa similarity index 100% rename from ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/gene.fa rename to ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/genes.fa diff --git a/ariba/tests/data/cluster_test_set_assembly_kmer/gene.fa b/ariba/tests/data/cluster_test_rename_scaffolds/genes.fa similarity index 100% rename from ariba/tests/data/cluster_test_set_assembly_kmer/gene.fa rename to ariba/tests/data/cluster_test_rename_scaffolds/genes.fa diff --git a/ariba/tests/data/cluster_test_scaffold_with_sspace.gene.fa b/ariba/tests/data/cluster_test_scaffold_with_sspace.gene.fa new file mode 100644 index 00000000..042775d6 --- /dev/null +++ b/ariba/tests/data/cluster_test_scaffold_with_sspace.gene.fa @@ -0,0 +1,2 @@ +>name_of_gene +AAACCCGGGTTT diff --git a/ariba/tests/data/cluster_test_scaffold_with_sspace/genes.fa b/ariba/tests/data/cluster_test_scaffold_with_sspace/genes.fa new file mode 100644 index 00000000..042775d6 --- /dev/null +++ b/ariba/tests/data/cluster_test_scaffold_with_sspace/genes.fa @@ -0,0 +1,2 @@ +>name_of_gene +AAACCCGGGTTT diff --git a/ariba/tests/data/cluster_test_set_assembly_kmer/genes.fa b/ariba/tests/data/cluster_test_set_assembly_kmer/genes.fa new file mode 100644 index 00000000..042775d6 --- /dev/null +++ b/ariba/tests/data/cluster_test_set_assembly_kmer/genes.fa @@ -0,0 +1,2 @@ +>name_of_gene +AAACCCGGGTTT From aa858b452e02a1df2c27f116e42395c61a0982ca Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 13:33:04 +0000 Subject: [PATCH 08/18] chdir so tmp files made in output dir --- ariba/clusters.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ariba/clusters.py b/ariba/clusters.py index ba67aca8..6251e6c8 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -107,6 +107,7 @@ def __init__(self, except: raise Error('Error mkdir ' + self.outdir) + os.chdir(self.outdir) def _run_cdhit(self): r = cdhit.Runner( From aa027e74f8942f7f2ef9711a0e6c615b57e632b6 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 13:56:13 +0000 Subject: [PATCH 09/18] Name each cluster and put name in report --- ariba/cluster.py | 5 +- ariba/clusters.py | 21 ++++--- ariba/tests/cluster_test.py | 59 ++++++++++--------- ariba/tests/clusters_test.py | 8 +-- .../tests/data/clusters_test_write_report.tsv | 2 +- 5 files changed, 53 insertions(+), 42 deletions(-) diff --git a/ariba/cluster.py b/ariba/cluster.py index 89dbcc1b..7987dbc6 100644 --- a/ariba/cluster.py +++ b/ariba/cluster.py @@ -13,6 +13,7 @@ class Error (Exception): pass class Cluster: def __init__(self, root_dir, + name, assembly_kmer=0, assembler='velvet', max_insert=1000, @@ -45,6 +46,7 @@ def __init__(self, if not os.path.exists(self.root_dir): raise Error('Directory ' + self.root_dir + ' not found. Cannot continue') + self.name = name self.reads1 = os.path.join(self.root_dir, 'reads_1.fq') self.reads2 = os.path.join(self.root_dir, 'reads_2.fq') self.gene_fa = os.path.join(self.root_dir, 'gene.fa') @@ -690,7 +692,7 @@ def _make_report_lines(self): self.report_lines = [] if len(self.variants) == 0: - self.report_lines.append([self.gene.id, self.status_flag.to_number(), len(self.gene)] + ['.'] * 11) + self.report_lines.append([self.gene.id, self.status_flag.to_number(), self.name, len(self.gene)] + ['.'] * 11) for contig in self.variants: for variants in self.variants[contig]: @@ -701,6 +703,7 @@ def _make_report_lines(self): self.report_lines.append([ self.gene.id, self.status_flag.to_number(), + self.name, len(self.gene), pymummer.variant.var_types[v.var_type], effect, diff --git a/ariba/clusters.py b/ariba/clusters.py index 6251e6c8..82f632f5 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -44,6 +44,7 @@ def __init__(self, self.reads_1 = os.path.abspath(reads_1) self.reads_2 = os.path.abspath(reads_2) self.outdir = os.path.abspath(outdir) + self.clusters_outdir = os.path.join(self.outdir, 'Clusters') self.assembler = assembler assert self.assembler in ['velvet', 'spades'] @@ -102,12 +103,11 @@ def __init__(self, self.cdhit_seq_identity_threshold = cdhit_seq_identity_threshold self.cdhit_length_diff_cutoff = cdhit_length_diff_cutoff - try: - os.mkdir(self.outdir) - except: - raise Error('Error mkdir ' + self.outdir) - - os.chdir(self.outdir) + for d in [self.outdir, self.clusters_outdir]: + try: + os.mkdir(d) + except: + raise Error('Error mkdir ' + d) def _run_cdhit(self): r = cdhit.Runner( @@ -203,7 +203,7 @@ def _bam_to_clusters_reads(self): assert ref not in filehandles_1 assert ref not in filehandles_2 - new_dir = os.path.join(self.outdir, ref) + new_dir = os.path.join(self.clusters_outdir, ref) try: os.mkdir(new_dir) except: @@ -257,6 +257,7 @@ def _init_and_run_clusters(self): self.clusters[gene] = cluster.Cluster( new_dir, + gene, assembly_kmer=self.assembly_kmer, assembler=self.assembler, max_insert=self.insert_proper_pair_max, @@ -287,6 +288,7 @@ def _write_reports(self): columns = [ '#gene', 'flag', + 'cluster', 'gene_len', 'var_type', 'var_effect', @@ -321,6 +323,9 @@ def _write_reports(self): def run(self): + cwd = os.getcwd() + os.chdir(self.outdir) + if self.verbose: print('{:_^79}'.format(' Running cd-hit ')) self._run_cdhit() @@ -341,3 +346,5 @@ def run(self): self._write_reports() if self.verbose: print('Finished writing report files. All done!') + + os.chdir(cwd) diff --git a/ariba/tests/cluster_test.py b/ariba/tests/cluster_test.py index 89d70cd4..915edd5a 100644 --- a/ariba/tests/cluster_test.py +++ b/ariba/tests/cluster_test.py @@ -52,7 +52,7 @@ def test_init_fail_files_missing(self): for d in dirs: clean_cluster_dir(d) with self.assertRaises(cluster.Error): - c = cluster.Cluster(d) + c = cluster.Cluster(d, 'name') clean_cluster_dir(d) @@ -60,7 +60,7 @@ def test_get_total_alignment_score(self): '''test _get_total_alignment_score''' cluster_dir = os.path.join(data_dir, 'cluster_test_get_total_alignment_score') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') got_score = c._get_total_alignment_score('1') expected_score = 1500 self.assertEqual(got_score, expected_score) @@ -71,7 +71,7 @@ def test_get_best_gene_by_alignment_score(self): '''test _get_best_gene_by_alignment_score''' cluster_dir = os.path.join(data_dir, 'cluster_test_get_best_gene_by_alignment_score') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') got_name = c._get_best_gene_by_alignment_score() self.assertEqual(got_name, '1') clean_cluster_dir(cluster_dir) @@ -81,7 +81,7 @@ def test_choose_best_gene(self): '''test _choose_best_gene''' cluster_dir = os.path.join(data_dir, 'cluster_test_choose_best_gene') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') expected_gene = pyfastaq.sequences.Fasta('1', ''.join([ 'AGCGCCTAGCTTTGGCACTTCAGGAGCGCCCGGAAATAATGGCGGGCGATGAAGGTTCTG', 'TAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAAC', @@ -100,10 +100,10 @@ def test_set_assembly_kmer(self): '''test _set_assembly_kmer''' cluster_dir = os.path.join(data_dir, 'cluster_test_set_assembly_kmer') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir, assembly_kmer=42) + c = cluster.Cluster(cluster_dir, 'name', assembly_kmer=42) self.assertEqual(c.assembly_kmer, 42) clean_cluster_dir(cluster_dir) - c = cluster.Cluster(os.path.join(data_dir, 'cluster_test_set_assembly_kmer')) + c = cluster.Cluster(os.path.join(data_dir, 'cluster_test_set_assembly_kmer'), 'name') self.assertEqual(c.assembly_kmer, 5) clean_cluster_dir(cluster_dir) @@ -112,7 +112,7 @@ def test_assemble_with_spades(self): '''test _assemble_with_spades''' cluster_dir = os.path.join(data_dir, 'cluster_test_assemble_with_spades') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') shutil.copyfile(os.path.join(data_dir, 'cluster_test_assemble_with_spades.gene.fa'), c.gene_fa) c._assemble_with_spades(unittest=True) self.assertEqual(c.status_flag.to_number(), 0) @@ -123,7 +123,7 @@ def test_assemble_with_spades_fail(self): '''test _assemble_with_spades handles spades fail''' cluster_dir = os.path.join(data_dir, 'cluster_test_assemble_with_spades') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') shutil.copyfile(os.path.join(data_dir, 'cluster_test_assemble_with_spades.gene.fa'), c.gene_fa) c._assemble_with_spades() self.assertEqual(c.status_flag.to_number(), 64) @@ -134,7 +134,7 @@ def test_scaffold_with_sspace(self): '''test _scaffold_with_sspace''' cluster_dir = os.path.join(data_dir, 'cluster_test_scaffold_with_sspace') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') shutil.copyfile( os.path.join(data_dir, 'cluster_test_scaffold_with_sspace.contigs.fa'), c.assembly_contigs @@ -149,7 +149,7 @@ def test_gap_fill_with_gapfiller_no_gaps(self): '''test _gap_fill_with_gapfiller no gaps''' cluster_dir = os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') shutil.copyfile( os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller.scaffolds_no_gaps.fa'), c.scaffolder_scaffolds @@ -164,7 +164,7 @@ def test_gap_fill_with_gapfiller_with_gaps(self): '''test _gap_fill_with_gapfiller with gaps''' cluster_dir = os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') shutil.copyfile( os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa'), c.scaffolder_scaffolds @@ -179,7 +179,7 @@ def test_rename_scaffolds(self): '''test _rename_scaffolds''' cluster_dir = os.path.join(data_dir, 'cluster_test_rename_scaffolds') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') c.gene = pyfastaq.sequences.Fasta('name_of_gene', 'AAACCCGGGTTT') infile = os.path.join(data_dir, 'cluster_test_rename_scaffolds.in.fa') outfile = os.path.join(data_dir, 'cluster_test_rename_scaffolds.out.fa') @@ -194,7 +194,7 @@ def test_fix_contig_orientation(self): '''test _fix_contig_orientation''' cluster_dir = os.path.join(data_dir, 'cluster_test_fix_contig_orientation') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') scaffs_in = os.path.join(data_dir, 'cluster_test_fix_contig_orientation.in.fa') scaffs_out = os.path.join(data_dir, 'cluster_test_fix_contig_orientation.out.fa') shutil.copyfile(scaffs_in, c.gapfilled_scaffolds) @@ -208,7 +208,7 @@ def test_load_final_contigs(self): '''test _load_final_contigs''' cluster_dir = os.path.join(data_dir, 'cluster_test_load_final_contigs') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') contigs_file = os.path.join(data_dir, 'cluster_test_load_final_contigs.contigs.fa') shutil.copyfile(contigs_file, c.final_assembly_fa) c._load_final_contigs() @@ -226,7 +226,7 @@ def test_parse_assembly_vs_gene_coords(self): cluster_dir = os.path.join(data_dir, 'cluster_test_parse_assembly_vs_gene_coords') coords_file = os.path.join(data_dir, 'cluster_test_parse_assembly_vs_gene_coords.coords') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') shutil.copyfile(coords_file, c.assembly_vs_gene_coords) c.gene = pyfastaq.sequences.Fasta('gene', 'AAACCCGGGTTT') c._parse_assembly_vs_gene_coords() @@ -245,7 +245,7 @@ def test_parse_assembly_bam(self): '''test _parse_assembly_bam''' cluster_dir = os.path.join(data_dir, 'cluster_test_parse_assembly_bam') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') bam = os.path.join(data_dir, 'cluster_test_parse_assembly_bam.bam') assembly_fa = os.path.join(data_dir, 'cluster_test_parse_assembly_bam.assembly.fa') shutil.copyfile(bam, c.final_assembly_bam) @@ -261,7 +261,7 @@ def test_nucmer_hits_to_scaff_coords(self): '''test _nucmer_hits_to_scaff_coords''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') hits = [ ['1', '10', '1', '10', '10', '10', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'], ['9', '42', '9', '42', '34', '34', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'], @@ -296,7 +296,7 @@ def test_nucmer_hits_to_ref_coords(self): '''test _nucmer_hits_to_ref_coords''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') hits = [ ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'contig1'], ['100', '142', '200', '242', '42', '42', '99.42', '1000', '1000', '1', '1', 'gene', 'contig1'] @@ -317,7 +317,7 @@ def test_whole_gene_covered_by_nucmer_hits(self): '''test _whole_gene_covered_by_nucmer_hits''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') c.gene = pyfastaq.sequences.Fasta('gene', 'ACGTGTGCAT') hit1 = ['1', '10', '1', '10', '10', '10', '100.00', '10', '10', '1', '1', 'gene', 'contig1'] hit2 = ['1', '5', '1', '5', '5', '5', '100.00', '10', '10', '1', '1', 'gene', 'contig2'] @@ -339,7 +339,7 @@ def test_gene_coverage_unique(self): '''test _gene_coverage_unique''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') c.gene = pyfastaq.sequences.Fasta('gene', 'ACGTGTGCAT') hit1 = ['1', '10', '1', '10', '10', '10', '100.00', '10', '10', '1', '1', 'gene', 'contig1'] hit2 = ['1', '5', '1', '5', '5', '5', '100.00', '10', '10', '1', '1', 'gene', 'contig2'] @@ -353,7 +353,7 @@ def test_gene_covered_by_complete_contig_with_orf(self): '''test _gene_covered_by_complete_contig_with_orf''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA') gene_no_orf = pyfastaq.sequences.Fasta('gene', 'GATTGAGAAGCGATGACCCATGAAGCGACCGAACGCTGA') c.gene = gene @@ -385,7 +385,7 @@ def test_gene_covered_by_at_least_one_full_length_contig(self): '''test _gene_covered_by_at_least_one_full_length_contig''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA') hit1 = ['1', '39', '1', '39', '39', '39', '100.00', '39', '39', '1', '1', 'gene', 'contig1'] hit2 = ['1', '20', '1', '20', '20', '20', '100.00', '39', '39', '1', '1', 'gene', 'contig1'] @@ -405,7 +405,7 @@ def test_get_mummer_variants(self): '''test _get_mummer_variants''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') snp_file = os.path.join(data_dir, 'cluster_test_get_mummer_variants.none.snps') shutil.copyfile(snp_file, c.assembly_vs_gene_coords + '.snps') c._get_mummer_variants() @@ -432,7 +432,7 @@ def test_filter_mummer_variants(self): '''test filter_mummer_variants''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA') v1 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tT\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig')) v2 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig')) @@ -448,7 +448,7 @@ def test_get_codon_start(self): '''test _get_codon_start''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') tests = [ (0, 5, 3), (0, 0, 0), @@ -470,7 +470,7 @@ def test_get_variant_effect(self): '''test _get_variant_effect''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA') v1 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tT\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig')) v1 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tT\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig')) @@ -516,7 +516,7 @@ def test_make_assembly_vcf(self): '''test _make_assembly_vcf''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') c.final_assembly_fa = os.path.join(data_dir, 'cluster_test_make_assembly_vcf.assembly.fa') c.final_assembly_bam = os.path.join(data_dir, 'cluster_test_make_assembly_vcf.assembly.bam') expected_vcf = os.path.join(data_dir, 'cluster_test_make_assembly_vcf.assembly.vcf') @@ -536,7 +536,7 @@ def test_get_vcf_variant_counts(self): '''test _get_vcf_variant_counts''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') hit = ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'] c.nucmer_hits = { 'scaff1': [pymummer.alignment.Alignment('\t'.join(hit))] @@ -553,7 +553,7 @@ def test_make_report_lines(self): '''test _make_report_lines''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) - c = cluster.Cluster(cluster_dir) + c = cluster.Cluster(cluster_dir, 'cluster_name') c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA') v1 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tT\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig')) c.variants = {'contig': [[v1]]} @@ -562,6 +562,7 @@ def test_make_report_lines(self): expected = [[ 'gene', 42, + 'cluster_name', 39, 'SNP', 'SYN', diff --git a/ariba/tests/clusters_test.py b/ariba/tests/clusters_test.py index 7d8a877e..4cf47fca 100644 --- a/ariba/tests/clusters_test.py +++ b/ariba/tests/clusters_test.py @@ -79,10 +79,10 @@ def test_bam_to_clusters_reads(self): ] got = [ - os.path.join(clusters_dir, 'ref1/reads_1.fq'), - os.path.join(clusters_dir, 'ref1/reads_2.fq'), - os.path.join(clusters_dir, 'ref2/reads_1.fq'), - os.path.join(clusters_dir, 'ref2/reads_2.fq'), + os.path.join(clusters_dir, 'Clusters/ref1/reads_1.fq'), + os.path.join(clusters_dir, 'Clusters/ref1/reads_2.fq'), + os.path.join(clusters_dir, 'Clusters/ref2/reads_1.fq'), + os.path.join(clusters_dir, 'Clusters/ref2/reads_2.fq'), ] diff --git a/ariba/tests/data/clusters_test_write_report.tsv b/ariba/tests/data/clusters_test_write_report.tsv index 0b219c9f..a7180669 100644 --- a/ariba/tests/data/clusters_test_write_report.tsv +++ b/ariba/tests/data/clusters_test_write_report.tsv @@ -1,3 +1,3 @@ -#gene flag gene_len var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt +#gene flag cluster gene_len var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt gene1 line1 gene2 line2 From f2403ff9359ab7c78c5c5016a90eac65e05b5337 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 14:05:50 +0000 Subject: [PATCH 10/18] Add cd-hit cmdline potions --- ariba/tasks/run.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ariba/tasks/run.py b/ariba/tasks/run.py index 1a18c558..a4b5e0aa 100644 --- a/ariba/tasks/run.py +++ b/ariba/tasks/run.py @@ -12,6 +12,10 @@ def run(): parser.add_argument('reads_2', help='Name of rev reads fastq file') parser.add_argument('outdir', help='Output directory (must not already exist)') + cdhit_group = parser.add_argument_group('cd-hit options') + cdhit_group.add_argument('--cdhit_seq_identity_threshold', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT') + cdhit_group.add_argument('--cdhit_length_diff_cutoff', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.9, metavar='FLOAT') + smalt_group = parser.add_argument_group('smalt options') smalt_group.add_argument('--smalt_k', type=int, help='kmer to use when indexing with smalt (smalt index -k) [%(default)s]', default=13, metavar='INT') smalt_group.add_argument('--smalt_s', type=int, help='Step length to use when indexing with smalt (see smalt index -s) [%(default)s]', default=2, metavar='INT') @@ -80,6 +84,8 @@ def run(): spades_exe=options.spades, sspace_exe=options.sspace, velvet_exe=options.velvet, + cdhit_seq_identity_threshold=options.cdhit_seq_identity_threshold, + cdhit_length_diff_cutoff=options.cdhit_length_diff_cutoff, ) c.run() From b54d3bed79036ffe0eb46e3ff3be40d4afa40e98 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 14:09:45 +0000 Subject: [PATCH 11/18] Check cd-hit version --- ariba/external_progs.py | 4 ++++ ariba/tasks/run.py | 1 + 2 files changed, 5 insertions(+) diff --git a/ariba/external_progs.py b/ariba/external_progs.py index 7bb37f50..275fcba2 100644 --- a/ariba/external_progs.py +++ b/ariba/external_progs.py @@ -15,6 +15,7 @@ def is_in_path(prog): prog_to_default = { 'bcftools': 'bcftools', + 'cdhit': 'cd-hit', 'gapfiller': 'GapFiller.pl', 'nucmer' : 'nucmer', 'samtools': 'samtools', @@ -35,6 +36,7 @@ def is_in_path(prog): prog_to_version_cmd = { 'bcftools': ('', re.compile('^Version: ([0-9\.]+)')), + 'cdhit': ('', re.compile('CD-HIT version ([0-9\.]+) \(')), 'gapfiller': ('', re.compile('^Usage: .*pl \[GapFiller_(.*)\]')), 'nucmer': ('--version', re.compile('^NUCmer \(NUCleotide MUMmer\) version ([0-9\.]+)')), 'samtools': ('', re.compile('^Version: ([0-9\.]+)')), @@ -48,6 +50,7 @@ def is_in_path(prog): min_versions = { 'bcftools': '1.2', + 'cd-hit': '4.6', 'nucmer': '3.1', 'samtools': '1.2', 'smalt': '0.7.4', @@ -104,6 +107,7 @@ def check_versions(opts, verbose=False): to_check = [ 'bcftools', + 'cdhit', 'nucmer', 'smalt', 'samtools', diff --git a/ariba/tasks/run.py b/ariba/tasks/run.py index a4b5e0aa..40a57709 100644 --- a/ariba/tasks/run.py +++ b/ariba/tasks/run.py @@ -42,6 +42,7 @@ def run(): executables_group = parser.add_argument_group('executables locations') executables_group.add_argument('--bcftools', help='bcftools executable [bcftools]', metavar='PATH') + executables_group.add_argument('--cdhit', help=argparse.SUPPRESS) executables_group.add_argument('--gapfiller', help='GapFiller executable [GapFiller.pl]', metavar='PATH') executables_group.add_argument('--nucmer', help=argparse.SUPPRESS, default='nucmer') executables_group.add_argument('--samtools', help='samtools executable [samtools]', metavar='PATH') From ee8c0a24a9a743fadab9d52e03b2ad10a9380222 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 15:08:56 +0000 Subject: [PATCH 12/18] Add clean option --- ariba/cluster.py | 47 ++++++++++++++++++++++++++++++++++++++++++++++ ariba/clusters.py | 34 +++++++++++++++++++++++++++++++-- ariba/tasks/run.py | 2 ++ 3 files changed, 81 insertions(+), 2 deletions(-) diff --git a/ariba/cluster.py b/ariba/cluster.py index 7987dbc6..a8ccdcda 100644 --- a/ariba/cluster.py +++ b/ariba/cluster.py @@ -40,6 +40,7 @@ def __init__(self, sspace_exe='SSPACE_Basic_v2.0.pl', velvet_exe='velvet', # prefix of velvet{g,h} spades_other=None, + clean=1, ): self.root_dir = os.path.abspath(root_dir) @@ -106,6 +107,7 @@ def __init__(self, self.unique_threshold = unique_threshold self.status_flag = flag.Flag() self.flag_file = os.path.join(self.root_dir, 'flag') + self.clean = clean self.assembly_dir = os.path.join(self.root_dir, 'Assembly') try: @@ -385,6 +387,7 @@ def _fix_contig_orientation(self): else: to_revcomp.add(hit.qry_name) + os.unlink(tmp_coords) in_both = to_revcomp.intersection(not_revcomp) for name in in_both: print('WARNING: hits to both strands of gene for scaffold. Interpretation of any variants cannot be trusted', name, file=sys.stderr) @@ -719,6 +722,49 @@ def _make_report_lines(self): ]) + def _clean(self): + if self.verbose: + print('Cleaning', self.root_dir) + + if self.clean > 0: + if self.verbose: + print(' rm -r', self.assembly_dir) + shutil.rmtree(self.assembly_dir) + + to_clean = [ + [ + 'assembly.reads_mapped.unsorted.bam', + ], + [ + 'assembly.fa.fai', + 'assembly.reads_mapped.bam.scaff', + 'assembly.reads_mapped.bam.soft_clipped', + 'assembly.reads_mapped.bam.unmapped_mates', + 'assembly_vs_gene.coords', + 'assembly_vs_gene.coords.snps', + 'genes.fa', + 'genes.fa.fai', + 'reads_1.fq', + 'reads_2.fq', + ], + [ + 'assembly.fa.fai', + 'assembly.reads_mapped.bam', + 'assembly.reads_mapped.bam.vcf', + 'assembly_vs_gene.coords', + 'assembly_vs_gene.coords.snps', + ] + ] + + for i in range(self.clean + 1): + for fname in to_clean[i]: + fullname = os.path.join(self.root_dir, fname) + if os.path.exists(fullname): + if self.verbose: + print(' rm', fname) + os.unlink(fullname) + + def run(self): self.gene = self._choose_best_gene() @@ -766,3 +812,4 @@ def run(self): self._get_vcf_variant_counts() self._make_report_lines() + self._clean() diff --git a/ariba/clusters.py b/ariba/clusters.py index 82f632f5..d2b92e3d 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -39,12 +39,14 @@ def __init__(self, velvet_exe='velvet', # prefix of velvet{g,h} cdhit_seq_identity_threshold=0.9, cdhit_length_diff_cutoff=0.9, + clean=1, ): self.db_fasta = os.path.abspath(db_fasta) self.reads_1 = os.path.abspath(reads_1) self.reads_2 = os.path.abspath(reads_2) self.outdir = os.path.abspath(outdir) self.clusters_outdir = os.path.join(self.outdir, 'Clusters') + self.clean = clean self.assembler = assembler assert self.assembler in ['velvet', 'spades'] @@ -278,7 +280,8 @@ def _init_and_run_clusters(self): spades_exe=self.spades_exe, sspace_exe=self.sspace_exe, velvet_exe=self.velvet, - spades_other=self.spades_other + spades_other=self.spades_other, + clean=self.clean, ) self.clusters[gene].run() @@ -320,6 +323,30 @@ def _write_reports(self): workbook.save(self.report_file_xls) + def _clean(self): + to_clean = [ + [ + ], + [ + self.bam + ], + [ + self.db_fasta_clustered, + self.db_fasta_clustered + '.fai', + ] + ] + + for i in range(self.clean + 1): + for fname in to_clean[i]: + if os.path.exists(fname): + if self.verbose: + print(' rm', fname) + os.unlink(fname) + + if self.clean >= 2: + if self.verbose: + print(' rm -r', self.clusters_outdir) + shutil.rmtree(self.clusters_outdir) def run(self): @@ -345,6 +372,9 @@ def run(self): print('{:_^79}'.format(' Writing report files ')) self._write_reports() if self.verbose: - print('Finished writing report files. All done!') + print('Finished writing report files. Cleaning files') + self._clean() + if self.verbose: + print('\nAll done!\n') os.chdir(cwd) diff --git a/ariba/tasks/run.py b/ariba/tasks/run.py index 40a57709..627c9209 100644 --- a/ariba/tasks/run.py +++ b/ariba/tasks/run.py @@ -38,6 +38,7 @@ def run(): other_group.add_argument('--threads', type=int, help='Number of threads for smalt and spades [%(default)s]', default=1, metavar='INT') other_group.add_argument('--assembled_threshold', type=float, help='If proportion of gene assembled (regardless of into how many contigs) is at least this value then the flag gene_assembled is set [%(default)s]', default=0.95, metavar='FLOAT (between 0 and 1)') other_group.add_argument('--unique_threshold', type=float, help='If proportion of bases in gene assembled more than once is <= this value, then the flag unique_contig is set [%(default)s]', default=0.03, metavar='FLOAT (between 0 and 1)') + other_group.add_argument('--clean', type=int, choices=[0,1,2], help='Specify how much cleaning to do. 0=none, 1=some, 2=only keep the report [%(default)s]', default=1, metavar='INT') other_group.add_argument('--verbose', action='store_true', help='Be verbose') executables_group = parser.add_argument_group('executables locations') @@ -87,6 +88,7 @@ def run(): velvet_exe=options.velvet, cdhit_seq_identity_threshold=options.cdhit_seq_identity_threshold, cdhit_length_diff_cutoff=options.cdhit_length_diff_cutoff, + clean=options.clean, ) c.run() From aae1251a9d63f8c0d4b4f87bfaca7c6476da0969 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 15:46:51 +0000 Subject: [PATCH 13/18] Do not output name of cluster (it is confusing) --- ariba/clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/clusters.py b/ariba/clusters.py index d2b92e3d..2fefc38a 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -246,7 +246,7 @@ def _init_and_run_clusters(self): for gene in sorted(self.cluster_to_dir): counter += 1 if self.verbose: - print('\nAssembling cluster', counter, 'of', str(len(self.cluster_to_dir)) + ':', gene) + print('\nAssembling cluster', counter, 'of', str(len(self.cluster_to_dir))) new_dir = self.cluster_to_dir[gene] faidx.write_fa_subset( From e63dec0d55d58ac027abdda7382f7b9e70803d81 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 15:58:55 +0000 Subject: [PATCH 14/18] Write cdhit clusters info to tsv file --- ariba/clusters.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ariba/clusters.py b/ariba/clusters.py index 2fefc38a..77c59e1b 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -46,6 +46,7 @@ def __init__(self, self.reads_2 = os.path.abspath(reads_2) self.outdir = os.path.abspath(outdir) self.clusters_outdir = os.path.join(self.outdir, 'Clusters') + self.clusters_info_file = os.path.join(self.outdir, 'clusters.tsv') self.clean = clean self.assembler = assembler @@ -123,6 +124,15 @@ def _run_cdhit(self): self.cluster_ids = r.run() + def _write_clusters_info_file(self): + f = pyfastaq.utils.open_file_write(self.clusters_info_file) + print('#Cluster\tGene', file=f) + for c in sorted([int(x) for x in self.cluster_ids]): + for seqname in sorted(list(self.cluster_ids[str(c)])): + print(c, seqname, sep='\t', file=f) + pyfastaq.utils.close(f) + + def _map_reads_to_clustered_genes(self): mapping.run_smalt( self.reads_1, @@ -333,6 +343,7 @@ def _clean(self): [ self.db_fasta_clustered, self.db_fasta_clustered + '.fai', + self.clusters_info_file, ] ] @@ -356,7 +367,9 @@ def run(self): if self.verbose: print('{:_^79}'.format(' Running cd-hit ')) self._run_cdhit() + self._write_clusters_info_file() if self.verbose: + print('Finished cd-hit\n') print('{:_^79}'.format(' Mapping reads to clustered genes ')) self._map_reads_to_clustered_genes() if self.verbose: From 4aafa12fff2df6b4f8ea7b7f479e619b30c437ba Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 19 Feb 2015 16:21:48 +0000 Subject: [PATCH 15/18] Only look for best gene when more than one to choose from --- ariba/cluster.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ariba/cluster.py b/ariba/cluster.py index a8ccdcda..3ddace3b 100644 --- a/ariba/cluster.py +++ b/ariba/cluster.py @@ -152,6 +152,17 @@ def _get_total_alignment_score(self, gene_name): def _get_best_gene_by_alignment_score(self): + cluster_size = pyfastaq.tasks.count_sequences(self.genes_fa) + if cluster_size == 1: + seqs = {} + pyfastaq.tasks.file_to_dict(self.genes_fa, seqs) + assert len(seqs) == 1 + if self.verbose: + print('No need to choose gene for this cluster (it only has one gene)') + return list(seqs.values())[0].id + + if self.verbose: + print('Choosing best gene from cluster of', cluster_size, 'genes...') file_reader = pyfastaq.sequences.file_reader(self.genes_fa) best_score = 0 best_gene_name = None From 9b1458a65b57ea65753530340573e0da19b7364c Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 20 Feb 2015 08:20:03 +0000 Subject: [PATCH 16/18] Tidy up verbose output --- ariba/cluster.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ariba/cluster.py b/ariba/cluster.py index 3ddace3b..6f2689be 100644 --- a/ariba/cluster.py +++ b/ariba/cluster.py @@ -157,12 +157,13 @@ def _get_best_gene_by_alignment_score(self): seqs = {} pyfastaq.tasks.file_to_dict(self.genes_fa, seqs) assert len(seqs) == 1 + gene_name = list(seqs.values())[0].id if self.verbose: - print('No need to choose gene for this cluster (it only has one gene)') - return list(seqs.values())[0].id + print('No need to choose gene for this cluster because only has one gene:', gene_name) + return gene_name if self.verbose: - print('Choosing best gene from cluster of', cluster_size, 'genes...') + print('\nChoosing best gene from cluster of', cluster_size, 'genes...') file_reader = pyfastaq.sequences.file_reader(self.genes_fa) best_score = 0 best_gene_name = None @@ -174,6 +175,10 @@ def _get_best_gene_by_alignment_score(self): best_score = score best_gene_name = seq.id + if self.verbose: + print('Best gene is', best_gene_name, 'with total alignment score of', best_score) + print() + return best_gene_name From 5c2177ac8c0297153b64956b3cf9500f93ec1385 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 20 Feb 2015 08:33:45 +0000 Subject: [PATCH 17/18] Add cc-hit dependency --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 7db33c8b..0d3ce95f 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Installation ------------ ARIBA has the following dependencies, which need to be installed: + * [cd-hit] [cdhit] version >= 4.6 * [samtools and bcftools] [samtools] version >= 1.2 * [SSPACE-basic scaffolder] [sspace] * [GapFiller] [gapfiller] @@ -39,6 +40,7 @@ Usage Please read the [ARIBA wiki page] [ARIBA wiki] for usage instructions. + [cdhit]: http://weizhongli-lab.org/cd-hit/ [ARIBA wiki]: https://github.com/sanger-pathogens/ariba/wiki [gapfiller]: http://www.baseclear.com/genomics/bioinformatics/basetools/gapfiller [mummer]: http://mummer.sourceforge.net/ From ad4695651f5f23ad2f575eba1f9449ea0f20287f Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 20 Feb 2015 08:34:21 +0000 Subject: [PATCH 18/18] Tweak verbose output --- ariba/clusters.py | 16 ++++++++++------ ariba/common.py | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/ariba/clusters.py b/ariba/clusters.py index 77c59e1b..d3ce2335 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -224,6 +224,8 @@ def _bam_to_clusters_reads(self): self.cluster_to_dir[ref] = new_dir filehandles_1[ref] = pyfastaq.utils.open_file_write(os.path.join(new_dir, 'reads_1.fq')) filehandles_2[ref] = pyfastaq.utils.open_file_write(os.path.join(new_dir, 'reads_2.fq')) + if self.verbose: + print('New cluster with reads that hit:', ref, flush=True) print(read1, file=filehandles_1[ref]) print(read2, file=filehandles_2[ref]) @@ -234,6 +236,8 @@ def _bam_to_clusters_reads(self): pyfastaq.utils.close(filehandles_1[ref]) pyfastaq.utils.close(filehandles_2[ref]) + if self.verbose: + print('Total clusters to perform local assemblies:', len(self.cluster_to_dir), flush=True) def _set_insert_size_data(self): assert len(self.insert_hist) > 0 @@ -365,27 +369,27 @@ def run(self): os.chdir(self.outdir) if self.verbose: - print('{:_^79}'.format(' Running cd-hit ')) + print('{:_^79}'.format(' Running cd-hit '), flush=True) self._run_cdhit() self._write_clusters_info_file() if self.verbose: print('Finished cd-hit\n') - print('{:_^79}'.format(' Mapping reads to clustered genes ')) + print('{:_^79}'.format(' Mapping reads to clustered genes '), flush=True) self._map_reads_to_clustered_genes() if self.verbose: print('Finished mapping\n') - print('{:_^79}'.format(' Generating clusters ')) + print('{:_^79}'.format(' Generating clusters '), flush=True) self._bam_to_clusters_reads() self._set_insert_size_data() if self.verbose: - print('{:_^79}'.format(' Assembling each cluster ')) + print('{:_^79}'.format(' Assembling each cluster '), flush=True) self._init_and_run_clusters() if self.verbose: print('Finished assembling clusters\n') - print('{:_^79}'.format(' Writing report files ')) + print('{:_^79}'.format(' Writing report files '), flush=True) self._write_reports() if self.verbose: - print('Finished writing report files. Cleaning files') + print('Finished writing report files. Cleaning files', flush=True) self._clean() if self.verbose: print('\nAll done!\n') diff --git a/ariba/common.py b/ariba/common.py index a5004a40..c521f95f 100644 --- a/ariba/common.py +++ b/ariba/common.py @@ -13,7 +13,7 @@ def syscall(cmd, allow_fail=False, verbose=False): print('The following command failed with exit code', error.returncode, file=sys.stderr) print(cmd, file=sys.stderr) print('\nThe output was:\n', file=sys.stderr) - print(errors, file=sys.stderr) + print(errors, file=sys.stderr, flush=True) if allow_fail: return False, errors