From 97c5a74af91d34b2f233f263e6ef6776ac04d071 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 25 Nov 2015 09:31:58 +0000 Subject: [PATCH 1/6] Make tree for jscandy --- ariba/summary.py | 56 ++++++++++++++++- ...ary_test_newick_from_dist_matrix.distances | 4 ++ .../summary_test_newick_from_dist_matrix.tre | 1 + ...mmary_test_write_distance_matrix.distances | 4 ++ ariba/tests/summary_test.py | 60 +++++++++++++++++++ 5 files changed, 123 insertions(+), 2 deletions(-) create mode 100644 ariba/tests/data/summary_test_newick_from_dist_matrix.distances create mode 100644 ariba/tests/data/summary_test_newick_from_dist_matrix.tre create mode 100644 ariba/tests/data/summary_test_write_distance_matrix.distances diff --git a/ariba/summary.py b/ariba/summary.py index 935846b9..8865ba90 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -1,7 +1,7 @@ import os import openpyxl import pyfastaq -from ariba import flag +from ariba import flag, common class Error (Exception): pass @@ -49,6 +49,7 @@ def __init__( filenames=None, fofn=None, filter_output=True, + js_candy_prefix=None, min_id=90.0 ): if filenames is None and fofn is None: @@ -65,6 +66,7 @@ def __init__( self.filter_output = filter_output self.min_id = min_id self.outfile = outfile + self.js_candy_prefix = js_candy_prefix def _load_fofn(self, fofn): @@ -204,6 +206,55 @@ def _write_xls(self): workbook.save(self.outfile) + @staticmethod + def _distance_score_between_values(value1, value2): + if value1 != value2 and 0 in [value1, value2]: + return 1 + else: + return 0 + + + @classmethod + def _distance_score_between_lists(cls, scores1, scores2): + assert len(scores1) == len(scores2) + return sum([cls._distance_score_between_values(scores1[i], scores2[i]) for i in range(1, len(scores1))]) + + + def _write_distance_matrix(self, outfile): + if len(self.rows_out) < 3: + raise Error('Cannot calculate distance matrix to make tree for js_candy.\n' + + 'Only one sample present.') + + if len(self.rows_out[0]) < 2: + raise Error('Cannot calculate distance matrix to make tree for js_candy.\n' + + 'No genes present in output') + + with open(outfile, 'w') as f: + sample_names = [x[0] for x in self.rows_out] + print(*sample_names[1:], sep='\t', file=f) + + for i in range(1,len(self.rows_out)): + scores = [] + for j in range(2, len(self.rows_out)): + scores.append(self._distance_score_between_lists(self.rows_out[i], self.rows_out[j])) + print(self.rows_out[i][0], *scores, sep='\t', file=f) + + + @staticmethod + def _newick_from_dist_matrix(distance_file, outfile): + r_script = outfile + '.tmp.R' + + with open(r_script, 'w') as f: + print('library(ape)', file=f) + print('a=read.table("', distance_file, '", header=TRUE, row.names=1)', sep='', file=f) + print('h=hclust(dist(a))', file=f) + print('write.tree(as.phylo(h), file="', outfile, '")', sep='', file=f) + + common.syscall('R CMD BATCH ' + r_script) + os.unlink(r_script + 'out') + os.unlink(r_script) + + def run(self): self._check_files_exist() self._gather_output_rows() @@ -213,4 +264,5 @@ def run(self): else: self._write_tsv() - + if self.js_candy_prefix is not None: + self._write_js_candy_files() diff --git a/ariba/tests/data/summary_test_newick_from_dist_matrix.distances b/ariba/tests/data/summary_test_newick_from_dist_matrix.distances new file mode 100644 index 00000000..c3ffce3c --- /dev/null +++ b/ariba/tests/data/summary_test_newick_from_dist_matrix.distances @@ -0,0 +1,4 @@ +file1 file2 file3 +file1 3 2 +file2 0 1 +file3 1 0 diff --git a/ariba/tests/data/summary_test_newick_from_dist_matrix.tre b/ariba/tests/data/summary_test_newick_from_dist_matrix.tre new file mode 100644 index 00000000..a3f5ac3a --- /dev/null +++ b/ariba/tests/data/summary_test_newick_from_dist_matrix.tre @@ -0,0 +1 @@ +(file1:1.58113883,(file2:0.7071067812,file3:0.7071067812):0.8740320489); diff --git a/ariba/tests/data/summary_test_write_distance_matrix.distances b/ariba/tests/data/summary_test_write_distance_matrix.distances new file mode 100644 index 00000000..c3ffce3c --- /dev/null +++ b/ariba/tests/data/summary_test_write_distance_matrix.distances @@ -0,0 +1,4 @@ +file1 file2 file3 +file1 3 2 +file2 0 1 +file3 1 0 diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 5e274162..05f44b23 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -153,3 +153,63 @@ def test_write_tsv(self): expected = os.path.join(data_dir, 'summary_test_write_tsv.out.tsv') self.assertTrue(filecmp.cmp(tmp_out, expected, shallow=False)) os.unlink(tmp_out) + + + def test_distance_score_bewteen_values(self): + '''Test _distance_score_bewteen_values''' + tests = [ + ((0, 0), 0), + ((0, 1), 1), + ((0, 2), 1), + ((1, 0), 1), + ((1, 1), 0), + ((1, 2), 0), + ((2, 0), 1), + ((2, 1), 0), + ((2, 2), 0), + ] + + for (val1, val2), expected in tests: + self.assertEqual(expected, summary.Summary._distance_score_between_values(val1, val2)) + + # check distance calculation is commutative + for val1 in range(5): + for val2 in range(5): + d1 = summary.Summary._distance_score_between_values(val1, val2) + d2 = summary.Summary._distance_score_between_values(val2, val1) + self.assertEqual(d1, d2) + + + def test_distance_score_between_lists(self): + '''Test _distance_score_between_lists''' + list1 = ['na', 0, 0, 0, 1, 1, 1, 2, 2, 2] + list2 = ['na', 0, 1, 2, 0, 1, 2, 1, 2, 2] + self.assertEqual(3, summary.Summary._distance_score_between_lists(list1, list2)) + + + def test_write_distance_matrix(self): + '''Test _write_distance_matrix''' + s = summary.Summary('out', filenames=['spam', 'eggs']) + s.rows_out = [ + ['filename', 'gene1', 'gene2', 'gene3'], + ['file1', 0, 1, 0], + ['file2', 1, 0, 3], + ['file3', 0, 0, 4], + ] + + tmp_distances = 'tmp.test.write_distance_matrix.distances' + s._write_distance_matrix(tmp_distances) + expected = os.path.join(data_dir, 'summary_test_write_distance_matrix.distances') + self.assertTrue(filecmp.cmp(expected, tmp_distances, shallow=False)) + os.unlink(tmp_distances) + + + def test_newick_from_dist_matrix(self): + '''Test _newick_from_dist_matrix''' + tmp_tree = 'tmp.test.newick_from_dist_matrix.tre' + dist_file = os.path.join(data_dir, 'summary_test_newick_from_dist_matrix.distances') + summary.Summary._newick_from_dist_matrix(dist_file, tmp_tree) + expected = os.path.join(data_dir, 'summary_test_newick_from_dist_matrix.tre') + self.assertTrue(filecmp.cmp(expected, tmp_tree, shallow=False)) + os.unlink(tmp_tree) + From a8e0e232483a838666e48f9e4a6ec51dab439c59 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 25 Nov 2015 09:34:16 +0000 Subject: [PATCH 2/6] Do not write .RData file --- ariba/summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/summary.py b/ariba/summary.py index 8865ba90..108f6d39 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -250,7 +250,7 @@ def _newick_from_dist_matrix(distance_file, outfile): print('h=hclust(dist(a))', file=f) print('write.tree(as.phylo(h), file="', outfile, '")', sep='', file=f) - common.syscall('R CMD BATCH ' + r_script) + common.syscall('R CMD BATCH --no-save ' + r_script) os.unlink(r_script + 'out') os.unlink(r_script) From eff1cf7f757acb1414ee568906d9a8a1be45df1c Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 25 Nov 2015 10:08:35 +0000 Subject: [PATCH 3/6] Add function _write_js_candy_csv --- ariba/summary.py | 10 ++++++++++ .../data/summary_test_write_js_candy_csv.csv | 3 +++ ariba/tests/summary_test.py | 15 +++++++++++++++ 3 files changed, 28 insertions(+) create mode 100644 ariba/tests/data/summary_test_write_js_candy_csv.csv diff --git a/ariba/summary.py b/ariba/summary.py index 108f6d39..c7db6662 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -197,6 +197,16 @@ def _write_tsv(self): pyfastaq.utils.close(f) + def _write_js_candy_csv(self, outfile): + f = pyfastaq.utils.open_file_write(outfile) + # js candy needs the "name" column. + # Names must match those used in the tree file + print('name', *self.rows_out[0][1:], sep=',', file=f) + for row in self.rows_out[1:]: + print(*row, sep=',', file=f) + pyfastaq.utils.close(f) + + def _write_xls(self): workbook = openpyxl.Workbook() worksheet = workbook.worksheets[0] diff --git a/ariba/tests/data/summary_test_write_js_candy_csv.csv b/ariba/tests/data/summary_test_write_js_candy_csv.csv new file mode 100644 index 00000000..d50d116f --- /dev/null +++ b/ariba/tests/data/summary_test_write_js_candy_csv.csv @@ -0,0 +1,3 @@ +name,gene1,gene3 +file1,1,3 +file2,2,4 diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 05f44b23..0a89dc8d 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -155,6 +155,21 @@ def test_write_tsv(self): os.unlink(tmp_out) + def test_write_js_candy_csv(self): + '''Test _write_js_candy_csv''' + tmp_out = 'tmp.test_write_js_candy.csv' + s = summary.Summary(tmp_out, filenames=['spam', 'eggs']) + s.rows_out = [ + ['filename', 'gene1', 'gene3'], + ['file1', 1, 3], + ['file2', 2, 4], + ] + s._write_js_candy_csv(tmp_out) + expected = os.path.join(data_dir, 'summary_test_write_js_candy_csv.csv') + self.assertTrue(filecmp.cmp(expected, tmp_out, shallow=False)) + os.unlink(tmp_out) + + def test_distance_score_bewteen_values(self): '''Test _distance_score_bewteen_values''' tests = [ From 124a1f70d1513d5d77738eff0097644dc677ab6b Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 25 Nov 2015 11:33:16 +0000 Subject: [PATCH 4/6] Add --js_candy option to summary --- ariba/summary.py | 12 +++++++++++- ariba/tasks/summary.py | 4 +++- .../summary_test_write_js_candy_files.csv | 4 ++++ .../summary_test_write_js_candy_files.tre | 1 + ariba/tests/summary_test.py | 19 +++++++++++++++++++ 5 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 ariba/tests/data/summary_test_write_js_candy_files.csv create mode 100644 ariba/tests/data/summary_test_write_js_candy_files.tre diff --git a/ariba/summary.py b/ariba/summary.py index c7db6662..d6625956 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -265,6 +265,16 @@ def _newick_from_dist_matrix(distance_file, outfile): os.unlink(r_script) + def _write_js_candy_files(self, outprefix): + distance_file = outprefix + '.distance_matrix' + tree_file = outprefix + '.tre' + csv_file = outprefix + '.csv' + self._write_distance_matrix(distance_file) + self._newick_from_dist_matrix(distance_file, tree_file) + os.unlink(distance_file) + self._write_js_candy_csv(csv_file) + + def run(self): self._check_files_exist() self._gather_output_rows() @@ -275,4 +285,4 @@ def run(self): self._write_tsv() if self.js_candy_prefix is not None: - self._write_js_candy_files() + self._write_js_candy_files(self.js_candy_prefix) diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py index 02cf4eda..494f446e 100644 --- a/ariba/tasks/summary.py +++ b/ariba/tasks/summary.py @@ -7,6 +7,7 @@ def run(): usage = 'ariba summary [options] [report1.tsv report2.tsv ...]', epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input. The input report files must be in tsv format, not xls.') parser.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME') + parser.add_argument('--js_candy', help='Write JS Candy output files, named with the given prefix. Writes a tree and csv file, which can be drag and dropped straight into JS Candy', metavar='files_prefix') parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT') parser.add_argument('--no_filter', action='store_true', help='Do not filter rows or columns of output that are all 0 (by deafult, they are removed from the output)') parser.add_argument('outfile', help='Name of output file. If file ends with ".xls", then an excel spreadsheet is written. Otherwise a tsv file is written') @@ -20,6 +21,7 @@ def run(): fofn=options.fofn, filenames=options.infiles, filter_output=(not options.no_filter), - min_id=options.min_id + min_id=options.min_id, + js_candy_prefix=options.js_candy ) s.run() diff --git a/ariba/tests/data/summary_test_write_js_candy_files.csv b/ariba/tests/data/summary_test_write_js_candy_files.csv new file mode 100644 index 00000000..5bbee36c --- /dev/null +++ b/ariba/tests/data/summary_test_write_js_candy_files.csv @@ -0,0 +1,4 @@ +name,gene1,gene2,gene3 +file1,0,1,0 +file2,1,0,3 +file3,0,0,4 diff --git a/ariba/tests/data/summary_test_write_js_candy_files.tre b/ariba/tests/data/summary_test_write_js_candy_files.tre new file mode 100644 index 00000000..a3f5ac3a --- /dev/null +++ b/ariba/tests/data/summary_test_write_js_candy_files.tre @@ -0,0 +1 @@ +(file1:1.58113883,(file2:0.7071067812,file3:0.7071067812):0.8740320489); diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 0a89dc8d..fc5e4302 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -228,3 +228,22 @@ def test_newick_from_dist_matrix(self): self.assertTrue(filecmp.cmp(expected, tmp_tree, shallow=False)) os.unlink(tmp_tree) + + def test_write_js_candy_files(self): + '''Test _write_js_candy_files''' + tmp_prefix = 'tmp.test.write_js_candy_files' + s = summary.Summary('out', filenames=['spam', 'eggs']) + s.rows_out = [ + ['filename', 'gene1', 'gene2', 'gene3'], + ['file1', 0, 1, 0], + ['file2', 1, 0, 3], + ['file3', 0, 0, 4], + ] + s._write_js_candy_files(tmp_prefix) + expected_csv = os.path.join(data_dir, 'summary_test_write_js_candy_files.csv') + expected_tre = os.path.join(data_dir, 'summary_test_write_js_candy_files.tre') + self.assertTrue(filecmp.cmp(expected_csv, tmp_prefix + '.csv', shallow=False)) + self.assertTrue(filecmp.cmp(expected_tre, tmp_prefix + '.tre', shallow=False)) + os.unlink(tmp_prefix + '.csv') + os.unlink(tmp_prefix + '.tre') + From 161e8a1c12519fdfef486ec7a95e111c109c2ffd Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 25 Nov 2015 13:16:15 +0000 Subject: [PATCH 5/6] Always run refcheck --- ariba/tasks/run.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ariba/tasks/run.py b/ariba/tasks/run.py index c52ccba6..800407e0 100644 --- a/ariba/tasks/run.py +++ b/ariba/tasks/run.py @@ -1,4 +1,5 @@ import argparse +import sys import pyfastaq import ariba @@ -29,6 +30,10 @@ def run(): assembly_group.add_argument('--assembler_k', type=int, help='kmer size to use with assembler. You can use 0 to set kmer to 2/3 of the read length. Warning - lower kmers are usually better. [%(default)s]', metavar='INT', default=21) assembly_group.add_argument('--spades_other', help='Put options string to be used with spades in quotes. This will NOT be sanity checked. Do not use -k or -t: for these options you should use the ariba run options --assembler_k and --threads [%(default)s]', default="--only-assembler", metavar="OPTIONS") + refcheck_group = parser.add_argument_group('refcheck options') + refcheck_group.add_argument('--refcheck_min_length', type=int, help='Minimum allowed length in nucleotides of reference gene [%(default)s]', metavar='INT', default=6) + refcheck_group.add_argument('--refcheck_max_length', type=int, help='Maximum allowed length in nucleotides of reference gene [%(default)s]', metavar='INT', default=10000) + other_group = parser.add_argument_group('Other options') other_group.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT') other_group.add_argument('--threads', type=int, help='Number of threads for bowtie2 and spades [%(default)s]', default=1, metavar='INT') @@ -58,6 +63,24 @@ def run(): ariba.external_progs.check_versions(options, verbose=options.verbose, not_required=set(['sspace', 'gapfiller'])) pyfastaq.sequences.genetic_code = options.genetic_code + + checker = ariba.refcheck.Checker( + options.db_fasta, + min_length=options.refcheck_min_length, + max_length=options.refcheck_max_length, + ) + ok, reason, seq = checker.run() + + if not ok: + print('\nInput reference file of genes failed refcheck! Cannot continue.', file=sys.stderr) + print('The first failed sequence is:\n', file=sys.stderr) + print(seq, file=sys.stderr) + print('\nIt failed for the reason:', reason, file=sys.stderr) + print('\nTo make a new fasta file called new_genes.fa, with the bad genes fixed/removed run:\n', file=sys.stderr) + print(' ariba refcheck -o new_genes', options.db_fasta, file=sys.stderr) + sys.exit(1) + + c = ariba.clusters.Clusters( options.db_fasta, options.reads_1, From 1bf542ec6bec4033345682ae291c3527a8311e6b Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 25 Nov 2015 13:24:26 +0000 Subject: [PATCH 6/6] Tweak js_candy help description --- ariba/tasks/summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py index 494f446e..0fce90ba 100644 --- a/ariba/tasks/summary.py +++ b/ariba/tasks/summary.py @@ -7,7 +7,7 @@ def run(): usage = 'ariba summary [options] [report1.tsv report2.tsv ...]', epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input. The input report files must be in tsv format, not xls.') parser.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME') - parser.add_argument('--js_candy', help='Write JS Candy output files, named with the given prefix. Writes a tree and csv file, which can be drag and dropped straight into JS Candy', metavar='files_prefix') + parser.add_argument('--js_candy', help='Write files that can be used as input to JS Candy, named with the given prefix. Writes a tree and csv file, which can be drag and dropped straight into JS Candy', metavar='files_prefix') parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT') parser.add_argument('--no_filter', action='store_true', help='Do not filter rows or columns of output that are all 0 (by deafult, they are removed from the output)') parser.add_argument('outfile', help='Name of output file. If file ends with ".xls", then an excel spreadsheet is written. Otherwise a tsv file is written')