From 97c5a74af91d34b2f233f263e6ef6776ac04d071 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 25 Nov 2015 09:31:58 +0000
Subject: [PATCH 1/6] Make tree for jscandy

---
 ariba/summary.py                              | 56 ++++++++++++++++-
 ...ary_test_newick_from_dist_matrix.distances |  4 ++
 .../summary_test_newick_from_dist_matrix.tre  |  1 +
 ...mmary_test_write_distance_matrix.distances |  4 ++
 ariba/tests/summary_test.py                   | 60 +++++++++++++++++++
 5 files changed, 123 insertions(+), 2 deletions(-)
 create mode 100644 ariba/tests/data/summary_test_newick_from_dist_matrix.distances
 create mode 100644 ariba/tests/data/summary_test_newick_from_dist_matrix.tre
 create mode 100644 ariba/tests/data/summary_test_write_distance_matrix.distances

diff --git a/ariba/summary.py b/ariba/summary.py
index 935846b9..8865ba90 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -1,7 +1,7 @@
 import os
 import openpyxl
 import pyfastaq
-from ariba import flag
+from ariba import flag, common
 
 class Error (Exception): pass
 
@@ -49,6 +49,7 @@ def __init__(
       filenames=None,
       fofn=None,
       filter_output=True,
+      js_candy_prefix=None,
       min_id=90.0
     ):
         if filenames is None and fofn is None:
@@ -65,6 +66,7 @@ def __init__(
         self.filter_output = filter_output
         self.min_id = min_id
         self.outfile = outfile
+        self.js_candy_prefix = js_candy_prefix
 
 
     def _load_fofn(self, fofn):
@@ -204,6 +206,55 @@ def _write_xls(self):
         workbook.save(self.outfile)
 
 
+    @staticmethod
+    def _distance_score_between_values(value1, value2):
+        if value1 != value2 and 0 in [value1, value2]:
+            return 1
+        else:
+            return 0
+
+
+    @classmethod
+    def _distance_score_between_lists(cls, scores1, scores2):
+        assert len(scores1) == len(scores2)
+        return sum([cls._distance_score_between_values(scores1[i], scores2[i]) for i in range(1, len(scores1))])
+
+
+    def _write_distance_matrix(self, outfile):
+        if len(self.rows_out) < 3:
+            raise Error('Cannot calculate distance matrix to make tree for js_candy.\n' +
+                        'Only one sample present.')
+
+        if len(self.rows_out[0]) < 2:
+            raise Error('Cannot calculate distance matrix to make tree for js_candy.\n' +
+                        'No genes present in output')
+
+        with open(outfile, 'w') as f:
+            sample_names = [x[0] for x in self.rows_out]
+            print(*sample_names[1:], sep='\t', file=f)
+
+            for i in range(1,len(self.rows_out)):
+                scores = []
+                for j in range(2, len(self.rows_out)):
+                    scores.append(self._distance_score_between_lists(self.rows_out[i], self.rows_out[j]))
+                print(self.rows_out[i][0], *scores, sep='\t', file=f)
+
+
+    @staticmethod
+    def _newick_from_dist_matrix(distance_file, outfile):
+        r_script = outfile + '.tmp.R'
+
+        with open(r_script, 'w') as f:
+            print('library(ape)', file=f)
+            print('a=read.table("', distance_file, '", header=TRUE, row.names=1)', sep='', file=f)
+            print('h=hclust(dist(a))', file=f)
+            print('write.tree(as.phylo(h), file="', outfile, '")', sep='', file=f)
+
+        common.syscall('R CMD BATCH ' + r_script)
+        os.unlink(r_script + 'out')
+        os.unlink(r_script)
+
+
     def run(self):
         self._check_files_exist()
         self._gather_output_rows()
@@ -213,4 +264,5 @@ def run(self):
         else:
             self._write_tsv()
 
-
+        if self.js_candy_prefix is not None:
+            self._write_js_candy_files()
diff --git a/ariba/tests/data/summary_test_newick_from_dist_matrix.distances b/ariba/tests/data/summary_test_newick_from_dist_matrix.distances
new file mode 100644
index 00000000..c3ffce3c
--- /dev/null
+++ b/ariba/tests/data/summary_test_newick_from_dist_matrix.distances
@@ -0,0 +1,4 @@
+file1	file2	file3
+file1	3	2
+file2	0	1
+file3	1	0
diff --git a/ariba/tests/data/summary_test_newick_from_dist_matrix.tre b/ariba/tests/data/summary_test_newick_from_dist_matrix.tre
new file mode 100644
index 00000000..a3f5ac3a
--- /dev/null
+++ b/ariba/tests/data/summary_test_newick_from_dist_matrix.tre
@@ -0,0 +1 @@
+(file1:1.58113883,(file2:0.7071067812,file3:0.7071067812):0.8740320489);
diff --git a/ariba/tests/data/summary_test_write_distance_matrix.distances b/ariba/tests/data/summary_test_write_distance_matrix.distances
new file mode 100644
index 00000000..c3ffce3c
--- /dev/null
+++ b/ariba/tests/data/summary_test_write_distance_matrix.distances
@@ -0,0 +1,4 @@
+file1	file2	file3
+file1	3	2
+file2	0	1
+file3	1	0
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 5e274162..05f44b23 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -153,3 +153,63 @@ def test_write_tsv(self):
         expected = os.path.join(data_dir, 'summary_test_write_tsv.out.tsv')
         self.assertTrue(filecmp.cmp(tmp_out, expected, shallow=False))
         os.unlink(tmp_out)
+
+
+    def test_distance_score_bewteen_values(self):
+        '''Test _distance_score_bewteen_values'''
+        tests = [
+            ((0, 0), 0),
+            ((0, 1), 1),
+            ((0, 2), 1),
+            ((1, 0), 1),
+            ((1, 1), 0),
+            ((1, 2), 0),
+            ((2, 0), 1),
+            ((2, 1), 0),
+            ((2, 2), 0),
+        ]
+
+        for (val1, val2), expected in tests:
+            self.assertEqual(expected, summary.Summary._distance_score_between_values(val1, val2))
+
+        # check distance calculation is commutative
+        for val1 in range(5):
+            for val2 in range(5):
+                d1 = summary.Summary._distance_score_between_values(val1, val2)
+                d2 = summary.Summary._distance_score_between_values(val2, val1)
+                self.assertEqual(d1, d2)
+
+
+    def test_distance_score_between_lists(self):
+        '''Test _distance_score_between_lists'''
+        list1 = ['na', 0, 0, 0, 1, 1, 1, 2, 2, 2]
+        list2 = ['na', 0, 1, 2, 0, 1, 2, 1, 2, 2]
+        self.assertEqual(3, summary.Summary._distance_score_between_lists(list1, list2))
+
+
+    def test_write_distance_matrix(self):
+        '''Test _write_distance_matrix'''
+        s = summary.Summary('out', filenames=['spam', 'eggs'])
+        s.rows_out = [
+            ['filename', 'gene1', 'gene2', 'gene3'],
+            ['file1', 0, 1, 0],
+            ['file2', 1, 0, 3],
+            ['file3', 0, 0, 4],
+        ]
+
+        tmp_distances = 'tmp.test.write_distance_matrix.distances'
+        s._write_distance_matrix(tmp_distances)
+        expected = os.path.join(data_dir, 'summary_test_write_distance_matrix.distances')
+        self.assertTrue(filecmp.cmp(expected, tmp_distances, shallow=False))
+        os.unlink(tmp_distances)
+
+
+    def test_newick_from_dist_matrix(self):
+        '''Test _newick_from_dist_matrix'''
+        tmp_tree = 'tmp.test.newick_from_dist_matrix.tre'
+        dist_file = os.path.join(data_dir, 'summary_test_newick_from_dist_matrix.distances')
+        summary.Summary._newick_from_dist_matrix(dist_file, tmp_tree)
+        expected = os.path.join(data_dir, 'summary_test_newick_from_dist_matrix.tre')
+        self.assertTrue(filecmp.cmp(expected, tmp_tree, shallow=False))
+        os.unlink(tmp_tree)
+

From a8e0e232483a838666e48f9e4a6ec51dab439c59 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 25 Nov 2015 09:34:16 +0000
Subject: [PATCH 2/6] Do not write .RData file

---
 ariba/summary.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ariba/summary.py b/ariba/summary.py
index 8865ba90..108f6d39 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -250,7 +250,7 @@ def _newick_from_dist_matrix(distance_file, outfile):
             print('h=hclust(dist(a))', file=f)
             print('write.tree(as.phylo(h), file="', outfile, '")', sep='', file=f)
 
-        common.syscall('R CMD BATCH ' + r_script)
+        common.syscall('R CMD BATCH --no-save ' + r_script)
         os.unlink(r_script + 'out')
         os.unlink(r_script)
 

From eff1cf7f757acb1414ee568906d9a8a1be45df1c Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 25 Nov 2015 10:08:35 +0000
Subject: [PATCH 3/6] Add function _write_js_candy_csv

---
 ariba/summary.py                                  | 10 ++++++++++
 .../data/summary_test_write_js_candy_csv.csv      |  3 +++
 ariba/tests/summary_test.py                       | 15 +++++++++++++++
 3 files changed, 28 insertions(+)
 create mode 100644 ariba/tests/data/summary_test_write_js_candy_csv.csv

diff --git a/ariba/summary.py b/ariba/summary.py
index 108f6d39..c7db6662 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -197,6 +197,16 @@ def _write_tsv(self):
         pyfastaq.utils.close(f)
 
 
+    def _write_js_candy_csv(self, outfile):
+        f = pyfastaq.utils.open_file_write(outfile)
+        # js candy needs the "name" column.
+        # Names must match those used in the tree file
+        print('name', *self.rows_out[0][1:], sep=',', file=f)
+        for row in self.rows_out[1:]:
+            print(*row, sep=',', file=f)
+        pyfastaq.utils.close(f)
+
+
     def _write_xls(self):
         workbook = openpyxl.Workbook()
         worksheet = workbook.worksheets[0]
diff --git a/ariba/tests/data/summary_test_write_js_candy_csv.csv b/ariba/tests/data/summary_test_write_js_candy_csv.csv
new file mode 100644
index 00000000..d50d116f
--- /dev/null
+++ b/ariba/tests/data/summary_test_write_js_candy_csv.csv
@@ -0,0 +1,3 @@
+name,gene1,gene3
+file1,1,3
+file2,2,4
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 05f44b23..0a89dc8d 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -155,6 +155,21 @@ def test_write_tsv(self):
         os.unlink(tmp_out)
 
 
+    def test_write_js_candy_csv(self):
+        '''Test _write_js_candy_csv'''
+        tmp_out = 'tmp.test_write_js_candy.csv'
+        s = summary.Summary(tmp_out, filenames=['spam', 'eggs'])
+        s.rows_out = [
+            ['filename', 'gene1', 'gene3'],
+            ['file1', 1, 3],
+            ['file2', 2, 4],
+        ]
+        s._write_js_candy_csv(tmp_out)
+        expected = os.path.join(data_dir, 'summary_test_write_js_candy_csv.csv')
+        self.assertTrue(filecmp.cmp(expected, tmp_out, shallow=False))
+        os.unlink(tmp_out)
+
+
     def test_distance_score_bewteen_values(self):
         '''Test _distance_score_bewteen_values'''
         tests = [

From 124a1f70d1513d5d77738eff0097644dc677ab6b Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 25 Nov 2015 11:33:16 +0000
Subject: [PATCH 4/6] Add --js_candy option to summary

---
 ariba/summary.py                              | 12 +++++++++++-
 ariba/tasks/summary.py                        |  4 +++-
 .../summary_test_write_js_candy_files.csv     |  4 ++++
 .../summary_test_write_js_candy_files.tre     |  1 +
 ariba/tests/summary_test.py                   | 19 +++++++++++++++++++
 5 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100644 ariba/tests/data/summary_test_write_js_candy_files.csv
 create mode 100644 ariba/tests/data/summary_test_write_js_candy_files.tre

diff --git a/ariba/summary.py b/ariba/summary.py
index c7db6662..d6625956 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -265,6 +265,16 @@ def _newick_from_dist_matrix(distance_file, outfile):
         os.unlink(r_script)
 
 
+    def _write_js_candy_files(self, outprefix):
+        distance_file = outprefix + '.distance_matrix'
+        tree_file = outprefix + '.tre'
+        csv_file = outprefix + '.csv'
+        self._write_distance_matrix(distance_file)
+        self._newick_from_dist_matrix(distance_file, tree_file)
+        os.unlink(distance_file)
+        self._write_js_candy_csv(csv_file)
+
+
     def run(self):
         self._check_files_exist()
         self._gather_output_rows()
@@ -275,4 +285,4 @@ def run(self):
             self._write_tsv()
 
         if self.js_candy_prefix is not None:
-            self._write_js_candy_files()
+            self._write_js_candy_files(self.js_candy_prefix)
diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
index 02cf4eda..494f446e 100644
--- a/ariba/tasks/summary.py
+++ b/ariba/tasks/summary.py
@@ -7,6 +7,7 @@ def run():
         usage = 'ariba summary [options] <outfile> [report1.tsv report2.tsv ...]',
         epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input. The input report files must be in tsv format, not xls.')
     parser.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME')
+    parser.add_argument('--js_candy', help='Write JS Candy output files, named with the given prefix. Writes a tree and csv file, which can be drag and dropped straight into JS Candy', metavar='files_prefix')
     parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
     parser.add_argument('--no_filter', action='store_true', help='Do not filter rows or columns of output that are all 0 (by deafult, they are removed from the output)')
     parser.add_argument('outfile', help='Name of output file. If file ends with ".xls", then an excel spreadsheet is written. Otherwise a tsv file is written')
@@ -20,6 +21,7 @@ def run():
         fofn=options.fofn,
         filenames=options.infiles,
         filter_output=(not options.no_filter),
-        min_id=options.min_id
+        min_id=options.min_id,
+        js_candy_prefix=options.js_candy
     )
     s.run()
diff --git a/ariba/tests/data/summary_test_write_js_candy_files.csv b/ariba/tests/data/summary_test_write_js_candy_files.csv
new file mode 100644
index 00000000..5bbee36c
--- /dev/null
+++ b/ariba/tests/data/summary_test_write_js_candy_files.csv
@@ -0,0 +1,4 @@
+name,gene1,gene2,gene3
+file1,0,1,0
+file2,1,0,3
+file3,0,0,4
diff --git a/ariba/tests/data/summary_test_write_js_candy_files.tre b/ariba/tests/data/summary_test_write_js_candy_files.tre
new file mode 100644
index 00000000..a3f5ac3a
--- /dev/null
+++ b/ariba/tests/data/summary_test_write_js_candy_files.tre
@@ -0,0 +1 @@
+(file1:1.58113883,(file2:0.7071067812,file3:0.7071067812):0.8740320489);
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 0a89dc8d..fc5e4302 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -228,3 +228,22 @@ def test_newick_from_dist_matrix(self):
         self.assertTrue(filecmp.cmp(expected, tmp_tree, shallow=False))
         os.unlink(tmp_tree)
 
+
+    def test_write_js_candy_files(self):
+        '''Test _write_js_candy_files'''
+        tmp_prefix = 'tmp.test.write_js_candy_files'
+        s = summary.Summary('out', filenames=['spam', 'eggs'])
+        s.rows_out = [
+            ['filename', 'gene1', 'gene2', 'gene3'],
+            ['file1', 0, 1, 0],
+            ['file2', 1, 0, 3],
+            ['file3', 0, 0, 4],
+        ]
+        s._write_js_candy_files(tmp_prefix)
+        expected_csv = os.path.join(data_dir, 'summary_test_write_js_candy_files.csv')
+        expected_tre = os.path.join(data_dir, 'summary_test_write_js_candy_files.tre')
+        self.assertTrue(filecmp.cmp(expected_csv, tmp_prefix + '.csv', shallow=False))
+        self.assertTrue(filecmp.cmp(expected_tre, tmp_prefix + '.tre', shallow=False))
+        os.unlink(tmp_prefix + '.csv')
+        os.unlink(tmp_prefix + '.tre')
+

From 161e8a1c12519fdfef486ec7a95e111c109c2ffd Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 25 Nov 2015 13:16:15 +0000
Subject: [PATCH 5/6] Always run refcheck

---
 ariba/tasks/run.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/ariba/tasks/run.py b/ariba/tasks/run.py
index c52ccba6..800407e0 100644
--- a/ariba/tasks/run.py
+++ b/ariba/tasks/run.py
@@ -1,4 +1,5 @@
 import argparse
+import sys
 import pyfastaq
 import ariba
 
@@ -29,6 +30,10 @@ def run():
     assembly_group.add_argument('--assembler_k', type=int, help='kmer size to use with assembler. You can use 0 to set kmer to 2/3 of the read length. Warning - lower kmers are usually better. [%(default)s]', metavar='INT', default=21)
     assembly_group.add_argument('--spades_other', help='Put options string to be used with spades in quotes. This will NOT be sanity checked. Do not use -k or -t: for these options you should use the ariba run options --assembler_k and --threads [%(default)s]', default="--only-assembler", metavar="OPTIONS")
 
+    refcheck_group = parser.add_argument_group('refcheck options')
+    refcheck_group.add_argument('--refcheck_min_length', type=int, help='Minimum allowed length in nucleotides of reference gene [%(default)s]', metavar='INT', default=6)
+    refcheck_group.add_argument('--refcheck_max_length', type=int, help='Maximum allowed length in nucleotides of reference gene [%(default)s]', metavar='INT', default=10000)
+
     other_group = parser.add_argument_group('Other options')
     other_group.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
     other_group.add_argument('--threads', type=int, help='Number of threads for bowtie2 and spades [%(default)s]', default=1, metavar='INT')
@@ -58,6 +63,24 @@ def run():
     ariba.external_progs.check_versions(options, verbose=options.verbose, not_required=set(['sspace', 'gapfiller']))
     pyfastaq.sequences.genetic_code = options.genetic_code
 
+
+    checker = ariba.refcheck.Checker(
+        options.db_fasta,
+        min_length=options.refcheck_min_length,
+        max_length=options.refcheck_max_length,
+    )
+    ok, reason, seq = checker.run()
+
+    if not ok:
+        print('\nInput reference file of genes failed refcheck! Cannot continue.', file=sys.stderr)
+        print('The first failed sequence is:\n', file=sys.stderr)
+        print(seq, file=sys.stderr)
+        print('\nIt failed for the reason:', reason, file=sys.stderr)
+        print('\nTo make a new fasta file called new_genes.fa, with the bad genes fixed/removed run:\n', file=sys.stderr)
+        print('    ariba refcheck -o new_genes', options.db_fasta, file=sys.stderr)
+        sys.exit(1)
+
+
     c = ariba.clusters.Clusters(
           options.db_fasta,
           options.reads_1,

From 1bf542ec6bec4033345682ae291c3527a8311e6b Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 25 Nov 2015 13:24:26 +0000
Subject: [PATCH 6/6] Tweak js_candy help description

---
 ariba/tasks/summary.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
index 494f446e..0fce90ba 100644
--- a/ariba/tasks/summary.py
+++ b/ariba/tasks/summary.py
@@ -7,7 +7,7 @@ def run():
         usage = 'ariba summary [options] <outfile> [report1.tsv report2.tsv ...]',
         epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input. The input report files must be in tsv format, not xls.')
     parser.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME')
-    parser.add_argument('--js_candy', help='Write JS Candy output files, named with the given prefix. Writes a tree and csv file, which can be drag and dropped straight into JS Candy', metavar='files_prefix')
+    parser.add_argument('--js_candy', help='Write files that can be used as input to JS Candy, named with the given prefix. Writes a tree and csv file, which can be drag and dropped straight into JS Candy', metavar='files_prefix')
     parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
     parser.add_argument('--no_filter', action='store_true', help='Do not filter rows or columns of output that are all 0 (by deafult, they are removed from the output)')
     parser.add_argument('outfile', help='Name of output file. If file ends with ".xls", then an excel spreadsheet is written. Otherwise a tsv file is written')