Merge pull request #37 from martinghunt/always_run_refcheck

Always run refcheck; Add JS candy output
sanger-pathogens · Nov 25, 2015 · ac94d94 · ac94d94
2 parents dbc9f44 + 1bf542e
commit ac94d94
Show file tree

Hide file tree

Showing 10 changed files with 211 additions and 3 deletions.
diff --git a/ariba/summary.py b/ariba/summary.py
@@ -1,7 +1,7 @@
 import os
 import openpyxl
 import pyfastaq
-from ariba import flag
+from ariba import flag, common
 
 class Error (Exception): pass
 
@@ -49,6 +49,7 @@ def __init__(
       filenames=None,
       fofn=None,
       filter_output=True,
+      js_candy_prefix=None,
       min_id=90.0
     ):
         if filenames is None and fofn is None:
@@ -65,6 +66,7 @@ def __init__(
         self.filter_output = filter_output
         self.min_id = min_id
         self.outfile = outfile
+        self.js_candy_prefix = js_candy_prefix
 
 
     def _load_fofn(self, fofn):
@@ -195,6 +197,16 @@ def _write_tsv(self):
         pyfastaq.utils.close(f)
 
 
+    def _write_js_candy_csv(self, outfile):
+        f = pyfastaq.utils.open_file_write(outfile)
+        # js candy needs the "name" column.
+        # Names must match those used in the tree file
+        print('name', *self.rows_out[0][1:], sep=',', file=f)
+        for row in self.rows_out[1:]:
+            print(*row, sep=',', file=f)
+        pyfastaq.utils.close(f)
+
+
     def _write_xls(self):
         workbook = openpyxl.Workbook()
         worksheet = workbook.worksheets[0]
@@ -204,6 +216,65 @@ def _write_xls(self):
         workbook.save(self.outfile)
 
 
+    @staticmethod
+    def _distance_score_between_values(value1, value2):
+        if value1 != value2 and 0 in [value1, value2]:
+            return 1
+        else:
+            return 0
+
+
+    @classmethod
+    def _distance_score_between_lists(cls, scores1, scores2):
+        assert len(scores1) == len(scores2)
+        return sum([cls._distance_score_between_values(scores1[i], scores2[i]) for i in range(1, len(scores1))])
+
+
+    def _write_distance_matrix(self, outfile):
+        if len(self.rows_out) < 3:
+            raise Error('Cannot calculate distance matrix to make tree for js_candy.\n' +
+                        'Only one sample present.')
+
+        if len(self.rows_out[0]) < 2:
+            raise Error('Cannot calculate distance matrix to make tree for js_candy.\n' +
+                        'No genes present in output')
+
+        with open(outfile, 'w') as f:
+            sample_names = [x[0] for x in self.rows_out]
+            print(*sample_names[1:], sep='\t', file=f)
+
+            for i in range(1,len(self.rows_out)):
+                scores = []
+                for j in range(2, len(self.rows_out)):
+                    scores.append(self._distance_score_between_lists(self.rows_out[i], self.rows_out[j]))
+                print(self.rows_out[i][0], *scores, sep='\t', file=f)
+
+
+    @staticmethod
+    def _newick_from_dist_matrix(distance_file, outfile):
+        r_script = outfile + '.tmp.R'
+
+        with open(r_script, 'w') as f:
+            print('library(ape)', file=f)
+            print('a=read.table("', distance_file, '", header=TRUE, row.names=1)', sep='', file=f)
+            print('h=hclust(dist(a))', file=f)
+            print('write.tree(as.phylo(h), file="', outfile, '")', sep='', file=f)
+
+        common.syscall('R CMD BATCH --no-save ' + r_script)
+        os.unlink(r_script + 'out')
+        os.unlink(r_script)
+
+
+    def _write_js_candy_files(self, outprefix):
+        distance_file = outprefix + '.distance_matrix'
+        tree_file = outprefix + '.tre'
+        csv_file = outprefix + '.csv'
+        self._write_distance_matrix(distance_file)
+        self._newick_from_dist_matrix(distance_file, tree_file)
+        os.unlink(distance_file)
+        self._write_js_candy_csv(csv_file)
+
+
     def run(self):
         self._check_files_exist()
         self._gather_output_rows()
@@ -213,4 +284,5 @@ def run(self):
         else:
             self._write_tsv()
 
-
+        if self.js_candy_prefix is not None:
+            self._write_js_candy_files(self.js_candy_prefix)
diff --git a/ariba/tasks/run.py b/ariba/tasks/run.py
@@ -1,4 +1,5 @@
 import argparse
+import sys
 import pyfastaq
 import ariba
 
@@ -29,6 +30,10 @@ def run():
     assembly_group.add_argument('--assembler_k', type=int, help='kmer size to use with assembler. You can use 0 to set kmer to 2/3 of the read length. Warning - lower kmers are usually better. [%(default)s]', metavar='INT', default=21)
     assembly_group.add_argument('--spades_other', help='Put options string to be used with spades in quotes. This will NOT be sanity checked. Do not use -k or -t: for these options you should use the ariba run options --assembler_k and --threads [%(default)s]', default="--only-assembler", metavar="OPTIONS")
 
+    refcheck_group = parser.add_argument_group('refcheck options')
+    refcheck_group.add_argument('--refcheck_min_length', type=int, help='Minimum allowed length in nucleotides of reference gene [%(default)s]', metavar='INT', default=6)
+    refcheck_group.add_argument('--refcheck_max_length', type=int, help='Maximum allowed length in nucleotides of reference gene [%(default)s]', metavar='INT', default=10000)
+
     other_group = parser.add_argument_group('Other options')
     other_group.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
     other_group.add_argument('--threads', type=int, help='Number of threads for bowtie2 and spades [%(default)s]', default=1, metavar='INT')
@@ -58,6 +63,24 @@ def run():
     ariba.external_progs.check_versions(options, verbose=options.verbose, not_required=set(['sspace', 'gapfiller']))
     pyfastaq.sequences.genetic_code = options.genetic_code
 
+
+    checker = ariba.refcheck.Checker(
+        options.db_fasta,
+        min_length=options.refcheck_min_length,
+        max_length=options.refcheck_max_length,
+    )
+    ok, reason, seq = checker.run()
+
+    if not ok:
+        print('\nInput reference file of genes failed refcheck! Cannot continue.', file=sys.stderr)
+        print('The first failed sequence is:\n', file=sys.stderr)
+        print(seq, file=sys.stderr)
+        print('\nIt failed for the reason:', reason, file=sys.stderr)
+        print('\nTo make a new fasta file called new_genes.fa, with the bad genes fixed/removed run:\n', file=sys.stderr)
+        print('    ariba refcheck -o new_genes', options.db_fasta, file=sys.stderr)
+        sys.exit(1)
+
+
     c = ariba.clusters.Clusters(
           options.db_fasta,
           options.reads_1,

diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
@@ -7,6 +7,7 @@ def run():
         usage = 'ariba summary [options] <outfile> [report1.tsv report2.tsv ...]',
         epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input. The input report files must be in tsv format, not xls.')
     parser.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME')
+    parser.add_argument('--js_candy', help='Write files that can be used as input to JS Candy, named with the given prefix. Writes a tree and csv file, which can be drag and dropped straight into JS Candy', metavar='files_prefix')
     parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
     parser.add_argument('--no_filter', action='store_true', help='Do not filter rows or columns of output that are all 0 (by deafult, they are removed from the output)')
     parser.add_argument('outfile', help='Name of output file. If file ends with ".xls", then an excel spreadsheet is written. Otherwise a tsv file is written')
@@ -20,6 +21,7 @@ def run():
         fofn=options.fofn,
         filenames=options.infiles,
         filter_output=(not options.no_filter),
-        min_id=options.min_id
+        min_id=options.min_id,
+        js_candy_prefix=options.js_candy
     )
     s.run()
diff --git a/ariba/tests/data/summary_test_newick_from_dist_matrix.distances b/ariba/tests/data/summary_test_newick_from_dist_matrix.distances
@@ -0,0 +1,4 @@
+file1	file2	file3
+file1	3	2
+file2	0	1
+file3	1	0
diff --git a/ariba/tests/data/summary_test_newick_from_dist_matrix.tre b/ariba/tests/data/summary_test_newick_from_dist_matrix.tre
@@ -0,0 +1 @@
+(file1:1.58113883,(file2:0.7071067812,file3:0.7071067812):0.8740320489);
diff --git a/ariba/tests/data/summary_test_write_distance_matrix.distances b/ariba/tests/data/summary_test_write_distance_matrix.distances
@@ -0,0 +1,4 @@
+file1	file2	file3
+file1	3	2
+file2	0	1
+file3	1	0
diff --git a/ariba/tests/data/summary_test_write_js_candy_csv.csv b/ariba/tests/data/summary_test_write_js_candy_csv.csv
@@ -0,0 +1,3 @@
+name,gene1,gene3
+file1,1,3
+file2,2,4
diff --git a/ariba/tests/data/summary_test_write_js_candy_files.csv b/ariba/tests/data/summary_test_write_js_candy_files.csv
@@ -0,0 +1,4 @@
+name,gene1,gene2,gene3
+file1,0,1,0
+file2,1,0,3
+file3,0,0,4
diff --git a/ariba/tests/data/summary_test_write_js_candy_files.tre b/ariba/tests/data/summary_test_write_js_candy_files.tre
@@ -0,0 +1 @@
+(file1:1.58113883,(file2:0.7071067812,file3:0.7071067812):0.8740320489);
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
@@ -153,3 +153,97 @@ def test_write_tsv(self):
         expected = os.path.join(data_dir, 'summary_test_write_tsv.out.tsv')
         self.assertTrue(filecmp.cmp(tmp_out, expected, shallow=False))
         os.unlink(tmp_out)
+
+
+    def test_write_js_candy_csv(self):
+        '''Test _write_js_candy_csv'''
+        tmp_out = 'tmp.test_write_js_candy.csv'
+        s = summary.Summary(tmp_out, filenames=['spam', 'eggs'])
+        s.rows_out = [
+            ['filename', 'gene1', 'gene3'],
+            ['file1', 1, 3],
+            ['file2', 2, 4],
+        ]
+        s._write_js_candy_csv(tmp_out)
+        expected = os.path.join(data_dir, 'summary_test_write_js_candy_csv.csv')
+        self.assertTrue(filecmp.cmp(expected, tmp_out, shallow=False))
+        os.unlink(tmp_out)
+
+
+    def test_distance_score_bewteen_values(self):
+        '''Test _distance_score_bewteen_values'''
+        tests = [
+            ((0, 0), 0),
+            ((0, 1), 1),
+            ((0, 2), 1),
+            ((1, 0), 1),
+            ((1, 1), 0),
+            ((1, 2), 0),
+            ((2, 0), 1),
+            ((2, 1), 0),
+            ((2, 2), 0),
+        ]
+
+        for (val1, val2), expected in tests:
+            self.assertEqual(expected, summary.Summary._distance_score_between_values(val1, val2))
+
+        # check distance calculation is commutative
+        for val1 in range(5):
+            for val2 in range(5):
+                d1 = summary.Summary._distance_score_between_values(val1, val2)
+                d2 = summary.Summary._distance_score_between_values(val2, val1)
+                self.assertEqual(d1, d2)
+
+
+    def test_distance_score_between_lists(self):
+        '''Test _distance_score_between_lists'''
+        list1 = ['na', 0, 0, 0, 1, 1, 1, 2, 2, 2]
+        list2 = ['na', 0, 1, 2, 0, 1, 2, 1, 2, 2]
+        self.assertEqual(3, summary.Summary._distance_score_between_lists(list1, list2))
+
+
+    def test_write_distance_matrix(self):
+        '''Test _write_distance_matrix'''
+        s = summary.Summary('out', filenames=['spam', 'eggs'])
+        s.rows_out = [
+            ['filename', 'gene1', 'gene2', 'gene3'],
+            ['file1', 0, 1, 0],
+            ['file2', 1, 0, 3],
+            ['file3', 0, 0, 4],
+        ]
+
+        tmp_distances = 'tmp.test.write_distance_matrix.distances'
+        s._write_distance_matrix(tmp_distances)
+        expected = os.path.join(data_dir, 'summary_test_write_distance_matrix.distances')
+        self.assertTrue(filecmp.cmp(expected, tmp_distances, shallow=False))
+        os.unlink(tmp_distances)
+
+
+    def test_newick_from_dist_matrix(self):
+        '''Test _newick_from_dist_matrix'''
+        tmp_tree = 'tmp.test.newick_from_dist_matrix.tre'
+        dist_file = os.path.join(data_dir, 'summary_test_newick_from_dist_matrix.distances')
+        summary.Summary._newick_from_dist_matrix(dist_file, tmp_tree)
+        expected = os.path.join(data_dir, 'summary_test_newick_from_dist_matrix.tre')
+        self.assertTrue(filecmp.cmp(expected, tmp_tree, shallow=False))
+        os.unlink(tmp_tree)
+
+
+    def test_write_js_candy_files(self):
+        '''Test _write_js_candy_files'''
+        tmp_prefix = 'tmp.test.write_js_candy_files'
+        s = summary.Summary('out', filenames=['spam', 'eggs'])
+        s.rows_out = [
+            ['filename', 'gene1', 'gene2', 'gene3'],
+            ['file1', 0, 1, 0],
+            ['file2', 1, 0, 3],
+            ['file3', 0, 0, 4],
+        ]
+        s._write_js_candy_files(tmp_prefix)
+        expected_csv = os.path.join(data_dir, 'summary_test_write_js_candy_files.csv')
+        expected_tre = os.path.join(data_dir, 'summary_test_write_js_candy_files.tre')
+        self.assertTrue(filecmp.cmp(expected_csv, tmp_prefix + '.csv', shallow=False))
+        self.assertTrue(filecmp.cmp(expected_tre, tmp_prefix + '.tre', shallow=False))
+        os.unlink(tmp_prefix + '.csv')
+        os.unlink(tmp_prefix + '.tre')
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		(file1:1.58113883,(file2:0.7071067812,file3:0.7071067812):0.8740320489);