From c1f34eac9611a742f0ccae16c0ca1c11722c07dd Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 5 Aug 2016 15:12:04 +0100
Subject: [PATCH 01/40] Bug fix getting snp group

---
 ariba/summary_cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py
index 7a53f556..39bf9fee 100644
--- a/ariba/summary_cluster.py
+++ b/ariba/summary_cluster.py
@@ -56,7 +56,7 @@ def line2dict(cls, line):
             d['var_group'] = '.'
         else:
             try:
-                d['var_group'] = d['var_description'].split(':')[3]
+                d['var_group'] = d['var_description'].split(':')[4]
             except:
                 raise Error('Error getting variant group from the following line:\n' + line)
 

From c123ce4a82a9fdb6569f47ecffa1c24031c0a33a Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 5 Aug 2016 15:13:21 +0100
Subject: [PATCH 02/40] Add option --no_tree

---
 ariba/summary.py       | 24 +++++++++++++++---------
 ariba/tasks/summary.py |  1 +
 scripts/ariba          |  1 +
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/ariba/summary.py b/ariba/summary.py
index ecf7d304..77b003ab 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -21,6 +21,7 @@ def __init__(
       show_known_het=False,
       cluster_cols='assembled,match,ref_seq,pct_id,known_var,novel_var',
       variant_cols='groups,grouped,ungrouped,novel',
+      make_phandango_tree=True,
       verbose=False,
     ):
         if filenames is None and fofn is None:
@@ -41,6 +42,7 @@ def __init__(
         self.filter_columns = filter_columns
         self.min_id = min_id
         self.outprefix = outprefix
+        self.make_phandango_tree = make_phandango_tree
         self.verbose = verbose
 
 
@@ -416,17 +418,21 @@ def run(self):
             csv_file = self.outprefix + '.phandango.csv'
             phandango_header, phandango_matrix = Summary._add_phandango_colour_columns(phandango_header, matrix)
             Summary._matrix_to_csv(phandango_matrix, phandango_header, csv_file)
-            dist_matrix_file = self.outprefix + '.phandango.distance_matrix'
-            tree_file = self.outprefix + '.phandango.tre'
 
-            if self.verbose:
-                print('Making Phandango distance matrix', dist_matrix_file, flush=True)
-            Summary._write_distance_matrix(matrix, dist_matrix_file)
+            if self.make_phandango_tree:
+                dist_matrix_file = self.outprefix + '.phandango.distance_matrix'
+                tree_file = self.outprefix + '.phandango.tre'
 
-            if self.verbose:
-                print('Making Phandango tree file', tree_file, flush=True)
-            Summary._newick_from_dist_matrix(dist_matrix_file, tree_file)
-            os.unlink(dist_matrix_file)
+                if self.verbose:
+                    print('Making Phandango distance matrix', dist_matrix_file, flush=True)
+                Summary._write_distance_matrix(matrix, dist_matrix_file)
+
+                if self.verbose:
+                    print('Making Phandango tree file', tree_file, flush=True)
+                Summary._newick_from_dist_matrix(dist_matrix_file, tree_file)
+                os.unlink(dist_matrix_file)
+            elif self.verbose:
+                print('Skipping making tree because you asked me not to make it', flush=True)
         else:
             print('Made csv file. Not making Phandango files because only one sample remains after filtering', file=sys.stderr)
 
diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
index 782c9056..b722ce8e 100644
--- a/ariba/tasks/summary.py
+++ b/ariba/tasks/summary.py
@@ -96,6 +96,7 @@ def run(options):
         show_known_het=options.het,
         cluster_cols=options.cluster_cols,
         variant_cols=options.var_cols,
+        make_phandango_tree=(not options.no_tree),
         verbose=options.verbose
     )
     s.run()
diff --git a/scripts/ariba b/scripts/ariba
index bb6627e3..63fc5d93 100755
--- a/scripts/ariba
+++ b/scripts/ariba
@@ -166,6 +166,7 @@ subparser_summary.add_argument('--preset', choices=summary_presets, help='Shorth
 subparser_summary.add_argument('--cluster_cols', help='Comma separated list of cluster columns to include. Choose from: assembled, match, ref_seq, pct_id, known_var, novel_var [%(default)s]', default='match', metavar='col1,col2,...')
 subparser_summary.add_argument('--col_filter', choices=['y', 'n'], default='y', help='Choose whether columns where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
 subparser_summary.add_argument('--het', action='store_true', help='For known noncoding SNPs, report if they are heterozygous or not, and the percent of reads supporting the variant type')
+subparser_summary.add_argument('--no_tree', action='store_true', help='Do not make phandango tree')
 subparser_summary.add_argument('--row_filter', choices=['y', 'n'], default='y', help='Choose whether rows where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
 subparser_summary.add_argument('--var_cols', help='Comma separated list of variant columns to include. Choose from: groups, grouped, ungrouped, novel [none by default]', metavar='col1,col2,...', default='')
 subparser_summary.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')

From 0cae6a3a8fa70fc5b27cb344194c6befab0368be Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 5 Aug 2016 15:26:37 +0100
Subject: [PATCH 03/40] Fix snp info format in penultimate column

---
 ariba/tests/summary_cluster_test.py | 136 ++++++++++++++--------------
 1 file changed, 68 insertions(+), 68 deletions(-)

diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py
index 6220dcfb..9ee7c458 100644
--- a/ariba/tests/summary_cluster_test.py
+++ b/ariba/tests/summary_cluster_test.py
@@ -8,7 +8,7 @@
 class TestSummaryCluster(unittest.TestCase):
     def test_line2dict(self):
         '''Test _line2dict'''
-        line = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:var_group1:ref has wild type, foo bar\tsome free text'
+        line = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:var_group1:ref has wild type, foo bar\tsome free text'
 
         expected = {
             'ref_name': 'refname',
@@ -39,7 +39,7 @@ def test_line2dict(self):
             'smtls_total_depth': '17',
             'smtls_alt_nt': '.',
             'smtls_alt_depth': '17',
-            'var_description': 'noncoding1:n:A14T:var_group1:ref has wild type, foo bar',
+            'var_description': 'noncoding1:1:0:A14T:var_group1:ref has wild type, foo bar',
             'var_group': 'var_group1',
             'free_text': 'some free text'
         }
@@ -51,9 +51,9 @@ def test_add_data_dict(self):
         '''Test add_data_dict'''
         cluster = summary_cluster.SummaryCluster()
         self.assertTrue(cluster.name is None)
-        line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
-        line2 = 'refname\t1\t0\t19\t78\tcluster2\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id2:ref has wild type, foo bar\tsome free text'
-        line3 = 'refname2\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id3:ref has wild type, foo bar\tsome free text'
+        line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line2 = 'refname\t1\t0\t19\t78\tcluster2\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id2:ref has wild type, foo bar\tsome free text'
+        line3 = 'refname2\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id3:ref has wild type, foo bar\tsome free text'
         data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
         data_dict2 = summary_cluster.SummaryCluster.line2dict(line2)
         data_dict3 = summary_cluster.SummaryCluster.line2dict(line3)
@@ -71,9 +71,9 @@ def test_pc_id_of_longest(self):
         '''Test pc_id_of_longest'''
         cluster = summary_cluster.SummaryCluster()
         self.assertTrue(cluster.name is None)
-        line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
-        line2 = 'refname\t1\t0\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
-        line3 = 'refname\t1\t0\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line2 = 'refname\t1\t0\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line3 = 'refname\t1\t0\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
         data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
         data_dict2 = summary_cluster.SummaryCluster.line2dict(line2)
         data_dict3 = summary_cluster.SummaryCluster.line2dict(line3)
@@ -85,7 +85,7 @@ def test_pc_id_of_longest(self):
 
     def test_to_cluster_summary_number(self):
         '''Test _to_cluster_summary_assembled'''
-        line = 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line = 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text'
         data_dict = summary_cluster.SummaryCluster.line2dict(line)
 
         tests = [
@@ -122,9 +122,9 @@ def test_to_cluster_summary_number(self):
     def test_has_known_variant(self):
         '''Test _has_known_variant'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -139,9 +139,9 @@ def test_has_known_variant(self):
 
     def test_has_any_known_variant(self):
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -159,10 +159,10 @@ def test_has_any_known_variant(self):
     def test_has_nonsynonymous(self):
         '''Test _has_nonsynonymous'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -178,11 +178,11 @@ def test_has_nonsynonymous(self):
     def test_has_any_nonsynonymous(self):
         '''Test _has_any_nonsynonymous'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:N_ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:N_ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['no', 'yes', 'no', 'yes', 'yes']
@@ -198,9 +198,9 @@ def test_has_any_nonsynonymous(self):
     def test_has_novel_nonsynonymous(self):
         '''Test _has_novel_nonsynonymous'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -216,9 +216,9 @@ def test_has_novel_nonsynonymous(self):
     def test_has_any_novel_nonsynonymous(self):
         '''Test _has_any_novel_nonsynonymous'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -236,11 +236,11 @@ def test_has_any_novel_nonsynonymous(self):
     def test_to_cluster_summary_has_known_nonsynonymous(self):
         '''Test _to_cluster_summary_has_known_nonsynonymous'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['yes', 'yes', 'no', 'no', 'no']
@@ -257,11 +257,11 @@ def test_to_cluster_summary_has_known_nonsynonymous(self):
     def test_to_cluster_summary_has_novel_nonsynonymous(self):
         '''Test _to_cluster_summary_has_novel_nonsynonymous'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['no', 'no', 'no', 'yes', 'yes']
@@ -278,11 +278,11 @@ def test_to_cluster_summary_has_novel_nonsynonymous(self):
     def test_to_cluster_summary_has_nonsynonymous(self):
         '''Test _to_cluster_summary_has_nonsynonymous'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['no', 'yes', 'no', 'yes', 'yes']
@@ -369,16 +369,16 @@ def test_get_nonsynonymous_var(self):
     def test_has_match(self):
         '''Test _has_match'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:1:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:1:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'no', 'no']
@@ -396,14 +396,14 @@ def test_has_match(self):
     def test_has_var_groups(self):
         '''Test has_var_groups'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id2:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id3:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id4:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id5:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id6:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id7:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id7:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id2:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id3:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id4:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id5:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id6:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id7:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id7:ref has wild type, foo bar\tsome free text',
         ]
         dicts = [summary_cluster.SummaryCluster.line2dict(line) for line in lines]
         cluster = summary_cluster.SummaryCluster()
@@ -438,7 +438,7 @@ def test_column_summary_data(self):
 
     def test_non_synon_variants(self):
         '''Test non_synon_variants'''
-        line1 = 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs'
+        line1 = 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs'
         line2 = 'ref1\t0\t0\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text'
 
         data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
@@ -454,10 +454,10 @@ def test_non_synon_variants(self):
     def test_known_noncoding_het_snps(self):
         '''test known_noncoding_het_snps'''
         lines = [
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs'
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs'
         ]
 
         cluster = summary_cluster.SummaryCluster()

From 001e308e7bd17cafc9b3d56fa3b0a2864af82445 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 5 Aug 2016 15:31:50 +0100
Subject: [PATCH 04/40] Fix snp info format in penultimate column of sample
 test files

---
 ...mmary_sample_test_column_names_tuples_and_het_snps.tsv | 8 ++++----
 .../data/summary_sample_test_column_summary_data.tsv      | 8 ++++----
 ariba/tests/data/summary_sample_test_var_groups.tsv       | 8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv b/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv
index 5e12e4a9..159949c8 100644
--- a/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv
+++ b/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv
@@ -1,8 +1,8 @@
 #ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:n:A14T:.:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:0:0:A14T:.:ref has wild type, reads have variant so should report	generic description of noncoding1
 noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	0	SNP	.	.	.	G15T	SNP	15	15	G	85	85	T	17	.	17	.	generic description of noncoding1
-noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
 noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
-presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
 presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
-variants_only1	1	1	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:.:Ref and reads have variant so report	Generic description of variants_only1
+variants_only1	1	1	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:1:0:S5T:.:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_sample_test_column_summary_data.tsv b/ariba/tests/data/summary_sample_test_column_summary_data.tsv
index 22a42b54..9c495ecd 100644
--- a/ariba/tests/data/summary_sample_test_column_summary_data.tsv
+++ b/ariba/tests/data/summary_sample_test_column_summary_data.tsv
@@ -1,8 +1,8 @@
 #ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
 noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	0	SNP	.	.	.	G15T	SNP	15	15	G	85	85	T	17	.	17	.	generic description of noncoding1
-noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1_n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
 noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
-presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
 presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
-variants_only1	1	1	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
+variants_only1	1	1	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:1:0:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_sample_test_var_groups.tsv b/ariba/tests/data/summary_sample_test_var_groups.tsv
index a1252110..33526608 100644
--- a/ariba/tests/data/summary_sample_test_var_groups.tsv
+++ b/ariba/tests/data/summary_sample_test_var_groups.tsv
@@ -1,7 +1,7 @@
 #ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
 noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
-presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
 presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
-variants_only1	1	1	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
+variants_only1	1	1	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:1:0:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1

From 13d630192fbfac585d9b747308e91d5b04659fb1 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 5 Aug 2016 15:36:29 +0100
Subject: [PATCH 05/40] Fix snp info format in penultimate column of summary
 test files

---
 ariba/tests/data/summary_test_gather_output_rows.in.1.tsv | 4 ++--
 ariba/tests/data/summary_test_gather_output_rows.in.2.tsv | 6 +++---
 ariba/tests/data/summary_test_get_all_cluster_names.1.tsv | 4 ++--
 ariba/tests/data/summary_test_get_all_cluster_names.2.tsv | 6 +++---
 ariba/tests/data/summary_test_get_all_var_groups.1.tsv    | 4 ++--
 ariba/tests/data/summary_test_get_all_var_groups.2.tsv    | 6 +++---
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
index d1f5f70b..3e67eeb1 100644
--- a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
+++ b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
@@ -1,3 +1,3 @@
 #ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
+noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
index 6507d5fd..398aedbc 100644
--- a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
+++ b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
@@ -1,5 +1,5 @@
 #ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id3:variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
+noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id3:variant in ref and reads so should report	generic description of noncoding1
+presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
 variants_only1	1	1	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
index 9e8e9a2a..f35590e2 100644
--- a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
+++ b/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
@@ -1,3 +1,3 @@
 #ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	cluster.p.1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id2:Ref has wild, reads have variant so report	Generic description of presence_absence1
+noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	1	0	27	88	cluster.p.1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:id2:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv
index d4cd028c..2bddc3d6 100644
--- a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv
+++ b/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv
@@ -1,5 +1,5 @@
 #ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	cluster.p.2	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+presence_absence1	1	0	27	88	cluster.p.2	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
 variants_only1	1	1	64	12	cluster.v.1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv b/ariba/tests/data/summary_test_get_all_var_groups.1.tsv
index 62394c08..c4db58da 100644
--- a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv
+++ b/ariba/tests/data/summary_test_get_all_var_groups.1.tsv
@@ -1,3 +1,3 @@
 #ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	cluster.p.1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id4:Ref has wild, reads have variant so report	Generic description of presence_absence1
+noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	1	0	27	88	cluster.p.1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:id4:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv b/ariba/tests/data/summary_test_get_all_var_groups.2.tsv
index d4cd028c..2bddc3d6 100644
--- a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv
+++ b/ariba/tests/data/summary_test_get_all_var_groups.2.tsv
@@ -1,5 +1,5 @@
 #ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	cluster.p.2	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+presence_absence1	1	0	27	88	cluster.p.2	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
 variants_only1	1	1	64	12	cluster.v.1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.

From 94fabc2c70955f463c6628f4fb344a7bff97ac41 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Mon, 8 Aug 2016 15:27:55 +0100
Subject: [PATCH 06/40] Report het snps in groups

---
 ariba/summary.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/ariba/summary.py b/ariba/summary.py
index 77b003ab..cabe3c01 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -135,6 +135,7 @@ def _get_all_var_groups(cls, samples_dict):
                 if name not in groups:
                     groups[name] = set()
                 groups[name].update(name_set)
+
         return groups
 
 
@@ -170,9 +171,29 @@ def _gather_output_rows(self):
                 if self.var_columns['groups']:
                     for group_name in var_groups[cluster]:
                         if cluster in sample.var_groups and group_name in sample.var_groups[cluster]:
-                            rows[filename][cluster]['vgroup.' + group_name] = 'yes'
+                            if self.show_known_het:
+                                if cluster in sample.het_snps:
+                                    if len(sample.het_snps[cluster]) == 0:
+                                        rows[filename][cluster]['vgroup.' + group_name] = 'no'
+                                        rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA'
+                                    elif len(sample.het_snps[cluster]) == 1:
+                                        rows[filename][cluster]['vgroup.' + group_name] = 'het'
+                                        snp_name = list(sample.het_snps[cluster].keys())[0]
+                                        percent = -1
+                                        for v in sample.variant_column_names_tuples[cluster]:
+                                            if v[1] == snp_name and snp_name in sample.het_snps[cluster]:
+                                                percent = sample.het_snps[cluster][snp_name]
+
+                                        rows[filename][cluster]['vgroup.' + group_name + '.%'] = percent
+                                    else:
+                                        rows[filename][cluster]['vgroup.' + group_name] = 'multi_het'
+                                        rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA'
+                            else:
+                                rows[filename][cluster]['vgroup.' + group_name] = 'yes'
                         else:
                             rows[filename][cluster]['vgroup.' + group_name] = 'no'
+                            if self.show_known_het:
+                                rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA'
 
                 if cluster in all_var_columns:
                     for (ref_name, variant, grouped_or_novel, group_name) in all_var_columns[cluster]:

From 23033e50aa73d02b1e52492fba29866768c3c433 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 10 Aug 2016 10:19:04 +0100
Subject: [PATCH 07/40] Bug fix passing all preset var_cols option

---
 ariba/tasks/summary.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
index b722ce8e..ec1cd879 100644
--- a/ariba/tasks/summary.py
+++ b/ariba/tasks/summary.py
@@ -9,7 +9,7 @@ def use_preset(options):
     preset_to_vals = {
         'minimal': {
             'cluster_cols': 'match',
-            'variant_cols': '',
+            'var_cols': '',
             'col_filter': 'y',
             'row_filter': 'y',
             'var_groups': 'n',
@@ -18,7 +18,7 @@ def use_preset(options):
         },
         'cluster_small': {
             'cluster_cols': 'assembled,match,ref_seq,known_var',
-            'variant_cols': '',
+            'var_cols': '',
             'col_filter': 'y',
             'row_filter': 'y',
             'var_groups': 'n',
@@ -27,7 +27,7 @@ def use_preset(options):
         },
         'cluster_all': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'variant_cols': '',
+            'var_cols': '',
             'col_filter': 'y',
             'row_filter': 'y',
             'var_groups': 'n',
@@ -36,7 +36,7 @@ def use_preset(options):
         },
         'cluster_var_groups': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'variant_cols': 'groups',
+            'var_cols': 'groups',
             'col_filter': 'y',
             'row_filter': 'y',
             'var_groups': 'y',
@@ -45,7 +45,7 @@ def use_preset(options):
         },
         'cluster_known_vars': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'variant_cols': 'groups,grouped,ungrouped',
+            'var_cols': 'groups,grouped,ungrouped',
             'col_filter': 'y',
             'row_filter': 'y',
             'var_groups': 'y',
@@ -54,7 +54,7 @@ def use_preset(options):
         },
         'all': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'variant_cols': 'groups,grouped,ungrouped,novel',
+            'var_cols': 'groups,grouped,ungrouped,novel',
             'col_filter': 'y',
             'row_filter': 'y',
             'var_groups': 'y',
@@ -63,7 +63,7 @@ def use_preset(options):
         },
         'all_no_filter': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'variant_cols': 'groups,grouped,ungrouped,novel',
+            'var_cols': 'groups,grouped,ungrouped,novel',
             'col_filter': 'n',
             'row_filter': 'n',
             'var_groups': 'y',

From 59a212ddffce2910adc7953043de5219d6d9dc70 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 10 Aug 2016 20:02:47 +0100
Subject: [PATCH 08/40] Add 'interrupted' as option in assembled column

---
 ariba/summary_cluster.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py
index 39bf9fee..2fd5b07f 100644
--- a/ariba/summary_cluster.py
+++ b/ariba/summary_cluster.py
@@ -118,6 +118,8 @@ def _to_cluster_summary_assembled(self):
                 return 'yes'
             else:
                 return 'yes_nonunique'
+        elif self.flag.has('assembled_into_one_contig'):
+            return 'interrupted'
         else:
             return 'fragmented'
 

From efc129122fe4731df9a5e96d77c6e9c80abc4e20 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 10 Aug 2016 22:00:53 +0100
Subject: [PATCH 09/40] track variant groups, if present

---
 ariba/summary_cluster.py            |  5 ++++-
 ariba/tests/summary_cluster_test.py | 12 ++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py
index 2fd5b07f..3b1daeb1 100644
--- a/ariba/summary_cluster.py
+++ b/ariba/summary_cluster.py
@@ -297,5 +297,8 @@ def known_noncoding_het_snps(self):
         for d in self.data:
             snp_tuple = self._get_known_noncoding_het_snp(d)
             if snp_tuple is not None:
-                snps[snp_tuple[0]] = snp_tuple[1]
+                snp_id = d['var_description'].split(':')[4]
+                if snp_id not in snps:
+                    snps[snp_id] = {}
+                snps[snp_id][snp_tuple[0]] = snp_tuple[1]
         return snps
diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py
index 9ee7c458..44727544 100644
--- a/ariba/tests/summary_cluster_test.py
+++ b/ariba/tests/summary_cluster_test.py
@@ -455,15 +455,19 @@ def test_known_noncoding_het_snps(self):
         '''test known_noncoding_het_snps'''
         lines = [
             'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs'
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs'
         ]
 
         cluster = summary_cluster.SummaryCluster()
         for line in lines:
             cluster.add_data_dict(summary_cluster.SummaryCluster.line2dict(line))
         got = cluster.known_noncoding_het_snps()
-        expected = {'A42T': 25.0, 'A62T': 75.0, 'A82T': 40.0}
+        expected = {
+            '.': {'A82T': 40.0},
+            'id1': {'A42T': 25.0},
+            'id2': {'A62T': 75.0},
+        }
         self.assertEqual(expected, got)
 

From 116c45386130876684d68d085af0052cf77bee00 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 10 Aug 2016 22:01:35 +0100
Subject: [PATCH 10/40] Format of dict changed: now has groups ids as well

---
 ariba/tests/summary_sample_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ariba/tests/summary_sample_test.py b/ariba/tests/summary_sample_test.py
index 091e8c09..32156c26 100644
--- a/ariba/tests/summary_sample_test.py
+++ b/ariba/tests/summary_sample_test.py
@@ -104,7 +104,7 @@ def test_variant_column_names_tuples_and_het_snps(self):
 
         expected_het_snps = {
             'cluster.v': {},
-            'cluster.n': {'A14T': 80.0},
+            'cluster.n': {'.': {'A14T': 80.0}},
             'cluster.p': {},
         }
         self.assertEqual(expected_het_snps, got_het_snps)

From fa680a3b08a7ab79c1df8f994221771a1195c126 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 10 Aug 2016 22:02:14 +0100
Subject: [PATCH 11/40] Fix format of variant string

---
 ariba/tests/data/summary_test_get_all_het_snps.1.tsv | 4 ++--
 ariba/tests/data/summary_test_get_all_het_snps.2.tsv | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ariba/tests/data/summary_test_get_all_het_snps.1.tsv b/ariba/tests/data/summary_test_get_all_het_snps.1.tsv
index d1f5f70b..3e67eeb1 100644
--- a/ariba/tests/data/summary_test_get_all_het_snps.1.tsv
+++ b/ariba/tests/data/summary_test_get_all_het_snps.1.tsv
@@ -1,3 +1,3 @@
 #ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
+noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_het_snps.2.tsv b/ariba/tests/data/summary_test_get_all_het_snps.2.tsv
index 6507d5fd..398aedbc 100644
--- a/ariba/tests/data/summary_test_get_all_het_snps.2.tsv
+++ b/ariba/tests/data/summary_test_get_all_het_snps.2.tsv
@@ -1,5 +1,5 @@
 #ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id3:variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
+noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id3:variant in ref and reads so should report	generic description of noncoding1
+presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
 variants_only1	1	1	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.

From 1cbf9d890c16b7b076d8105401ea9ebb80fb6d68 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 10 Aug 2016 22:02:49 +0100
Subject: [PATCH 12/40] Report percents for snp groups. Update phandango
 colours

---
 ariba/summary.py            | 57 +++++++++++++++++++------------------
 ariba/tests/summary_test.py | 16 ++++++++---
 2 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/ariba/summary.py b/ariba/summary.py
index cabe3c01..a829d1ea 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -121,8 +121,8 @@ def _get_all_het_snps(cls, samples_dict):
         snps = set()
         for filename, sample in samples_dict.items():
             for cluster, snp_dict in sample.het_snps.items():
-                if len(snp_dict):
-                    for snp in snp_dict:
+                for snp_id in snp_dict:
+                    for snp in snp_dict[snp_id]:
                         snps.add((cluster, snp))
 
         return snps
@@ -143,7 +143,6 @@ def _gather_output_rows(self):
         all_cluster_names = Summary._get_all_cluster_names(self.samples)
         all_var_columns = Summary._get_all_variant_columns(self.samples)
         all_het_snps = Summary._get_all_het_snps(self.samples)
-
         if self.var_columns['groups']:
             var_groups = Summary._get_all_var_groups(self.samples)
         else:
@@ -156,7 +155,7 @@ def _gather_output_rows(self):
             for cluster in all_cluster_names:
                 rows[filename][cluster] = {}
 
-                if cluster in sample.column_summary_data and sample.column_summary_data[cluster]['assembled'].startswith('yes'):
+                if cluster in sample.column_summary_data and sample.column_summary_data[cluster]['assembled'] not in {'no'}:
                     rows[filename][cluster] = sample.column_summary_data[cluster]
                 else:
                     rows[filename][cluster] = {
@@ -171,25 +170,20 @@ def _gather_output_rows(self):
                 if self.var_columns['groups']:
                     for group_name in var_groups[cluster]:
                         if cluster in sample.var_groups and group_name in sample.var_groups[cluster]:
+                            rows[filename][cluster]['vgroup.' + group_name] = 'yes'
                             if self.show_known_het:
                                 if cluster in sample.het_snps:
-                                    if len(sample.het_snps[cluster]) == 0:
-                                        rows[filename][cluster]['vgroup.' + group_name] = 'no'
-                                        rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA'
-                                    elif len(sample.het_snps[cluster]) == 1:
-                                        rows[filename][cluster]['vgroup.' + group_name] = 'het'
-                                        snp_name = list(sample.het_snps[cluster].keys())[0]
-                                        percent = -1
-                                        for v in sample.variant_column_names_tuples[cluster]:
-                                            if v[1] == snp_name and snp_name in sample.het_snps[cluster]:
-                                                percent = sample.het_snps[cluster][snp_name]
-
-                                        rows[filename][cluster]['vgroup.' + group_name + '.%'] = percent
+                                    if group_name in sample.het_snps[cluster]:
+                                        if len(sample.het_snps[cluster][group_name]) == 1:
+                                            rows[filename][cluster]['vgroup.' + group_name] = 'het'
+                                            percent = list(sample.het_snps[cluster][group_name].values())[0]
+                                            rows[filename][cluster]['vgroup.' + group_name + '.%'] = percent
+                                        else:
+                                            assert len(sample.het_snps[cluster][group_name]) > 1
+                                            rows[filename][cluster]['vgroup.' + group_name] = 'multi_het'
+                                            rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA'
                                     else:
-                                        rows[filename][cluster]['vgroup.' + group_name] = 'multi_het'
                                         rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA'
-                            else:
-                                rows[filename][cluster]['vgroup.' + group_name] = 'yes'
                         else:
                             rows[filename][cluster]['vgroup.' + group_name] = 'no'
                             if self.show_known_het:
@@ -201,15 +195,20 @@ def _gather_output_rows(self):
                             continue
 
                         key = ref_name + '.' + variant
-
                         if rows[filename][cluster]['assembled'] == 'no':
                             rows[filename][cluster][key] = 'NA'
                         elif cluster in sample.variant_column_names_tuples and (ref_name, variant, grouped_or_novel, group_name) in sample.variant_column_names_tuples[cluster]:
                             rows[filename][cluster][key] = 'yes'
                             if self.show_known_het:
-                                if cluster in sample.het_snps and variant in sample.het_snps[cluster]:
-                                    rows[filename][cluster][key] = 'het'
-                                    rows[filename][cluster][key + '.%'] = sample.het_snps[cluster][variant]
+                                if cluster in sample.het_snps:
+                                    if grouped_or_novel == 'grouped' and group_name in sample.het_snps[cluster]:
+                                        rows[filename][cluster][key] = 'het'
+                                        rows[filename][cluster][key + '.%'] = sample.het_snps[cluster][group_name].get(variant, "NA")
+                                    elif grouped_or_novel == 'novel' and '.' in sample.het_snps[cluster]:
+                                        rows[filename][cluster][key] = 'het'
+                                        rows[filename][cluster][key + '.%'] = sample.het_snps['.'].get(variant, "NA")
+                                    else:
+                                        percent = 'NA'
                         else:
                             rows[filename][cluster][key] = 'no'
                             if self.show_known_het and (cluster, variant) in all_het_snps:
@@ -315,11 +314,13 @@ def _add_phandango_colour_columns(cls, header, matrix):
         matrix = copy.deepcopy(matrix)
         cols_to_add_colour_col = [i for i in range(len(header)) if header[i].endswith(':o1')]
         field_to_col = {
-            'yes': '#1f78b4',
-            'yes_nonunique': '#a6cee3',
-            'no': '#33a02c',
-            'NA': '#b2df8a',
-            'het': '#fb9a99',
+            'yes': '#33a02c',
+            'yes_nonunique': '#b2df8a',
+            'no': '#fb9a99',
+            'NA': '#d3d3d3',
+            'het': '#fdbf6f',
+            'fragmented': '#1f78b4',
+            'interrupted': '#a6cee3',
         }
 
         cols_to_add_colour_col.reverse()
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 6b615ee6..6eb790e8 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -225,6 +225,12 @@ def test_gather_output_rows(self):
         self.assertEqual(expected, got)
 
         s.show_known_het = True
+        expected[infiles[0]]['noncoding1']['vgroup.id1.%'] = 'NA'
+        expected[infiles[0]]['noncoding1']['vgroup.id3.%'] = 'NA'
+        expected[infiles[1]]['noncoding1']['vgroup.id1'] = 'het'
+        expected[infiles[1]]['noncoding1']['vgroup.id1.%'] = 80.0
+        expected[infiles[1]]['noncoding1']['vgroup.id3'] = 'yes'
+        expected[infiles[1]]['noncoding1']['vgroup.id3.%'] = 'NA'
         expected[infiles[0]]['noncoding1']['noncoding1.A14T.%'] = 'NA'
         expected[infiles[1]]['noncoding1']['noncoding1.A14T'] = 'het'
         expected[infiles[1]]['noncoding1']['noncoding1.A14T.%'] = 80.0
@@ -234,6 +240,8 @@ def test_gather_output_rows(self):
         for filename in expected:
             del expected[filename]['noncoding1']['vgroup.id1']
             del expected[filename]['noncoding1']['vgroup.id3']
+            del expected[filename]['noncoding1']['vgroup.id1.%']
+            del expected[filename]['noncoding1']['vgroup.id3.%']
             for gene_type in expected[filename]:
                 del expected[filename][gene_type]['ref_seq']
 
@@ -373,10 +381,10 @@ def test_add_phandango_colour_columns(self):
 
         expected_header = ['head1', 'head2', 'head2:colour', 'head3', 'head3:colour', 'head4', 'head5', 'head5:colour']
         expected_matrix = [
-            ['yes', 'yes', '#1f78b4', 'yes_nonunique', '#a6cee3', 'yes', 'no', '#33a02c'],
-            ['yes', 'yes_nonunique', '#a6cee3', 'no', '#33a02c', 'yes', 'NA', '#b2df8a'],
-            ['yes', 'no', '#33a02c', 'NA', '#b2df8a', 'yes', 'yes', '#1f78b4'],
-            ['yes', 'NA', '#b2df8a', 'yes', '#1f78b4', 'yes', 'yes_nonunique', '#a6cee3'],
+            ['yes', 'yes', '#33a02c', 'yes_nonunique', '#b2df8a', 'yes', 'no', '#fb9a99'],
+            ['yes', 'yes_nonunique', '#b2df8a', 'no', '#fb9a99', 'yes', 'NA', '#d3d3d3'],
+            ['yes', 'no', '#fb9a99', 'NA', '#d3d3d3', 'yes', 'yes', '#33a02c'],
+            ['yes', 'NA', '#d3d3d3', 'yes', '#33a02c', 'yes', 'yes_nonunique', '#b2df8a']
         ]
         got_header, got_matrix = summary.Summary._add_phandango_colour_columns(header, matrix)
         self.assertEqual(expected_header, got_header)

From 65f69f071144f5338415226662ddcc62c83d4798 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 10 Aug 2016 22:21:12 +0100
Subject: [PATCH 13/40] add only_clusters option

---
 ariba/summary_sample.py            | 5 ++++-
 ariba/tests/summary_sample_test.py | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py
index c5349f41..ea9575ae 100644
--- a/ariba/summary_sample.py
+++ b/ariba/summary_sample.py
@@ -15,7 +15,7 @@ def __eq__(self, other):
 
 
     @staticmethod
-    def _load_file(filename, min_pc_id):
+    def _load_file(filename, min_pc_id, only_clusters=None):
         f = pyfastaq.utils.open_file_read(filename)
         clusters = {}
 
@@ -28,6 +28,9 @@ def _load_file(filename, min_pc_id):
 
             data_dict = summary_cluster.SummaryCluster.line2dict(line)
             cluster = data_dict['cluster']
+            if only_clusters is not None and cluster not in only_clusters:
+                continue
+
             if cluster not in clusters:
                 clusters[cluster] = summary_cluster.SummaryCluster(min_pc_id=min_pc_id)
             clusters[cluster].add_data_dict(data_dict)
diff --git a/ariba/tests/summary_sample_test.py b/ariba/tests/summary_sample_test.py
index 32156c26..f097883a 100644
--- a/ariba/tests/summary_sample_test.py
+++ b/ariba/tests/summary_sample_test.py
@@ -33,6 +33,9 @@ def test_load_file(self):
         got = summary_sample.SummarySample._load_file(infile, 90)
         self.assertEqual(expected, got)
 
+        got = summary_sample.SummarySample._load_file(infile, 90, only_clusters={'cluster.n'})
+        expected = {'cluster.n': cluster1}
+        self.assertEqual(expected, got)
 
     def test_column_summary_data(self):
         '''Test _column_summary_data'''

From d2ffaeb00554d9b56eb3c4c74121b905e8e8b9ab Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 10 Aug 2016 22:47:25 +0100
Subject: [PATCH 14/40] add only_clusters option

---
 ariba/summary_sample.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py
index ea9575ae..5a5b397b 100644
--- a/ariba/summary_sample.py
+++ b/ariba/summary_sample.py
@@ -4,9 +4,10 @@
 class Error (Exception): pass
 
 class SummarySample:
-    def __init__(self, report_tsv, min_pc_id=90):
+    def __init__(self, report_tsv, min_pc_id=90, only_clusters=None):
         self.report_tsv = report_tsv
         self.min_pc_id = min_pc_id
+        self.only_clusters = only_clusters
         self.clusters = {}
 
 
@@ -61,7 +62,7 @@ def _variant_column_names_tuples_and_het_snps(self):
 
 
     def run(self):
-        self.clusters = self._load_file(self.report_tsv, self.min_pc_id)
+        self.clusters = self._load_file(self.report_tsv, self.min_pc_id, only_clusters=self.only_clusters)
         self.column_summary_data = self._column_summary_data()
         self.variant_column_names_tuples, self.het_snps = self._variant_column_names_tuples_and_het_snps()
         self.var_groups = self._var_groups()

From 5f86cee559f904aedfc6647636e35306416ca637 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 10 Aug 2016 22:48:06 +0100
Subject: [PATCH 15/40] Add only_clusters option

---
 ariba/summary.py            | 4 ++--
 ariba/tests/summary_test.py | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/ariba/summary.py b/ariba/summary.py
index a829d1ea..5caf97b5 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -82,10 +82,10 @@ def _check_files_exist(self):
 
 
     @classmethod
-    def _load_input_files(cls, filenames, min_id, verbose=False):
+    def _load_input_files(cls, filenames, min_id, verbose=False, only_clusters=None):
         samples = {}
         for filename in filenames:
-            samples[filename] = summary_sample.SummarySample(filename, min_pc_id=min_id)
+            samples[filename] = summary_sample.SummarySample(filename, min_pc_id=min_id, only_clusters=only_clusters)
             samples[filename].run()
             if verbose:
                 print('Loaded file', filename, flush=True)
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 6eb790e8..45f01dc1 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -84,6 +84,15 @@ def test_load_input_files(self):
         expected = {file1: sample1, file2: sample2}
         self.assertEqual(expected, got)
 
+        sample1 = summary_sample.SummarySample(file1, only_clusters={'noncoding1'})
+        sample2 = summary_sample.SummarySample(file2, only_clusters={'noncoding1'})
+        sample1.run()
+        sample2.run()
+        expected = {file1: sample1, file2: sample2}
+        got = summary.Summary._load_input_files([file1, file2], 90, only_clusters={'noncoding1'})
+        self.assertEqual(expected, got)
+
+
 
     def test_get_all_cluster_names(self):
         '''Test _get_all_cluster_names'''

From a78cd3a1c60d4c775c2a781ce14fb5ba507d9208 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 10 Aug 2016 22:52:03 +0100
Subject: [PATCH 16/40] add only_clusters option

---
 ariba/summary.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ariba/summary.py b/ariba/summary.py
index 5caf97b5..7c7774c9 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -22,6 +22,7 @@ def __init__(
       cluster_cols='assembled,match,ref_seq,pct_id,known_var,novel_var',
       variant_cols='groups,grouped,ungrouped,novel',
       make_phandango_tree=True,
+      only_clusters=None,
       verbose=False,
     ):
         if filenames is None and fofn is None:
@@ -43,6 +44,7 @@ def __init__(
         self.min_id = min_id
         self.outprefix = outprefix
         self.make_phandango_tree = make_phandango_tree
+        self.only_clusters = only_clusters
         self.verbose = verbose
 
 
@@ -396,7 +398,7 @@ def run(self):
         if self.verbose:
             print('Loading input files...', flush=True)
         self._check_files_exist()
-        self.samples = self._load_input_files(self.filenames, self.min_id, verbose=self.verbose)
+        self.samples = self._load_input_files(self.filenames, self.min_id, verbose=self.verbose, only_clusters=self.only_clusters)
         if self.verbose:
             print('Generating output rows', flush=True)
         self.rows = self._gather_output_rows()

From 898a6337f40e6611675330a09d345bbf00175c29 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Wed, 10 Aug 2016 22:58:50 +0100
Subject: [PATCH 17/40] Add only_cluster option

---
 ariba/tasks/summary.py | 1 +
 scripts/ariba          | 1 +
 2 files changed, 2 insertions(+)

diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
index ec1cd879..c674df3a 100644
--- a/ariba/tasks/summary.py
+++ b/ariba/tasks/summary.py
@@ -97,6 +97,7 @@ def run(options):
         cluster_cols=options.cluster_cols,
         variant_cols=options.var_cols,
         make_phandango_tree=(not options.no_tree),
+        only_clusters=None if options.only_cluster is None else {options.only_cluster},
         verbose=options.verbose
     )
     s.run()
diff --git a/scripts/ariba b/scripts/ariba
index 63fc5d93..f0b91a86 100755
--- a/scripts/ariba
+++ b/scripts/ariba
@@ -170,6 +170,7 @@ subparser_summary.add_argument('--no_tree', action='store_true', help='Do not ma
 subparser_summary.add_argument('--row_filter', choices=['y', 'n'], default='y', help='Choose whether rows where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
 subparser_summary.add_argument('--var_cols', help='Comma separated list of variant columns to include. Choose from: groups, grouped, ungrouped, novel [none by default]', metavar='col1,col2,...', default='')
 subparser_summary.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
+subparser_summary.add_argument('--only_cluster', help='Only report data for the given cluster name', metavar='Cluster_name')
 subparser_summary.add_argument('--verbose', action='store_true', help='Be verbose')
 subparser_summary.add_argument('outprefix', help='Prefix of output files')
 subparser_summary.add_argument('infiles', nargs='*', help='Files to be summarised')

From d94178214c4529149cd766212b940af4fe46807f Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Thu, 11 Aug 2016 10:49:40 +0100
Subject: [PATCH 18/40] New method _get_het_percent

---
 ariba/summary_cluster.py            | 24 ++++++++++++++++++++++
 ariba/tests/summary_cluster_test.py | 32 +++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py
index 3b1daeb1..4def11cb 100644
--- a/ariba/summary_cluster.py
+++ b/ariba/summary_cluster.py
@@ -220,6 +220,7 @@ def _get_known_noncoding_het_snp(data_dict):
             return None
 
 
+
     @staticmethod
     def _get_nonsynonymous_var(data_dict):
         '''if data_dict has a non synonymous variant, return string:
@@ -302,3 +303,26 @@ def known_noncoding_het_snps(self):
                     snps[snp_id] = {}
                 snps[snp_id][snp_tuple[0]] = snp_tuple[1]
         return snps
+
+
+    @classmethod
+    def _get_het_percent(cls, data_dict):
+        if data_dict['gene'] == '1' or data_dict['ref_ctg_effect'] != 'SNP' or data_dict['smtls_alt_nt'] == '.' or ';' in data_dict['smtls_alt_nt']:
+            return None
+        else:
+            nucleotides = [data_dict['ctg_nt']] + data_dict['smtls_alt_nt'].split(',')
+            depths = data_dict['smtls_alt_depth'].split(',')
+
+            if len(nucleotides) != len(depths):
+                raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_alt_nt, smtls_alt_depth columns. Cannot continue\n' + str(data_dict))
+
+            try:
+                var_nucleotide = data_dict['known_var_change'][-1]
+                depths = [int(x) for x in depths]
+                nuc_to_depth = dict(zip(nucleotides, depths))
+                total_depth = sum(depths)
+                var_depth = nuc_to_depth.get(var_nucleotide, 0)
+                return round(100 * var_depth / total_depth, 1)
+            except:
+                return None
+
diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py
index 44727544..e46440df 100644
--- a/ariba/tests/summary_cluster_test.py
+++ b/ariba/tests/summary_cluster_test.py
@@ -471,3 +471,35 @@ def test_known_noncoding_het_snps(self):
         }
         self.assertEqual(expected, got)
 
+
+    def  test_get_het_percent(self):
+        '''test _get_het_percent'''
+        #FIXME
+        lines = [
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs'
+        ]
+
+        expected = [None, 25.0, 75.0, 40.0]
+        assert len(lines) == len(expected)
+
+        for i in range(len(lines)):
+            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+            got = summary_cluster.SummaryCluster._get_het_percent(data_dict)
+            self.assertEqual(expected[i], got)
+
+
+    def test_get_nonsynon_variant_data(self):
+        '''test _get_nonsynon_variant_data'''
+        #FIXME
+        pass
+
+
+    def test_get_all_nonsynon_variants(self):
+        '''test _get_all_nonsynon_variants'''
+        #FIXME
+        pass
+
+

From 09c393e611f69d59651facfbe2b355138ebc95f3 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Thu, 11 Aug 2016 11:03:04 +0100
Subject: [PATCH 19/40] New method _get_nonsynon_variant_data

---
 ariba/summary_cluster.py            | 28 ++++++++++++++++++++++++++++
 ariba/tests/summary_cluster_test.py | 25 +++++++++++++++++--------
 2 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py
index 4def11cb..4027e38e 100644
--- a/ariba/summary_cluster.py
+++ b/ariba/summary_cluster.py
@@ -326,3 +326,31 @@ def _get_het_percent(cls, data_dict):
             except:
                 return None
 
+
+    @classmethod
+    def _get_nonsynon_variant_data(cls, data_dict):
+        if not SummaryCluster._has_nonsynonymous(data_dict):
+            return None
+
+        if data_dict['known_var_change'] == data_dict['ref_ctg_change'] == '.' == data_dict['ref_ctg_effect']:
+            raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change, ref_ctg_change, ref_ctg_effect all equal to ".", but has a non synonymous change. Something is inconsistent. Cannot continue')
+        elif '.' not in [data_dict['known_var_change'], data_dict['ref_ctg_change']] and \
+          data_dict['known_var_change'] != data_dict['ref_ctg_change']:
+            raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change != ref_ctg_change. Cannot continue')
+
+        var_data = {
+            'known': data_dict['known_var'] == '1',
+            'var_group': data_dict['var_group'],
+            'coding': data_dict['gene'] == '1'
+        }
+
+        if data_dict['known_var'] == '1' and data_dict['known_var_change'] != '.':
+            var_data['var_string'] = data_dict['known_var_change']
+        elif data_dict['ref_ctg_change'] != '.':
+            var_data['var_string'] = data_dict['ref_ctg_change']
+        else:
+            var_data['var_string'] = data_dict['ref_ctg_effect']
+
+        var_data['het_percent'] = SummaryCluster._get_het_percent(data_dict)
+        return var_data
+
diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py
index e46440df..502c770c 100644
--- a/ariba/tests/summary_cluster_test.py
+++ b/ariba/tests/summary_cluster_test.py
@@ -474,7 +474,6 @@ def test_known_noncoding_het_snps(self):
 
     def  test_get_het_percent(self):
         '''test _get_het_percent'''
-        #FIXME
         lines = [
             'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
             'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs',
@@ -493,13 +492,23 @@ def  test_get_het_percent(self):
 
     def test_get_nonsynon_variant_data(self):
         '''test _get_nonsynon_variant_data'''
-        #FIXME
-        pass
-
+        lines = [
+            'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+        ]
 
-    def test_get_all_nonsynon_variants(self):
-        '''test _get_all_nonsynon_variants'''
-        #FIXME
-        pass
+        expected = [
+            {'coding': True, 'known': True, 'var_string': 'I14L', 'var_group': '.', 'het_percent': None},
+            {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': None},
+            {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 25.0},
+            {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 50.0},
+        ]
+        assert len(lines) == len(expected)
 
+        for i in range(len(lines)):
+            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+            got = summary_cluster.SummaryCluster._get_nonsynon_variant_data(data_dict)
+            self.assertEqual(expected[i], got)
 

From d9ccdd04cc7bc227ac02dfe026ddbc080ba7d6f6 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Thu, 11 Aug 2016 11:19:28 +0100
Subject: [PATCH 20/40] New class SummaryClusterVariant

---
 ariba/__init__.py                           |  1 +
 ariba/summary_cluster.py                    | 50 ---------------
 ariba/summary_cluster_variant.py            | 66 ++++++++++++++++++++
 ariba/tests/summary_cluster_test.py         | 41 -------------
 ariba/tests/summary_cluster_variant_test.py | 67 +++++++++++++++++++++
 5 files changed, 134 insertions(+), 91 deletions(-)
 create mode 100644 ariba/summary_cluster_variant.py
 create mode 100644 ariba/tests/summary_cluster_variant_test.py

diff --git a/ariba/__init__.py b/ariba/__init__.py
index 0c36b1a4..1d589dc3 100644
--- a/ariba/__init__.py
+++ b/ariba/__init__.py
@@ -39,6 +39,7 @@
     'sequence_variant',
     'summary',
     'summary_cluster',
+    'summary_cluster_variant',
     'summary_sample',
     'tasks',
     'versions',
diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py
index 4027e38e..b0a6a03e 100644
--- a/ariba/summary_cluster.py
+++ b/ariba/summary_cluster.py
@@ -304,53 +304,3 @@ def known_noncoding_het_snps(self):
                 snps[snp_id][snp_tuple[0]] = snp_tuple[1]
         return snps
 
-
-    @classmethod
-    def _get_het_percent(cls, data_dict):
-        if data_dict['gene'] == '1' or data_dict['ref_ctg_effect'] != 'SNP' or data_dict['smtls_alt_nt'] == '.' or ';' in data_dict['smtls_alt_nt']:
-            return None
-        else:
-            nucleotides = [data_dict['ctg_nt']] + data_dict['smtls_alt_nt'].split(',')
-            depths = data_dict['smtls_alt_depth'].split(',')
-
-            if len(nucleotides) != len(depths):
-                raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_alt_nt, smtls_alt_depth columns. Cannot continue\n' + str(data_dict))
-
-            try:
-                var_nucleotide = data_dict['known_var_change'][-1]
-                depths = [int(x) for x in depths]
-                nuc_to_depth = dict(zip(nucleotides, depths))
-                total_depth = sum(depths)
-                var_depth = nuc_to_depth.get(var_nucleotide, 0)
-                return round(100 * var_depth / total_depth, 1)
-            except:
-                return None
-
-
-    @classmethod
-    def _get_nonsynon_variant_data(cls, data_dict):
-        if not SummaryCluster._has_nonsynonymous(data_dict):
-            return None
-
-        if data_dict['known_var_change'] == data_dict['ref_ctg_change'] == '.' == data_dict['ref_ctg_effect']:
-            raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change, ref_ctg_change, ref_ctg_effect all equal to ".", but has a non synonymous change. Something is inconsistent. Cannot continue')
-        elif '.' not in [data_dict['known_var_change'], data_dict['ref_ctg_change']] and \
-          data_dict['known_var_change'] != data_dict['ref_ctg_change']:
-            raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change != ref_ctg_change. Cannot continue')
-
-        var_data = {
-            'known': data_dict['known_var'] == '1',
-            'var_group': data_dict['var_group'],
-            'coding': data_dict['gene'] == '1'
-        }
-
-        if data_dict['known_var'] == '1' and data_dict['known_var_change'] != '.':
-            var_data['var_string'] = data_dict['known_var_change']
-        elif data_dict['ref_ctg_change'] != '.':
-            var_data['var_string'] = data_dict['ref_ctg_change']
-        else:
-            var_data['var_string'] = data_dict['ref_ctg_effect']
-
-        var_data['het_percent'] = SummaryCluster._get_het_percent(data_dict)
-        return var_data
-
diff --git a/ariba/summary_cluster_variant.py b/ariba/summary_cluster_variant.py
new file mode 100644
index 00000000..4043a922
--- /dev/null
+++ b/ariba/summary_cluster_variant.py
@@ -0,0 +1,66 @@
+from ariba import flag, report
+
+class Error (Exception): pass
+
+class SummaryClusterVariant:
+    def __init__(self, data_dict):
+        self._get_nonsynon_variant_data(data_dict)
+
+
+    @classmethod
+    def _has_nonsynonymous(cls, data_dict):
+        return data_dict['ref_ctg_effect'] != 'SYN' and \
+          (
+              data_dict['has_known_var'] == '1' or \
+              (data_dict['known_var'] != '1' and (data_dict['ref_ctg_change'] != '.' or data_dict['ref_ctg_effect'] != '.'))
+          )
+
+
+    @classmethod
+    def _get_het_percent(cls, data_dict):
+        if data_dict['gene'] == '1' or data_dict['ref_ctg_effect'] != 'SNP' or data_dict['smtls_alt_nt'] == '.' or ';' in data_dict['smtls_alt_nt']:
+            return None
+        else:
+            nucleotides = [data_dict['ctg_nt']] + data_dict['smtls_alt_nt'].split(',')
+            depths = data_dict['smtls_alt_depth'].split(',')
+
+            if len(nucleotides) != len(depths):
+                raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_alt_nt, smtls_alt_depth columns. Cannot continue\n' + str(data_dict))
+
+            try:
+                var_nucleotide = data_dict['known_var_change'][-1]
+                depths = [int(x) for x in depths]
+                nuc_to_depth = dict(zip(nucleotides, depths))
+                total_depth = sum(depths)
+                var_depth = nuc_to_depth.get(var_nucleotide, 0)
+                return round(100 * var_depth / total_depth, 1)
+            except:
+                return None
+
+
+    def _get_nonsynon_variant_data(self, data_dict):
+        if not SummaryClusterVariant._has_nonsynonymous(data_dict):
+            self.has_nonsynon = False
+            return
+
+        self.has_nonsynon = True
+
+        if data_dict['known_var_change'] == data_dict['ref_ctg_change'] == '.' == data_dict['ref_ctg_effect']:
+            raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change, ref_ctg_change, ref_ctg_effect all equal to ".", but has a non synonymous change. Something is inconsistent. Cannot continue')
+        elif '.' not in [data_dict['known_var_change'], data_dict['ref_ctg_change']] and \
+          data_dict['known_var_change'] != data_dict['ref_ctg_change']:
+            raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change != ref_ctg_change. Cannot continue')
+
+        self.known = data_dict['known_var'] == '1'
+        self.var_group = data_dict['var_group']
+        self.coding = data_dict['gene'] == '1'
+
+        if data_dict['known_var'] == '1' and data_dict['known_var_change'] != '.':
+            self.var_string = data_dict['known_var_change']
+        elif data_dict['ref_ctg_change'] != '.':
+            self.var_string = data_dict['ref_ctg_change']
+        else:
+            self.var_string = data_dict['ref_ctg_effect']
+
+        self.het_percent = SummaryClusterVariant._get_het_percent(data_dict)
+
diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py
index 502c770c..44727544 100644
--- a/ariba/tests/summary_cluster_test.py
+++ b/ariba/tests/summary_cluster_test.py
@@ -471,44 +471,3 @@ def test_known_noncoding_het_snps(self):
         }
         self.assertEqual(expected, got)
 
-
-    def  test_get_het_percent(self):
-        '''test _get_het_percent'''
-        lines = [
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs'
-        ]
-
-        expected = [None, 25.0, 75.0, 40.0]
-        assert len(lines) == len(expected)
-
-        for i in range(len(lines)):
-            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
-            got = summary_cluster.SummaryCluster._get_het_percent(data_dict)
-            self.assertEqual(expected[i], got)
-
-
-    def test_get_nonsynon_variant_data(self):
-        '''test _get_nonsynon_variant_data'''
-        lines = [
-            'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
-        ]
-
-        expected = [
-            {'coding': True, 'known': True, 'var_string': 'I14L', 'var_group': '.', 'het_percent': None},
-            {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': None},
-            {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 25.0},
-            {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 50.0},
-        ]
-        assert len(lines) == len(expected)
-
-        for i in range(len(lines)):
-            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
-            got = summary_cluster.SummaryCluster._get_nonsynon_variant_data(data_dict)
-            self.assertEqual(expected[i], got)
-
diff --git a/ariba/tests/summary_cluster_variant_test.py b/ariba/tests/summary_cluster_variant_test.py
new file mode 100644
index 00000000..ec099422
--- /dev/null
+++ b/ariba/tests/summary_cluster_variant_test.py
@@ -0,0 +1,67 @@
+import unittest
+import os
+from ariba import summary_cluster, summary_cluster_variant
+
+
+class TestSummaryClusterVariant(unittest.TestCase):
+    def test_has_nonsynonymous(self):
+        '''Test _has_nonsynonymous'''
+        lines = [
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
+            'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
+        ]
+
+        dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines]
+        expected = [False, True, False, True, True, True]
+        assert len(dicts) == len(expected)
+
+        for i in range(len(dicts)):
+            self.assertEqual(expected[i], summary_cluster_variant.SummaryClusterVariant._has_nonsynonymous(dicts[i]))
+
+
+    def  test_get_het_percent(self):
+        '''test _get_het_percent'''
+        lines = [
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs'
+        ]
+
+        expected = [None, 25.0, 75.0, 40.0]
+        assert len(lines) == len(expected)
+
+        for i in range(len(lines)):
+            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+            got = summary_cluster_variant.SummaryClusterVariant._get_het_percent(data_dict)
+            self.assertEqual(expected[i], got)
+
+
+    def test_init(self):
+        '''test __init__'''
+        lines = [
+            'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+        ]
+
+        expected = [
+            {'coding': True, 'known': True, 'var_string': 'I14L', 'var_group': '.', 'het_percent': None},
+            {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': None},
+            {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 25.0},
+            {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 50.0},
+        ]
+        assert len(lines) == len(expected)
+
+        for i in range(len(lines)):
+            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+            cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)
+            for key in expected[i]:
+                got_value = eval('cluster_var.' + key)
+                self.assertEqual(expected[i][key], got_value)
+

From 3ef9b21d91e01068ad91be13290c3906f158d283 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Thu, 11 Aug 2016 11:53:57 +0100
Subject: [PATCH 21/40] Make hashable and add __str__ method

---
 ariba/summary_cluster_variant.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/ariba/summary_cluster_variant.py b/ariba/summary_cluster_variant.py
index 4043a922..b9c7c6db 100644
--- a/ariba/summary_cluster_variant.py
+++ b/ariba/summary_cluster_variant.py
@@ -7,6 +7,21 @@ def __init__(self, data_dict):
         self._get_nonsynon_variant_data(data_dict)
 
 
+    def __eq__(self, other):
+       return type(other) is type(self) and self.__dict__ == other.__dict__
+
+
+    def __hash__(self):
+        return hash(tuple([self.__dict__[x] for x in sorted(self.__dict__.keys())]))
+
+
+    def __str__(self):
+        if self.has_nonsynon:
+            return ', '.join((str(self.known), self.var_group, str(self.coding), self.var_string, str(self.het_percent)))
+        else:
+            return 'None'
+
+
     @classmethod
     def _has_nonsynonymous(cls, data_dict):
         return data_dict['ref_ctg_effect'] != 'SYN' and \

From ba94cc105773d1ddbfdec4a259018da61be98863 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Thu, 11 Aug 2016 11:54:20 +0100
Subject: [PATCH 22/40] New method _get_all_nonsynon_variants_set

---
 ariba/summary_cluster.py            | 14 +++++++++++++-
 ariba/tests/summary_cluster_test.py | 19 ++++++++++++++++++-
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py
index b0a6a03e..8bf07f4a 100644
--- a/ariba/summary_cluster.py
+++ b/ariba/summary_cluster.py
@@ -1,4 +1,4 @@
-from ariba import flag, report
+from ariba import flag, report, summary_cluster_variant
 
 class Error (Exception): pass
 
@@ -304,3 +304,15 @@ def known_noncoding_het_snps(self):
                 snps[snp_id][snp_tuple[0]] = snp_tuple[1]
         return snps
 
+
+    @classmethod
+    def _get_all_nonsynon_variants_set(cls, data_dicts):
+        variants = set()
+
+        for data_dict in data_dicts:
+            cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)
+            if cluster_var.has_nonsynon:
+                variants.add(cluster_var)
+
+        return variants
+
diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py
index 44727544..d3bfffa6 100644
--- a/ariba/tests/summary_cluster_test.py
+++ b/ariba/tests/summary_cluster_test.py
@@ -1,6 +1,6 @@
 import unittest
 import os
-from ariba import flag, summary_cluster
+from ariba import flag, summary_cluster, summary_cluster_variant
 
 modules_dir = os.path.dirname(os.path.abspath(summary_cluster.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
@@ -471,3 +471,20 @@ def test_known_noncoding_het_snps(self):
         }
         self.assertEqual(expected, got)
 
+
+    def test_get_all_nonsynon_variants_set(self):
+        '''test _get_all_nonsynon_variants_set'''
+        lines = [
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text',
+            'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+        ]
+
+        data_dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines]
+
+        cluster_vars = [summary_cluster_variant.SummaryClusterVariant(x) for x in data_dicts]
+        expected = {x for x in cluster_vars if x.has_nonsynon}
+        got = summary_cluster.SummaryCluster._get_all_nonsynon_variants_set(data_dicts)
+        self.assertEqual(expected, got)
+

From b63b57496ff2d2333d85ddbea458f5a0e2b89d1c Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Thu, 11 Aug 2016 12:52:34 +0100
Subject: [PATCH 23/40] New method gather_data

---
 ariba/summary_cluster.py            |  5 +++++
 ariba/tests/summary_cluster_test.py | 30 +++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py
index 8bf07f4a..efc4cf8c 100644
--- a/ariba/summary_cluster.py
+++ b/ariba/summary_cluster.py
@@ -316,3 +316,8 @@ def _get_all_nonsynon_variants_set(cls, data_dicts):
 
         return variants
 
+
+    def gather_data(self):
+        self.summary = self.column_summary_data()
+        self.variants = self._get_all_nonsynon_variants_set(self.data)
+
diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py
index d3bfffa6..f5022fce 100644
--- a/ariba/tests/summary_cluster_test.py
+++ b/ariba/tests/summary_cluster_test.py
@@ -488,3 +488,33 @@ def test_get_all_nonsynon_variants_set(self):
         got = summary_cluster.SummaryCluster._get_all_nonsynon_variants_set(data_dicts)
         self.assertEqual(expected, got)
 
+
+    def test_gather_data(self):
+        '''test gather_data'''
+        lines = [
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text',
+            'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+        ]
+
+        data_dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines]
+        cluster = summary_cluster.SummaryCluster()
+        for data_dict in data_dicts:
+            cluster.add_data_dict(data_dict)
+
+        cluster.gather_data()
+        expected_summary = {
+            'assembled': 'yes',
+            'match': 'yes',
+            'ref_seq': 'ref1',
+            'pct_id': '98.33',
+            'known_var': 'yes',
+            'novel_var': 'no',
+        }
+        self.assertEqual(expected_summary, cluster.summary)
+
+        cluster_vars = [summary_cluster_variant.SummaryClusterVariant(x) for x in data_dicts]
+        expected_variants = {x for x in cluster_vars if x.has_nonsynon}
+        self.assertEqual(expected_variants, cluster.variants)
+

From 2e3713f2fed38f0eec91770b7821069c7257a0cf Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Thu, 11 Aug 2016 12:58:03 +0100
Subject: [PATCH 24/40] Use new summary_cluster that stores variant info

---
 ariba/summary_sample.py            | 4 ++++
 ariba/tests/summary_sample_test.py | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py
index 5a5b397b..df4dbaea 100644
--- a/ariba/summary_sample.py
+++ b/ariba/summary_sample.py
@@ -37,6 +37,10 @@ def _load_file(filename, min_pc_id, only_clusters=None):
             clusters[cluster].add_data_dict(data_dict)
 
         pyfastaq.utils.close(f)
+
+        for cluster_name, cluster in clusters.items():
+            cluster.gather_data()
+
         return clusters
 
 
diff --git a/ariba/tests/summary_sample_test.py b/ariba/tests/summary_sample_test.py
index f097883a..67ca2bc9 100644
--- a/ariba/tests/summary_sample_test.py
+++ b/ariba/tests/summary_sample_test.py
@@ -18,11 +18,14 @@ def test_load_file(self):
         cluster1.add_data_dict(dicts[0])
         cluster1.add_data_dict(dicts[1])
         cluster1.add_data_dict(dicts[2])
+        cluster1.gather_data()
         cluster2 = summary_cluster.SummaryCluster()
         cluster2.add_data_dict(dicts[3])
         cluster2.add_data_dict(dicts[4])
+        cluster2.gather_data()
         cluster3 = summary_cluster.SummaryCluster()
         cluster3.add_data_dict(dicts[5])
+        cluster3.gather_data()
 
         expected = {
             'cluster.n': cluster1,

From 09b4d7b8072a9dd34f3fc180d88b6e611fbea7d7 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Thu, 11 Aug 2016 19:08:49 +0100
Subject: [PATCH 25/40] Do not use when not present (usually unassembled)

---
 ariba/summary_sample.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py
index df4dbaea..bc1ea25f 100644
--- a/ariba/summary_sample.py
+++ b/ariba/summary_sample.py
@@ -38,8 +38,15 @@ def _load_file(filename, min_pc_id, only_clusters=None):
 
         pyfastaq.utils.close(f)
 
+        to_delete = set()
+
         for cluster_name, cluster in clusters.items():
             cluster.gather_data()
+            if cluster.name is None:
+                to_delete.add(cluster_name)
+
+        for name in to_delete:
+            del clusters[name]
 
         return clusters
 

From 4215634b3853c325e666fcdbd5b8881106272c48 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Thu, 11 Aug 2016 19:10:54 +0100
Subject: [PATCH 26/40] New method _gather_unfiltered_output_data

---
 ariba/summary.py            |  49 +++++++++++
 ariba/tests/summary_test.py | 160 ++++++++++++++++++++++++++++++++++++
 2 files changed, 209 insertions(+)

diff --git a/ariba/summary.py b/ariba/summary.py
index 7c7774c9..bdad93a3 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -23,6 +23,8 @@ def __init__(
       variant_cols='groups,grouped,ungrouped,novel',
       make_phandango_tree=True,
       only_clusters=None,
+      show_var_groups=False,
+      show_vars=False,
       verbose=False,
     ):
         if filenames is None and fofn is None:
@@ -45,6 +47,8 @@ def __init__(
         self.outprefix = outprefix
         self.make_phandango_tree = make_phandango_tree
         self.only_clusters = only_clusters
+        self.show_var_groups = show_var_groups
+        self.show_vars = show_vars
         self.verbose = verbose
 
 
@@ -462,3 +466,48 @@ def run(self):
 
         if self.verbose:
             print('Finished', flush=True)
+
+
+    def _gather_unfiltered_output_data(self):
+        self.all_potential_columns = {}
+        self.all_data = {}
+
+        for filename in sorted(self.samples):
+            self.all_data[filename] = {}
+            for cluster in self.samples[filename].clusters.values():
+                self.all_data[filename][cluster.name] = {}
+                if cluster.name not in self.all_potential_columns:
+                    self.all_potential_columns[cluster.name] = {'summary' : set(), 'groups': set(), 'vars': set()}
+
+                this_cluster_dict = {'summary': copy.copy(cluster.summary), 'groups': {}, 'vars': {}}
+                seen_groups = {}
+
+                for variant in cluster.variants:
+                    if self.show_vars:
+                        this_cluster_dict['vars'][variant.var_string] = 'yes' if variant.het_percent is None else 'het'
+                        if variant.het_percent is not None:
+                            this_cluster_dict['vars'][variant.var_string + '.%'] = variant.het_percent
+
+                    if self.show_var_groups and variant.var_group != '.':
+                        if variant.var_group not in seen_groups:
+                            seen_groups[variant.var_group] = {'yes': 0, 'het': 0}
+
+                        if variant.het_percent is None:
+                            seen_groups[variant.var_group]['yes'] += 1
+                            this_cluster_dict['groups'][variant.var_group] = 'yes'
+                        else:
+                            seen_groups[variant.var_group]['het'] += 1
+                            this_cluster_dict['groups'][variant.var_group] = 'het'
+                            this_cluster_dict['groups'][variant.var_group + '.%'] = variant.het_percent
+
+                for group, d in seen_groups.items():
+                    if d['het'] > 0 and d['het'] + d['yes'] > 1:
+                        this_cluster_dict['groups'][group] = 'yes_multi_het'
+                        this_cluster_dict['groups'][group + '.%'] = 'NA'
+
+                for x in this_cluster_dict:
+                    self.all_potential_columns[cluster.name][x].update(set(this_cluster_dict[x].keys()))
+
+                self.all_data[filename][cluster.name] = this_cluster_dict
+
+        return self.all_data, self.all_potential_columns
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 45f01dc1..b6a4d007 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -262,6 +262,166 @@ def test_gather_output_rows(self):
         self.assertEqual(expected, got)
 
 
+    def test_gather_unfiltered_output_data(self):
+        '''test gather_output_rows_new'''
+        infiles = [
+            os.path.join(data_dir, 'summary_gather_unfiltered_output_data.in.1.tsv'),
+            os.path.join(data_dir, 'summary_gather_unfiltered_output_data.in.2.tsv')
+        ]
+        s = summary.Summary('out', filenames=infiles, variant_cols=None)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        got_all, got_potential_cols = s._gather_unfiltered_output_data()
+
+        expected_all = {
+            infiles[0]: {
+                'noncoding1': {
+                    'summary': {
+                        'assembled': 'yes',
+                        'known_var': 'yes',
+                        'match': 'yes',
+                        'novel_var': 'no',
+                        'pct_id': '98.33',
+                        'ref_seq': 'noncoding_ref1'
+                    },
+                    'groups': {},
+                    'vars': {},
+                },
+                'noncoding2': {
+                    'summary': {
+                        'assembled': 'yes',
+                        'known_var': 'yes',
+                        'match': 'yes',
+                        'novel_var': 'no',
+                        'pct_id': '98.33',
+                        'ref_seq': 'noncoding_ref2'
+                    },
+                    'groups': {},
+                    'vars': {},
+                },
+                'presence_absence1': {
+                    'summary': {
+                        'assembled': 'yes',
+                        'known_var': 'no',
+                        'match': 'yes',
+                        'novel_var': 'yes',
+                        'pct_id': '98.96',
+                        'ref_seq': 'presence_absence_ref1'
+                    },
+                    'groups': {},
+                    'vars': {},
+                }
+            },
+            infiles[1]: {
+                'noncoding1': {
+                    'summary': {'assembled': 'yes',
+                        'known_var': 'yes',
+                        'match': 'yes',
+                        'novel_var': 'no',
+                        'pct_id': '98.33',
+                        'ref_seq': 'noncoding_ref1'
+                     },
+                    'groups': {},
+                    'vars': {},
+                },
+                'noncoding2': {
+                    'summary': {
+                        'assembled': 'yes',
+                        'known_var': 'yes',
+                        'match': 'yes',
+                        'novel_var': 'no',
+                        'pct_id': '98.33',
+                        'ref_seq': 'noncoding_ref2'
+                    },
+                    'groups': {},
+                    'vars': {},
+                },
+                'presence_absence1': {
+                    'summary': {
+                            'assembled': 'yes',
+                            'known_var': 'no',
+                            'match': 'yes',
+                            'novel_var': 'yes',
+                            'pct_id': '98.96',
+                            'ref_seq': 'presence_absence1'
+                    },
+                    'groups': {},
+                    'vars': {}
+                }
+            }
+        }
+
+        expected_potential_cols = {
+            'noncoding1': {
+                'summary': {
+                    'assembled',
+                    'known_var',
+                    'match',
+                    'novel_var',
+                    'pct_id',
+                    'ref_seq'
+                },
+                'groups': set(),
+                'vars': set()
+            },
+            'noncoding2': {
+                'summary': {
+                    'assembled',
+                    'known_var',
+                    'match',
+                    'novel_var',
+                    'pct_id',
+                    'ref_seq'
+                },
+                'groups': set(),
+                'vars': set()
+            },
+            'presence_absence1': {
+                'summary': {
+                    'assembled',
+                    'known_var',
+                    'match',
+                    'novel_var',
+                    'pct_id',
+                    'ref_seq'
+                },
+                'groups': set(),
+                'vars': set()
+            }
+        }
+
+        self.assertEqual(expected_potential_cols, got_potential_cols)
+        self.assertEqual(expected_all, got_all)
+
+        expected_potential_cols['noncoding1']['groups'] = {'id3', 'id1', 'id1.%'}
+        expected_potential_cols['noncoding2']['groups'] = {'id2.%', 'id2'}
+        expected_all[infiles[0]]['noncoding1']['groups'] = {'id1': 'yes'}
+        expected_all[infiles[0]]['noncoding2']['groups'] = {'id2': 'yes_multi_het', 'id2.%': 'NA'}
+        expected_all[infiles[1]]['noncoding1']['groups'] = {'id1': 'het', 'id1.%': 80.0, 'id3': 'yes'}
+        expected_all[infiles[1]]['noncoding2']['groups'] = {'id2': 'het', 'id2.%': 40.0}
+        s = summary.Summary('out', filenames=infiles, variant_cols=None, show_var_groups=True)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        got_all, got_potential_cols = s._gather_unfiltered_output_data()
+        self.assertEqual(expected_potential_cols, got_potential_cols)
+        self.assertEqual(expected_all, got_all)
+
+        expected_potential_cols['noncoding1']['vars'] = {'A14T.%', 'A6G', 'A14T'}
+        expected_potential_cols['noncoding2']['vars'] = {'A52T', 'A52T.%', 'A42T'}
+        expected_potential_cols['presence_absence1']['vars'] = {'A10V'}
+
+        expected_all[infiles[0]]['noncoding1']['vars'] = {'A14T': 'yes'}
+        expected_all[infiles[0]]['noncoding2']['vars'] = {'A42T': 'yes', 'A52T': 'het', 'A52T.%': 40.0}
+        expected_all[infiles[0]]['presence_absence1']['vars'] = {'A10V': 'yes'}
+        expected_all[infiles[1]]['noncoding1']['vars'] = {'A14T': 'het', 'A14T.%': 80.0, 'A6G': 'yes'}
+        expected_all[infiles[1]]['noncoding2']['vars'] = {'A52T': 'het', 'A52T.%': 40.0}
+        expected_all[infiles[1]]['presence_absence1']['vars'] = {'A10V': 'yes'}
+        s = summary.Summary('out', filenames=infiles, variant_cols=None, show_var_groups=True, show_vars=True)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        got_all, got_potential_cols = s._gather_unfiltered_output_data()
+        self.maxDiff = None
+        self.assertEqual(expected_potential_cols, got_potential_cols)
+        self.assertEqual(expected_all, got_all)
+
+
     def test_to_matrix(self):
         '''Test _to_matrix'''
         rows = {

From 2af126d6a5d4e83c1a240eea0d562967d24ae0d6 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Thu, 11 Aug 2016 19:11:43 +0100
Subject: [PATCH 27/40] Remove maxDiff=None

---
 ariba/tests/summary_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index b6a4d007..74a2f77d 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -417,7 +417,6 @@ def test_gather_unfiltered_output_data(self):
         s = summary.Summary('out', filenames=infiles, variant_cols=None, show_var_groups=True, show_vars=True)
         s.samples = summary.Summary._load_input_files(infiles, 90)
         got_all, got_potential_cols = s._gather_unfiltered_output_data()
-        self.maxDiff = None
         self.assertEqual(expected_potential_cols, got_potential_cols)
         self.assertEqual(expected_all, got_all)
 

From edd503e543dd369a398c05f3cf0f2bd4a41cf370 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Thu, 11 Aug 2016 19:36:18 +0100
Subject: [PATCH 28/40] Add test files for
 summary_gather_unfiltered_output_data

---
 .../data/summary_gather_unfiltered_output_data.in.1.tsv     | 5 +++++
 .../data/summary_gather_unfiltered_output_data.in.2.tsv     | 6 ++++++
 2 files changed, 11 insertions(+)
 create mode 100644 ariba/tests/data/summary_gather_unfiltered_output_data.in.1.tsv
 create mode 100644 ariba/tests/data/summary_gather_unfiltered_output_data.in.2.tsv

diff --git a/ariba/tests/data/summary_gather_unfiltered_output_data.in.1.tsv b/ariba/tests/data/summary_gather_unfiltered_output_data.in.1.tsv
new file mode 100644
index 00000000..1957349c
--- /dev/null
+++ b/ariba/tests/data/summary_gather_unfiltered_output_data.in.1.tsv
@@ -0,0 +1,5 @@
+#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding_ref1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding_ref1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding_ref2	0	0	19	78	noncoding2	120	120	98.33	noncoding2.scaffold.1	279	10.0	1	SNP	n	A42T	1	A42T	SNP	42	42	A	84	84	T	17	.	17	noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding_ref2	0	0	19	78	noncoding2	120	120	98.33	noncoding2.scaffold.1	279	10.0	1	SNP	n	A52T	1	A52T	SNP	42	42	A	84	84	T	17	G	20,30	noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence_ref1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence_ref1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_gather_unfiltered_output_data.in.2.tsv b/ariba/tests/data/summary_gather_unfiltered_output_data.in.2.tsv
new file mode 100644
index 00000000..4a23ebc4
--- /dev/null
+++ b/ariba/tests/data/summary_gather_unfiltered_output_data.in.2.tsv
@@ -0,0 +1,6 @@
+#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding_ref1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding_ref1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id3:variant in ref and reads so should report	generic description of noncoding1
+noncoding_ref2	0	0	19	78	noncoding2	120	120	98.33	noncoding2.scaffold.1	279	10.0	1	SNP	n	A52T	1	A52T	SNP	42	42	A	84	84	T	17	G	20,30	noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
+variants_only1	1	1	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.

From 5e74a72eea347c39a3d21e8f9836038066066371 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Thu, 11 Aug 2016 19:36:42 +0100
Subject: [PATCH 29/40] Do not return dicts

---
 ariba/summary.py            |  1 -
 ariba/tests/summary_test.py | 22 +++++++++++-----------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/ariba/summary.py b/ariba/summary.py
index bdad93a3..21bf2770 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -510,4 +510,3 @@ def _gather_unfiltered_output_data(self):
 
                 self.all_data[filename][cluster.name] = this_cluster_dict
 
-        return self.all_data, self.all_potential_columns
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 74a2f77d..b9aefec1 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -268,9 +268,6 @@ def test_gather_unfiltered_output_data(self):
             os.path.join(data_dir, 'summary_gather_unfiltered_output_data.in.1.tsv'),
             os.path.join(data_dir, 'summary_gather_unfiltered_output_data.in.2.tsv')
         ]
-        s = summary.Summary('out', filenames=infiles, variant_cols=None)
-        s.samples = summary.Summary._load_input_files(infiles, 90)
-        got_all, got_potential_cols = s._gather_unfiltered_output_data()
 
         expected_all = {
             infiles[0]: {
@@ -389,8 +386,11 @@ def test_gather_unfiltered_output_data(self):
             }
         }
 
-        self.assertEqual(expected_potential_cols, got_potential_cols)
-        self.assertEqual(expected_all, got_all)
+        s = summary.Summary('out', filenames=infiles, variant_cols=None)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s._gather_unfiltered_output_data()
+        self.assertEqual(expected_potential_cols, s.all_potential_columns)
+        self.assertEqual(expected_all, s.all_data)
 
         expected_potential_cols['noncoding1']['groups'] = {'id3', 'id1', 'id1.%'}
         expected_potential_cols['noncoding2']['groups'] = {'id2.%', 'id2'}
@@ -400,9 +400,9 @@ def test_gather_unfiltered_output_data(self):
         expected_all[infiles[1]]['noncoding2']['groups'] = {'id2': 'het', 'id2.%': 40.0}
         s = summary.Summary('out', filenames=infiles, variant_cols=None, show_var_groups=True)
         s.samples = summary.Summary._load_input_files(infiles, 90)
-        got_all, got_potential_cols = s._gather_unfiltered_output_data()
-        self.assertEqual(expected_potential_cols, got_potential_cols)
-        self.assertEqual(expected_all, got_all)
+        s._gather_unfiltered_output_data()
+        self.assertEqual(expected_potential_cols, s.all_potential_columns)
+        self.assertEqual(expected_all, s.all_data)
 
         expected_potential_cols['noncoding1']['vars'] = {'A14T.%', 'A6G', 'A14T'}
         expected_potential_cols['noncoding2']['vars'] = {'A52T', 'A52T.%', 'A42T'}
@@ -416,9 +416,9 @@ def test_gather_unfiltered_output_data(self):
         expected_all[infiles[1]]['presence_absence1']['vars'] = {'A10V': 'yes'}
         s = summary.Summary('out', filenames=infiles, variant_cols=None, show_var_groups=True, show_vars=True)
         s.samples = summary.Summary._load_input_files(infiles, 90)
-        got_all, got_potential_cols = s._gather_unfiltered_output_data()
-        self.assertEqual(expected_potential_cols, got_potential_cols)
-        self.assertEqual(expected_all, got_all)
+        s._gather_unfiltered_output_data()
+        self.assertEqual(expected_potential_cols, s.all_potential_columns)
+        self.assertEqual(expected_all, s.all_data)
 
 
     def test_to_matrix(self):

From 4e4c24acd45ebed0c4ae184e4e85dc6750ac6da1 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Thu, 11 Aug 2016 23:31:47 +0100
Subject: [PATCH 30/40] Rewrite to_matrix

---
 ariba/summary.py                         |  57 +++----
 ariba/tests/data/summary_to_matrix.1.tsv |   5 +
 ariba/tests/data/summary_to_matrix.2.tsv |   6 +
 ariba/tests/summary_test.py              | 186 ++++++++++++++---------
 4 files changed, 158 insertions(+), 96 deletions(-)
 create mode 100644 ariba/tests/data/summary_to_matrix.1.tsv
 create mode 100644 ariba/tests/data/summary_to_matrix.2.tsv

diff --git a/ariba/summary.py b/ariba/summary.py
index 21bf2770..2645b4bc 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -231,49 +231,54 @@ def _gather_output_rows(self):
 
 
     @classmethod
-    def _to_matrix(cls, filenames, rows, cluster_cols):
-        '''rows = output from _gather_output_rows().
-           filenames = self.filenames
-           cluster_cols = self.cluster_columns'''
+    def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols):
         matrix = []
         making_header_lines = True
         phandango_header = ['name']
-        phandago_suffixes = {'assembled': ':o1', 'match': ':o1', 'ref_seq': ':o2', 'pct_id': ':c1', 'known_var': ':o1', 'novel_var': ':o1'}
+        phandango_suffixes = {'assembled': ':o1', 'match': ':o1', 'ref_seq': ':o2', 'pct_id': ':c1', 'known_var': ':o1', 'novel_var': ':o1'}
         ref_seq_counter = 2
         csv_header = ['name']
-        all_cluster_cols_in_order = ['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var']
-        all_cluster_cols_in_order_set = set(['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'])
-        cluster_cols_in_order = [x for x in all_cluster_cols_in_order if cluster_cols[x]]
+        summary_cols_in_order = ['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var']
+        summary_cols_set = set(['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'])
+        summary_cols_in_order = [x for x in summary_cols_in_order if cluster_cols[x]]
 
         for filename in filenames:
-            assert filename in rows
             line = [filename]
 
-            for cluster_name in sorted(rows[filename]):
-                for col in cluster_cols_in_order:
+            for cluster_name in sorted(all_potential_columns):
+                group_cols = sorted(list(all_potential_columns[cluster_name]['groups']))
+                var_cols = sorted(list(all_potential_columns[cluster_name]['vars']))
+
+                for col in summary_cols_in_order + group_cols + var_cols:
                     if making_header_lines:
                         csv_header.append(cluster_name + '.' + col)
                         if col == 'ref_seq':
-                            phandago_suffixes[col] = ':o' + str(ref_seq_counter)
+                            phandango_suffixes[col] = ':o' + str(ref_seq_counter)
                             ref_seq_counter += 1
-                        phandango_header.append(cluster_name + '.' + col + phandago_suffixes[col])
-
-                    line.append(rows[filename][cluster_name][col])
-
-                for col in sorted(rows[filename][cluster_name]):
-                    if col in all_cluster_cols_in_order_set:
-                        continue
-
-                    if making_header_lines:
-                        csv_header.append(cluster_name + '.' + col)
-                        suffix = ':c2' if col.endswith('.%') else ':o1'
-                        phandango_header.append(cluster_name + '.' + col + suffix)
-
-                    line.append(rows[filename][cluster_name][col])
+                            phandango_header.append(cluster_name + '.' + col + phandango_suffixes[col])
+                        elif col in phandango_suffixes:
+                            phandango_header.append(cluster_name + '.' + col + phandango_suffixes[col])
+                        elif col.endswith('.%'):
+                            phandango_header.append(cluster_name + '.' + col + ':c2')
+                        else:
+                            phandango_header.append(cluster_name + '.' + col + ':o1')
+
+                    for col_type in ['summary', 'groups', 'vars']:
+                        if col in all_data[filename][cluster_name][col_type]:
+                            line.append(all_data[filename][cluster_name][col_type][col])
+                            break
+                    else:
+                        if col == 'assembled' or not col.endswith('.%'):
+                            line.append('no')
+                        else:
+                            line.append('NA')
 
             making_header_lines = False
             matrix.append(line)
 
+        assert len(phandango_header) == len(csv_header)
+        for line in matrix:
+            assert len(line) == len(csv_header)
         return phandango_header, csv_header, matrix
 
 
diff --git a/ariba/tests/data/summary_to_matrix.1.tsv b/ariba/tests/data/summary_to_matrix.1.tsv
new file mode 100644
index 00000000..1957349c
--- /dev/null
+++ b/ariba/tests/data/summary_to_matrix.1.tsv
@@ -0,0 +1,5 @@
+#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding_ref1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding_ref1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding_ref2	0	0	19	78	noncoding2	120	120	98.33	noncoding2.scaffold.1	279	10.0	1	SNP	n	A42T	1	A42T	SNP	42	42	A	84	84	T	17	.	17	noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding_ref2	0	0	19	78	noncoding2	120	120	98.33	noncoding2.scaffold.1	279	10.0	1	SNP	n	A52T	1	A52T	SNP	42	42	A	84	84	T	17	G	20,30	noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence_ref1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence_ref1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_to_matrix.2.tsv b/ariba/tests/data/summary_to_matrix.2.tsv
new file mode 100644
index 00000000..4a23ebc4
--- /dev/null
+++ b/ariba/tests/data/summary_to_matrix.2.tsv
@@ -0,0 +1,6 @@
+#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding_ref1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding_ref1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id3:variant in ref and reads so should report	generic description of noncoding1
+noncoding_ref2	0	0	19	78	noncoding2	120	120	98.33	noncoding2.scaffold.1	279	10.0	1	SNP	n	A52T	1	A52T	SNP	42	42	A	84	84	T	17	G	20,30	noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
+variants_only1	1	1	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index b9aefec1..73343f8b 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -421,78 +421,124 @@ def test_gather_unfiltered_output_data(self):
         self.assertEqual(expected_all, s.all_data)
 
 
-    def test_to_matrix(self):
-        '''Test _to_matrix'''
-        rows = {
-            'file1': {
-                'cluster.n.1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'noncoding1',
-                    'known_var': 'yes',
-                    'novel_var': 'no',
-                    'pct_id': '98.33',
-                    'noncoding1.A14T': 'yes'
-                },
-                'cluster.p.1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'presence_absence1',
-                    'known_var': 'yes',
-                    'novel_var': 'no',
-                    'pct_id': '98.96',
-                    'presence_absence1.I42L': 'yes'
-                },
-                'cluster.v.1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'varonly1',
-                    'known_var': 'no',
-                    'novel_var': 'no',
-                    'pct_id': '99.42',
-                }
-            },
-            'file2': {
-                'cluster.n.1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'noncoding1',
-                    'known_var': 'no',
-                    'novel_var': 'no',
-                    'pct_id': '98.33',
-                    'noncoding1.A14T': 'no'
-                },
-                'cluster.p.1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'presence_absence1',
-                    'pct_id': '98.96',
-                    'known_var': 'no',
-                    'novel_var': 'no',
-                    'presence_absence1.I42L': 'no'
-                },
-                'cluster.v.1': {
-                    'assembled': 'no',
-                    'match': 'NA',
-                    'ref_seq': 'NA',
-                    'known_var': 'NA',
-                    'novel_var': 'NA',
-                    'pct_id': 'NA',
-                }
-            },
-        }
-        filenames = ['file1', 'file2']
-        cluster_cols = {'assembled': True, 'match': True, 'ref_seq': False, 'pct_id': False, 'known_var': False, 'novel_var': False}
-        got_phandago_header, got_csv_header, got_lines  = summary.Summary._to_matrix(filenames, rows, cluster_cols)
-        expected_phandango_header = ['name', 'cluster.n.1.assembled:o1', 'cluster.n.1.match:o1', 'cluster.n.1.noncoding1.A14T:o1', 'cluster.p.1.assembled:o1', 'cluster.p.1.match:o1', 'cluster.p.1.presence_absence1.I42L:o1', 'cluster.v.1.assembled:o1', 'cluster.v.1.match:o1']
-        expected_csv_header = ['name', 'cluster.n.1.assembled', 'cluster.n.1.match', 'cluster.n.1.noncoding1.A14T', 'cluster.p.1.assembled', 'cluster.p.1.match', 'cluster.p.1.presence_absence1.I42L', 'cluster.v.1.assembled', 'cluster.v.1.match']
-        expected_lines = [
-            ['file1', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes'],
-            ['file2', 'yes', 'yes', 'no', 'yes', 'yes', 'no', 'no', 'NA']
+    def test_to_matrix_all_cols(self):
+        '''Test _to_matrix all columns'''
+        infiles = [
+            os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+            os.path.join(data_dir, 'summary_to_matrix.2.tsv')
         ]
-        self.assertEqual(expected_phandango_header, got_phandago_header)
+
+        s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_vars=True)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s._gather_unfiltered_output_data()
+        got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+
+        expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.id2:o1', 'noncoding2.id2.%:c2', 'noncoding2.A42T:o1', 'noncoding2.A52T:o1', 'noncoding2.A52T.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1', 'presence_absence1.A10V:o1']
+        expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'noncoding2.A42T', 'noncoding2.A52T', 'noncoding2.A52T.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var', 'presence_absence1.A10V']
+        expected_matrix = [
+            [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'NA', 'no', 'yes', 'NA', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'],
+            [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 'het', 80.0, 'yes', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'no', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes']
+        ]
+
+        self.assertEqual(expected_phandango_header, got_phandango_header)
+        self.assertEqual(expected_csv_header, got_csv_header)
+        self.assertEqual(expected_matrix, got_matrix)
+
+
+    def test_to_matrix_with_groups(self):
+        '''Test _to_matrix with groups'''
+        infiles = [
+            os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+            os.path.join(data_dir, 'summary_to_matrix.2.tsv')
+        ]
+
+        s = summary.Summary('out', filenames=infiles, show_var_groups=True)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s._gather_unfiltered_output_data()
+        got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+
+        expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.id2:o1', 'noncoding2.id2.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1']
+        expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var']
+        expected_matrix = [
+            [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'NA', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes'],
+            [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes']
+        ]
+
+        self.assertEqual(expected_phandango_header, got_phandango_header)
+        self.assertEqual(expected_csv_header, got_csv_header)
+        self.assertEqual(expected_matrix, got_matrix)
+
+
+    def test_to_matrix_with_vars(self):
+        '''Test _to_matrix with vars'''
+        infiles = [
+            os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+            os.path.join(data_dir, 'summary_to_matrix.2.tsv')
+        ]
+
+        s = summary.Summary('out', filenames=infiles, show_vars=True)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s._gather_unfiltered_output_data()
+        got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+
+        expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.A42T:o1', 'noncoding2.A52T:o1', 'noncoding2.A52T.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1', 'presence_absence1.A10V:o1']
+        expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.A42T', 'noncoding2.A52T', 'noncoding2.A52T.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var', 'presence_absence1.A10V']
+        expected_matrix = [
+            [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'NA', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'],
+            [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'no', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes']
+        ]
+
+        self.assertEqual(expected_phandango_header, got_phandango_header)
+        self.assertEqual(expected_csv_header, got_csv_header)
+        self.assertEqual(expected_matrix, got_matrix)
+
+
+    def test_to_matrix_cluster_only(self):
+        '''Test _to_matrix with cluster columns only'''
+        infiles = [
+            os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+            os.path.join(data_dir, 'summary_to_matrix.2.tsv')
+        ]
+
+        s = summary.Summary('out', filenames=infiles)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s._gather_unfiltered_output_data()
+        got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+
+        expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1']
+        expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var']
+        expected_matrix = [
+            [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes'],
+            [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes']
+        ]
+
+        self.assertEqual(expected_phandango_header, got_phandango_header)
+        self.assertEqual(expected_csv_header, got_csv_header)
+        self.assertEqual(expected_matrix, got_matrix)
+
+
+    def test_to_matrix_assembled_only(self):
+        '''Test _to_matrix with assembled column only'''
+        infiles = [
+            os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+            os.path.join(data_dir, 'summary_to_matrix.2.tsv')
+        ]
+
+        s = summary.Summary('out', filenames=infiles, cluster_cols='assembled')
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s._gather_unfiltered_output_data()
+        got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+
+        expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding2.assembled:o1', 'presence_absence1.assembled:o1']
+        expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding2.assembled', 'presence_absence1.assembled']
+        expected_matrix = [
+            [infiles[0], 'yes', 'yes', 'yes'],
+            [infiles[1], 'yes', 'yes', 'yes']
+        ]
+
+        self.assertEqual(expected_phandango_header, got_phandango_header)
         self.assertEqual(expected_csv_header, got_csv_header)
-        self.assertEqual(expected_lines, got_lines)
+        self.assertEqual(expected_matrix, got_matrix)
 
 
     def test_filter_matrix_rows(self):

From 831a664250ea090be6bb0e1c7303a13b02588430 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 12 Aug 2016 08:15:17 +0100
Subject: [PATCH 31/40] Remove old _gather_output_rows method

---
 ariba/summary.py                              |  85 -------------
 .../summary_test_gather_output_rows.in.1.tsv  |   3 -
 .../summary_test_gather_output_rows.in.2.tsv  |   5 -
 ariba/tests/summary_test.py                   | 119 ------------------
 4 files changed, 212 deletions(-)
 delete mode 100644 ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
 delete mode 100644 ariba/tests/data/summary_test_gather_output_rows.in.2.tsv

diff --git a/ariba/summary.py b/ariba/summary.py
index 2645b4bc..9ce36050 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -145,91 +145,6 @@ def _get_all_var_groups(cls, samples_dict):
         return groups
 
 
-    def _gather_output_rows(self):
-        all_cluster_names = Summary._get_all_cluster_names(self.samples)
-        all_var_columns = Summary._get_all_variant_columns(self.samples)
-        all_het_snps = Summary._get_all_het_snps(self.samples)
-        if self.var_columns['groups']:
-            var_groups = Summary._get_all_var_groups(self.samples)
-        else:
-            var_groups = set()
-        rows = {}
-
-        for filename, sample in self.samples.items():
-            rows[filename] = {}
-
-            for cluster in all_cluster_names:
-                rows[filename][cluster] = {}
-
-                if cluster in sample.column_summary_data and sample.column_summary_data[cluster]['assembled'] not in {'no'}:
-                    rows[filename][cluster] = sample.column_summary_data[cluster]
-                else:
-                    rows[filename][cluster] = {
-                        'assembled': 'no',
-                        'match': 'no',
-                        'ref_seq': 'NA',
-                        'known_var': 'NA',
-                        'novel_var': 'NA',
-                        'pct_id': 'NA'
-                    }
-
-                if self.var_columns['groups']:
-                    for group_name in var_groups[cluster]:
-                        if cluster in sample.var_groups and group_name in sample.var_groups[cluster]:
-                            rows[filename][cluster]['vgroup.' + group_name] = 'yes'
-                            if self.show_known_het:
-                                if cluster in sample.het_snps:
-                                    if group_name in sample.het_snps[cluster]:
-                                        if len(sample.het_snps[cluster][group_name]) == 1:
-                                            rows[filename][cluster]['vgroup.' + group_name] = 'het'
-                                            percent = list(sample.het_snps[cluster][group_name].values())[0]
-                                            rows[filename][cluster]['vgroup.' + group_name + '.%'] = percent
-                                        else:
-                                            assert len(sample.het_snps[cluster][group_name]) > 1
-                                            rows[filename][cluster]['vgroup.' + group_name] = 'multi_het'
-                                            rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA'
-                                    else:
-                                        rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA'
-                        else:
-                            rows[filename][cluster]['vgroup.' + group_name] = 'no'
-                            if self.show_known_het:
-                                rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA'
-
-                if cluster in all_var_columns:
-                    for (ref_name, variant, grouped_or_novel, group_name) in all_var_columns[cluster]:
-                        if not self.var_columns[grouped_or_novel]:
-                            continue
-
-                        key = ref_name + '.' + variant
-                        if rows[filename][cluster]['assembled'] == 'no':
-                            rows[filename][cluster][key] = 'NA'
-                        elif cluster in sample.variant_column_names_tuples and (ref_name, variant, grouped_or_novel, group_name) in sample.variant_column_names_tuples[cluster]:
-                            rows[filename][cluster][key] = 'yes'
-                            if self.show_known_het:
-                                if cluster in sample.het_snps:
-                                    if grouped_or_novel == 'grouped' and group_name in sample.het_snps[cluster]:
-                                        rows[filename][cluster][key] = 'het'
-                                        rows[filename][cluster][key + '.%'] = sample.het_snps[cluster][group_name].get(variant, "NA")
-                                    elif grouped_or_novel == 'novel' and '.' in sample.het_snps[cluster]:
-                                        rows[filename][cluster][key] = 'het'
-                                        rows[filename][cluster][key + '.%'] = sample.het_snps['.'].get(variant, "NA")
-                                    else:
-                                        percent = 'NA'
-                        else:
-                            rows[filename][cluster][key] = 'no'
-                            if self.show_known_het and (cluster, variant) in all_het_snps:
-                                rows[filename][cluster][key + '.%'] = 'NA'
-
-                        if self.show_known_het and (cluster, variant) in all_het_snps and key + '.%' not in rows[filename][cluster]:
-                            rows[filename][cluster][key + '.%'] = 'NA'
-
-                for key, wanted in self.cluster_columns.items():
-                    if not wanted:
-                        del rows[filename][cluster][key]
-
-        return rows
-
-
     @classmethod
     def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols):
         matrix = []
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
deleted file mode 100644
index 3e67eeb1..00000000
--- a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
deleted file mode 100644
index 398aedbc..00000000
--- a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id3:variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
-variants_only1	1	1	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 73343f8b..56ceb82d 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -143,125 +143,6 @@ def test_get_all_var_groups(self):
         self.assertEqual(expected, got)
 
 
-    def test_gather_output_rows(self):
-        '''Test _gather_output_rows'''
-        infiles = [
-            os.path.join(data_dir, 'summary_test_gather_output_rows.in.1.tsv'),
-            os.path.join(data_dir, 'summary_test_gather_output_rows.in.2.tsv')
-        ]
-        s = summary.Summary('out', filenames=infiles, variant_cols=None)
-        s.samples = summary.Summary._load_input_files(infiles, 90)
-        expected = {
-            infiles[0]: {
-                'noncoding1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'noncoding1',
-                    'known_var': 'yes',
-                    'novel_var': 'no',
-                    'pct_id': '98.33',
-                },
-                'presence_absence1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'presence_absence1',
-                    'known_var': 'no',
-                    'novel_var': 'yes',
-                    'pct_id': '98.96',
-                },
-                'variants_only1': {
-                    'assembled': 'no',
-                    'match': 'no',
-                    'ref_seq': 'NA',
-                    'known_var': 'NA',
-                    'novel_var': 'NA',
-                    'pct_id': 'NA',
-                }
-            },
-            infiles[1]: {
-                'noncoding1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'noncoding1',
-                    'known_var': 'yes',
-                    'novel_var': 'no',
-                    'pct_id': '98.33',
-                },
-                'presence_absence1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'presence_absence1',
-                    'pct_id': '98.96',
-                    'known_var': 'no',
-                    'novel_var': 'yes',
-                },
-                'variants_only1': {
-                    'assembled': 'no',
-                    'match': 'no',
-                    'ref_seq': 'NA',
-                    'known_var': 'NA',
-                    'novel_var': 'NA',
-                    'pct_id': 'NA',
-                }
-            },
-        }
-        got = s._gather_output_rows()
-        self.assertEqual(expected, got)
-
-        s.var_columns['groups'] = True
-        expected[infiles[0]]['noncoding1']['vgroup.id1'] = 'yes'
-        expected[infiles[0]]['noncoding1']['vgroup.id3'] = 'no'
-        expected[infiles[1]]['noncoding1']['vgroup.id1'] = 'yes'
-        expected[infiles[1]]['noncoding1']['vgroup.id3'] = 'yes'
-        got = s._gather_output_rows()
-        self.assertEqual(expected, got)
-
-
-        s.var_columns['grouped'] = True
-        s.var_columns['ungrouped'] = True
-        expected[infiles[0]]['noncoding1']['noncoding1.A14T'] = 'yes'
-        expected[infiles[0]]['noncoding1']['noncoding1.A6G'] = 'no'
-        expected[infiles[1]]['noncoding1']['noncoding1.A14T'] = 'yes'
-        expected[infiles[1]]['noncoding1']['noncoding1.A6G'] = 'yes'
-        self.maxDiff = None
-        got = s._gather_output_rows()
-        self.assertEqual(expected, got)
-
-        s.var_columns['novel'] = True
-        expected[infiles[0]]['presence_absence1']['presence_absence1.A10V'] = 'yes'
-        expected[infiles[1]]['presence_absence1']['presence_absence1.A10V'] = 'yes'
-        got = s._gather_output_rows()
-        self.assertEqual(expected, got)
-
-        s.show_known_het = True
-        expected[infiles[0]]['noncoding1']['vgroup.id1.%'] = 'NA'
-        expected[infiles[0]]['noncoding1']['vgroup.id3.%'] = 'NA'
-        expected[infiles[1]]['noncoding1']['vgroup.id1'] = 'het'
-        expected[infiles[1]]['noncoding1']['vgroup.id1.%'] = 80.0
-        expected[infiles[1]]['noncoding1']['vgroup.id3'] = 'yes'
-        expected[infiles[1]]['noncoding1']['vgroup.id3.%'] = 'NA'
-        expected[infiles[0]]['noncoding1']['noncoding1.A14T.%'] = 'NA'
-        expected[infiles[1]]['noncoding1']['noncoding1.A14T'] = 'het'
-        expected[infiles[1]]['noncoding1']['noncoding1.A14T.%'] = 80.0
-        got = s._gather_output_rows()
-        self.assertEqual(expected, got)
-
-        for filename in expected:
-            del expected[filename]['noncoding1']['vgroup.id1']
-            del expected[filename]['noncoding1']['vgroup.id3']
-            del expected[filename]['noncoding1']['vgroup.id1.%']
-            del expected[filename]['noncoding1']['vgroup.id3.%']
-            for gene_type in expected[filename]:
-                del expected[filename][gene_type]['ref_seq']
-
-        s = summary.Summary('out', filenames=infiles, cluster_cols='assembled,match,pct_id,known_var,novel_var', variant_cols='ungrouped,grouped,novel')
-        s.samples = summary.Summary._load_input_files(infiles, 90)
-        s.include_all_variant_columns = True
-        s.show_known_het = True
-        got = s._gather_output_rows()
-        self.assertEqual(expected, got)
-
-
     def test_gather_unfiltered_output_data(self):
         '''test gather_output_rows_new'''
         infiles = [

From 5accff81163594432bd1e2bd92afe733a3a4b9e1 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 12 Aug 2016 08:18:24 +0100
Subject: [PATCH 32/40] Remove var_columns option

---
 ariba/summary.py            |  8 --------
 ariba/tests/summary_test.py | 35 +++--------------------------------
 2 files changed, 3 insertions(+), 40 deletions(-)

diff --git a/ariba/summary.py b/ariba/summary.py
index 9ce36050..af6f1e57 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -20,7 +20,6 @@ def __init__(
       min_id=90.0,
       show_known_het=False,
       cluster_cols='assembled,match,ref_seq,pct_id,known_var,novel_var',
-      variant_cols='groups,grouped,ungrouped,novel',
       make_phandango_tree=True,
       only_clusters=None,
       show_var_groups=False,
@@ -40,7 +39,6 @@ def __init__(
 
         self.show_known_het = show_known_het
         self.cluster_columns = self._determine_cluster_cols(cluster_cols)
-        self.var_columns = self._determine_var_cols(variant_cols)
         self.filter_rows = filter_rows
         self.filter_columns = filter_columns
         self.min_id = min_id
@@ -68,12 +66,6 @@ def _determine_cluster_cols(cols_string):
         return Summary._determine_cols(cols_string, allowed_cols, 'cluster columns')
 
 
-    @staticmethod
-    def _determine_var_cols(cols_string):
-        allowed_cols = {'groups', 'grouped', 'ungrouped', 'novel'}
-        return Summary._determine_cols(cols_string, allowed_cols, 'variant columns')
-
-
     def _load_fofn(self, fofn):
         f = pyfastaq.utils.open_file_read(fofn)
         filenames = [x.rstrip() for x in f.readlines()]
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 56ceb82d..40447235 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -43,35 +43,6 @@ def test_determine_cluster_cols(self):
             self.assertEqual(expected[i], summary.Summary._determine_cluster_cols(col_strings[i]))
 
 
-    def test_determine_var_cols(self):
-        col_strings = [
-            'groups,grouped,ungrouped,novel',
-            'groups,grouped,ungrouped',
-            'grouped,novel',
-            'ungrouped,novel',
-            'grouped',
-            'ungrouped',
-            'novel',
-            ''
-        ]
-
-        expected = [
-            {'groups': True, 'grouped': True, 'ungrouped': True, 'novel': True},
-            {'groups': True, 'grouped': True, 'ungrouped': True, 'novel': False},
-            {'groups': False, 'grouped': True, 'ungrouped': False, 'novel': True},
-            {'groups': False, 'grouped': False, 'ungrouped': True, 'novel': True},
-            {'groups': False, 'grouped': True, 'ungrouped': False, 'novel': False},
-            {'groups': False, 'grouped': False, 'ungrouped': True, 'novel': False},
-            {'groups': False, 'grouped': False, 'ungrouped': False, 'novel': True},
-            {'groups': False, 'grouped': False, 'ungrouped': False, 'novel': False},
-        ]
-
-        assert len(col_strings) == len(expected)
-
-        for i in range(len(col_strings)):
-            self.assertEqual(expected[i], summary.Summary._determine_var_cols(col_strings[i]))
-
-
     def test_load_input_files(self):
         '''Test _load_input_files'''
         file1 = os.path.join(data_dir, 'summary_test_load_input_files.1.tsv')
@@ -267,7 +238,7 @@ def test_gather_unfiltered_output_data(self):
             }
         }
 
-        s = summary.Summary('out', filenames=infiles, variant_cols=None)
+        s = summary.Summary('out', filenames=infiles)
         s.samples = summary.Summary._load_input_files(infiles, 90)
         s._gather_unfiltered_output_data()
         self.assertEqual(expected_potential_cols, s.all_potential_columns)
@@ -279,7 +250,7 @@ def test_gather_unfiltered_output_data(self):
         expected_all[infiles[0]]['noncoding2']['groups'] = {'id2': 'yes_multi_het', 'id2.%': 'NA'}
         expected_all[infiles[1]]['noncoding1']['groups'] = {'id1': 'het', 'id1.%': 80.0, 'id3': 'yes'}
         expected_all[infiles[1]]['noncoding2']['groups'] = {'id2': 'het', 'id2.%': 40.0}
-        s = summary.Summary('out', filenames=infiles, variant_cols=None, show_var_groups=True)
+        s = summary.Summary('out', filenames=infiles, show_var_groups=True)
         s.samples = summary.Summary._load_input_files(infiles, 90)
         s._gather_unfiltered_output_data()
         self.assertEqual(expected_potential_cols, s.all_potential_columns)
@@ -295,7 +266,7 @@ def test_gather_unfiltered_output_data(self):
         expected_all[infiles[1]]['noncoding1']['vars'] = {'A14T': 'het', 'A14T.%': 80.0, 'A6G': 'yes'}
         expected_all[infiles[1]]['noncoding2']['vars'] = {'A52T': 'het', 'A52T.%': 40.0}
         expected_all[infiles[1]]['presence_absence1']['vars'] = {'A10V': 'yes'}
-        s = summary.Summary('out', filenames=infiles, variant_cols=None, show_var_groups=True, show_vars=True)
+        s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_vars=True)
         s.samples = summary.Summary._load_input_files(infiles, 90)
         s._gather_unfiltered_output_data()
         self.assertEqual(expected_potential_cols, s.all_potential_columns)

From ed96e6347e09dd2d02cf6f456454799f37f0e720 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 12 Aug 2016 08:20:21 +0100
Subject: [PATCH 33/40] remove methods _get_all_cluster_names
 _get_all_variant_columns

---
 ariba/summary.py                              | 24 ------------------
 .../summary_test_get_all_cluster_names.1.tsv  |  3 ---
 .../summary_test_get_all_cluster_names.2.tsv  |  5 ----
 ariba/tests/summary_test.py                   | 25 -------------------
 4 files changed, 57 deletions(-)
 delete mode 100644 ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
 delete mode 100644 ariba/tests/data/summary_test_get_all_cluster_names.2.tsv

diff --git a/ariba/summary.py b/ariba/summary.py
index af6f1e57..069bbc95 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -90,30 +90,6 @@ def _load_input_files(cls, filenames, min_id, verbose=False, only_clusters=None)
         return samples
 
 
-    @classmethod
-    def _get_all_cluster_names(cls, samples_dict):
-        '''Input should be output of _load_input_files'''
-        cluster_names = set()
-        for filename, sample in samples_dict.items():
-            cluster_names.update(set(sample.clusters.keys()))
-        return cluster_names
-
-
-    @classmethod
-    def _get_all_variant_columns(cls, samples_dict):
-        '''Input should be output of _load_input_files'''
-        columns = {}
-        for filename, sample in samples_dict.items():
-            for cluster in sample.column_summary_data:
-                if sample.column_summary_data[cluster]['assembled'] == 'yes':
-                    for key, tuple_set in sample.variant_column_names_tuples.items():
-                        for t in tuple_set:
-                            if key not in columns:
-                                columns[key] = set()
-                            columns[key].add(t)
-        return columns
-
-
     @classmethod
     def _get_all_het_snps(cls, samples_dict):
         snps = set()
diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
deleted file mode 100644
index f35590e2..00000000
--- a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	cluster.p.1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:id2:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv
deleted file mode 100644
index 2bddc3d6..00000000
--- a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	cluster.p.2	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
-variants_only1	1	1	64	12	cluster.v.1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 40447235..d4f39530 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -64,31 +64,6 @@ def test_load_input_files(self):
         self.assertEqual(expected, got)
 
 
-
-    def test_get_all_cluster_names(self):
-        '''Test _get_all_cluster_names'''
-        file1 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.1.tsv')
-        file2 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.2.tsv')
-        samples = summary.Summary._load_input_files([file1, file2], 90)
-        got = summary.Summary._get_all_cluster_names(samples)
-        expected = {'cluster.n.1', 'cluster.v.1', 'cluster.p.1', 'cluster.p.2'}
-        self.assertEqual(expected, got)
-
-
-    def test_get_all_variant_columns(self):
-        '''Test _get_all_variant_columns'''
-        file1 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.1.tsv')
-        file2 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.2.tsv')
-        samples = summary.Summary._load_input_files([file1, file2], 90)
-        got = summary.Summary._get_all_variant_columns(samples)
-        expected = {
-            'cluster.p.2': {('presence_absence1', 'A10V', 'grouped', 'id3')},
-            'cluster.n.1': {('noncoding1', 'A6G', 'grouped', 'id2'), ('noncoding1', 'A14T', 'grouped', 'id1')},
-            'cluster.p.1': {('presence_absence1', 'A10V', 'grouped', 'id2')},
-        }
-        self.assertEqual(expected, got)
-
-
     def test_get_all_het_snps(self):
         '''test _get_all_het_snps'''
         file1 = os.path.join(data_dir, 'summary_test_get_all_het_snps.1.tsv')

From 287348a5ddc52fa1292b27f2cbf29ca01ed27586 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 12 Aug 2016 08:21:49 +0100
Subject: [PATCH 34/40] Remove method _get_all_het_snps

---
 ariba/summary.py                                     | 11 -----------
 ariba/tests/data/summary_test_get_all_het_snps.1.tsv |  3 ---
 ariba/tests/data/summary_test_get_all_het_snps.2.tsv |  5 -----
 ariba/tests/summary_test.py                          | 10 ----------
 4 files changed, 29 deletions(-)
 delete mode 100644 ariba/tests/data/summary_test_get_all_het_snps.1.tsv
 delete mode 100644 ariba/tests/data/summary_test_get_all_het_snps.2.tsv

diff --git a/ariba/summary.py b/ariba/summary.py
index 069bbc95..105d5daf 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -90,17 +90,6 @@ def _load_input_files(cls, filenames, min_id, verbose=False, only_clusters=None)
         return samples
 
 
-    @classmethod
-    def _get_all_het_snps(cls, samples_dict):
-        snps = set()
-        for filename, sample in samples_dict.items():
-            for cluster, snp_dict in sample.het_snps.items():
-                for snp_id in snp_dict:
-                    for snp in snp_dict[snp_id]:
-                        snps.add((cluster, snp))
-
-        return snps
-
     @classmethod
     def _get_all_var_groups(cls, samples_dict):
         groups = {}
diff --git a/ariba/tests/data/summary_test_get_all_het_snps.1.tsv b/ariba/tests/data/summary_test_get_all_het_snps.1.tsv
deleted file mode 100644
index 3e67eeb1..00000000
--- a/ariba/tests/data/summary_test_get_all_het_snps.1.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_het_snps.2.tsv b/ariba/tests/data/summary_test_get_all_het_snps.2.tsv
deleted file mode 100644
index 398aedbc..00000000
--- a/ariba/tests/data/summary_test_get_all_het_snps.2.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id3:variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
-variants_only1	1	1	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index d4f39530..3b7ecee6 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -64,16 +64,6 @@ def test_load_input_files(self):
         self.assertEqual(expected, got)
 
 
-    def test_get_all_het_snps(self):
-        '''test _get_all_het_snps'''
-        file1 = os.path.join(data_dir, 'summary_test_get_all_het_snps.1.tsv')
-        file2 = os.path.join(data_dir, 'summary_test_get_all_het_snps.2.tsv')
-        samples = summary.Summary._load_input_files([file1, file2], 90)
-        got = summary.Summary._get_all_het_snps(samples)
-        expected = {('noncoding1', 'A14T')}
-        self.assertEqual(expected, got)
-
-
     def test_get_all_var_groups(self):
         '''test _get_all_var_groups'''
         file1 = os.path.join(data_dir, 'summary_test_get_all_var_groups.1.tsv')

From 8ab83fce410f8dbabc1f3c60e58f6170de7de52f Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 12 Aug 2016 08:22:48 +0100
Subject: [PATCH 35/40] Remove method _get_all_var_groups

---
 ariba/summary.py                                  | 12 ------------
 .../data/summary_test_get_all_var_groups.1.tsv    |  3 ---
 .../data/summary_test_get_all_var_groups.2.tsv    |  5 -----
 ariba/tests/summary_test.py                       | 15 ---------------
 4 files changed, 35 deletions(-)
 delete mode 100644 ariba/tests/data/summary_test_get_all_var_groups.1.tsv
 delete mode 100644 ariba/tests/data/summary_test_get_all_var_groups.2.tsv

diff --git a/ariba/summary.py b/ariba/summary.py
index 105d5daf..030d16f7 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -90,18 +90,6 @@ def _load_input_files(cls, filenames, min_id, verbose=False, only_clusters=None)
         return samples
 
 
-    @classmethod
-    def _get_all_var_groups(cls, samples_dict):
-        groups = {}
-        for filename, sample in samples_dict.items():
-            for name, name_set in sample.var_groups.items():
-                if name not in groups:
-                    groups[name] = set()
-                groups[name].update(name_set)
-
-        return groups
-
-
     @classmethod
     def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols):
         matrix = []
diff --git a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv b/ariba/tests/data/summary_test_get_all_var_groups.1.tsv
deleted file mode 100644
index c4db58da..00000000
--- a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	cluster.p.1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:id4:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv b/ariba/tests/data/summary_test_get_all_var_groups.2.tsv
deleted file mode 100644
index 2bddc3d6..00000000
--- a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	cluster.p.2	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
-variants_only1	1	1	64	12	cluster.v.1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 3b7ecee6..9c4931c9 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -64,21 +64,6 @@ def test_load_input_files(self):
         self.assertEqual(expected, got)
 
 
-    def test_get_all_var_groups(self):
-        '''test _get_all_var_groups'''
-        file1 = os.path.join(data_dir, 'summary_test_get_all_var_groups.1.tsv')
-        file2 = os.path.join(data_dir, 'summary_test_get_all_var_groups.2.tsv')
-        samples = summary.Summary._load_input_files([file1, file2], 90)
-        got = summary.Summary._get_all_var_groups(samples)
-        expected = {
-            'cluster.p.1': {'id4'},
-            'cluster.p.2': {'id3'},
-            'cluster.v.1': set(),
-            'cluster.n.1': {'id1', 'id2'}
-        }
-        self.assertEqual(expected, got)
-
-
     def test_gather_unfiltered_output_data(self):
         '''test gather_output_rows_new'''
         infiles = [

From e4a645dcf9daa89468f5e54b04130540a2680234 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 12 Aug 2016 08:24:05 +0100
Subject: [PATCH 36/40] Move _gather_unfiltered_output_data higher up in file

---
 ariba/summary.py | 86 ++++++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/ariba/summary.py b/ariba/summary.py
index 030d16f7..1621534a 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -90,6 +90,49 @@ def _load_input_files(cls, filenames, min_id, verbose=False, only_clusters=None)
         return samples
 
 
+    def _gather_unfiltered_output_data(self):
+        self.all_potential_columns = {}
+        self.all_data = {}
+
+        for filename in sorted(self.samples):
+            self.all_data[filename] = {}
+            for cluster in self.samples[filename].clusters.values():
+                self.all_data[filename][cluster.name] = {}
+                if cluster.name not in self.all_potential_columns:
+                    self.all_potential_columns[cluster.name] = {'summary' : set(), 'groups': set(), 'vars': set()}
+
+                this_cluster_dict = {'summary': copy.copy(cluster.summary), 'groups': {}, 'vars': {}}
+                seen_groups = {}
+
+                for variant in cluster.variants:
+                    if self.show_vars:
+                        this_cluster_dict['vars'][variant.var_string] = 'yes' if variant.het_percent is None else 'het'
+                        if variant.het_percent is not None:
+                            this_cluster_dict['vars'][variant.var_string + '.%'] = variant.het_percent
+
+                    if self.show_var_groups and variant.var_group != '.':
+                        if variant.var_group not in seen_groups:
+                            seen_groups[variant.var_group] = {'yes': 0, 'het': 0}
+
+                        if variant.het_percent is None:
+                            seen_groups[variant.var_group]['yes'] += 1
+                            this_cluster_dict['groups'][variant.var_group] = 'yes'
+                        else:
+                            seen_groups[variant.var_group]['het'] += 1
+                            this_cluster_dict['groups'][variant.var_group] = 'het'
+                            this_cluster_dict['groups'][variant.var_group + '.%'] = variant.het_percent
+
+                for group, d in seen_groups.items():
+                    if d['het'] > 0 and d['het'] + d['yes'] > 1:
+                        this_cluster_dict['groups'][group] = 'yes_multi_het'
+                        this_cluster_dict['groups'][group + '.%'] = 'NA'
+
+                for x in this_cluster_dict:
+                    self.all_potential_columns[cluster.name][x].update(set(this_cluster_dict[x].keys()))
+
+                self.all_data[filename][cluster.name] = this_cluster_dict
+
+
     @classmethod
     def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols):
         matrix = []
@@ -332,46 +375,3 @@ def run(self):
         if self.verbose:
             print('Finished', flush=True)
 
-
-    def _gather_unfiltered_output_data(self):
-        self.all_potential_columns = {}
-        self.all_data = {}
-
-        for filename in sorted(self.samples):
-            self.all_data[filename] = {}
-            for cluster in self.samples[filename].clusters.values():
-                self.all_data[filename][cluster.name] = {}
-                if cluster.name not in self.all_potential_columns:
-                    self.all_potential_columns[cluster.name] = {'summary' : set(), 'groups': set(), 'vars': set()}
-
-                this_cluster_dict = {'summary': copy.copy(cluster.summary), 'groups': {}, 'vars': {}}
-                seen_groups = {}
-
-                for variant in cluster.variants:
-                    if self.show_vars:
-                        this_cluster_dict['vars'][variant.var_string] = 'yes' if variant.het_percent is None else 'het'
-                        if variant.het_percent is not None:
-                            this_cluster_dict['vars'][variant.var_string + '.%'] = variant.het_percent
-
-                    if self.show_var_groups and variant.var_group != '.':
-                        if variant.var_group not in seen_groups:
-                            seen_groups[variant.var_group] = {'yes': 0, 'het': 0}
-
-                        if variant.het_percent is None:
-                            seen_groups[variant.var_group]['yes'] += 1
-                            this_cluster_dict['groups'][variant.var_group] = 'yes'
-                        else:
-                            seen_groups[variant.var_group]['het'] += 1
-                            this_cluster_dict['groups'][variant.var_group] = 'het'
-                            this_cluster_dict['groups'][variant.var_group + '.%'] = variant.het_percent
-
-                for group, d in seen_groups.items():
-                    if d['het'] > 0 and d['het'] + d['yes'] > 1:
-                        this_cluster_dict['groups'][group] = 'yes_multi_het'
-                        this_cluster_dict['groups'][group + '.%'] = 'NA'
-
-                for x in this_cluster_dict:
-                    self.all_potential_columns[cluster.name][x].update(set(this_cluster_dict[x].keys()))
-
-                self.all_data[filename][cluster.name] = this_cluster_dict
-

From 9d7841896f146386141836979d3fa3076ea3cae2 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 12 Aug 2016 08:42:17 +0100
Subject: [PATCH 37/40] Use new refactored code

---
 ariba/summary.py       |  8 +++-----
 ariba/tasks/summary.py | 32 ++------------------------------
 2 files changed, 5 insertions(+), 35 deletions(-)

diff --git a/ariba/summary.py b/ariba/summary.py
index 1621534a..0787b935 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -18,7 +18,6 @@ def __init__(
       filter_rows=True,
       filter_columns=True,
       min_id=90.0,
-      show_known_het=False,
       cluster_cols='assembled,match,ref_seq,pct_id,known_var,novel_var',
       make_phandango_tree=True,
       only_clusters=None,
@@ -37,7 +36,6 @@ def __init__(
         if fofn is not None:
             self.filenames.extend(self._load_fofn(fofn))
 
-        self.show_known_het = show_known_het
         self.cluster_columns = self._determine_cluster_cols(cluster_cols)
         self.filter_rows = filter_rows
         self.filter_columns = filter_columns
@@ -167,7 +165,7 @@ def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols):
                             phandango_header.append(cluster_name + '.' + col + ':o1')
 
                     for col_type in ['summary', 'groups', 'vars']:
-                        if col in all_data[filename][cluster_name][col_type]:
+                        if cluster_name in all_data[filename] and col in all_data[filename][cluster_name][col_type]:
                             line.append(all_data[filename][cluster_name][col_type][col])
                             break
                     else:
@@ -313,8 +311,8 @@ def run(self):
         self.samples = self._load_input_files(self.filenames, self.min_id, verbose=self.verbose, only_clusters=self.only_clusters)
         if self.verbose:
             print('Generating output rows', flush=True)
-        self.rows = self._gather_output_rows()
-        phandango_header, csv_header, matrix = Summary._to_matrix(self.filenames, self.rows, self.cluster_columns)
+        self._gather_unfiltered_output_data()
+        phandango_header, csv_header, matrix = Summary._to_matrix(self.filenames, self.all_data, self.all_potential_columns, self.cluster_columns)
 
         # sanity check same number of columns in headers and matrix
         lengths = {len(x) for x in matrix}
diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
index c674df3a..d6bfb848 100644
--- a/ariba/tasks/summary.py
+++ b/ariba/tasks/summary.py
@@ -9,66 +9,38 @@ def use_preset(options):
     preset_to_vals = {
         'minimal': {
             'cluster_cols': 'match',
-            'var_cols': '',
             'col_filter': 'y',
             'row_filter': 'y',
-            'var_groups': 'n',
-            'known_vars': 'n',
-            'novel_vars': 'n'
         },
         'cluster_small': {
             'cluster_cols': 'assembled,match,ref_seq,known_var',
-            'var_cols': '',
             'col_filter': 'y',
             'row_filter': 'y',
-            'var_groups': 'n',
-            'known_vars': 'n',
-            'novel_vars': 'n'
         },
         'cluster_all': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'var_cols': '',
             'col_filter': 'y',
             'row_filter': 'y',
-            'var_groups': 'n',
-            'known_vars': 'n',
-            'novel_vars': 'n'
         },
         'cluster_var_groups': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'var_cols': 'groups',
             'col_filter': 'y',
             'row_filter': 'y',
-            'var_groups': 'y',
-            'known_vars': 'n',
-            'novel_vars': 'n'
         },
         'cluster_known_vars': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'var_cols': 'groups,grouped,ungrouped',
             'col_filter': 'y',
             'row_filter': 'y',
-            'var_groups': 'y',
-            'known_vars': 'y',
-            'novel_vars': 'n'
         },
         'all': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'var_cols': 'groups,grouped,ungrouped,novel',
             'col_filter': 'y',
             'row_filter': 'y',
-            'var_groups': 'y',
-            'known_vars': 'y',
-            'novel_vars': 'y'
         },
         'all_no_filter': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'var_cols': 'groups,grouped,ungrouped,novel',
             'col_filter': 'n',
             'row_filter': 'n',
-            'var_groups': 'y',
-            'known_vars': 'y',
-            'novel_vars': 'y'
         },
     }
 
@@ -93,11 +65,11 @@ def run(options):
         filter_rows=options.col_filter == 'y',
         filter_columns=options.row_filter == 'y',
         min_id=options.min_id,
-        show_known_het=options.het,
         cluster_cols=options.cluster_cols,
-        variant_cols=options.var_cols,
         make_phandango_tree=(not options.no_tree),
         only_clusters=None if options.only_cluster is None else {options.only_cluster},
+        show_var_groups=options.v_groups,
+        show_vars=options.variants,
         verbose=options.verbose
     )
     s.run()

From dd0e6d76c23fef53440767415063af45c5d40fb9 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 12 Aug 2016 08:52:49 +0100
Subject: [PATCH 38/40] update preset to use v_groups and variants options

---
 ariba/tasks/summary.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
index d6bfb848..252f85fa 100644
--- a/ariba/tasks/summary.py
+++ b/ariba/tasks/summary.py
@@ -27,11 +27,6 @@ def use_preset(options):
             'col_filter': 'y',
             'row_filter': 'y',
         },
-        'cluster_known_vars': {
-            'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'col_filter': 'y',
-            'row_filter': 'y',
-        },
         'all': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
             'col_filter': 'y',
@@ -49,6 +44,12 @@ def use_preset(options):
     for key, val in preset_to_vals[options.preset].items():
         exec('options.' + key + ' = "' + val + '"')
 
+    if options.preset in {'cluster_var_groups', 'all', 'all_no_filter'}:
+        options.v_groups = True
+
+    if options.preset in {'all', 'all_no_filter'}:
+        options.variants = True
+
     return options
 
 

From 8a47c6b49befe738c47ad7b1039bcb938148b270 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 12 Aug 2016 09:03:35 +0100
Subject: [PATCH 39/40] Bug fix getting variant nucleotide for novel snp

---
 ariba/summary_cluster_variant.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ariba/summary_cluster_variant.py b/ariba/summary_cluster_variant.py
index b9c7c6db..51e00d95 100644
--- a/ariba/summary_cluster_variant.py
+++ b/ariba/summary_cluster_variant.py
@@ -43,7 +43,9 @@ def _get_het_percent(cls, data_dict):
                 raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_alt_nt, smtls_alt_depth columns. Cannot continue\n' + str(data_dict))
 
             try:
-                var_nucleotide = data_dict['known_var_change'][-1]
+                var_nucleotide = data_dict['known_var_change'][-1] if data_dict['known_var_change'] != '.' else data_dict['ref_ctg_change'][-1]
+                if var_nucleotide == '.':
+                    return None
                 depths = [int(x) for x in depths]
                 nuc_to_depth = dict(zip(nucleotides, depths))
                 total_depth = sum(depths)

From 553ff0cd111350e886154007e38af2aedcf4dbb7 Mon Sep 17 00:00:00 2001
From: Martin Hunt <mh12@sanger.ac.uk>
Date: Fri, 12 Aug 2016 09:14:04 +0100
Subject: [PATCH 40/40] Update options to reflect rewrite of summary code

---
 scripts/ariba | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/ariba b/scripts/ariba
index f0b91a86..696a00a9 100755
--- a/scripts/ariba
+++ b/scripts/ariba
@@ -152,7 +152,7 @@ subparser_run.set_defaults(func=ariba.tasks.run.run)
 
 
 #----------------------------- summary -------------------------------
-summary_presets = ['minimal', 'cluster_small', 'cluster_all', 'cluster_var_groups', 'cluster_known_vars', 'all', 'all_no_filter']
+summary_presets = ['minimal', 'cluster_small', 'cluster_all', 'cluster_var_groups', 'all', 'all_no_filter']
 subparser_summary = subparsers.add_parser(
     'summary',
     help='Summarise multiple reports made by "run"',
@@ -162,15 +162,15 @@ subparser_summary = subparsers.add_parser(
 )
 
 subparser_summary.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME')
-subparser_summary.add_argument('--preset', choices=summary_presets, help='Shorthand for setting --cluster_cols,--col_filter,--row_filter,--known_vars,--novel_vars. Using this overrides those options', metavar='|'.join(summary_presets))
+subparser_summary.add_argument('--preset', choices=summary_presets, help='Shorthand for setting --cluster_cols,--col_filter,--row_filter,--v_groups,--variants. Using this overrides those options', metavar='|'.join(summary_presets))
 subparser_summary.add_argument('--cluster_cols', help='Comma separated list of cluster columns to include. Choose from: assembled, match, ref_seq, pct_id, known_var, novel_var [%(default)s]', default='match', metavar='col1,col2,...')
 subparser_summary.add_argument('--col_filter', choices=['y', 'n'], default='y', help='Choose whether columns where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
-subparser_summary.add_argument('--het', action='store_true', help='For known noncoding SNPs, report if they are heterozygous or not, and the percent of reads supporting the variant type')
 subparser_summary.add_argument('--no_tree', action='store_true', help='Do not make phandango tree')
 subparser_summary.add_argument('--row_filter', choices=['y', 'n'], default='y', help='Choose whether rows where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
-subparser_summary.add_argument('--var_cols', help='Comma separated list of variant columns to include. Choose from: groups, grouped, ungrouped, novel [none by default]', metavar='col1,col2,...', default='')
 subparser_summary.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
 subparser_summary.add_argument('--only_cluster', help='Only report data for the given cluster name', metavar='Cluster_name')
+subparser_summary.add_argument('--v_groups', action='store_true', help='Show presence of variants that are in groups')
+subparser_summary.add_argument('--variants', action='store_true', help='Report all variants')
 subparser_summary.add_argument('--verbose', action='store_true', help='Be verbose')
 subparser_summary.add_argument('outprefix', help='Prefix of output files')
 subparser_summary.add_argument('infiles', nargs='*', help='Files to be summarised')