From c1f34eac9611a742f0ccae16c0ca1c11722c07dd Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 5 Aug 2016 15:12:04 +0100 Subject: [PATCH 01/40] Bug fix getting snp group --- ariba/summary_cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py index 7a53f556..39bf9fee 100644 --- a/ariba/summary_cluster.py +++ b/ariba/summary_cluster.py @@ -56,7 +56,7 @@ def line2dict(cls, line): d['var_group'] = '.' else: try: - d['var_group'] = d['var_description'].split(':')[3] + d['var_group'] = d['var_description'].split(':')[4] except: raise Error('Error getting variant group from the following line:\n' + line) From c123ce4a82a9fdb6569f47ecffa1c24031c0a33a Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 5 Aug 2016 15:13:21 +0100 Subject: [PATCH 02/40] Add option --no_tree --- ariba/summary.py | 24 +++++++++++++++--------- ariba/tasks/summary.py | 1 + scripts/ariba | 1 + 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/ariba/summary.py b/ariba/summary.py index ecf7d304..77b003ab 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -21,6 +21,7 @@ def __init__( show_known_het=False, cluster_cols='assembled,match,ref_seq,pct_id,known_var,novel_var', variant_cols='groups,grouped,ungrouped,novel', + make_phandango_tree=True, verbose=False, ): if filenames is None and fofn is None: @@ -41,6 +42,7 @@ def __init__( self.filter_columns = filter_columns self.min_id = min_id self.outprefix = outprefix + self.make_phandango_tree = make_phandango_tree self.verbose = verbose @@ -416,17 +418,21 @@ def run(self): csv_file = self.outprefix + '.phandango.csv' phandango_header, phandango_matrix = Summary._add_phandango_colour_columns(phandango_header, matrix) Summary._matrix_to_csv(phandango_matrix, phandango_header, csv_file) - dist_matrix_file = self.outprefix + '.phandango.distance_matrix' - tree_file = self.outprefix + '.phandango.tre' - if self.verbose: - print('Making Phandango distance matrix', dist_matrix_file, flush=True) - Summary._write_distance_matrix(matrix, dist_matrix_file) + if self.make_phandango_tree: + dist_matrix_file = self.outprefix + '.phandango.distance_matrix' + tree_file = self.outprefix + '.phandango.tre' - if self.verbose: - print('Making Phandango tree file', tree_file, flush=True) - Summary._newick_from_dist_matrix(dist_matrix_file, tree_file) - os.unlink(dist_matrix_file) + if self.verbose: + print('Making Phandango distance matrix', dist_matrix_file, flush=True) + Summary._write_distance_matrix(matrix, dist_matrix_file) + + if self.verbose: + print('Making Phandango tree file', tree_file, flush=True) + Summary._newick_from_dist_matrix(dist_matrix_file, tree_file) + os.unlink(dist_matrix_file) + elif self.verbose: + print('Skipping making tree because you asked me not to make it', flush=True) else: print('Made csv file. Not making Phandango files because only one sample remains after filtering', file=sys.stderr) diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py index 782c9056..b722ce8e 100644 --- a/ariba/tasks/summary.py +++ b/ariba/tasks/summary.py @@ -96,6 +96,7 @@ def run(options): show_known_het=options.het, cluster_cols=options.cluster_cols, variant_cols=options.var_cols, + make_phandango_tree=(not options.no_tree), verbose=options.verbose ) s.run() diff --git a/scripts/ariba b/scripts/ariba index bb6627e3..63fc5d93 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -166,6 +166,7 @@ subparser_summary.add_argument('--preset', choices=summary_presets, help='Shorth subparser_summary.add_argument('--cluster_cols', help='Comma separated list of cluster columns to include. Choose from: assembled, match, ref_seq, pct_id, known_var, novel_var [%(default)s]', default='match', metavar='col1,col2,...') subparser_summary.add_argument('--col_filter', choices=['y', 'n'], default='y', help='Choose whether columns where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n') subparser_summary.add_argument('--het', action='store_true', help='For known noncoding SNPs, report if they are heterozygous or not, and the percent of reads supporting the variant type') +subparser_summary.add_argument('--no_tree', action='store_true', help='Do not make phandango tree') subparser_summary.add_argument('--row_filter', choices=['y', 'n'], default='y', help='Choose whether rows where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n') subparser_summary.add_argument('--var_cols', help='Comma separated list of variant columns to include. Choose from: groups, grouped, ungrouped, novel [none by default]', metavar='col1,col2,...', default='') subparser_summary.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT') From 0cae6a3a8fa70fc5b27cb344194c6befab0368be Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 5 Aug 2016 15:26:37 +0100 Subject: [PATCH 03/40] Fix snp info format in penultimate column --- ariba/tests/summary_cluster_test.py | 136 ++++++++++++++-------------- 1 file changed, 68 insertions(+), 68 deletions(-) diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py index 6220dcfb..9ee7c458 100644 --- a/ariba/tests/summary_cluster_test.py +++ b/ariba/tests/summary_cluster_test.py @@ -8,7 +8,7 @@ class TestSummaryCluster(unittest.TestCase): def test_line2dict(self): '''Test _line2dict''' - line = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:var_group1:ref has wild type, foo bar\tsome free text' + line = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:var_group1:ref has wild type, foo bar\tsome free text' expected = { 'ref_name': 'refname', @@ -39,7 +39,7 @@ def test_line2dict(self): 'smtls_total_depth': '17', 'smtls_alt_nt': '.', 'smtls_alt_depth': '17', - 'var_description': 'noncoding1:n:A14T:var_group1:ref has wild type, foo bar', + 'var_description': 'noncoding1:1:0:A14T:var_group1:ref has wild type, foo bar', 'var_group': 'var_group1', 'free_text': 'some free text' } @@ -51,9 +51,9 @@ def test_add_data_dict(self): '''Test add_data_dict''' cluster = summary_cluster.SummaryCluster() self.assertTrue(cluster.name is None) - line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text' - line2 = 'refname\t1\t0\t19\t78\tcluster2\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id2:ref has wild type, foo bar\tsome free text' - line3 = 'refname2\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id3:ref has wild type, foo bar\tsome free text' + line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text' + line2 = 'refname\t1\t0\t19\t78\tcluster2\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id2:ref has wild type, foo bar\tsome free text' + line3 = 'refname2\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id3:ref has wild type, foo bar\tsome free text' data_dict1 = summary_cluster.SummaryCluster.line2dict(line1) data_dict2 = summary_cluster.SummaryCluster.line2dict(line2) data_dict3 = summary_cluster.SummaryCluster.line2dict(line3) @@ -71,9 +71,9 @@ def test_pc_id_of_longest(self): '''Test pc_id_of_longest''' cluster = summary_cluster.SummaryCluster() self.assertTrue(cluster.name is None) - line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text' - line2 = 'refname\t1\t0\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text' - line3 = 'refname\t1\t0\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text' + line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text' + line2 = 'refname\t1\t0\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text' + line3 = 'refname\t1\t0\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text' data_dict1 = summary_cluster.SummaryCluster.line2dict(line1) data_dict2 = summary_cluster.SummaryCluster.line2dict(line2) data_dict3 = summary_cluster.SummaryCluster.line2dict(line3) @@ -85,7 +85,7 @@ def test_pc_id_of_longest(self): def test_to_cluster_summary_number(self): '''Test _to_cluster_summary_assembled''' - line = 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text' + line = 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text' data_dict = summary_cluster.SummaryCluster.line2dict(line) tests = [ @@ -122,9 +122,9 @@ def test_to_cluster_summary_number(self): def test_has_known_variant(self): '''Test _has_known_variant''' lines = [ - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', 'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.', 'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.' ] @@ -139,9 +139,9 @@ def test_has_known_variant(self): def test_has_any_known_variant(self): lines = [ - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', 'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.', 'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.' ] @@ -159,10 +159,10 @@ def test_has_any_known_variant(self): def test_has_nonsynonymous(self): '''Test _has_nonsynonymous''' lines = [ - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', 'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.', 'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.' ] @@ -178,11 +178,11 @@ def test_has_nonsynonymous(self): def test_has_any_nonsynonymous(self): '''Test _has_any_nonsynonymous''' lines = [ - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:N_ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:N_ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', ] expected = ['no', 'yes', 'no', 'yes', 'yes'] @@ -198,9 +198,9 @@ def test_has_any_nonsynonymous(self): def test_has_novel_nonsynonymous(self): '''Test _has_novel_nonsynonymous''' lines = [ - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', 'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.', 'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.' ] @@ -216,9 +216,9 @@ def test_has_novel_nonsynonymous(self): def test_has_any_novel_nonsynonymous(self): '''Test _has_any_novel_nonsynonymous''' lines = [ - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', 'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.', 'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.' ] @@ -236,11 +236,11 @@ def test_has_any_novel_nonsynonymous(self): def test_to_cluster_summary_has_known_nonsynonymous(self): '''Test _to_cluster_summary_has_known_nonsynonymous''' lines = [ - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', ] expected = ['yes', 'yes', 'no', 'no', 'no'] @@ -257,11 +257,11 @@ def test_to_cluster_summary_has_known_nonsynonymous(self): def test_to_cluster_summary_has_novel_nonsynonymous(self): '''Test _to_cluster_summary_has_novel_nonsynonymous''' lines = [ - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', ] expected = ['no', 'no', 'no', 'yes', 'yes'] @@ -278,11 +278,11 @@ def test_to_cluster_summary_has_novel_nonsynonymous(self): def test_to_cluster_summary_has_nonsynonymous(self): '''Test _to_cluster_summary_has_nonsynonymous''' lines = [ - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', ] expected = ['no', 'yes', 'no', 'yes', 'yes'] @@ -369,16 +369,16 @@ def test_get_nonsynonymous_var(self): def test_has_match(self): '''Test _has_match''' lines = [ - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:1:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:1:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id1:ref has wild type, foo bar\tsome free text', ] expected = ['yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'no', 'no'] @@ -396,14 +396,14 @@ def test_has_match(self): def test_has_var_groups(self): '''Test has_var_groups''' lines = [ - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', - 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id2:ref has wild type, foo bar\tsome free text', - 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id3:ref has wild type, foo bar\tsome free text', - 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id4:ref has wild type, foo bar\tsome free text', - 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id5:ref has wild type, foo bar\tsome free text', - 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id6:ref has wild type, foo bar\tsome free text', - 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id7:ref has wild type, foo bar\tsome free text', - 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id7:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id2:ref has wild type, foo bar\tsome free text', + 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id3:ref has wild type, foo bar\tsome free text', + 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id4:ref has wild type, foo bar\tsome free text', + 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id5:ref has wild type, foo bar\tsome free text', + 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id6:ref has wild type, foo bar\tsome free text', + 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id7:ref has wild type, foo bar\tsome free text', + 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id7:ref has wild type, foo bar\tsome free text', ] dicts = [summary_cluster.SummaryCluster.line2dict(line) for line in lines] cluster = summary_cluster.SummaryCluster() @@ -438,7 +438,7 @@ def test_column_summary_data(self): def test_non_synon_variants(self): '''Test non_synon_variants''' - line1 = 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs' + line1 = 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs' line2 = 'ref1\t0\t0\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text' data_dict1 = summary_cluster.SummaryCluster.line2dict(line1) @@ -454,10 +454,10 @@ def test_non_synon_variants(self): def test_known_noncoding_het_snps(self): '''test known_noncoding_het_snps''' lines = [ - 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs', - 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs', - 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs', - 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs' + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs' ] cluster = summary_cluster.SummaryCluster() From 001e308e7bd17cafc9b3d56fa3b0a2864af82445 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 5 Aug 2016 15:31:50 +0100 Subject: [PATCH 04/40] Fix snp info format in penultimate column of sample test files --- ...mmary_sample_test_column_names_tuples_and_het_snps.tsv | 8 ++++---- .../data/summary_sample_test_column_summary_data.tsv | 8 ++++---- ariba/tests/data/summary_sample_test_var_groups.tsv | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv b/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv index 5e12e4a9..159949c8 100644 --- a/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv +++ b/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv @@ -1,8 +1,8 @@ #ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:n:A14T:.:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:0:0:A14T:.:ref has wild type, reads have variant so should report generic description of noncoding1 noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 0 SNP . . . G15T SNP 15 15 G 85 85 T 17 . 17 . generic description of noncoding1 -noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1 +noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id2:variant in ref and reads so should report generic description of noncoding1 noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding2.scaffold.1 279 35.4 . . . . . . . . . . . . . . . . . generic description of noncoding2 -presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 +presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.2 267 35.1 . . . . . . . . . . . . . . . . . Generic description of presence_absence2 -variants_only1 1 1 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:p:S5T:.:Ref and reads have variant so report Generic description of variants_only1 +variants_only1 1 1 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:1:0:S5T:.:Ref and reads have variant so report Generic description of variants_only1 diff --git a/ariba/tests/data/summary_sample_test_column_summary_data.tsv b/ariba/tests/data/summary_sample_test_column_summary_data.tsv index 22a42b54..9c495ecd 100644 --- a/ariba/tests/data/summary_sample_test_column_summary_data.tsv +++ b/ariba/tests/data/summary_sample_test_column_summary_data.tsv @@ -1,8 +1,8 @@ #ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1_n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 0 SNP . . . G15T SNP 15 15 G 85 85 T 17 . 17 . generic description of noncoding1 -noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1_n:A6G:id2:variant in ref and reads so should report generic description of noncoding1 +noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id2:variant in ref and reads so should report generic description of noncoding1 noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding2.scaffold.1 279 35.4 . . . . . . . . . . . . . . . . . generic description of noncoding2 -presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 +presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.2 267 35.1 . . . . . . . . . . . . . . . . . Generic description of presence_absence2 -variants_only1 1 1 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:p:S5T:id4:Ref and reads have variant so report Generic description of variants_only1 +variants_only1 1 1 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:1:0:S5T:id4:Ref and reads have variant so report Generic description of variants_only1 diff --git a/ariba/tests/data/summary_sample_test_var_groups.tsv b/ariba/tests/data/summary_sample_test_var_groups.tsv index a1252110..33526608 100644 --- a/ariba/tests/data/summary_sample_test_var_groups.tsv +++ b/ariba/tests/data/summary_sample_test_var_groups.tsv @@ -1,7 +1,7 @@ #ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1 +noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id2:variant in ref and reads so should report generic description of noncoding1 noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding2.scaffold.1 279 35.4 . . . . . . . . . . . . . . . . . generic description of noncoding2 -presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 +presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.2 267 35.1 . . . . . . . . . . . . . . . . . Generic description of presence_absence2 -variants_only1 1 1 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:p:S5T:id4:Ref and reads have variant so report Generic description of variants_only1 +variants_only1 1 1 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:1:0:S5T:id4:Ref and reads have variant so report Generic description of variants_only1 From 13d630192fbfac585d9b747308e91d5b04659fb1 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 5 Aug 2016 15:36:29 +0100 Subject: [PATCH 05/40] Fix snp info format in penultimate column of summary test files --- ariba/tests/data/summary_test_gather_output_rows.in.1.tsv | 4 ++-- ariba/tests/data/summary_test_gather_output_rows.in.2.tsv | 6 +++--- ariba/tests/data/summary_test_get_all_cluster_names.1.tsv | 4 ++-- ariba/tests/data/summary_test_get_all_cluster_names.2.tsv | 6 +++--- ariba/tests/data/summary_test_get_all_var_groups.1.tsv | 4 ++-- ariba/tests/data/summary_test_get_all_var_groups.2.tsv | 6 +++--- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv index d1f5f70b..3e67eeb1 100644 --- a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv +++ b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv @@ -1,3 +1,3 @@ #ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 +noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv index 6507d5fd..398aedbc 100644 --- a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv +++ b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv @@ -1,5 +1,5 @@ #ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id3:variant in ref and reads so should report generic description of noncoding1 -presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 +noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id3:variant in ref and reads so should report generic description of noncoding1 +presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 variants_only1 1 1 64 12 variants_only1 90 . . . . . . . . . . . . . . . . . . . . . . . diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv index 9e8e9a2a..f35590e2 100644 --- a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv +++ b/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv @@ -1,3 +1,3 @@ #ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -presence_absence1 1 0 27 88 cluster.p.1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id2:Ref has wild, reads have variant so report Generic description of presence_absence1 +noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +presence_absence1 1 0 27 88 cluster.p.1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:id2:Ref has wild, reads have variant so report Generic description of presence_absence1 diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv index d4cd028c..2bddc3d6 100644 --- a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv +++ b/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv @@ -1,5 +1,5 @@ #ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1 -presence_absence1 1 0 27 88 cluster.p.2 96 96 98.96 presence_absence1.scaffold.1 267 51.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 +noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id2:variant in ref and reads so should report generic description of noncoding1 +presence_absence1 1 0 27 88 cluster.p.2 96 96 98.96 presence_absence1.scaffold.1 267 51.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 variants_only1 1 1 64 12 cluster.v.1 90 . . . . . . . . . . . . . . . . . . . . . . . diff --git a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv b/ariba/tests/data/summary_test_get_all_var_groups.1.tsv index 62394c08..c4db58da 100644 --- a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv +++ b/ariba/tests/data/summary_test_get_all_var_groups.1.tsv @@ -1,3 +1,3 @@ #ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -presence_absence1 1 0 27 88 cluster.p.1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id4:Ref has wild, reads have variant so report Generic description of presence_absence1 +noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +presence_absence1 1 0 27 88 cluster.p.1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:id4:Ref has wild, reads have variant so report Generic description of presence_absence1 diff --git a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv b/ariba/tests/data/summary_test_get_all_var_groups.2.tsv index d4cd028c..2bddc3d6 100644 --- a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv +++ b/ariba/tests/data/summary_test_get_all_var_groups.2.tsv @@ -1,5 +1,5 @@ #ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1 -presence_absence1 1 0 27 88 cluster.p.2 96 96 98.96 presence_absence1.scaffold.1 267 51.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 +noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id2:variant in ref and reads so should report generic description of noncoding1 +presence_absence1 1 0 27 88 cluster.p.2 96 96 98.96 presence_absence1.scaffold.1 267 51.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 variants_only1 1 1 64 12 cluster.v.1 90 . . . . . . . . . . . . . . . . . . . . . . . From 94fabc2c70955f463c6628f4fb344a7bff97ac41 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Mon, 8 Aug 2016 15:27:55 +0100 Subject: [PATCH 06/40] Report het snps in groups --- ariba/summary.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/ariba/summary.py b/ariba/summary.py index 77b003ab..cabe3c01 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -135,6 +135,7 @@ def _get_all_var_groups(cls, samples_dict): if name not in groups: groups[name] = set() groups[name].update(name_set) + return groups @@ -170,9 +171,29 @@ def _gather_output_rows(self): if self.var_columns['groups']: for group_name in var_groups[cluster]: if cluster in sample.var_groups and group_name in sample.var_groups[cluster]: - rows[filename][cluster]['vgroup.' + group_name] = 'yes' + if self.show_known_het: + if cluster in sample.het_snps: + if len(sample.het_snps[cluster]) == 0: + rows[filename][cluster]['vgroup.' + group_name] = 'no' + rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA' + elif len(sample.het_snps[cluster]) == 1: + rows[filename][cluster]['vgroup.' + group_name] = 'het' + snp_name = list(sample.het_snps[cluster].keys())[0] + percent = -1 + for v in sample.variant_column_names_tuples[cluster]: + if v[1] == snp_name and snp_name in sample.het_snps[cluster]: + percent = sample.het_snps[cluster][snp_name] + + rows[filename][cluster]['vgroup.' + group_name + '.%'] = percent + else: + rows[filename][cluster]['vgroup.' + group_name] = 'multi_het' + rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA' + else: + rows[filename][cluster]['vgroup.' + group_name] = 'yes' else: rows[filename][cluster]['vgroup.' + group_name] = 'no' + if self.show_known_het: + rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA' if cluster in all_var_columns: for (ref_name, variant, grouped_or_novel, group_name) in all_var_columns[cluster]: From 23033e50aa73d02b1e52492fba29866768c3c433 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 10 Aug 2016 10:19:04 +0100 Subject: [PATCH 07/40] Bug fix passing all preset var_cols option --- ariba/tasks/summary.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py index b722ce8e..ec1cd879 100644 --- a/ariba/tasks/summary.py +++ b/ariba/tasks/summary.py @@ -9,7 +9,7 @@ def use_preset(options): preset_to_vals = { 'minimal': { 'cluster_cols': 'match', - 'variant_cols': '', + 'var_cols': '', 'col_filter': 'y', 'row_filter': 'y', 'var_groups': 'n', @@ -18,7 +18,7 @@ def use_preset(options): }, 'cluster_small': { 'cluster_cols': 'assembled,match,ref_seq,known_var', - 'variant_cols': '', + 'var_cols': '', 'col_filter': 'y', 'row_filter': 'y', 'var_groups': 'n', @@ -27,7 +27,7 @@ def use_preset(options): }, 'cluster_all': { 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var', - 'variant_cols': '', + 'var_cols': '', 'col_filter': 'y', 'row_filter': 'y', 'var_groups': 'n', @@ -36,7 +36,7 @@ def use_preset(options): }, 'cluster_var_groups': { 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var', - 'variant_cols': 'groups', + 'var_cols': 'groups', 'col_filter': 'y', 'row_filter': 'y', 'var_groups': 'y', @@ -45,7 +45,7 @@ def use_preset(options): }, 'cluster_known_vars': { 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var', - 'variant_cols': 'groups,grouped,ungrouped', + 'var_cols': 'groups,grouped,ungrouped', 'col_filter': 'y', 'row_filter': 'y', 'var_groups': 'y', @@ -54,7 +54,7 @@ def use_preset(options): }, 'all': { 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var', - 'variant_cols': 'groups,grouped,ungrouped,novel', + 'var_cols': 'groups,grouped,ungrouped,novel', 'col_filter': 'y', 'row_filter': 'y', 'var_groups': 'y', @@ -63,7 +63,7 @@ def use_preset(options): }, 'all_no_filter': { 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var', - 'variant_cols': 'groups,grouped,ungrouped,novel', + 'var_cols': 'groups,grouped,ungrouped,novel', 'col_filter': 'n', 'row_filter': 'n', 'var_groups': 'y', From 59a212ddffce2910adc7953043de5219d6d9dc70 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 10 Aug 2016 20:02:47 +0100 Subject: [PATCH 08/40] Add 'interrupted' as option in assembled column --- ariba/summary_cluster.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py index 39bf9fee..2fd5b07f 100644 --- a/ariba/summary_cluster.py +++ b/ariba/summary_cluster.py @@ -118,6 +118,8 @@ def _to_cluster_summary_assembled(self): return 'yes' else: return 'yes_nonunique' + elif self.flag.has('assembled_into_one_contig'): + return 'interrupted' else: return 'fragmented' From efc129122fe4731df9a5e96d77c6e9c80abc4e20 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 10 Aug 2016 22:00:53 +0100 Subject: [PATCH 09/40] track variant groups, if present --- ariba/summary_cluster.py | 5 ++++- ariba/tests/summary_cluster_test.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py index 2fd5b07f..3b1daeb1 100644 --- a/ariba/summary_cluster.py +++ b/ariba/summary_cluster.py @@ -297,5 +297,8 @@ def known_noncoding_het_snps(self): for d in self.data: snp_tuple = self._get_known_noncoding_het_snp(d) if snp_tuple is not None: - snps[snp_tuple[0]] = snp_tuple[1] + snp_id = d['var_description'].split(':')[4] + if snp_id not in snps: + snps[snp_id] = {} + snps[snp_id][snp_tuple[0]] = snp_tuple[1] return snps diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py index 9ee7c458..44727544 100644 --- a/ariba/tests/summary_cluster_test.py +++ b/ariba/tests/summary_cluster_test.py @@ -455,15 +455,19 @@ def test_known_noncoding_het_snps(self): '''test known_noncoding_het_snps''' lines = [ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', - 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', - 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', - 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs' + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs' ] cluster = summary_cluster.SummaryCluster() for line in lines: cluster.add_data_dict(summary_cluster.SummaryCluster.line2dict(line)) got = cluster.known_noncoding_het_snps() - expected = {'A42T': 25.0, 'A62T': 75.0, 'A82T': 40.0} + expected = { + '.': {'A82T': 40.0}, + 'id1': {'A42T': 25.0}, + 'id2': {'A62T': 75.0}, + } self.assertEqual(expected, got) From 116c45386130876684d68d085af0052cf77bee00 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 10 Aug 2016 22:01:35 +0100 Subject: [PATCH 10/40] Format of dict changed: now has groups ids as well --- ariba/tests/summary_sample_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/tests/summary_sample_test.py b/ariba/tests/summary_sample_test.py index 091e8c09..32156c26 100644 --- a/ariba/tests/summary_sample_test.py +++ b/ariba/tests/summary_sample_test.py @@ -104,7 +104,7 @@ def test_variant_column_names_tuples_and_het_snps(self): expected_het_snps = { 'cluster.v': {}, - 'cluster.n': {'A14T': 80.0}, + 'cluster.n': {'.': {'A14T': 80.0}}, 'cluster.p': {}, } self.assertEqual(expected_het_snps, got_het_snps) From fa680a3b08a7ab79c1df8f994221771a1195c126 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 10 Aug 2016 22:02:14 +0100 Subject: [PATCH 11/40] Fix format of variant string --- ariba/tests/data/summary_test_get_all_het_snps.1.tsv | 4 ++-- ariba/tests/data/summary_test_get_all_het_snps.2.tsv | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ariba/tests/data/summary_test_get_all_het_snps.1.tsv b/ariba/tests/data/summary_test_get_all_het_snps.1.tsv index d1f5f70b..3e67eeb1 100644 --- a/ariba/tests/data/summary_test_get_all_het_snps.1.tsv +++ b/ariba/tests/data/summary_test_get_all_het_snps.1.tsv @@ -1,3 +1,3 @@ #ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 +noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 diff --git a/ariba/tests/data/summary_test_get_all_het_snps.2.tsv b/ariba/tests/data/summary_test_get_all_het_snps.2.tsv index 6507d5fd..398aedbc 100644 --- a/ariba/tests/data/summary_test_get_all_het_snps.2.tsv +++ b/ariba/tests/data/summary_test_get_all_het_snps.2.tsv @@ -1,5 +1,5 @@ #ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id3:variant in ref and reads so should report generic description of noncoding1 -presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 +noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id3:variant in ref and reads so should report generic description of noncoding1 +presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 variants_only1 1 1 64 12 variants_only1 90 . . . . . . . . . . . . . . . . . . . . . . . From 1cbf9d890c16b7b076d8105401ea9ebb80fb6d68 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 10 Aug 2016 22:02:49 +0100 Subject: [PATCH 12/40] Report percents for snp groups. Update phandango colours --- ariba/summary.py | 57 +++++++++++++++++++------------------ ariba/tests/summary_test.py | 16 ++++++++--- 2 files changed, 41 insertions(+), 32 deletions(-) diff --git a/ariba/summary.py b/ariba/summary.py index cabe3c01..a829d1ea 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -121,8 +121,8 @@ def _get_all_het_snps(cls, samples_dict): snps = set() for filename, sample in samples_dict.items(): for cluster, snp_dict in sample.het_snps.items(): - if len(snp_dict): - for snp in snp_dict: + for snp_id in snp_dict: + for snp in snp_dict[snp_id]: snps.add((cluster, snp)) return snps @@ -143,7 +143,6 @@ def _gather_output_rows(self): all_cluster_names = Summary._get_all_cluster_names(self.samples) all_var_columns = Summary._get_all_variant_columns(self.samples) all_het_snps = Summary._get_all_het_snps(self.samples) - if self.var_columns['groups']: var_groups = Summary._get_all_var_groups(self.samples) else: @@ -156,7 +155,7 @@ def _gather_output_rows(self): for cluster in all_cluster_names: rows[filename][cluster] = {} - if cluster in sample.column_summary_data and sample.column_summary_data[cluster]['assembled'].startswith('yes'): + if cluster in sample.column_summary_data and sample.column_summary_data[cluster]['assembled'] not in {'no'}: rows[filename][cluster] = sample.column_summary_data[cluster] else: rows[filename][cluster] = { @@ -171,25 +170,20 @@ def _gather_output_rows(self): if self.var_columns['groups']: for group_name in var_groups[cluster]: if cluster in sample.var_groups and group_name in sample.var_groups[cluster]: + rows[filename][cluster]['vgroup.' + group_name] = 'yes' if self.show_known_het: if cluster in sample.het_snps: - if len(sample.het_snps[cluster]) == 0: - rows[filename][cluster]['vgroup.' + group_name] = 'no' - rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA' - elif len(sample.het_snps[cluster]) == 1: - rows[filename][cluster]['vgroup.' + group_name] = 'het' - snp_name = list(sample.het_snps[cluster].keys())[0] - percent = -1 - for v in sample.variant_column_names_tuples[cluster]: - if v[1] == snp_name and snp_name in sample.het_snps[cluster]: - percent = sample.het_snps[cluster][snp_name] - - rows[filename][cluster]['vgroup.' + group_name + '.%'] = percent + if group_name in sample.het_snps[cluster]: + if len(sample.het_snps[cluster][group_name]) == 1: + rows[filename][cluster]['vgroup.' + group_name] = 'het' + percent = list(sample.het_snps[cluster][group_name].values())[0] + rows[filename][cluster]['vgroup.' + group_name + '.%'] = percent + else: + assert len(sample.het_snps[cluster][group_name]) > 1 + rows[filename][cluster]['vgroup.' + group_name] = 'multi_het' + rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA' else: - rows[filename][cluster]['vgroup.' + group_name] = 'multi_het' rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA' - else: - rows[filename][cluster]['vgroup.' + group_name] = 'yes' else: rows[filename][cluster]['vgroup.' + group_name] = 'no' if self.show_known_het: @@ -201,15 +195,20 @@ def _gather_output_rows(self): continue key = ref_name + '.' + variant - if rows[filename][cluster]['assembled'] == 'no': rows[filename][cluster][key] = 'NA' elif cluster in sample.variant_column_names_tuples and (ref_name, variant, grouped_or_novel, group_name) in sample.variant_column_names_tuples[cluster]: rows[filename][cluster][key] = 'yes' if self.show_known_het: - if cluster in sample.het_snps and variant in sample.het_snps[cluster]: - rows[filename][cluster][key] = 'het' - rows[filename][cluster][key + '.%'] = sample.het_snps[cluster][variant] + if cluster in sample.het_snps: + if grouped_or_novel == 'grouped' and group_name in sample.het_snps[cluster]: + rows[filename][cluster][key] = 'het' + rows[filename][cluster][key + '.%'] = sample.het_snps[cluster][group_name].get(variant, "NA") + elif grouped_or_novel == 'novel' and '.' in sample.het_snps[cluster]: + rows[filename][cluster][key] = 'het' + rows[filename][cluster][key + '.%'] = sample.het_snps['.'].get(variant, "NA") + else: + percent = 'NA' else: rows[filename][cluster][key] = 'no' if self.show_known_het and (cluster, variant) in all_het_snps: @@ -315,11 +314,13 @@ def _add_phandango_colour_columns(cls, header, matrix): matrix = copy.deepcopy(matrix) cols_to_add_colour_col = [i for i in range(len(header)) if header[i].endswith(':o1')] field_to_col = { - 'yes': '#1f78b4', - 'yes_nonunique': '#a6cee3', - 'no': '#33a02c', - 'NA': '#b2df8a', - 'het': '#fb9a99', + 'yes': '#33a02c', + 'yes_nonunique': '#b2df8a', + 'no': '#fb9a99', + 'NA': '#d3d3d3', + 'het': '#fdbf6f', + 'fragmented': '#1f78b4', + 'interrupted': '#a6cee3', } cols_to_add_colour_col.reverse() diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 6b615ee6..6eb790e8 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -225,6 +225,12 @@ def test_gather_output_rows(self): self.assertEqual(expected, got) s.show_known_het = True + expected[infiles[0]]['noncoding1']['vgroup.id1.%'] = 'NA' + expected[infiles[0]]['noncoding1']['vgroup.id3.%'] = 'NA' + expected[infiles[1]]['noncoding1']['vgroup.id1'] = 'het' + expected[infiles[1]]['noncoding1']['vgroup.id1.%'] = 80.0 + expected[infiles[1]]['noncoding1']['vgroup.id3'] = 'yes' + expected[infiles[1]]['noncoding1']['vgroup.id3.%'] = 'NA' expected[infiles[0]]['noncoding1']['noncoding1.A14T.%'] = 'NA' expected[infiles[1]]['noncoding1']['noncoding1.A14T'] = 'het' expected[infiles[1]]['noncoding1']['noncoding1.A14T.%'] = 80.0 @@ -234,6 +240,8 @@ def test_gather_output_rows(self): for filename in expected: del expected[filename]['noncoding1']['vgroup.id1'] del expected[filename]['noncoding1']['vgroup.id3'] + del expected[filename]['noncoding1']['vgroup.id1.%'] + del expected[filename]['noncoding1']['vgroup.id3.%'] for gene_type in expected[filename]: del expected[filename][gene_type]['ref_seq'] @@ -373,10 +381,10 @@ def test_add_phandango_colour_columns(self): expected_header = ['head1', 'head2', 'head2:colour', 'head3', 'head3:colour', 'head4', 'head5', 'head5:colour'] expected_matrix = [ - ['yes', 'yes', '#1f78b4', 'yes_nonunique', '#a6cee3', 'yes', 'no', '#33a02c'], - ['yes', 'yes_nonunique', '#a6cee3', 'no', '#33a02c', 'yes', 'NA', '#b2df8a'], - ['yes', 'no', '#33a02c', 'NA', '#b2df8a', 'yes', 'yes', '#1f78b4'], - ['yes', 'NA', '#b2df8a', 'yes', '#1f78b4', 'yes', 'yes_nonunique', '#a6cee3'], + ['yes', 'yes', '#33a02c', 'yes_nonunique', '#b2df8a', 'yes', 'no', '#fb9a99'], + ['yes', 'yes_nonunique', '#b2df8a', 'no', '#fb9a99', 'yes', 'NA', '#d3d3d3'], + ['yes', 'no', '#fb9a99', 'NA', '#d3d3d3', 'yes', 'yes', '#33a02c'], + ['yes', 'NA', '#d3d3d3', 'yes', '#33a02c', 'yes', 'yes_nonunique', '#b2df8a'] ] got_header, got_matrix = summary.Summary._add_phandango_colour_columns(header, matrix) self.assertEqual(expected_header, got_header) From 65f69f071144f5338415226662ddcc62c83d4798 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 10 Aug 2016 22:21:12 +0100 Subject: [PATCH 13/40] add only_clusters option --- ariba/summary_sample.py | 5 ++++- ariba/tests/summary_sample_test.py | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py index c5349f41..ea9575ae 100644 --- a/ariba/summary_sample.py +++ b/ariba/summary_sample.py @@ -15,7 +15,7 @@ def __eq__(self, other): @staticmethod - def _load_file(filename, min_pc_id): + def _load_file(filename, min_pc_id, only_clusters=None): f = pyfastaq.utils.open_file_read(filename) clusters = {} @@ -28,6 +28,9 @@ def _load_file(filename, min_pc_id): data_dict = summary_cluster.SummaryCluster.line2dict(line) cluster = data_dict['cluster'] + if only_clusters is not None and cluster not in only_clusters: + continue + if cluster not in clusters: clusters[cluster] = summary_cluster.SummaryCluster(min_pc_id=min_pc_id) clusters[cluster].add_data_dict(data_dict) diff --git a/ariba/tests/summary_sample_test.py b/ariba/tests/summary_sample_test.py index 32156c26..f097883a 100644 --- a/ariba/tests/summary_sample_test.py +++ b/ariba/tests/summary_sample_test.py @@ -33,6 +33,9 @@ def test_load_file(self): got = summary_sample.SummarySample._load_file(infile, 90) self.assertEqual(expected, got) + got = summary_sample.SummarySample._load_file(infile, 90, only_clusters={'cluster.n'}) + expected = {'cluster.n': cluster1} + self.assertEqual(expected, got) def test_column_summary_data(self): '''Test _column_summary_data''' From d2ffaeb00554d9b56eb3c4c74121b905e8e8b9ab Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 10 Aug 2016 22:47:25 +0100 Subject: [PATCH 14/40] add only_clusters option --- ariba/summary_sample.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py index ea9575ae..5a5b397b 100644 --- a/ariba/summary_sample.py +++ b/ariba/summary_sample.py @@ -4,9 +4,10 @@ class Error (Exception): pass class SummarySample: - def __init__(self, report_tsv, min_pc_id=90): + def __init__(self, report_tsv, min_pc_id=90, only_clusters=None): self.report_tsv = report_tsv self.min_pc_id = min_pc_id + self.only_clusters = only_clusters self.clusters = {} @@ -61,7 +62,7 @@ def _variant_column_names_tuples_and_het_snps(self): def run(self): - self.clusters = self._load_file(self.report_tsv, self.min_pc_id) + self.clusters = self._load_file(self.report_tsv, self.min_pc_id, only_clusters=self.only_clusters) self.column_summary_data = self._column_summary_data() self.variant_column_names_tuples, self.het_snps = self._variant_column_names_tuples_and_het_snps() self.var_groups = self._var_groups() From 5f86cee559f904aedfc6647636e35306416ca637 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 10 Aug 2016 22:48:06 +0100 Subject: [PATCH 15/40] Add only_clusters option --- ariba/summary.py | 4 ++-- ariba/tests/summary_test.py | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/ariba/summary.py b/ariba/summary.py index a829d1ea..5caf97b5 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -82,10 +82,10 @@ def _check_files_exist(self): @classmethod - def _load_input_files(cls, filenames, min_id, verbose=False): + def _load_input_files(cls, filenames, min_id, verbose=False, only_clusters=None): samples = {} for filename in filenames: - samples[filename] = summary_sample.SummarySample(filename, min_pc_id=min_id) + samples[filename] = summary_sample.SummarySample(filename, min_pc_id=min_id, only_clusters=only_clusters) samples[filename].run() if verbose: print('Loaded file', filename, flush=True) diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 6eb790e8..45f01dc1 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -84,6 +84,15 @@ def test_load_input_files(self): expected = {file1: sample1, file2: sample2} self.assertEqual(expected, got) + sample1 = summary_sample.SummarySample(file1, only_clusters={'noncoding1'}) + sample2 = summary_sample.SummarySample(file2, only_clusters={'noncoding1'}) + sample1.run() + sample2.run() + expected = {file1: sample1, file2: sample2} + got = summary.Summary._load_input_files([file1, file2], 90, only_clusters={'noncoding1'}) + self.assertEqual(expected, got) + + def test_get_all_cluster_names(self): '''Test _get_all_cluster_names''' From a78cd3a1c60d4c775c2a781ce14fb5ba507d9208 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 10 Aug 2016 22:52:03 +0100 Subject: [PATCH 16/40] add only_clusters option --- ariba/summary.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ariba/summary.py b/ariba/summary.py index 5caf97b5..7c7774c9 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -22,6 +22,7 @@ def __init__( cluster_cols='assembled,match,ref_seq,pct_id,known_var,novel_var', variant_cols='groups,grouped,ungrouped,novel', make_phandango_tree=True, + only_clusters=None, verbose=False, ): if filenames is None and fofn is None: @@ -43,6 +44,7 @@ def __init__( self.min_id = min_id self.outprefix = outprefix self.make_phandango_tree = make_phandango_tree + self.only_clusters = only_clusters self.verbose = verbose @@ -396,7 +398,7 @@ def run(self): if self.verbose: print('Loading input files...', flush=True) self._check_files_exist() - self.samples = self._load_input_files(self.filenames, self.min_id, verbose=self.verbose) + self.samples = self._load_input_files(self.filenames, self.min_id, verbose=self.verbose, only_clusters=self.only_clusters) if self.verbose: print('Generating output rows', flush=True) self.rows = self._gather_output_rows() From 898a6337f40e6611675330a09d345bbf00175c29 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 10 Aug 2016 22:58:50 +0100 Subject: [PATCH 17/40] Add only_cluster option --- ariba/tasks/summary.py | 1 + scripts/ariba | 1 + 2 files changed, 2 insertions(+) diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py index ec1cd879..c674df3a 100644 --- a/ariba/tasks/summary.py +++ b/ariba/tasks/summary.py @@ -97,6 +97,7 @@ def run(options): cluster_cols=options.cluster_cols, variant_cols=options.var_cols, make_phandango_tree=(not options.no_tree), + only_clusters=None if options.only_cluster is None else {options.only_cluster}, verbose=options.verbose ) s.run() diff --git a/scripts/ariba b/scripts/ariba index 63fc5d93..f0b91a86 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -170,6 +170,7 @@ subparser_summary.add_argument('--no_tree', action='store_true', help='Do not ma subparser_summary.add_argument('--row_filter', choices=['y', 'n'], default='y', help='Choose whether rows where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n') subparser_summary.add_argument('--var_cols', help='Comma separated list of variant columns to include. Choose from: groups, grouped, ungrouped, novel [none by default]', metavar='col1,col2,...', default='') subparser_summary.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT') +subparser_summary.add_argument('--only_cluster', help='Only report data for the given cluster name', metavar='Cluster_name') subparser_summary.add_argument('--verbose', action='store_true', help='Be verbose') subparser_summary.add_argument('outprefix', help='Prefix of output files') subparser_summary.add_argument('infiles', nargs='*', help='Files to be summarised') From d94178214c4529149cd766212b940af4fe46807f Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 11 Aug 2016 10:49:40 +0100 Subject: [PATCH 18/40] New method _get_het_percent --- ariba/summary_cluster.py | 24 ++++++++++++++++++++++ ariba/tests/summary_cluster_test.py | 32 +++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py index 3b1daeb1..4def11cb 100644 --- a/ariba/summary_cluster.py +++ b/ariba/summary_cluster.py @@ -220,6 +220,7 @@ def _get_known_noncoding_het_snp(data_dict): return None + @staticmethod def _get_nonsynonymous_var(data_dict): '''if data_dict has a non synonymous variant, return string: @@ -302,3 +303,26 @@ def known_noncoding_het_snps(self): snps[snp_id] = {} snps[snp_id][snp_tuple[0]] = snp_tuple[1] return snps + + + @classmethod + def _get_het_percent(cls, data_dict): + if data_dict['gene'] == '1' or data_dict['ref_ctg_effect'] != 'SNP' or data_dict['smtls_alt_nt'] == '.' or ';' in data_dict['smtls_alt_nt']: + return None + else: + nucleotides = [data_dict['ctg_nt']] + data_dict['smtls_alt_nt'].split(',') + depths = data_dict['smtls_alt_depth'].split(',') + + if len(nucleotides) != len(depths): + raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_alt_nt, smtls_alt_depth columns. Cannot continue\n' + str(data_dict)) + + try: + var_nucleotide = data_dict['known_var_change'][-1] + depths = [int(x) for x in depths] + nuc_to_depth = dict(zip(nucleotides, depths)) + total_depth = sum(depths) + var_depth = nuc_to_depth.get(var_nucleotide, 0) + return round(100 * var_depth / total_depth, 1) + except: + return None + diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py index 44727544..e46440df 100644 --- a/ariba/tests/summary_cluster_test.py +++ b/ariba/tests/summary_cluster_test.py @@ -471,3 +471,35 @@ def test_known_noncoding_het_snps(self): } self.assertEqual(expected, got) + + def test_get_het_percent(self): + '''test _get_het_percent''' + #FIXME + lines = [ + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs' + ] + + expected = [None, 25.0, 75.0, 40.0] + assert len(lines) == len(expected) + + for i in range(len(lines)): + data_dict = summary_cluster.SummaryCluster.line2dict(lines[i]) + got = summary_cluster.SummaryCluster._get_het_percent(data_dict) + self.assertEqual(expected[i], got) + + + def test_get_nonsynon_variant_data(self): + '''test _get_nonsynon_variant_data''' + #FIXME + pass + + + def test_get_all_nonsynon_variants(self): + '''test _get_all_nonsynon_variants''' + #FIXME + pass + + From 09c393e611f69d59651facfbe2b355138ebc95f3 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 11 Aug 2016 11:03:04 +0100 Subject: [PATCH 19/40] New method _get_nonsynon_variant_data --- ariba/summary_cluster.py | 28 ++++++++++++++++++++++++++++ ariba/tests/summary_cluster_test.py | 25 +++++++++++++++++-------- 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py index 4def11cb..4027e38e 100644 --- a/ariba/summary_cluster.py +++ b/ariba/summary_cluster.py @@ -326,3 +326,31 @@ def _get_het_percent(cls, data_dict): except: return None + + @classmethod + def _get_nonsynon_variant_data(cls, data_dict): + if not SummaryCluster._has_nonsynonymous(data_dict): + return None + + if data_dict['known_var_change'] == data_dict['ref_ctg_change'] == '.' == data_dict['ref_ctg_effect']: + raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change, ref_ctg_change, ref_ctg_effect all equal to ".", but has a non synonymous change. Something is inconsistent. Cannot continue') + elif '.' not in [data_dict['known_var_change'], data_dict['ref_ctg_change']] and \ + data_dict['known_var_change'] != data_dict['ref_ctg_change']: + raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change != ref_ctg_change. Cannot continue') + + var_data = { + 'known': data_dict['known_var'] == '1', + 'var_group': data_dict['var_group'], + 'coding': data_dict['gene'] == '1' + } + + if data_dict['known_var'] == '1' and data_dict['known_var_change'] != '.': + var_data['var_string'] = data_dict['known_var_change'] + elif data_dict['ref_ctg_change'] != '.': + var_data['var_string'] = data_dict['ref_ctg_change'] + else: + var_data['var_string'] = data_dict['ref_ctg_effect'] + + var_data['het_percent'] = SummaryCluster._get_het_percent(data_dict) + return var_data + diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py index e46440df..502c770c 100644 --- a/ariba/tests/summary_cluster_test.py +++ b/ariba/tests/summary_cluster_test.py @@ -474,7 +474,6 @@ def test_known_noncoding_het_snps(self): def test_get_het_percent(self): '''test _get_het_percent''' - #FIXME lines = [ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs', @@ -493,13 +492,23 @@ def test_get_het_percent(self): def test_get_nonsynon_variant_data(self): '''test _get_nonsynon_variant_data''' - #FIXME - pass - + lines = [ + 'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + ] - def test_get_all_nonsynon_variants(self): - '''test _get_all_nonsynon_variants''' - #FIXME - pass + expected = [ + {'coding': True, 'known': True, 'var_string': 'I14L', 'var_group': '.', 'het_percent': None}, + {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': None}, + {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 25.0}, + {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 50.0}, + ] + assert len(lines) == len(expected) + for i in range(len(lines)): + data_dict = summary_cluster.SummaryCluster.line2dict(lines[i]) + got = summary_cluster.SummaryCluster._get_nonsynon_variant_data(data_dict) + self.assertEqual(expected[i], got) From d9ccdd04cc7bc227ac02dfe026ddbc080ba7d6f6 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 11 Aug 2016 11:19:28 +0100 Subject: [PATCH 20/40] New class SummaryClusterVariant --- ariba/__init__.py | 1 + ariba/summary_cluster.py | 50 --------------- ariba/summary_cluster_variant.py | 66 ++++++++++++++++++++ ariba/tests/summary_cluster_test.py | 41 ------------- ariba/tests/summary_cluster_variant_test.py | 67 +++++++++++++++++++++ 5 files changed, 134 insertions(+), 91 deletions(-) create mode 100644 ariba/summary_cluster_variant.py create mode 100644 ariba/tests/summary_cluster_variant_test.py diff --git a/ariba/__init__.py b/ariba/__init__.py index 0c36b1a4..1d589dc3 100644 --- a/ariba/__init__.py +++ b/ariba/__init__.py @@ -39,6 +39,7 @@ 'sequence_variant', 'summary', 'summary_cluster', + 'summary_cluster_variant', 'summary_sample', 'tasks', 'versions', diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py index 4027e38e..b0a6a03e 100644 --- a/ariba/summary_cluster.py +++ b/ariba/summary_cluster.py @@ -304,53 +304,3 @@ def known_noncoding_het_snps(self): snps[snp_id][snp_tuple[0]] = snp_tuple[1] return snps - - @classmethod - def _get_het_percent(cls, data_dict): - if data_dict['gene'] == '1' or data_dict['ref_ctg_effect'] != 'SNP' or data_dict['smtls_alt_nt'] == '.' or ';' in data_dict['smtls_alt_nt']: - return None - else: - nucleotides = [data_dict['ctg_nt']] + data_dict['smtls_alt_nt'].split(',') - depths = data_dict['smtls_alt_depth'].split(',') - - if len(nucleotides) != len(depths): - raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_alt_nt, smtls_alt_depth columns. Cannot continue\n' + str(data_dict)) - - try: - var_nucleotide = data_dict['known_var_change'][-1] - depths = [int(x) for x in depths] - nuc_to_depth = dict(zip(nucleotides, depths)) - total_depth = sum(depths) - var_depth = nuc_to_depth.get(var_nucleotide, 0) - return round(100 * var_depth / total_depth, 1) - except: - return None - - - @classmethod - def _get_nonsynon_variant_data(cls, data_dict): - if not SummaryCluster._has_nonsynonymous(data_dict): - return None - - if data_dict['known_var_change'] == data_dict['ref_ctg_change'] == '.' == data_dict['ref_ctg_effect']: - raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change, ref_ctg_change, ref_ctg_effect all equal to ".", but has a non synonymous change. Something is inconsistent. Cannot continue') - elif '.' not in [data_dict['known_var_change'], data_dict['ref_ctg_change']] and \ - data_dict['known_var_change'] != data_dict['ref_ctg_change']: - raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change != ref_ctg_change. Cannot continue') - - var_data = { - 'known': data_dict['known_var'] == '1', - 'var_group': data_dict['var_group'], - 'coding': data_dict['gene'] == '1' - } - - if data_dict['known_var'] == '1' and data_dict['known_var_change'] != '.': - var_data['var_string'] = data_dict['known_var_change'] - elif data_dict['ref_ctg_change'] != '.': - var_data['var_string'] = data_dict['ref_ctg_change'] - else: - var_data['var_string'] = data_dict['ref_ctg_effect'] - - var_data['het_percent'] = SummaryCluster._get_het_percent(data_dict) - return var_data - diff --git a/ariba/summary_cluster_variant.py b/ariba/summary_cluster_variant.py new file mode 100644 index 00000000..4043a922 --- /dev/null +++ b/ariba/summary_cluster_variant.py @@ -0,0 +1,66 @@ +from ariba import flag, report + +class Error (Exception): pass + +class SummaryClusterVariant: + def __init__(self, data_dict): + self._get_nonsynon_variant_data(data_dict) + + + @classmethod + def _has_nonsynonymous(cls, data_dict): + return data_dict['ref_ctg_effect'] != 'SYN' and \ + ( + data_dict['has_known_var'] == '1' or \ + (data_dict['known_var'] != '1' and (data_dict['ref_ctg_change'] != '.' or data_dict['ref_ctg_effect'] != '.')) + ) + + + @classmethod + def _get_het_percent(cls, data_dict): + if data_dict['gene'] == '1' or data_dict['ref_ctg_effect'] != 'SNP' or data_dict['smtls_alt_nt'] == '.' or ';' in data_dict['smtls_alt_nt']: + return None + else: + nucleotides = [data_dict['ctg_nt']] + data_dict['smtls_alt_nt'].split(',') + depths = data_dict['smtls_alt_depth'].split(',') + + if len(nucleotides) != len(depths): + raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_alt_nt, smtls_alt_depth columns. Cannot continue\n' + str(data_dict)) + + try: + var_nucleotide = data_dict['known_var_change'][-1] + depths = [int(x) for x in depths] + nuc_to_depth = dict(zip(nucleotides, depths)) + total_depth = sum(depths) + var_depth = nuc_to_depth.get(var_nucleotide, 0) + return round(100 * var_depth / total_depth, 1) + except: + return None + + + def _get_nonsynon_variant_data(self, data_dict): + if not SummaryClusterVariant._has_nonsynonymous(data_dict): + self.has_nonsynon = False + return + + self.has_nonsynon = True + + if data_dict['known_var_change'] == data_dict['ref_ctg_change'] == '.' == data_dict['ref_ctg_effect']: + raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change, ref_ctg_change, ref_ctg_effect all equal to ".", but has a non synonymous change. Something is inconsistent. Cannot continue') + elif '.' not in [data_dict['known_var_change'], data_dict['ref_ctg_change']] and \ + data_dict['known_var_change'] != data_dict['ref_ctg_change']: + raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change != ref_ctg_change. Cannot continue') + + self.known = data_dict['known_var'] == '1' + self.var_group = data_dict['var_group'] + self.coding = data_dict['gene'] == '1' + + if data_dict['known_var'] == '1' and data_dict['known_var_change'] != '.': + self.var_string = data_dict['known_var_change'] + elif data_dict['ref_ctg_change'] != '.': + self.var_string = data_dict['ref_ctg_change'] + else: + self.var_string = data_dict['ref_ctg_effect'] + + self.het_percent = SummaryClusterVariant._get_het_percent(data_dict) + diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py index 502c770c..44727544 100644 --- a/ariba/tests/summary_cluster_test.py +++ b/ariba/tests/summary_cluster_test.py @@ -471,44 +471,3 @@ def test_known_noncoding_het_snps(self): } self.assertEqual(expected, got) - - def test_get_het_percent(self): - '''test _get_het_percent''' - lines = [ - 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', - 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs', - 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs', - 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs' - ] - - expected = [None, 25.0, 75.0, 40.0] - assert len(lines) == len(expected) - - for i in range(len(lines)): - data_dict = summary_cluster.SummaryCluster.line2dict(lines[i]) - got = summary_cluster.SummaryCluster._get_het_percent(data_dict) - self.assertEqual(expected[i], got) - - - def test_get_nonsynon_variant_data(self): - '''test _get_nonsynon_variant_data''' - lines = [ - 'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs', - 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', - 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', - 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', - ] - - expected = [ - {'coding': True, 'known': True, 'var_string': 'I14L', 'var_group': '.', 'het_percent': None}, - {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': None}, - {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 25.0}, - {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 50.0}, - ] - assert len(lines) == len(expected) - - for i in range(len(lines)): - data_dict = summary_cluster.SummaryCluster.line2dict(lines[i]) - got = summary_cluster.SummaryCluster._get_nonsynon_variant_data(data_dict) - self.assertEqual(expected[i], got) - diff --git a/ariba/tests/summary_cluster_variant_test.py b/ariba/tests/summary_cluster_variant_test.py new file mode 100644 index 00000000..ec099422 --- /dev/null +++ b/ariba/tests/summary_cluster_variant_test.py @@ -0,0 +1,67 @@ +import unittest +import os +from ariba import summary_cluster, summary_cluster_variant + + +class TestSummaryClusterVariant(unittest.TestCase): + def test_has_nonsynonymous(self): + '''Test _has_nonsynonymous''' + lines = [ + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.', + 'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.' + ] + + dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines] + expected = [False, True, False, True, True, True] + assert len(dicts) == len(expected) + + for i in range(len(dicts)): + self.assertEqual(expected[i], summary_cluster_variant.SummaryClusterVariant._has_nonsynonymous(dicts[i])) + + + def test_get_het_percent(self): + '''test _get_het_percent''' + lines = [ + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs' + ] + + expected = [None, 25.0, 75.0, 40.0] + assert len(lines) == len(expected) + + for i in range(len(lines)): + data_dict = summary_cluster.SummaryCluster.line2dict(lines[i]) + got = summary_cluster_variant.SummaryClusterVariant._get_het_percent(data_dict) + self.assertEqual(expected[i], got) + + + def test_init(self): + '''test __init__''' + lines = [ + 'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + ] + + expected = [ + {'coding': True, 'known': True, 'var_string': 'I14L', 'var_group': '.', 'het_percent': None}, + {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': None}, + {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 25.0}, + {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 50.0}, + ] + assert len(lines) == len(expected) + + for i in range(len(lines)): + data_dict = summary_cluster.SummaryCluster.line2dict(lines[i]) + cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict) + for key in expected[i]: + got_value = eval('cluster_var.' + key) + self.assertEqual(expected[i][key], got_value) + From 3ef9b21d91e01068ad91be13290c3906f158d283 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 11 Aug 2016 11:53:57 +0100 Subject: [PATCH 21/40] Make hashable and add __str__ method --- ariba/summary_cluster_variant.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ariba/summary_cluster_variant.py b/ariba/summary_cluster_variant.py index 4043a922..b9c7c6db 100644 --- a/ariba/summary_cluster_variant.py +++ b/ariba/summary_cluster_variant.py @@ -7,6 +7,21 @@ def __init__(self, data_dict): self._get_nonsynon_variant_data(data_dict) + def __eq__(self, other): + return type(other) is type(self) and self.__dict__ == other.__dict__ + + + def __hash__(self): + return hash(tuple([self.__dict__[x] for x in sorted(self.__dict__.keys())])) + + + def __str__(self): + if self.has_nonsynon: + return ', '.join((str(self.known), self.var_group, str(self.coding), self.var_string, str(self.het_percent))) + else: + return 'None' + + @classmethod def _has_nonsynonymous(cls, data_dict): return data_dict['ref_ctg_effect'] != 'SYN' and \ From ba94cc105773d1ddbfdec4a259018da61be98863 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 11 Aug 2016 11:54:20 +0100 Subject: [PATCH 22/40] New method _get_all_nonsynon_variants_set --- ariba/summary_cluster.py | 14 +++++++++++++- ariba/tests/summary_cluster_test.py | 19 ++++++++++++++++++- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py index b0a6a03e..8bf07f4a 100644 --- a/ariba/summary_cluster.py +++ b/ariba/summary_cluster.py @@ -1,4 +1,4 @@ -from ariba import flag, report +from ariba import flag, report, summary_cluster_variant class Error (Exception): pass @@ -304,3 +304,15 @@ def known_noncoding_het_snps(self): snps[snp_id][snp_tuple[0]] = snp_tuple[1] return snps + + @classmethod + def _get_all_nonsynon_variants_set(cls, data_dicts): + variants = set() + + for data_dict in data_dicts: + cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict) + if cluster_var.has_nonsynon: + variants.add(cluster_var) + + return variants + diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py index 44727544..d3bfffa6 100644 --- a/ariba/tests/summary_cluster_test.py +++ b/ariba/tests/summary_cluster_test.py @@ -1,6 +1,6 @@ import unittest import os -from ariba import flag, summary_cluster +from ariba import flag, summary_cluster, summary_cluster_variant modules_dir = os.path.dirname(os.path.abspath(summary_cluster.__file__)) data_dir = os.path.join(modules_dir, 'tests', 'data') @@ -471,3 +471,20 @@ def test_known_noncoding_het_snps(self): } self.assertEqual(expected, got) + + def test_get_all_nonsynon_variants_set(self): + '''test _get_all_nonsynon_variants_set''' + lines = [ + 'ref1\t0\t0\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text', + 'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + ] + + data_dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines] + + cluster_vars = [summary_cluster_variant.SummaryClusterVariant(x) for x in data_dicts] + expected = {x for x in cluster_vars if x.has_nonsynon} + got = summary_cluster.SummaryCluster._get_all_nonsynon_variants_set(data_dicts) + self.assertEqual(expected, got) + From b63b57496ff2d2333d85ddbea458f5a0e2b89d1c Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 11 Aug 2016 12:52:34 +0100 Subject: [PATCH 23/40] New method gather_data --- ariba/summary_cluster.py | 5 +++++ ariba/tests/summary_cluster_test.py | 30 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py index 8bf07f4a..efc4cf8c 100644 --- a/ariba/summary_cluster.py +++ b/ariba/summary_cluster.py @@ -316,3 +316,8 @@ def _get_all_nonsynon_variants_set(cls, data_dicts): return variants + + def gather_data(self): + self.summary = self.column_summary_data() + self.variants = self._get_all_nonsynon_variants_set(self.data) + diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py index d3bfffa6..f5022fce 100644 --- a/ariba/tests/summary_cluster_test.py +++ b/ariba/tests/summary_cluster_test.py @@ -488,3 +488,33 @@ def test_get_all_nonsynon_variants_set(self): got = summary_cluster.SummaryCluster._get_all_nonsynon_variants_set(data_dicts) self.assertEqual(expected, got) + + def test_gather_data(self): + '''test gather_data''' + lines = [ + 'ref1\t0\t0\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text', + 'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', + ] + + data_dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines] + cluster = summary_cluster.SummaryCluster() + for data_dict in data_dicts: + cluster.add_data_dict(data_dict) + + cluster.gather_data() + expected_summary = { + 'assembled': 'yes', + 'match': 'yes', + 'ref_seq': 'ref1', + 'pct_id': '98.33', + 'known_var': 'yes', + 'novel_var': 'no', + } + self.assertEqual(expected_summary, cluster.summary) + + cluster_vars = [summary_cluster_variant.SummaryClusterVariant(x) for x in data_dicts] + expected_variants = {x for x in cluster_vars if x.has_nonsynon} + self.assertEqual(expected_variants, cluster.variants) + From 2e3713f2fed38f0eec91770b7821069c7257a0cf Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 11 Aug 2016 12:58:03 +0100 Subject: [PATCH 24/40] Use new summary_cluster that stores variant info --- ariba/summary_sample.py | 4 ++++ ariba/tests/summary_sample_test.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py index 5a5b397b..df4dbaea 100644 --- a/ariba/summary_sample.py +++ b/ariba/summary_sample.py @@ -37,6 +37,10 @@ def _load_file(filename, min_pc_id, only_clusters=None): clusters[cluster].add_data_dict(data_dict) pyfastaq.utils.close(f) + + for cluster_name, cluster in clusters.items(): + cluster.gather_data() + return clusters diff --git a/ariba/tests/summary_sample_test.py b/ariba/tests/summary_sample_test.py index f097883a..67ca2bc9 100644 --- a/ariba/tests/summary_sample_test.py +++ b/ariba/tests/summary_sample_test.py @@ -18,11 +18,14 @@ def test_load_file(self): cluster1.add_data_dict(dicts[0]) cluster1.add_data_dict(dicts[1]) cluster1.add_data_dict(dicts[2]) + cluster1.gather_data() cluster2 = summary_cluster.SummaryCluster() cluster2.add_data_dict(dicts[3]) cluster2.add_data_dict(dicts[4]) + cluster2.gather_data() cluster3 = summary_cluster.SummaryCluster() cluster3.add_data_dict(dicts[5]) + cluster3.gather_data() expected = { 'cluster.n': cluster1, From 09b4d7b8072a9dd34f3fc180d88b6e611fbea7d7 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 11 Aug 2016 19:08:49 +0100 Subject: [PATCH 25/40] Do not use when not present (usually unassembled) --- ariba/summary_sample.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py index df4dbaea..bc1ea25f 100644 --- a/ariba/summary_sample.py +++ b/ariba/summary_sample.py @@ -38,8 +38,15 @@ def _load_file(filename, min_pc_id, only_clusters=None): pyfastaq.utils.close(f) + to_delete = set() + for cluster_name, cluster in clusters.items(): cluster.gather_data() + if cluster.name is None: + to_delete.add(cluster_name) + + for name in to_delete: + del clusters[name] return clusters From 4215634b3853c325e666fcdbd5b8881106272c48 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 11 Aug 2016 19:10:54 +0100 Subject: [PATCH 26/40] New method _gather_unfiltered_output_data --- ariba/summary.py | 49 +++++++++++ ariba/tests/summary_test.py | 160 ++++++++++++++++++++++++++++++++++++ 2 files changed, 209 insertions(+) diff --git a/ariba/summary.py b/ariba/summary.py index 7c7774c9..bdad93a3 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -23,6 +23,8 @@ def __init__( variant_cols='groups,grouped,ungrouped,novel', make_phandango_tree=True, only_clusters=None, + show_var_groups=False, + show_vars=False, verbose=False, ): if filenames is None and fofn is None: @@ -45,6 +47,8 @@ def __init__( self.outprefix = outprefix self.make_phandango_tree = make_phandango_tree self.only_clusters = only_clusters + self.show_var_groups = show_var_groups + self.show_vars = show_vars self.verbose = verbose @@ -462,3 +466,48 @@ def run(self): if self.verbose: print('Finished', flush=True) + + + def _gather_unfiltered_output_data(self): + self.all_potential_columns = {} + self.all_data = {} + + for filename in sorted(self.samples): + self.all_data[filename] = {} + for cluster in self.samples[filename].clusters.values(): + self.all_data[filename][cluster.name] = {} + if cluster.name not in self.all_potential_columns: + self.all_potential_columns[cluster.name] = {'summary' : set(), 'groups': set(), 'vars': set()} + + this_cluster_dict = {'summary': copy.copy(cluster.summary), 'groups': {}, 'vars': {}} + seen_groups = {} + + for variant in cluster.variants: + if self.show_vars: + this_cluster_dict['vars'][variant.var_string] = 'yes' if variant.het_percent is None else 'het' + if variant.het_percent is not None: + this_cluster_dict['vars'][variant.var_string + '.%'] = variant.het_percent + + if self.show_var_groups and variant.var_group != '.': + if variant.var_group not in seen_groups: + seen_groups[variant.var_group] = {'yes': 0, 'het': 0} + + if variant.het_percent is None: + seen_groups[variant.var_group]['yes'] += 1 + this_cluster_dict['groups'][variant.var_group] = 'yes' + else: + seen_groups[variant.var_group]['het'] += 1 + this_cluster_dict['groups'][variant.var_group] = 'het' + this_cluster_dict['groups'][variant.var_group + '.%'] = variant.het_percent + + for group, d in seen_groups.items(): + if d['het'] > 0 and d['het'] + d['yes'] > 1: + this_cluster_dict['groups'][group] = 'yes_multi_het' + this_cluster_dict['groups'][group + '.%'] = 'NA' + + for x in this_cluster_dict: + self.all_potential_columns[cluster.name][x].update(set(this_cluster_dict[x].keys())) + + self.all_data[filename][cluster.name] = this_cluster_dict + + return self.all_data, self.all_potential_columns diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 45f01dc1..b6a4d007 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -262,6 +262,166 @@ def test_gather_output_rows(self): self.assertEqual(expected, got) + def test_gather_unfiltered_output_data(self): + '''test gather_output_rows_new''' + infiles = [ + os.path.join(data_dir, 'summary_gather_unfiltered_output_data.in.1.tsv'), + os.path.join(data_dir, 'summary_gather_unfiltered_output_data.in.2.tsv') + ] + s = summary.Summary('out', filenames=infiles, variant_cols=None) + s.samples = summary.Summary._load_input_files(infiles, 90) + got_all, got_potential_cols = s._gather_unfiltered_output_data() + + expected_all = { + infiles[0]: { + 'noncoding1': { + 'summary': { + 'assembled': 'yes', + 'known_var': 'yes', + 'match': 'yes', + 'novel_var': 'no', + 'pct_id': '98.33', + 'ref_seq': 'noncoding_ref1' + }, + 'groups': {}, + 'vars': {}, + }, + 'noncoding2': { + 'summary': { + 'assembled': 'yes', + 'known_var': 'yes', + 'match': 'yes', + 'novel_var': 'no', + 'pct_id': '98.33', + 'ref_seq': 'noncoding_ref2' + }, + 'groups': {}, + 'vars': {}, + }, + 'presence_absence1': { + 'summary': { + 'assembled': 'yes', + 'known_var': 'no', + 'match': 'yes', + 'novel_var': 'yes', + 'pct_id': '98.96', + 'ref_seq': 'presence_absence_ref1' + }, + 'groups': {}, + 'vars': {}, + } + }, + infiles[1]: { + 'noncoding1': { + 'summary': {'assembled': 'yes', + 'known_var': 'yes', + 'match': 'yes', + 'novel_var': 'no', + 'pct_id': '98.33', + 'ref_seq': 'noncoding_ref1' + }, + 'groups': {}, + 'vars': {}, + }, + 'noncoding2': { + 'summary': { + 'assembled': 'yes', + 'known_var': 'yes', + 'match': 'yes', + 'novel_var': 'no', + 'pct_id': '98.33', + 'ref_seq': 'noncoding_ref2' + }, + 'groups': {}, + 'vars': {}, + }, + 'presence_absence1': { + 'summary': { + 'assembled': 'yes', + 'known_var': 'no', + 'match': 'yes', + 'novel_var': 'yes', + 'pct_id': '98.96', + 'ref_seq': 'presence_absence1' + }, + 'groups': {}, + 'vars': {} + } + } + } + + expected_potential_cols = { + 'noncoding1': { + 'summary': { + 'assembled', + 'known_var', + 'match', + 'novel_var', + 'pct_id', + 'ref_seq' + }, + 'groups': set(), + 'vars': set() + }, + 'noncoding2': { + 'summary': { + 'assembled', + 'known_var', + 'match', + 'novel_var', + 'pct_id', + 'ref_seq' + }, + 'groups': set(), + 'vars': set() + }, + 'presence_absence1': { + 'summary': { + 'assembled', + 'known_var', + 'match', + 'novel_var', + 'pct_id', + 'ref_seq' + }, + 'groups': set(), + 'vars': set() + } + } + + self.assertEqual(expected_potential_cols, got_potential_cols) + self.assertEqual(expected_all, got_all) + + expected_potential_cols['noncoding1']['groups'] = {'id3', 'id1', 'id1.%'} + expected_potential_cols['noncoding2']['groups'] = {'id2.%', 'id2'} + expected_all[infiles[0]]['noncoding1']['groups'] = {'id1': 'yes'} + expected_all[infiles[0]]['noncoding2']['groups'] = {'id2': 'yes_multi_het', 'id2.%': 'NA'} + expected_all[infiles[1]]['noncoding1']['groups'] = {'id1': 'het', 'id1.%': 80.0, 'id3': 'yes'} + expected_all[infiles[1]]['noncoding2']['groups'] = {'id2': 'het', 'id2.%': 40.0} + s = summary.Summary('out', filenames=infiles, variant_cols=None, show_var_groups=True) + s.samples = summary.Summary._load_input_files(infiles, 90) + got_all, got_potential_cols = s._gather_unfiltered_output_data() + self.assertEqual(expected_potential_cols, got_potential_cols) + self.assertEqual(expected_all, got_all) + + expected_potential_cols['noncoding1']['vars'] = {'A14T.%', 'A6G', 'A14T'} + expected_potential_cols['noncoding2']['vars'] = {'A52T', 'A52T.%', 'A42T'} + expected_potential_cols['presence_absence1']['vars'] = {'A10V'} + + expected_all[infiles[0]]['noncoding1']['vars'] = {'A14T': 'yes'} + expected_all[infiles[0]]['noncoding2']['vars'] = {'A42T': 'yes', 'A52T': 'het', 'A52T.%': 40.0} + expected_all[infiles[0]]['presence_absence1']['vars'] = {'A10V': 'yes'} + expected_all[infiles[1]]['noncoding1']['vars'] = {'A14T': 'het', 'A14T.%': 80.0, 'A6G': 'yes'} + expected_all[infiles[1]]['noncoding2']['vars'] = {'A52T': 'het', 'A52T.%': 40.0} + expected_all[infiles[1]]['presence_absence1']['vars'] = {'A10V': 'yes'} + s = summary.Summary('out', filenames=infiles, variant_cols=None, show_var_groups=True, show_vars=True) + s.samples = summary.Summary._load_input_files(infiles, 90) + got_all, got_potential_cols = s._gather_unfiltered_output_data() + self.maxDiff = None + self.assertEqual(expected_potential_cols, got_potential_cols) + self.assertEqual(expected_all, got_all) + + def test_to_matrix(self): '''Test _to_matrix''' rows = { From 2af126d6a5d4e83c1a240eea0d562967d24ae0d6 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 11 Aug 2016 19:11:43 +0100 Subject: [PATCH 27/40] Remove maxDiff=None --- ariba/tests/summary_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index b6a4d007..74a2f77d 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -417,7 +417,6 @@ def test_gather_unfiltered_output_data(self): s = summary.Summary('out', filenames=infiles, variant_cols=None, show_var_groups=True, show_vars=True) s.samples = summary.Summary._load_input_files(infiles, 90) got_all, got_potential_cols = s._gather_unfiltered_output_data() - self.maxDiff = None self.assertEqual(expected_potential_cols, got_potential_cols) self.assertEqual(expected_all, got_all) From edd503e543dd369a398c05f3cf0f2bd4a41cf370 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 11 Aug 2016 19:36:18 +0100 Subject: [PATCH 28/40] Add test files for summary_gather_unfiltered_output_data --- .../data/summary_gather_unfiltered_output_data.in.1.tsv | 5 +++++ .../data/summary_gather_unfiltered_output_data.in.2.tsv | 6 ++++++ 2 files changed, 11 insertions(+) create mode 100644 ariba/tests/data/summary_gather_unfiltered_output_data.in.1.tsv create mode 100644 ariba/tests/data/summary_gather_unfiltered_output_data.in.2.tsv diff --git a/ariba/tests/data/summary_gather_unfiltered_output_data.in.1.tsv b/ariba/tests/data/summary_gather_unfiltered_output_data.in.1.tsv new file mode 100644 index 00000000..1957349c --- /dev/null +++ b/ariba/tests/data/summary_gather_unfiltered_output_data.in.1.tsv @@ -0,0 +1,5 @@ +#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text +noncoding_ref1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding_ref1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding_ref2 0 0 19 78 noncoding2 120 120 98.33 noncoding2.scaffold.1 279 10.0 1 SNP n A42T 1 A42T SNP 42 42 A 84 84 T 17 . 17 noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding_ref2 0 0 19 78 noncoding2 120 120 98.33 noncoding2.scaffold.1 279 10.0 1 SNP n A52T 1 A52T SNP 42 42 A 84 84 T 17 G 20,30 noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report generic description of noncoding1 +presence_absence_ref1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence_ref1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 diff --git a/ariba/tests/data/summary_gather_unfiltered_output_data.in.2.tsv b/ariba/tests/data/summary_gather_unfiltered_output_data.in.2.tsv new file mode 100644 index 00000000..4a23ebc4 --- /dev/null +++ b/ariba/tests/data/summary_gather_unfiltered_output_data.in.2.tsv @@ -0,0 +1,6 @@ +#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text +noncoding_ref1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding_ref1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id3:variant in ref and reads so should report generic description of noncoding1 +noncoding_ref2 0 0 19 78 noncoding2 120 120 98.33 noncoding2.scaffold.1 279 10.0 1 SNP n A52T 1 A52T SNP 42 42 A 84 84 T 17 G 20,30 noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report generic description of noncoding1 +presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 +variants_only1 1 1 64 12 variants_only1 90 . . . . . . . . . . . . . . . . . . . . . . . From 5e74a72eea347c39a3d21e8f9836038066066371 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 11 Aug 2016 19:36:42 +0100 Subject: [PATCH 29/40] Do not return dicts --- ariba/summary.py | 1 - ariba/tests/summary_test.py | 22 +++++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/ariba/summary.py b/ariba/summary.py index bdad93a3..21bf2770 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -510,4 +510,3 @@ def _gather_unfiltered_output_data(self): self.all_data[filename][cluster.name] = this_cluster_dict - return self.all_data, self.all_potential_columns diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 74a2f77d..b9aefec1 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -268,9 +268,6 @@ def test_gather_unfiltered_output_data(self): os.path.join(data_dir, 'summary_gather_unfiltered_output_data.in.1.tsv'), os.path.join(data_dir, 'summary_gather_unfiltered_output_data.in.2.tsv') ] - s = summary.Summary('out', filenames=infiles, variant_cols=None) - s.samples = summary.Summary._load_input_files(infiles, 90) - got_all, got_potential_cols = s._gather_unfiltered_output_data() expected_all = { infiles[0]: { @@ -389,8 +386,11 @@ def test_gather_unfiltered_output_data(self): } } - self.assertEqual(expected_potential_cols, got_potential_cols) - self.assertEqual(expected_all, got_all) + s = summary.Summary('out', filenames=infiles, variant_cols=None) + s.samples = summary.Summary._load_input_files(infiles, 90) + s._gather_unfiltered_output_data() + self.assertEqual(expected_potential_cols, s.all_potential_columns) + self.assertEqual(expected_all, s.all_data) expected_potential_cols['noncoding1']['groups'] = {'id3', 'id1', 'id1.%'} expected_potential_cols['noncoding2']['groups'] = {'id2.%', 'id2'} @@ -400,9 +400,9 @@ def test_gather_unfiltered_output_data(self): expected_all[infiles[1]]['noncoding2']['groups'] = {'id2': 'het', 'id2.%': 40.0} s = summary.Summary('out', filenames=infiles, variant_cols=None, show_var_groups=True) s.samples = summary.Summary._load_input_files(infiles, 90) - got_all, got_potential_cols = s._gather_unfiltered_output_data() - self.assertEqual(expected_potential_cols, got_potential_cols) - self.assertEqual(expected_all, got_all) + s._gather_unfiltered_output_data() + self.assertEqual(expected_potential_cols, s.all_potential_columns) + self.assertEqual(expected_all, s.all_data) expected_potential_cols['noncoding1']['vars'] = {'A14T.%', 'A6G', 'A14T'} expected_potential_cols['noncoding2']['vars'] = {'A52T', 'A52T.%', 'A42T'} @@ -416,9 +416,9 @@ def test_gather_unfiltered_output_data(self): expected_all[infiles[1]]['presence_absence1']['vars'] = {'A10V': 'yes'} s = summary.Summary('out', filenames=infiles, variant_cols=None, show_var_groups=True, show_vars=True) s.samples = summary.Summary._load_input_files(infiles, 90) - got_all, got_potential_cols = s._gather_unfiltered_output_data() - self.assertEqual(expected_potential_cols, got_potential_cols) - self.assertEqual(expected_all, got_all) + s._gather_unfiltered_output_data() + self.assertEqual(expected_potential_cols, s.all_potential_columns) + self.assertEqual(expected_all, s.all_data) def test_to_matrix(self): From 4e4c24acd45ebed0c4ae184e4e85dc6750ac6da1 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 11 Aug 2016 23:31:47 +0100 Subject: [PATCH 30/40] Rewrite to_matrix --- ariba/summary.py | 57 +++---- ariba/tests/data/summary_to_matrix.1.tsv | 5 + ariba/tests/data/summary_to_matrix.2.tsv | 6 + ariba/tests/summary_test.py | 186 ++++++++++++++--------- 4 files changed, 158 insertions(+), 96 deletions(-) create mode 100644 ariba/tests/data/summary_to_matrix.1.tsv create mode 100644 ariba/tests/data/summary_to_matrix.2.tsv diff --git a/ariba/summary.py b/ariba/summary.py index 21bf2770..2645b4bc 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -231,49 +231,54 @@ def _gather_output_rows(self): @classmethod - def _to_matrix(cls, filenames, rows, cluster_cols): - '''rows = output from _gather_output_rows(). - filenames = self.filenames - cluster_cols = self.cluster_columns''' + def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols): matrix = [] making_header_lines = True phandango_header = ['name'] - phandago_suffixes = {'assembled': ':o1', 'match': ':o1', 'ref_seq': ':o2', 'pct_id': ':c1', 'known_var': ':o1', 'novel_var': ':o1'} + phandango_suffixes = {'assembled': ':o1', 'match': ':o1', 'ref_seq': ':o2', 'pct_id': ':c1', 'known_var': ':o1', 'novel_var': ':o1'} ref_seq_counter = 2 csv_header = ['name'] - all_cluster_cols_in_order = ['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'] - all_cluster_cols_in_order_set = set(['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var']) - cluster_cols_in_order = [x for x in all_cluster_cols_in_order if cluster_cols[x]] + summary_cols_in_order = ['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'] + summary_cols_set = set(['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var']) + summary_cols_in_order = [x for x in summary_cols_in_order if cluster_cols[x]] for filename in filenames: - assert filename in rows line = [filename] - for cluster_name in sorted(rows[filename]): - for col in cluster_cols_in_order: + for cluster_name in sorted(all_potential_columns): + group_cols = sorted(list(all_potential_columns[cluster_name]['groups'])) + var_cols = sorted(list(all_potential_columns[cluster_name]['vars'])) + + for col in summary_cols_in_order + group_cols + var_cols: if making_header_lines: csv_header.append(cluster_name + '.' + col) if col == 'ref_seq': - phandago_suffixes[col] = ':o' + str(ref_seq_counter) + phandango_suffixes[col] = ':o' + str(ref_seq_counter) ref_seq_counter += 1 - phandango_header.append(cluster_name + '.' + col + phandago_suffixes[col]) - - line.append(rows[filename][cluster_name][col]) - - for col in sorted(rows[filename][cluster_name]): - if col in all_cluster_cols_in_order_set: - continue - - if making_header_lines: - csv_header.append(cluster_name + '.' + col) - suffix = ':c2' if col.endswith('.%') else ':o1' - phandango_header.append(cluster_name + '.' + col + suffix) - - line.append(rows[filename][cluster_name][col]) + phandango_header.append(cluster_name + '.' + col + phandango_suffixes[col]) + elif col in phandango_suffixes: + phandango_header.append(cluster_name + '.' + col + phandango_suffixes[col]) + elif col.endswith('.%'): + phandango_header.append(cluster_name + '.' + col + ':c2') + else: + phandango_header.append(cluster_name + '.' + col + ':o1') + + for col_type in ['summary', 'groups', 'vars']: + if col in all_data[filename][cluster_name][col_type]: + line.append(all_data[filename][cluster_name][col_type][col]) + break + else: + if col == 'assembled' or not col.endswith('.%'): + line.append('no') + else: + line.append('NA') making_header_lines = False matrix.append(line) + assert len(phandango_header) == len(csv_header) + for line in matrix: + assert len(line) == len(csv_header) return phandango_header, csv_header, matrix diff --git a/ariba/tests/data/summary_to_matrix.1.tsv b/ariba/tests/data/summary_to_matrix.1.tsv new file mode 100644 index 00000000..1957349c --- /dev/null +++ b/ariba/tests/data/summary_to_matrix.1.tsv @@ -0,0 +1,5 @@ +#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text +noncoding_ref1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding_ref1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding_ref2 0 0 19 78 noncoding2 120 120 98.33 noncoding2.scaffold.1 279 10.0 1 SNP n A42T 1 A42T SNP 42 42 A 84 84 T 17 . 17 noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding_ref2 0 0 19 78 noncoding2 120 120 98.33 noncoding2.scaffold.1 279 10.0 1 SNP n A52T 1 A52T SNP 42 42 A 84 84 T 17 G 20,30 noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report generic description of noncoding1 +presence_absence_ref1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence_ref1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 diff --git a/ariba/tests/data/summary_to_matrix.2.tsv b/ariba/tests/data/summary_to_matrix.2.tsv new file mode 100644 index 00000000..4a23ebc4 --- /dev/null +++ b/ariba/tests/data/summary_to_matrix.2.tsv @@ -0,0 +1,6 @@ +#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text +noncoding_ref1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding_ref1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id3:variant in ref and reads so should report generic description of noncoding1 +noncoding_ref2 0 0 19 78 noncoding2 120 120 98.33 noncoding2.scaffold.1 279 10.0 1 SNP n A52T 1 A52T SNP 42 42 A 84 84 T 17 G 20,30 noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report generic description of noncoding1 +presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 +variants_only1 1 1 64 12 variants_only1 90 . . . . . . . . . . . . . . . . . . . . . . . diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index b9aefec1..73343f8b 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -421,78 +421,124 @@ def test_gather_unfiltered_output_data(self): self.assertEqual(expected_all, s.all_data) - def test_to_matrix(self): - '''Test _to_matrix''' - rows = { - 'file1': { - 'cluster.n.1': { - 'assembled': 'yes', - 'match': 'yes', - 'ref_seq': 'noncoding1', - 'known_var': 'yes', - 'novel_var': 'no', - 'pct_id': '98.33', - 'noncoding1.A14T': 'yes' - }, - 'cluster.p.1': { - 'assembled': 'yes', - 'match': 'yes', - 'ref_seq': 'presence_absence1', - 'known_var': 'yes', - 'novel_var': 'no', - 'pct_id': '98.96', - 'presence_absence1.I42L': 'yes' - }, - 'cluster.v.1': { - 'assembled': 'yes', - 'match': 'yes', - 'ref_seq': 'varonly1', - 'known_var': 'no', - 'novel_var': 'no', - 'pct_id': '99.42', - } - }, - 'file2': { - 'cluster.n.1': { - 'assembled': 'yes', - 'match': 'yes', - 'ref_seq': 'noncoding1', - 'known_var': 'no', - 'novel_var': 'no', - 'pct_id': '98.33', - 'noncoding1.A14T': 'no' - }, - 'cluster.p.1': { - 'assembled': 'yes', - 'match': 'yes', - 'ref_seq': 'presence_absence1', - 'pct_id': '98.96', - 'known_var': 'no', - 'novel_var': 'no', - 'presence_absence1.I42L': 'no' - }, - 'cluster.v.1': { - 'assembled': 'no', - 'match': 'NA', - 'ref_seq': 'NA', - 'known_var': 'NA', - 'novel_var': 'NA', - 'pct_id': 'NA', - } - }, - } - filenames = ['file1', 'file2'] - cluster_cols = {'assembled': True, 'match': True, 'ref_seq': False, 'pct_id': False, 'known_var': False, 'novel_var': False} - got_phandago_header, got_csv_header, got_lines = summary.Summary._to_matrix(filenames, rows, cluster_cols) - expected_phandango_header = ['name', 'cluster.n.1.assembled:o1', 'cluster.n.1.match:o1', 'cluster.n.1.noncoding1.A14T:o1', 'cluster.p.1.assembled:o1', 'cluster.p.1.match:o1', 'cluster.p.1.presence_absence1.I42L:o1', 'cluster.v.1.assembled:o1', 'cluster.v.1.match:o1'] - expected_csv_header = ['name', 'cluster.n.1.assembled', 'cluster.n.1.match', 'cluster.n.1.noncoding1.A14T', 'cluster.p.1.assembled', 'cluster.p.1.match', 'cluster.p.1.presence_absence1.I42L', 'cluster.v.1.assembled', 'cluster.v.1.match'] - expected_lines = [ - ['file1', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes'], - ['file2', 'yes', 'yes', 'no', 'yes', 'yes', 'no', 'no', 'NA'] + def test_to_matrix_all_cols(self): + '''Test _to_matrix all columns''' + infiles = [ + os.path.join(data_dir, 'summary_to_matrix.1.tsv'), + os.path.join(data_dir, 'summary_to_matrix.2.tsv') ] - self.assertEqual(expected_phandango_header, got_phandago_header) + + s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_vars=True) + s.samples = summary.Summary._load_input_files(infiles, 90) + s._gather_unfiltered_output_data() + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + + expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.id2:o1', 'noncoding2.id2.%:c2', 'noncoding2.A42T:o1', 'noncoding2.A52T:o1', 'noncoding2.A52T.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1', 'presence_absence1.A10V:o1'] + expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'noncoding2.A42T', 'noncoding2.A52T', 'noncoding2.A52T.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var', 'presence_absence1.A10V'] + expected_matrix = [ + [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'NA', 'no', 'yes', 'NA', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'], + [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 'het', 80.0, 'yes', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'no', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes'] + ] + + self.assertEqual(expected_phandango_header, got_phandango_header) + self.assertEqual(expected_csv_header, got_csv_header) + self.assertEqual(expected_matrix, got_matrix) + + + def test_to_matrix_with_groups(self): + '''Test _to_matrix with groups''' + infiles = [ + os.path.join(data_dir, 'summary_to_matrix.1.tsv'), + os.path.join(data_dir, 'summary_to_matrix.2.tsv') + ] + + s = summary.Summary('out', filenames=infiles, show_var_groups=True) + s.samples = summary.Summary._load_input_files(infiles, 90) + s._gather_unfiltered_output_data() + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + + expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.id2:o1', 'noncoding2.id2.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1'] + expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var'] + expected_matrix = [ + [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'NA', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes'], + [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes'] + ] + + self.assertEqual(expected_phandango_header, got_phandango_header) + self.assertEqual(expected_csv_header, got_csv_header) + self.assertEqual(expected_matrix, got_matrix) + + + def test_to_matrix_with_vars(self): + '''Test _to_matrix with vars''' + infiles = [ + os.path.join(data_dir, 'summary_to_matrix.1.tsv'), + os.path.join(data_dir, 'summary_to_matrix.2.tsv') + ] + + s = summary.Summary('out', filenames=infiles, show_vars=True) + s.samples = summary.Summary._load_input_files(infiles, 90) + s._gather_unfiltered_output_data() + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + + expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.A42T:o1', 'noncoding2.A52T:o1', 'noncoding2.A52T.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1', 'presence_absence1.A10V:o1'] + expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.A42T', 'noncoding2.A52T', 'noncoding2.A52T.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var', 'presence_absence1.A10V'] + expected_matrix = [ + [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'NA', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'], + [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'no', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes'] + ] + + self.assertEqual(expected_phandango_header, got_phandango_header) + self.assertEqual(expected_csv_header, got_csv_header) + self.assertEqual(expected_matrix, got_matrix) + + + def test_to_matrix_cluster_only(self): + '''Test _to_matrix with cluster columns only''' + infiles = [ + os.path.join(data_dir, 'summary_to_matrix.1.tsv'), + os.path.join(data_dir, 'summary_to_matrix.2.tsv') + ] + + s = summary.Summary('out', filenames=infiles) + s.samples = summary.Summary._load_input_files(infiles, 90) + s._gather_unfiltered_output_data() + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + + expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1'] + expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var'] + expected_matrix = [ + [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes'], + [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes'] + ] + + self.assertEqual(expected_phandango_header, got_phandango_header) + self.assertEqual(expected_csv_header, got_csv_header) + self.assertEqual(expected_matrix, got_matrix) + + + def test_to_matrix_assembled_only(self): + '''Test _to_matrix with assembled column only''' + infiles = [ + os.path.join(data_dir, 'summary_to_matrix.1.tsv'), + os.path.join(data_dir, 'summary_to_matrix.2.tsv') + ] + + s = summary.Summary('out', filenames=infiles, cluster_cols='assembled') + s.samples = summary.Summary._load_input_files(infiles, 90) + s._gather_unfiltered_output_data() + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + + expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding2.assembled:o1', 'presence_absence1.assembled:o1'] + expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding2.assembled', 'presence_absence1.assembled'] + expected_matrix = [ + [infiles[0], 'yes', 'yes', 'yes'], + [infiles[1], 'yes', 'yes', 'yes'] + ] + + self.assertEqual(expected_phandango_header, got_phandango_header) self.assertEqual(expected_csv_header, got_csv_header) - self.assertEqual(expected_lines, got_lines) + self.assertEqual(expected_matrix, got_matrix) def test_filter_matrix_rows(self): From 831a664250ea090be6bb0e1c7303a13b02588430 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 12 Aug 2016 08:15:17 +0100 Subject: [PATCH 31/40] Remove old _gather_output_rows method --- ariba/summary.py | 85 ------------- .../summary_test_gather_output_rows.in.1.tsv | 3 - .../summary_test_gather_output_rows.in.2.tsv | 5 - ariba/tests/summary_test.py | 119 ------------------ 4 files changed, 212 deletions(-) delete mode 100644 ariba/tests/data/summary_test_gather_output_rows.in.1.tsv delete mode 100644 ariba/tests/data/summary_test_gather_output_rows.in.2.tsv diff --git a/ariba/summary.py b/ariba/summary.py index 2645b4bc..9ce36050 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -145,91 +145,6 @@ def _get_all_var_groups(cls, samples_dict): return groups - def _gather_output_rows(self): - all_cluster_names = Summary._get_all_cluster_names(self.samples) - all_var_columns = Summary._get_all_variant_columns(self.samples) - all_het_snps = Summary._get_all_het_snps(self.samples) - if self.var_columns['groups']: - var_groups = Summary._get_all_var_groups(self.samples) - else: - var_groups = set() - rows = {} - - for filename, sample in self.samples.items(): - rows[filename] = {} - - for cluster in all_cluster_names: - rows[filename][cluster] = {} - - if cluster in sample.column_summary_data and sample.column_summary_data[cluster]['assembled'] not in {'no'}: - rows[filename][cluster] = sample.column_summary_data[cluster] - else: - rows[filename][cluster] = { - 'assembled': 'no', - 'match': 'no', - 'ref_seq': 'NA', - 'known_var': 'NA', - 'novel_var': 'NA', - 'pct_id': 'NA' - } - - if self.var_columns['groups']: - for group_name in var_groups[cluster]: - if cluster in sample.var_groups and group_name in sample.var_groups[cluster]: - rows[filename][cluster]['vgroup.' + group_name] = 'yes' - if self.show_known_het: - if cluster in sample.het_snps: - if group_name in sample.het_snps[cluster]: - if len(sample.het_snps[cluster][group_name]) == 1: - rows[filename][cluster]['vgroup.' + group_name] = 'het' - percent = list(sample.het_snps[cluster][group_name].values())[0] - rows[filename][cluster]['vgroup.' + group_name + '.%'] = percent - else: - assert len(sample.het_snps[cluster][group_name]) > 1 - rows[filename][cluster]['vgroup.' + group_name] = 'multi_het' - rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA' - else: - rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA' - else: - rows[filename][cluster]['vgroup.' + group_name] = 'no' - if self.show_known_het: - rows[filename][cluster]['vgroup.' + group_name + '.%'] = 'NA' - - if cluster in all_var_columns: - for (ref_name, variant, grouped_or_novel, group_name) in all_var_columns[cluster]: - if not self.var_columns[grouped_or_novel]: - continue - - key = ref_name + '.' + variant - if rows[filename][cluster]['assembled'] == 'no': - rows[filename][cluster][key] = 'NA' - elif cluster in sample.variant_column_names_tuples and (ref_name, variant, grouped_or_novel, group_name) in sample.variant_column_names_tuples[cluster]: - rows[filename][cluster][key] = 'yes' - if self.show_known_het: - if cluster in sample.het_snps: - if grouped_or_novel == 'grouped' and group_name in sample.het_snps[cluster]: - rows[filename][cluster][key] = 'het' - rows[filename][cluster][key + '.%'] = sample.het_snps[cluster][group_name].get(variant, "NA") - elif grouped_or_novel == 'novel' and '.' in sample.het_snps[cluster]: - rows[filename][cluster][key] = 'het' - rows[filename][cluster][key + '.%'] = sample.het_snps['.'].get(variant, "NA") - else: - percent = 'NA' - else: - rows[filename][cluster][key] = 'no' - if self.show_known_het and (cluster, variant) in all_het_snps: - rows[filename][cluster][key + '.%'] = 'NA' - - if self.show_known_het and (cluster, variant) in all_het_snps and key + '.%' not in rows[filename][cluster]: - rows[filename][cluster][key + '.%'] = 'NA' - - for key, wanted in self.cluster_columns.items(): - if not wanted: - del rows[filename][cluster][key] - - return rows - - @classmethod def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols): matrix = [] diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv deleted file mode 100644 index 3e67eeb1..00000000 --- a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv +++ /dev/null @@ -1,3 +0,0 @@ -#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv deleted file mode 100644 index 398aedbc..00000000 --- a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv +++ /dev/null @@ -1,5 +0,0 @@ -#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id3:variant in ref and reads so should report generic description of noncoding1 -presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 -variants_only1 1 1 64 12 variants_only1 90 . . . . . . . . . . . . . . . . . . . . . . . diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 73343f8b..56ceb82d 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -143,125 +143,6 @@ def test_get_all_var_groups(self): self.assertEqual(expected, got) - def test_gather_output_rows(self): - '''Test _gather_output_rows''' - infiles = [ - os.path.join(data_dir, 'summary_test_gather_output_rows.in.1.tsv'), - os.path.join(data_dir, 'summary_test_gather_output_rows.in.2.tsv') - ] - s = summary.Summary('out', filenames=infiles, variant_cols=None) - s.samples = summary.Summary._load_input_files(infiles, 90) - expected = { - infiles[0]: { - 'noncoding1': { - 'assembled': 'yes', - 'match': 'yes', - 'ref_seq': 'noncoding1', - 'known_var': 'yes', - 'novel_var': 'no', - 'pct_id': '98.33', - }, - 'presence_absence1': { - 'assembled': 'yes', - 'match': 'yes', - 'ref_seq': 'presence_absence1', - 'known_var': 'no', - 'novel_var': 'yes', - 'pct_id': '98.96', - }, - 'variants_only1': { - 'assembled': 'no', - 'match': 'no', - 'ref_seq': 'NA', - 'known_var': 'NA', - 'novel_var': 'NA', - 'pct_id': 'NA', - } - }, - infiles[1]: { - 'noncoding1': { - 'assembled': 'yes', - 'match': 'yes', - 'ref_seq': 'noncoding1', - 'known_var': 'yes', - 'novel_var': 'no', - 'pct_id': '98.33', - }, - 'presence_absence1': { - 'assembled': 'yes', - 'match': 'yes', - 'ref_seq': 'presence_absence1', - 'pct_id': '98.96', - 'known_var': 'no', - 'novel_var': 'yes', - }, - 'variants_only1': { - 'assembled': 'no', - 'match': 'no', - 'ref_seq': 'NA', - 'known_var': 'NA', - 'novel_var': 'NA', - 'pct_id': 'NA', - } - }, - } - got = s._gather_output_rows() - self.assertEqual(expected, got) - - s.var_columns['groups'] = True - expected[infiles[0]]['noncoding1']['vgroup.id1'] = 'yes' - expected[infiles[0]]['noncoding1']['vgroup.id3'] = 'no' - expected[infiles[1]]['noncoding1']['vgroup.id1'] = 'yes' - expected[infiles[1]]['noncoding1']['vgroup.id3'] = 'yes' - got = s._gather_output_rows() - self.assertEqual(expected, got) - - - s.var_columns['grouped'] = True - s.var_columns['ungrouped'] = True - expected[infiles[0]]['noncoding1']['noncoding1.A14T'] = 'yes' - expected[infiles[0]]['noncoding1']['noncoding1.A6G'] = 'no' - expected[infiles[1]]['noncoding1']['noncoding1.A14T'] = 'yes' - expected[infiles[1]]['noncoding1']['noncoding1.A6G'] = 'yes' - self.maxDiff = None - got = s._gather_output_rows() - self.assertEqual(expected, got) - - s.var_columns['novel'] = True - expected[infiles[0]]['presence_absence1']['presence_absence1.A10V'] = 'yes' - expected[infiles[1]]['presence_absence1']['presence_absence1.A10V'] = 'yes' - got = s._gather_output_rows() - self.assertEqual(expected, got) - - s.show_known_het = True - expected[infiles[0]]['noncoding1']['vgroup.id1.%'] = 'NA' - expected[infiles[0]]['noncoding1']['vgroup.id3.%'] = 'NA' - expected[infiles[1]]['noncoding1']['vgroup.id1'] = 'het' - expected[infiles[1]]['noncoding1']['vgroup.id1.%'] = 80.0 - expected[infiles[1]]['noncoding1']['vgroup.id3'] = 'yes' - expected[infiles[1]]['noncoding1']['vgroup.id3.%'] = 'NA' - expected[infiles[0]]['noncoding1']['noncoding1.A14T.%'] = 'NA' - expected[infiles[1]]['noncoding1']['noncoding1.A14T'] = 'het' - expected[infiles[1]]['noncoding1']['noncoding1.A14T.%'] = 80.0 - got = s._gather_output_rows() - self.assertEqual(expected, got) - - for filename in expected: - del expected[filename]['noncoding1']['vgroup.id1'] - del expected[filename]['noncoding1']['vgroup.id3'] - del expected[filename]['noncoding1']['vgroup.id1.%'] - del expected[filename]['noncoding1']['vgroup.id3.%'] - for gene_type in expected[filename]: - del expected[filename][gene_type]['ref_seq'] - - s = summary.Summary('out', filenames=infiles, cluster_cols='assembled,match,pct_id,known_var,novel_var', variant_cols='ungrouped,grouped,novel') - s.samples = summary.Summary._load_input_files(infiles, 90) - s.include_all_variant_columns = True - s.show_known_het = True - got = s._gather_output_rows() - self.assertEqual(expected, got) - - def test_gather_unfiltered_output_data(self): '''test gather_output_rows_new''' infiles = [ From 5accff81163594432bd1e2bd92afe733a3a4b9e1 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 12 Aug 2016 08:18:24 +0100 Subject: [PATCH 32/40] Remove var_columns option --- ariba/summary.py | 8 -------- ariba/tests/summary_test.py | 35 +++-------------------------------- 2 files changed, 3 insertions(+), 40 deletions(-) diff --git a/ariba/summary.py b/ariba/summary.py index 9ce36050..af6f1e57 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -20,7 +20,6 @@ def __init__( min_id=90.0, show_known_het=False, cluster_cols='assembled,match,ref_seq,pct_id,known_var,novel_var', - variant_cols='groups,grouped,ungrouped,novel', make_phandango_tree=True, only_clusters=None, show_var_groups=False, @@ -40,7 +39,6 @@ def __init__( self.show_known_het = show_known_het self.cluster_columns = self._determine_cluster_cols(cluster_cols) - self.var_columns = self._determine_var_cols(variant_cols) self.filter_rows = filter_rows self.filter_columns = filter_columns self.min_id = min_id @@ -68,12 +66,6 @@ def _determine_cluster_cols(cols_string): return Summary._determine_cols(cols_string, allowed_cols, 'cluster columns') - @staticmethod - def _determine_var_cols(cols_string): - allowed_cols = {'groups', 'grouped', 'ungrouped', 'novel'} - return Summary._determine_cols(cols_string, allowed_cols, 'variant columns') - - def _load_fofn(self, fofn): f = pyfastaq.utils.open_file_read(fofn) filenames = [x.rstrip() for x in f.readlines()] diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 56ceb82d..40447235 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -43,35 +43,6 @@ def test_determine_cluster_cols(self): self.assertEqual(expected[i], summary.Summary._determine_cluster_cols(col_strings[i])) - def test_determine_var_cols(self): - col_strings = [ - 'groups,grouped,ungrouped,novel', - 'groups,grouped,ungrouped', - 'grouped,novel', - 'ungrouped,novel', - 'grouped', - 'ungrouped', - 'novel', - '' - ] - - expected = [ - {'groups': True, 'grouped': True, 'ungrouped': True, 'novel': True}, - {'groups': True, 'grouped': True, 'ungrouped': True, 'novel': False}, - {'groups': False, 'grouped': True, 'ungrouped': False, 'novel': True}, - {'groups': False, 'grouped': False, 'ungrouped': True, 'novel': True}, - {'groups': False, 'grouped': True, 'ungrouped': False, 'novel': False}, - {'groups': False, 'grouped': False, 'ungrouped': True, 'novel': False}, - {'groups': False, 'grouped': False, 'ungrouped': False, 'novel': True}, - {'groups': False, 'grouped': False, 'ungrouped': False, 'novel': False}, - ] - - assert len(col_strings) == len(expected) - - for i in range(len(col_strings)): - self.assertEqual(expected[i], summary.Summary._determine_var_cols(col_strings[i])) - - def test_load_input_files(self): '''Test _load_input_files''' file1 = os.path.join(data_dir, 'summary_test_load_input_files.1.tsv') @@ -267,7 +238,7 @@ def test_gather_unfiltered_output_data(self): } } - s = summary.Summary('out', filenames=infiles, variant_cols=None) + s = summary.Summary('out', filenames=infiles) s.samples = summary.Summary._load_input_files(infiles, 90) s._gather_unfiltered_output_data() self.assertEqual(expected_potential_cols, s.all_potential_columns) @@ -279,7 +250,7 @@ def test_gather_unfiltered_output_data(self): expected_all[infiles[0]]['noncoding2']['groups'] = {'id2': 'yes_multi_het', 'id2.%': 'NA'} expected_all[infiles[1]]['noncoding1']['groups'] = {'id1': 'het', 'id1.%': 80.0, 'id3': 'yes'} expected_all[infiles[1]]['noncoding2']['groups'] = {'id2': 'het', 'id2.%': 40.0} - s = summary.Summary('out', filenames=infiles, variant_cols=None, show_var_groups=True) + s = summary.Summary('out', filenames=infiles, show_var_groups=True) s.samples = summary.Summary._load_input_files(infiles, 90) s._gather_unfiltered_output_data() self.assertEqual(expected_potential_cols, s.all_potential_columns) @@ -295,7 +266,7 @@ def test_gather_unfiltered_output_data(self): expected_all[infiles[1]]['noncoding1']['vars'] = {'A14T': 'het', 'A14T.%': 80.0, 'A6G': 'yes'} expected_all[infiles[1]]['noncoding2']['vars'] = {'A52T': 'het', 'A52T.%': 40.0} expected_all[infiles[1]]['presence_absence1']['vars'] = {'A10V': 'yes'} - s = summary.Summary('out', filenames=infiles, variant_cols=None, show_var_groups=True, show_vars=True) + s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_vars=True) s.samples = summary.Summary._load_input_files(infiles, 90) s._gather_unfiltered_output_data() self.assertEqual(expected_potential_cols, s.all_potential_columns) From ed96e6347e09dd2d02cf6f456454799f37f0e720 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 12 Aug 2016 08:20:21 +0100 Subject: [PATCH 33/40] remove methods _get_all_cluster_names _get_all_variant_columns --- ariba/summary.py | 24 ------------------ .../summary_test_get_all_cluster_names.1.tsv | 3 --- .../summary_test_get_all_cluster_names.2.tsv | 5 ---- ariba/tests/summary_test.py | 25 ------------------- 4 files changed, 57 deletions(-) delete mode 100644 ariba/tests/data/summary_test_get_all_cluster_names.1.tsv delete mode 100644 ariba/tests/data/summary_test_get_all_cluster_names.2.tsv diff --git a/ariba/summary.py b/ariba/summary.py index af6f1e57..069bbc95 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -90,30 +90,6 @@ def _load_input_files(cls, filenames, min_id, verbose=False, only_clusters=None) return samples - @classmethod - def _get_all_cluster_names(cls, samples_dict): - '''Input should be output of _load_input_files''' - cluster_names = set() - for filename, sample in samples_dict.items(): - cluster_names.update(set(sample.clusters.keys())) - return cluster_names - - - @classmethod - def _get_all_variant_columns(cls, samples_dict): - '''Input should be output of _load_input_files''' - columns = {} - for filename, sample in samples_dict.items(): - for cluster in sample.column_summary_data: - if sample.column_summary_data[cluster]['assembled'] == 'yes': - for key, tuple_set in sample.variant_column_names_tuples.items(): - for t in tuple_set: - if key not in columns: - columns[key] = set() - columns[key].add(t) - return columns - - @classmethod def _get_all_het_snps(cls, samples_dict): snps = set() diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv deleted file mode 100644 index f35590e2..00000000 --- a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv +++ /dev/null @@ -1,3 +0,0 @@ -#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -presence_absence1 1 0 27 88 cluster.p.1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:id2:Ref has wild, reads have variant so report Generic description of presence_absence1 diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv deleted file mode 100644 index 2bddc3d6..00000000 --- a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv +++ /dev/null @@ -1,5 +0,0 @@ -#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id2:variant in ref and reads so should report generic description of noncoding1 -presence_absence1 1 0 27 88 cluster.p.2 96 96 98.96 presence_absence1.scaffold.1 267 51.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 -variants_only1 1 1 64 12 cluster.v.1 90 . . . . . . . . . . . . . . . . . . . . . . . diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 40447235..d4f39530 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -64,31 +64,6 @@ def test_load_input_files(self): self.assertEqual(expected, got) - - def test_get_all_cluster_names(self): - '''Test _get_all_cluster_names''' - file1 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.1.tsv') - file2 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.2.tsv') - samples = summary.Summary._load_input_files([file1, file2], 90) - got = summary.Summary._get_all_cluster_names(samples) - expected = {'cluster.n.1', 'cluster.v.1', 'cluster.p.1', 'cluster.p.2'} - self.assertEqual(expected, got) - - - def test_get_all_variant_columns(self): - '''Test _get_all_variant_columns''' - file1 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.1.tsv') - file2 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.2.tsv') - samples = summary.Summary._load_input_files([file1, file2], 90) - got = summary.Summary._get_all_variant_columns(samples) - expected = { - 'cluster.p.2': {('presence_absence1', 'A10V', 'grouped', 'id3')}, - 'cluster.n.1': {('noncoding1', 'A6G', 'grouped', 'id2'), ('noncoding1', 'A14T', 'grouped', 'id1')}, - 'cluster.p.1': {('presence_absence1', 'A10V', 'grouped', 'id2')}, - } - self.assertEqual(expected, got) - - def test_get_all_het_snps(self): '''test _get_all_het_snps''' file1 = os.path.join(data_dir, 'summary_test_get_all_het_snps.1.tsv') From 287348a5ddc52fa1292b27f2cbf29ca01ed27586 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 12 Aug 2016 08:21:49 +0100 Subject: [PATCH 34/40] Remove method _get_all_het_snps --- ariba/summary.py | 11 ----------- ariba/tests/data/summary_test_get_all_het_snps.1.tsv | 3 --- ariba/tests/data/summary_test_get_all_het_snps.2.tsv | 5 ----- ariba/tests/summary_test.py | 10 ---------- 4 files changed, 29 deletions(-) delete mode 100644 ariba/tests/data/summary_test_get_all_het_snps.1.tsv delete mode 100644 ariba/tests/data/summary_test_get_all_het_snps.2.tsv diff --git a/ariba/summary.py b/ariba/summary.py index 069bbc95..105d5daf 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -90,17 +90,6 @@ def _load_input_files(cls, filenames, min_id, verbose=False, only_clusters=None) return samples - @classmethod - def _get_all_het_snps(cls, samples_dict): - snps = set() - for filename, sample in samples_dict.items(): - for cluster, snp_dict in sample.het_snps.items(): - for snp_id in snp_dict: - for snp in snp_dict[snp_id]: - snps.add((cluster, snp)) - - return snps - @classmethod def _get_all_var_groups(cls, samples_dict): groups = {} diff --git a/ariba/tests/data/summary_test_get_all_het_snps.1.tsv b/ariba/tests/data/summary_test_get_all_het_snps.1.tsv deleted file mode 100644 index 3e67eeb1..00000000 --- a/ariba/tests/data/summary_test_get_all_het_snps.1.tsv +++ /dev/null @@ -1,3 +0,0 @@ -#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 diff --git a/ariba/tests/data/summary_test_get_all_het_snps.2.tsv b/ariba/tests/data/summary_test_get_all_het_snps.2.tsv deleted file mode 100644 index 398aedbc..00000000 --- a/ariba/tests/data/summary_test_get_all_het_snps.2.tsv +++ /dev/null @@ -1,5 +0,0 @@ -#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id3:variant in ref and reads so should report generic description of noncoding1 -presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 -variants_only1 1 1 64 12 variants_only1 90 . . . . . . . . . . . . . . . . . . . . . . . diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index d4f39530..3b7ecee6 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -64,16 +64,6 @@ def test_load_input_files(self): self.assertEqual(expected, got) - def test_get_all_het_snps(self): - '''test _get_all_het_snps''' - file1 = os.path.join(data_dir, 'summary_test_get_all_het_snps.1.tsv') - file2 = os.path.join(data_dir, 'summary_test_get_all_het_snps.2.tsv') - samples = summary.Summary._load_input_files([file1, file2], 90) - got = summary.Summary._get_all_het_snps(samples) - expected = {('noncoding1', 'A14T')} - self.assertEqual(expected, got) - - def test_get_all_var_groups(self): '''test _get_all_var_groups''' file1 = os.path.join(data_dir, 'summary_test_get_all_var_groups.1.tsv') From 8ab83fce410f8dbabc1f3c60e58f6170de7de52f Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 12 Aug 2016 08:22:48 +0100 Subject: [PATCH 35/40] Remove method _get_all_var_groups --- ariba/summary.py | 12 ------------ .../data/summary_test_get_all_var_groups.1.tsv | 3 --- .../data/summary_test_get_all_var_groups.2.tsv | 5 ----- ariba/tests/summary_test.py | 15 --------------- 4 files changed, 35 deletions(-) delete mode 100644 ariba/tests/data/summary_test_get_all_var_groups.1.tsv delete mode 100644 ariba/tests/data/summary_test_get_all_var_groups.2.tsv diff --git a/ariba/summary.py b/ariba/summary.py index 105d5daf..030d16f7 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -90,18 +90,6 @@ def _load_input_files(cls, filenames, min_id, verbose=False, only_clusters=None) return samples - @classmethod - def _get_all_var_groups(cls, samples_dict): - groups = {} - for filename, sample in samples_dict.items(): - for name, name_set in sample.var_groups.items(): - if name not in groups: - groups[name] = set() - groups[name].update(name_set) - - return groups - - @classmethod def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols): matrix = [] diff --git a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv b/ariba/tests/data/summary_test_get_all_var_groups.1.tsv deleted file mode 100644 index c4db58da..00000000 --- a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv +++ /dev/null @@ -1,3 +0,0 @@ -#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -presence_absence1 1 0 27 88 cluster.p.1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:id4:Ref has wild, reads have variant so report Generic description of presence_absence1 diff --git a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv b/ariba/tests/data/summary_test_get_all_var_groups.2.tsv deleted file mode 100644 index 2bddc3d6..00000000 --- a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv +++ /dev/null @@ -1,5 +0,0 @@ -#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id2:variant in ref and reads so should report generic description of noncoding1 -presence_absence1 1 0 27 88 cluster.p.2 96 96 98.96 presence_absence1.scaffold.1 267 51.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 -variants_only1 1 1 64 12 cluster.v.1 90 . . . . . . . . . . . . . . . . . . . . . . . diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 3b7ecee6..9c4931c9 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -64,21 +64,6 @@ def test_load_input_files(self): self.assertEqual(expected, got) - def test_get_all_var_groups(self): - '''test _get_all_var_groups''' - file1 = os.path.join(data_dir, 'summary_test_get_all_var_groups.1.tsv') - file2 = os.path.join(data_dir, 'summary_test_get_all_var_groups.2.tsv') - samples = summary.Summary._load_input_files([file1, file2], 90) - got = summary.Summary._get_all_var_groups(samples) - expected = { - 'cluster.p.1': {'id4'}, - 'cluster.p.2': {'id3'}, - 'cluster.v.1': set(), - 'cluster.n.1': {'id1', 'id2'} - } - self.assertEqual(expected, got) - - def test_gather_unfiltered_output_data(self): '''test gather_output_rows_new''' infiles = [ From e4a645dcf9daa89468f5e54b04130540a2680234 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 12 Aug 2016 08:24:05 +0100 Subject: [PATCH 36/40] Move _gather_unfiltered_output_data higher up in file --- ariba/summary.py | 86 ++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/ariba/summary.py b/ariba/summary.py index 030d16f7..1621534a 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -90,6 +90,49 @@ def _load_input_files(cls, filenames, min_id, verbose=False, only_clusters=None) return samples + def _gather_unfiltered_output_data(self): + self.all_potential_columns = {} + self.all_data = {} + + for filename in sorted(self.samples): + self.all_data[filename] = {} + for cluster in self.samples[filename].clusters.values(): + self.all_data[filename][cluster.name] = {} + if cluster.name not in self.all_potential_columns: + self.all_potential_columns[cluster.name] = {'summary' : set(), 'groups': set(), 'vars': set()} + + this_cluster_dict = {'summary': copy.copy(cluster.summary), 'groups': {}, 'vars': {}} + seen_groups = {} + + for variant in cluster.variants: + if self.show_vars: + this_cluster_dict['vars'][variant.var_string] = 'yes' if variant.het_percent is None else 'het' + if variant.het_percent is not None: + this_cluster_dict['vars'][variant.var_string + '.%'] = variant.het_percent + + if self.show_var_groups and variant.var_group != '.': + if variant.var_group not in seen_groups: + seen_groups[variant.var_group] = {'yes': 0, 'het': 0} + + if variant.het_percent is None: + seen_groups[variant.var_group]['yes'] += 1 + this_cluster_dict['groups'][variant.var_group] = 'yes' + else: + seen_groups[variant.var_group]['het'] += 1 + this_cluster_dict['groups'][variant.var_group] = 'het' + this_cluster_dict['groups'][variant.var_group + '.%'] = variant.het_percent + + for group, d in seen_groups.items(): + if d['het'] > 0 and d['het'] + d['yes'] > 1: + this_cluster_dict['groups'][group] = 'yes_multi_het' + this_cluster_dict['groups'][group + '.%'] = 'NA' + + for x in this_cluster_dict: + self.all_potential_columns[cluster.name][x].update(set(this_cluster_dict[x].keys())) + + self.all_data[filename][cluster.name] = this_cluster_dict + + @classmethod def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols): matrix = [] @@ -332,46 +375,3 @@ def run(self): if self.verbose: print('Finished', flush=True) - - def _gather_unfiltered_output_data(self): - self.all_potential_columns = {} - self.all_data = {} - - for filename in sorted(self.samples): - self.all_data[filename] = {} - for cluster in self.samples[filename].clusters.values(): - self.all_data[filename][cluster.name] = {} - if cluster.name not in self.all_potential_columns: - self.all_potential_columns[cluster.name] = {'summary' : set(), 'groups': set(), 'vars': set()} - - this_cluster_dict = {'summary': copy.copy(cluster.summary), 'groups': {}, 'vars': {}} - seen_groups = {} - - for variant in cluster.variants: - if self.show_vars: - this_cluster_dict['vars'][variant.var_string] = 'yes' if variant.het_percent is None else 'het' - if variant.het_percent is not None: - this_cluster_dict['vars'][variant.var_string + '.%'] = variant.het_percent - - if self.show_var_groups and variant.var_group != '.': - if variant.var_group not in seen_groups: - seen_groups[variant.var_group] = {'yes': 0, 'het': 0} - - if variant.het_percent is None: - seen_groups[variant.var_group]['yes'] += 1 - this_cluster_dict['groups'][variant.var_group] = 'yes' - else: - seen_groups[variant.var_group]['het'] += 1 - this_cluster_dict['groups'][variant.var_group] = 'het' - this_cluster_dict['groups'][variant.var_group + '.%'] = variant.het_percent - - for group, d in seen_groups.items(): - if d['het'] > 0 and d['het'] + d['yes'] > 1: - this_cluster_dict['groups'][group] = 'yes_multi_het' - this_cluster_dict['groups'][group + '.%'] = 'NA' - - for x in this_cluster_dict: - self.all_potential_columns[cluster.name][x].update(set(this_cluster_dict[x].keys())) - - self.all_data[filename][cluster.name] = this_cluster_dict - From 9d7841896f146386141836979d3fa3076ea3cae2 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 12 Aug 2016 08:42:17 +0100 Subject: [PATCH 37/40] Use new refactored code --- ariba/summary.py | 8 +++----- ariba/tasks/summary.py | 32 ++------------------------------ 2 files changed, 5 insertions(+), 35 deletions(-) diff --git a/ariba/summary.py b/ariba/summary.py index 1621534a..0787b935 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -18,7 +18,6 @@ def __init__( filter_rows=True, filter_columns=True, min_id=90.0, - show_known_het=False, cluster_cols='assembled,match,ref_seq,pct_id,known_var,novel_var', make_phandango_tree=True, only_clusters=None, @@ -37,7 +36,6 @@ def __init__( if fofn is not None: self.filenames.extend(self._load_fofn(fofn)) - self.show_known_het = show_known_het self.cluster_columns = self._determine_cluster_cols(cluster_cols) self.filter_rows = filter_rows self.filter_columns = filter_columns @@ -167,7 +165,7 @@ def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols): phandango_header.append(cluster_name + '.' + col + ':o1') for col_type in ['summary', 'groups', 'vars']: - if col in all_data[filename][cluster_name][col_type]: + if cluster_name in all_data[filename] and col in all_data[filename][cluster_name][col_type]: line.append(all_data[filename][cluster_name][col_type][col]) break else: @@ -313,8 +311,8 @@ def run(self): self.samples = self._load_input_files(self.filenames, self.min_id, verbose=self.verbose, only_clusters=self.only_clusters) if self.verbose: print('Generating output rows', flush=True) - self.rows = self._gather_output_rows() - phandango_header, csv_header, matrix = Summary._to_matrix(self.filenames, self.rows, self.cluster_columns) + self._gather_unfiltered_output_data() + phandango_header, csv_header, matrix = Summary._to_matrix(self.filenames, self.all_data, self.all_potential_columns, self.cluster_columns) # sanity check same number of columns in headers and matrix lengths = {len(x) for x in matrix} diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py index c674df3a..d6bfb848 100644 --- a/ariba/tasks/summary.py +++ b/ariba/tasks/summary.py @@ -9,66 +9,38 @@ def use_preset(options): preset_to_vals = { 'minimal': { 'cluster_cols': 'match', - 'var_cols': '', 'col_filter': 'y', 'row_filter': 'y', - 'var_groups': 'n', - 'known_vars': 'n', - 'novel_vars': 'n' }, 'cluster_small': { 'cluster_cols': 'assembled,match,ref_seq,known_var', - 'var_cols': '', 'col_filter': 'y', 'row_filter': 'y', - 'var_groups': 'n', - 'known_vars': 'n', - 'novel_vars': 'n' }, 'cluster_all': { 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var', - 'var_cols': '', 'col_filter': 'y', 'row_filter': 'y', - 'var_groups': 'n', - 'known_vars': 'n', - 'novel_vars': 'n' }, 'cluster_var_groups': { 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var', - 'var_cols': 'groups', 'col_filter': 'y', 'row_filter': 'y', - 'var_groups': 'y', - 'known_vars': 'n', - 'novel_vars': 'n' }, 'cluster_known_vars': { 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var', - 'var_cols': 'groups,grouped,ungrouped', 'col_filter': 'y', 'row_filter': 'y', - 'var_groups': 'y', - 'known_vars': 'y', - 'novel_vars': 'n' }, 'all': { 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var', - 'var_cols': 'groups,grouped,ungrouped,novel', 'col_filter': 'y', 'row_filter': 'y', - 'var_groups': 'y', - 'known_vars': 'y', - 'novel_vars': 'y' }, 'all_no_filter': { 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var', - 'var_cols': 'groups,grouped,ungrouped,novel', 'col_filter': 'n', 'row_filter': 'n', - 'var_groups': 'y', - 'known_vars': 'y', - 'novel_vars': 'y' }, } @@ -93,11 +65,11 @@ def run(options): filter_rows=options.col_filter == 'y', filter_columns=options.row_filter == 'y', min_id=options.min_id, - show_known_het=options.het, cluster_cols=options.cluster_cols, - variant_cols=options.var_cols, make_phandango_tree=(not options.no_tree), only_clusters=None if options.only_cluster is None else {options.only_cluster}, + show_var_groups=options.v_groups, + show_vars=options.variants, verbose=options.verbose ) s.run() From dd0e6d76c23fef53440767415063af45c5d40fb9 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 12 Aug 2016 08:52:49 +0100 Subject: [PATCH 38/40] update preset to use v_groups and variants options --- ariba/tasks/summary.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py index d6bfb848..252f85fa 100644 --- a/ariba/tasks/summary.py +++ b/ariba/tasks/summary.py @@ -27,11 +27,6 @@ def use_preset(options): 'col_filter': 'y', 'row_filter': 'y', }, - 'cluster_known_vars': { - 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var', - 'col_filter': 'y', - 'row_filter': 'y', - }, 'all': { 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var', 'col_filter': 'y', @@ -49,6 +44,12 @@ def use_preset(options): for key, val in preset_to_vals[options.preset].items(): exec('options.' + key + ' = "' + val + '"') + if options.preset in {'cluster_var_groups', 'all', 'all_no_filter'}: + options.v_groups = True + + if options.preset in {'all', 'all_no_filter'}: + options.variants = True + return options From 8a47c6b49befe738c47ad7b1039bcb938148b270 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 12 Aug 2016 09:03:35 +0100 Subject: [PATCH 39/40] Bug fix getting variant nucleotide for novel snp --- ariba/summary_cluster_variant.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ariba/summary_cluster_variant.py b/ariba/summary_cluster_variant.py index b9c7c6db..51e00d95 100644 --- a/ariba/summary_cluster_variant.py +++ b/ariba/summary_cluster_variant.py @@ -43,7 +43,9 @@ def _get_het_percent(cls, data_dict): raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_alt_nt, smtls_alt_depth columns. Cannot continue\n' + str(data_dict)) try: - var_nucleotide = data_dict['known_var_change'][-1] + var_nucleotide = data_dict['known_var_change'][-1] if data_dict['known_var_change'] != '.' else data_dict['ref_ctg_change'][-1] + if var_nucleotide == '.': + return None depths = [int(x) for x in depths] nuc_to_depth = dict(zip(nucleotides, depths)) total_depth = sum(depths) From 553ff0cd111350e886154007e38af2aedcf4dbb7 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 12 Aug 2016 09:14:04 +0100 Subject: [PATCH 40/40] Update options to reflect rewrite of summary code --- scripts/ariba | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/ariba b/scripts/ariba index f0b91a86..696a00a9 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -152,7 +152,7 @@ subparser_run.set_defaults(func=ariba.tasks.run.run) #----------------------------- summary ------------------------------- -summary_presets = ['minimal', 'cluster_small', 'cluster_all', 'cluster_var_groups', 'cluster_known_vars', 'all', 'all_no_filter'] +summary_presets = ['minimal', 'cluster_small', 'cluster_all', 'cluster_var_groups', 'all', 'all_no_filter'] subparser_summary = subparsers.add_parser( 'summary', help='Summarise multiple reports made by "run"', @@ -162,15 +162,15 @@ subparser_summary = subparsers.add_parser( ) subparser_summary.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME') -subparser_summary.add_argument('--preset', choices=summary_presets, help='Shorthand for setting --cluster_cols,--col_filter,--row_filter,--known_vars,--novel_vars. Using this overrides those options', metavar='|'.join(summary_presets)) +subparser_summary.add_argument('--preset', choices=summary_presets, help='Shorthand for setting --cluster_cols,--col_filter,--row_filter,--v_groups,--variants. Using this overrides those options', metavar='|'.join(summary_presets)) subparser_summary.add_argument('--cluster_cols', help='Comma separated list of cluster columns to include. Choose from: assembled, match, ref_seq, pct_id, known_var, novel_var [%(default)s]', default='match', metavar='col1,col2,...') subparser_summary.add_argument('--col_filter', choices=['y', 'n'], default='y', help='Choose whether columns where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n') -subparser_summary.add_argument('--het', action='store_true', help='For known noncoding SNPs, report if they are heterozygous or not, and the percent of reads supporting the variant type') subparser_summary.add_argument('--no_tree', action='store_true', help='Do not make phandango tree') subparser_summary.add_argument('--row_filter', choices=['y', 'n'], default='y', help='Choose whether rows where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n') -subparser_summary.add_argument('--var_cols', help='Comma separated list of variant columns to include. Choose from: groups, grouped, ungrouped, novel [none by default]', metavar='col1,col2,...', default='') subparser_summary.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT') subparser_summary.add_argument('--only_cluster', help='Only report data for the given cluster name', metavar='Cluster_name') +subparser_summary.add_argument('--v_groups', action='store_true', help='Show presence of variants that are in groups') +subparser_summary.add_argument('--variants', action='store_true', help='Report all variants') subparser_summary.add_argument('--verbose', action='store_true', help='Be verbose') subparser_summary.add_argument('outprefix', help='Prefix of output files') subparser_summary.add_argument('infiles', nargs='*', help='Files to be summarised')