Skip to content

Commit

Permalink
Merge pull request #167 from martinghunt/summary_read_depth_column
Browse files Browse the repository at this point in the history
Summary read depth column
  • Loading branch information
martinghunt authored Mar 31, 2017
2 parents 881828b + 8e604ec commit 95102b6
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 51 deletions.
11 changes: 6 additions & 5 deletions ariba/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __init__(
filter_rows=True,
filter_columns=True,
min_id=90.0,
cluster_cols='assembled,match,ref_seq,pct_id,known_var,novel_var',
cluster_cols='assembled,match,ref_seq,pct_id,ctg_cov,known_var,novel_var',
make_phandango_tree=True,
only_clusters=None,
show_var_groups=False,
Expand Down Expand Up @@ -62,7 +62,7 @@ def _determine_cols(cls, cols_string, allowed_cols, error_string):

@staticmethod
def _determine_cluster_cols(cols_string):
allowed_cols = {'assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'}
allowed_cols = {'assembled', 'match', 'ref_seq', 'pct_id', 'ctg_cov', 'known_var', 'novel_var'}
return Summary._determine_cols(cols_string, allowed_cols, 'cluster columns')


Expand Down Expand Up @@ -122,6 +122,7 @@ def _gather_unfiltered_output_data(self):
'match': 'no',
'novel_var': 'NA',
'pct_id': 'NA',
'ctg_cov': 'NA',
'ref_seq': 'NA'
}
else:
Expand Down Expand Up @@ -164,11 +165,11 @@ def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols):
matrix = []
making_header_lines = True
phandango_header = ['name']
phandango_suffixes = {'assembled': ':o1', 'match': ':o1', 'ref_seq': ':o2', 'pct_id': ':c1', 'known_var': ':o1', 'novel_var': ':o1'}
phandango_suffixes = {'assembled': ':o1', 'match': ':o1', 'ref_seq': ':o2', 'pct_id': ':c1', 'ctg_cov': ':c3', 'known_var': ':o1', 'novel_var': ':o1'}
ref_seq_counter = 2
csv_header = ['name']
summary_cols_in_order = ['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var']
summary_cols_set = set(['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'])
summary_cols_in_order = ['assembled', 'match', 'ref_seq', 'pct_id', 'ctg_cov', 'known_var', 'novel_var']
summary_cols_set = set(['assembled', 'match', 'ref_seq', 'pct_id', 'ctg_cov', 'known_var', 'novel_var'])
summary_cols_in_order = [x for x in summary_cols_in_order if cluster_cols[x]]

for filename in sorted(filenames):
Expand Down
13 changes: 9 additions & 4 deletions ariba/summary_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ class Error (Exception): pass
]


float_columns = ['pc_ident']
float_columns = ['pc_ident', 'ctg_cov']


class SummaryCluster:
def __init__(self, min_pc_id=90):
Expand Down Expand Up @@ -90,16 +91,18 @@ def add_data_dict(self, data_dict):
self.data.append(data_dict)


def pc_id_of_longest(self):
def _pc_id_and_read_depth_of_longest(self):
longest = 0
identity = 0
depth = 0

for d in self.data:
if d['ref_base_assembled'] > longest:
longest = d['ref_base_assembled']
identity = d['pc_ident']
depth = d['ctg_cov']

return identity
return identity, depth


def _has_any_part_of_ref_assembled(self):
Expand Down Expand Up @@ -316,12 +319,14 @@ def has_var_groups(self):
def column_summary_data(self):
'''Returns a dictionary of column name -> value, for cluster-level results'''
assembled_summary = self._to_cluster_summary_assembled()
pct_id, read_depth = self._pc_id_and_read_depth_of_longest()

columns = {
'assembled': self._to_cluster_summary_assembled(),
'match': self._has_match(assembled_summary),
'ref_seq': self.ref_name,
'pct_id': str(self.pc_id_of_longest()),
'pct_id': str(pct_id),
'ctg_cov': str(read_depth),
'known_var': self._to_cluster_summary_has_known_nonsynonymous(assembled_summary),
'novel_var': self._to_cluster_summary_has_novel_nonsynonymous(assembled_summary)
}
Expand Down
8 changes: 4 additions & 4 deletions ariba/tasks/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,22 @@ def use_preset(options):
'row_filter': 'y',
},
'cluster_all': {
'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
'cluster_cols': 'assembled,match,ref_seq,pct_id,ctg_cov,known_var,novel_var',
'col_filter': 'y',
'row_filter': 'y',
},
'cluster_var_groups': {
'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
'cluster_cols': 'assembled,match,ref_seq,pct_id,ctg_cov,known_var,novel_var',
'col_filter': 'y',
'row_filter': 'y',
},
'all': {
'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
'cluster_cols': 'assembled,match,ref_seq,pct_id,ctg_cov,known_var,novel_var',
'col_filter': 'y',
'row_filter': 'y',
},
'all_no_filter': {
'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
'cluster_cols': 'assembled,match,ref_seq,pct_id,ctg_cov,known_var,novel_var',
'col_filter': 'n',
'row_filter': 'n',
},
Expand Down
6 changes: 3 additions & 3 deletions ariba/tests/data/summary_test_whole_run.out.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
name,23S.assembled,23S.match,23S.ref_seq,23S.pct_id,23S.known_var,23S.2597CT,23S.2597CT.%,23S.2597TC,23S.2597TC.%,coding1.assembled,coding1.match,coding1.ref_seq,coding1.pct_id,coding2.assembled,coding2.match,coding2.ref_seq,coding2.pct_id,coding3.assembled,coding3.match,coding3.ref_seq,coding3.pct_id,coding5.assembled,coding5.match,coding5.ref_seq,coding5.pct_id,coding5.known_var,coding5.A42S,coding6.assembled,coding6.match,coding6.ref_seq,coding6.pct_id,coding6.known_var,coding6.A52S,coding7.assembled,coding7.ref_seq,coding7.pct_id,coding8.assembled,coding8.match,coding8.ref_seq,coding8.pct_id,coding8.novel_var,coding8.A53S,mdfA.assembled,mdfA.ref_seq,mdfA.pct_id,mdfA.novel_var,noncoding1.assembled,noncoding1.match,noncoding1.ref_seq,noncoding1.pct_id,noncoding10.assembled,noncoding10.match,noncoding10.ref_seq,noncoding10.pct_id,noncoding10.novel_var,noncoding10.100T,noncoding10.100T.%,noncoding11.assembled,noncoding11.match,noncoding11.ref_seq,noncoding11.pct_id,noncoding11.novel_var,noncoding11.101AG,noncoding11.101AG.%,noncoding2.assembled,noncoding2.match,noncoding2.ref_seq,noncoding2.pct_id,noncoding3.assembled,noncoding3.match,noncoding3.ref_seq,noncoding3.pct_id,noncoding5.assembled,noncoding5.match,noncoding5.ref_seq,noncoding5.pct_id,noncoding5.known_var,noncoding5.42T,noncoding5.42T.%,noncoding6.assembled,noncoding6.match,noncoding6.ref_seq,noncoding6.pct_id,noncoding6.known_var,noncoding6.52CT,noncoding6.52CT.%,noncoding7.assembled,noncoding7.match,noncoding7.ref_seq,noncoding7.pct_id,noncoding7.known_var,noncoding7.53T,noncoding7.53T.%,noncoding8.assembled,noncoding8.match,noncoding8.ref_seq,noncoding8.pct_id,noncoding9.assembled,noncoding9.ref_seq,noncoding9.pct_id
summary_test_whole_run.in.1.tsv,yes,yes,23S.rDNA_WHO_F_01358c,99.86,yes,no,NA,yes,100.0,interrupted,no,coding1_ref1,99.1,yes,yes,coding2_ref1,98.2,no,no,NA,NA,yes,no,coding5_ref1,97.4,no,no,yes,yes,coding6_ref1,95.5,yes,yes,yes,coding7_ref1,95.4,yes,yes,coding8_ref1,95.3,yes,yes,interrupted,mdfA.3001328.JQ394987.0_1233.561,97.0,yes,yes,yes,noncoding1_ref1,99.1,yes,yes,noncoding10_ref1,95.1,yes,yes,99.0,yes,yes,noncoding11_ref1,95.05,het,het,30.0,yes,yes,noncoding2_ref1,98.2,no,no,NA,NA,yes,yes,noncoding5_ref1,97.4,yes,yes,100.0,yes,yes,noncoding6_ref1,95.5,yes,het,70.0,yes,yes,noncoding7_ref1,95.4,yes,yes,98.6,yes,yes,noncoding8_ref1,95.3,yes,noncoding9_ref1,95.2
summary_test_whole_run.in.2.tsv,yes_nonunique,no,23S.rDNA_WHO_F_01358c,99.84,het,het,12.8,no,NA,yes,yes,coding1_ref2,99.2,no,no,NA,NA,yes,yes,coding3_ref1,97.6,yes,yes,coding5_ref1,97.4,yes,yes,no,no,NA,NA,NA,NA,no,NA,NA,no,no,NA,NA,NA,NA,no,NA,NA,NA,yes,yes,noncoding1_ref2,99.2,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,yes,yes,noncoding3_ref1,97.6,yes,no,noncoding5_ref1,99.42,no,no,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,no,NA,NA
name,23S.assembled,23S.match,23S.ref_seq,23S.pct_id,23S.ctg_cov,23S.known_var,23S.2597CT,23S.2597CT.%,23S.2597TC,23S.2597TC.%,coding1.assembled,coding1.match,coding1.ref_seq,coding1.pct_id,coding1.ctg_cov,coding2.assembled,coding2.match,coding2.ref_seq,coding2.pct_id,coding2.ctg_cov,coding3.assembled,coding3.match,coding3.ref_seq,coding3.pct_id,coding3.ctg_cov,coding5.assembled,coding5.match,coding5.ref_seq,coding5.pct_id,coding5.ctg_cov,coding5.known_var,coding5.A42S,coding6.assembled,coding6.match,coding6.ref_seq,coding6.pct_id,coding6.ctg_cov,coding6.known_var,coding6.A52S,coding7.assembled,coding7.ref_seq,coding7.pct_id,coding7.ctg_cov,coding8.assembled,coding8.match,coding8.ref_seq,coding8.pct_id,coding8.ctg_cov,coding8.novel_var,coding8.A53S,mdfA.assembled,mdfA.ref_seq,mdfA.pct_id,mdfA.ctg_cov,mdfA.novel_var,noncoding1.assembled,noncoding1.match,noncoding1.ref_seq,noncoding1.pct_id,noncoding1.ctg_cov,noncoding10.assembled,noncoding10.match,noncoding10.ref_seq,noncoding10.pct_id,noncoding10.ctg_cov,noncoding10.novel_var,noncoding10.100T,noncoding10.100T.%,noncoding11.assembled,noncoding11.match,noncoding11.ref_seq,noncoding11.pct_id,noncoding11.ctg_cov,noncoding11.novel_var,noncoding11.101AG,noncoding11.101AG.%,noncoding2.assembled,noncoding2.match,noncoding2.ref_seq,noncoding2.pct_id,noncoding2.ctg_cov,noncoding3.assembled,noncoding3.match,noncoding3.ref_seq,noncoding3.pct_id,noncoding3.ctg_cov,noncoding5.assembled,noncoding5.match,noncoding5.ref_seq,noncoding5.pct_id,noncoding5.ctg_cov,noncoding5.known_var,noncoding5.42T,noncoding5.42T.%,noncoding6.assembled,noncoding6.match,noncoding6.ref_seq,noncoding6.pct_id,noncoding6.ctg_cov,noncoding6.known_var,noncoding6.52CT,noncoding6.52CT.%,noncoding7.assembled,noncoding7.match,noncoding7.ref_seq,noncoding7.pct_id,noncoding7.ctg_cov,noncoding7.known_var,noncoding7.53T,noncoding7.53T.%,noncoding8.assembled,noncoding8.match,noncoding8.ref_seq,noncoding8.pct_id,noncoding8.ctg_cov,noncoding9.assembled,noncoding9.ref_seq,noncoding9.pct_id,noncoding9.ctg_cov
summary_test_whole_run.in.1.tsv,yes,yes,23S.rDNA_WHO_F_01358c,99.86,744.8,yes,no,NA,yes,100.0,interrupted,no,coding1_ref1,99.1,10.1,yes,yes,coding2_ref1,98.2,42.42,no,no,NA,NA,NA,yes,no,coding5_ref1,97.4,14.1,no,no,yes,yes,coding6_ref1,95.5,24.32,yes,yes,yes,coding7_ref1,95.4,24.32,yes,yes,coding8_ref1,95.3,24.31,yes,yes,interrupted,mdfA.3001328.JQ394987.0_1233.561,97.0,16.2,yes,yes,yes,noncoding1_ref1,99.1,10.1,yes,yes,noncoding10_ref1,95.1,24.27,yes,yes,99.0,yes,yes,noncoding11_ref1,95.05,24.26,het,het,30.0,yes,yes,noncoding2_ref1,98.2,42.42,no,no,NA,NA,NA,yes,yes,noncoding5_ref1,97.4,14.1,yes,yes,100.0,yes,yes,noncoding6_ref1,95.5,24.32,yes,het,70.0,yes,yes,noncoding7_ref1,95.4,24.31,yes,yes,98.6,yes,yes,noncoding8_ref1,95.3,24.29,yes,noncoding9_ref1,95.2,24.28
summary_test_whole_run.in.2.tsv,yes_nonunique,no,23S.rDNA_WHO_F_01358c,99.84,344.0,het,het,12.8,no,NA,yes,yes,coding1_ref2,99.2,10.1,no,no,NA,NA,NA,yes,yes,coding3_ref1,97.6,37.6,yes,yes,coding5_ref1,97.4,14.1,yes,yes,no,no,NA,NA,NA,NA,NA,no,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,NA,NA,NA,NA,yes,yes,noncoding1_ref2,99.2,10.1,no,no,NA,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,NA,no,no,NA,NA,NA,yes,yes,noncoding3_ref1,97.6,37.6,yes,no,noncoding5_ref1,99.42,14.1,no,no,NA,no,no,NA,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,NA,no,no,NA,NA,NA,no,NA,NA,NA
16 changes: 9 additions & 7 deletions ariba/tests/summary_cluster_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_line2dict(self):
'pc_ident': 98.33,
'ctg': 'ctg_name',
'ctg_len': 279,
'ctg_cov': '24.4',
'ctg_cov': 24.4,
'known_var': '1',
'var_type': 'SNP',
'var_seq_type': 'n',
Expand Down Expand Up @@ -85,20 +85,20 @@ def test_has_any_part_of_ref_assembled(self):
self.assertTrue(cluster._has_any_part_of_ref_assembled())


def test_pc_id_of_longest(self):
'''Test pc_id_of_longest'''
def test_pc_id_and_read_depth_of_longest(self):
'''Test _pc_id_and_read_depth_of_longest'''
cluster = summary_cluster.SummaryCluster()
self.assertTrue(cluster.name is None)
line1 = 'ariba_refname\trefname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
line2 = 'ariba_refname\trefname\t1\t0\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
line3 = 'ariba_refname\trefname\t1\t0\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
line1 = 'ariba_refname\trefname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t42.2\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
line2 = 'ariba_refname\trefname\t1\t0\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t42.42\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
line3 = 'ariba_refname\trefname\t1\t0\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t42.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
data_dict2 = summary_cluster.SummaryCluster.line2dict(line2)
data_dict3 = summary_cluster.SummaryCluster.line2dict(line3)
cluster.add_data_dict(data_dict1)
cluster.add_data_dict(data_dict2)
cluster.add_data_dict(data_dict3)
self.assertEqual(98.2, cluster.pc_id_of_longest())
self.assertEqual((98.2, 42.42), cluster._pc_id_and_read_depth_of_longest())


def test_to_cluster_summary_number(self):
Expand Down Expand Up @@ -465,6 +465,7 @@ def test_column_summary_data(self):
'novel_var': 'no',
'known_var': 'yes',
'pct_id': '98.33',
'ctg_cov': '24.4',
}
got = cluster.column_summary_data()
self.assertEqual(expected, got)
Expand Down Expand Up @@ -543,6 +544,7 @@ def test_gather_data(self):
'match': 'yes',
'ref_seq': 'ref1',
'pct_id': '98.33',
'ctg_cov': '24.4',
'known_var': 'yes',
'novel_var': 'no',
}
Expand Down
9 changes: 6 additions & 3 deletions ariba/tests/summary_sample_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,23 +52,26 @@ def test_column_summary_data(self):
'ref_seq': 'noncoding1',
'known_var': 'yes',
'novel_var': 'yes',
'pct_id': '98.33'
'pct_id': '98.33',
'ctg_cov': '35.4',
},
'cluster.p': {
'assembled': 'yes',
'match': 'yes',
'ref_seq': 'presence_absence1',
'known_var': 'yes',
'novel_var': 'no',
'pct_id': '98.96'
'pct_id': '98.96',
'ctg_cov': '35.1',
},
'cluster.v': {
'assembled': 'yes',
'match': 'yes',
'ref_seq': 'variants_only1',
'known_var': 'yes',
'novel_var': 'no',
'pct_id': '100.0'
'pct_id': '100.0',
'ctg_cov': '42.4',
}
}
self.maxDiff = None
Expand Down
Loading

0 comments on commit 95102b6

Please sign in to comment.