From 996d60a505264e2f9f629efba9ef98fa90c418d8 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 17 Feb 2017 14:13:46 +0000 Subject: [PATCH 01/88] Start of MicPlotter class --- ariba/__init__.py | 1 + ariba/mic_plotter.py | 52 +++++++++++++++++++ .../tests/data/mic_plotter_load_mic_file.tsv | 7 +++ ariba/tests/mic_plotter_test.py | 43 +++++++++++++++ 4 files changed, 103 insertions(+) create mode 100644 ariba/mic_plotter.py create mode 100644 ariba/tests/data/mic_plotter_load_mic_file.tsv create mode 100644 ariba/tests/mic_plotter_test.py diff --git a/ariba/__init__.py b/ariba/__init__.py index 1ca0437e..ca6fc157 100644 --- a/ariba/__init__.py +++ b/ariba/__init__.py @@ -25,6 +25,7 @@ 'mapping', 'megares_data_finder', 'megares_zip_parser', + 'mic_plotter', 'mlst_profile', 'mlst_reporter', 'pubmlst_getter', diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py new file mode 100644 index 00000000..30844596 --- /dev/null +++ b/ariba/mic_plotter.py @@ -0,0 +1,52 @@ +import csv +import re +import os + +class Error (Exception): pass + +regex_string_to_float = re.compile(r'\s*(?P[<>]?)\s*(?P=?)\s*(?P[0-9.]+)\s*$') + +class MicPlotter: + def __init__(self, mic_file, summary_file): + self.mic_file = mic_file + self.summary_file = summary_file + + + @classmethod + def _mic_string_to_float(cls, s): + regex_match = regex_string_to_float.match(s) + + if regex_match is None or regex_match.group('number') == '.': + if s.strip() in {'NA', 'na', '', '.'}: + return 'NA' + else: + return None + + try: + flt = float(regex_match.group('number')) + except: + return None + + if regex_match.group('equals') == '': + if regex_match.group('lt_or_gt') == '<': + return 0.5 * flt + elif regex_match.group('lt_or_gt') == '>': + return 2 * flt + + return flt + + + @classmethod + def _load_mic_file(cls, infile): + mic_data = {} + + with open(infile) as f: + reader = csv.DictReader(f, delimiter='\t') + if reader.fieldnames[0] != 'Sample': + raise Error('Error. Expected first column of MIC file "' + self.infile + '" to be "Sample"') + + for row in reader: + mic_data[row['Sample']] = {x: MicPlotter._mic_string_to_float(row[x]) for x in reader.fieldnames[1:]} + + return mic_data + diff --git a/ariba/tests/data/mic_plotter_load_mic_file.tsv b/ariba/tests/data/mic_plotter_load_mic_file.tsv new file mode 100644 index 00000000..70aa315f --- /dev/null +++ b/ariba/tests/data/mic_plotter_load_mic_file.tsv @@ -0,0 +1,7 @@ +Sample antibio1 antibio2 +sample1 0.25 0.004 +sample2 <0.25 <=0.004 +sample3 < 0.25 <= 0.004 +sample4 >256 >=256 +sample5 > 256 >= 256 +sample6 NA 1 diff --git a/ariba/tests/mic_plotter_test.py b/ariba/tests/mic_plotter_test.py new file mode 100644 index 00000000..46324878 --- /dev/null +++ b/ariba/tests/mic_plotter_test.py @@ -0,0 +1,43 @@ +import unittest +import os +from ariba import mic_plotter + +modules_dir = os.path.dirname(os.path.abspath(mic_plotter.__file__)) +data_dir = os.path.join(modules_dir, 'tests', 'data') + + +class TestMicPlotter(unittest.TestCase): + def test_mic_string_to_float(self): + '''Test _mic_string_to_float''' + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('42.42'), 42.42) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('42'), 42.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('>42'), 84.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('> 42'), 84.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('>=42'), 42.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('>= 42'), 42.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('<42'), 21.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('< 42'), 21.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('<=42'), 42.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('<= 42'), 42.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float(' <= 42.0 '), 42.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('na'), 'NA') + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('NA'), 'NA') + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('.'), 'NA') + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float(' '), 'NA') + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float(''), 'NA') + + + def test_load_mic_file(self): + '''Test _load_mic_file''' + infile = os.path.join(data_dir, 'mic_plotter_load_mic_file.tsv') + got = mic_plotter.MicPlotter._load_mic_file(infile) + expected = { + 'sample1': {'antibio1': 0.25, 'antibio2': 0.004}, + 'sample2': {'antibio1': 0.125, 'antibio2': 0.004}, + 'sample3': {'antibio1': 0.125, 'antibio2': 0.004}, + 'sample4': {'antibio1': 512.0, 'antibio2': 256.0}, + 'sample5': {'antibio1': 512.0, 'antibio2': 256.0}, + 'sample6': {'antibio1': 'NA', 'antibio2': 1.0}, + } + + self.assertEqual(got, expected) From 82ef5c3cdede009e1a8a87932b12968f67677d61 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 17 Feb 2017 14:28:35 +0000 Subject: [PATCH 02/88] Option --only_clusters instead of --noly_cluster --- ariba/tasks/summary.py | 2 +- scripts/ariba | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py index c91e9829..778facb6 100644 --- a/ariba/tasks/summary.py +++ b/ariba/tasks/summary.py @@ -69,7 +69,7 @@ def run(options): min_id=options.min_id, cluster_cols=options.cluster_cols, make_phandango_tree=(not options.no_tree), - only_clusters=None if options.only_cluster is None else {options.only_cluster}, + only_clusters=None if options.only_clusters is None else set(options.only_clusters.split(',')), show_var_groups=options.v_groups, show_known_vars=options.known_variants, show_novel_vars=options.novel_variants, diff --git a/scripts/ariba b/scripts/ariba index 2ba73d82..beab2716 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -194,7 +194,7 @@ subparser_summary.add_argument('--col_filter', choices=['y', 'n'], default='y', subparser_summary.add_argument('--no_tree', action='store_true', help='Do not make phandango tree') subparser_summary.add_argument('--row_filter', choices=['y', 'n'], default='y', help='Choose whether rows where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n') subparser_summary.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT') -subparser_summary.add_argument('--only_cluster', help='Only report data for the given cluster name', metavar='Cluster_name') +subparser_summary.add_argument('--only_clusters', help='Only report data for the given comma-separated list of cluster names, eg: cluster1,cluster2,cluster42', metavar='Cluster_names') subparser_summary.add_argument('--v_groups', action='store_true', help='Show a group column for each group of variants') subparser_summary.add_argument('--known_variants', action='store_true', help='Report all known variants') subparser_summary.add_argument('--novel_variants', action='store_true', help='Report all novel variants') From 4495b0f994dba4382e2df8c75724a99c7e32102f Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 17 Feb 2017 15:45:57 +0000 Subject: [PATCH 03/88] new method _load_summary_file --- ariba/mic_plotter.py | 35 ++++++++++++++++++- .../data/mic_plotter_load_summary_file.tsv | 3 ++ ariba/tests/mic_plotter_test.py | 19 ++++++++++ 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 ariba/tests/data/mic_plotter_load_summary_file.tsv diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 30844596..68bf4c79 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -43,10 +43,43 @@ def _load_mic_file(cls, infile): with open(infile) as f: reader = csv.DictReader(f, delimiter='\t') if reader.fieldnames[0] != 'Sample': - raise Error('Error. Expected first column of MIC file "' + self.infile + '" to be "Sample"') + raise Error('Error. Expected first column of MIC file "' + infile + '" to be "Sample"') for row in reader: mic_data[row['Sample']] = {x: MicPlotter._mic_string_to_float(row[x]) for x in reader.fieldnames[1:]} return mic_data + + @classmethod + def _load_summary_file(cls, infile): + data = {} + + with open(infile) as f: + reader = csv.DictReader(f, delimiter='\t') + if reader.fieldnames[0] != 'name': + raise Error('Error. Expected first column of summary file "' + infile + '" to be "name"') + + clusters = [x.split('.', maxsplit=1)[0] for x in reader.fieldnames[1:]] + + for row in reader: + data[row['name']] = {} + + for field in row: + if field == 'name': + continue + + cluster, col = field.split('.', maxsplit=1) + if cluster not in clusters: + raise Error('Cluster "' + cluster + '" not recognised. Cannot continue') + if cluster not in data[row['name']]: + data[row['name']][cluster] = {} + + try: + value = float(row[field]) + except: + value = row[field] + data[row['name']][cluster][col] = value + + return data + diff --git a/ariba/tests/data/mic_plotter_load_summary_file.tsv b/ariba/tests/data/mic_plotter_load_summary_file.tsv new file mode 100644 index 00000000..c0a7e75c --- /dev/null +++ b/ariba/tests/data/mic_plotter_load_summary_file.tsv @@ -0,0 +1,3 @@ +name cluster1.assembled cluster1.match cluster1.ref_seq cluster1.pct_id cluster1.known_var cluster1.novel_var cluster1.group1.A42T cluster1.group1.A42T.% cluster2.assembled cluster2.match cluster2.ref_seq cluster2.pct_id cluster2.known_var cluster2.novel_var cluster2.group1.A42T cluster2.group1.A42T.% +name1 yes yes ref1 100.0 no no no NA yes yes ref2 99.0 yes no yes 95.42 +name2 yes yes_nonunique ref3 99.0 yes no yes 90.90 no no NA NA NA NA NA NA diff --git a/ariba/tests/mic_plotter_test.py b/ariba/tests/mic_plotter_test.py index 46324878..830f5da8 100644 --- a/ariba/tests/mic_plotter_test.py +++ b/ariba/tests/mic_plotter_test.py @@ -41,3 +41,22 @@ def test_load_mic_file(self): } self.assertEqual(got, expected) + + + def test_load_summary_file(self): + '''Test _load_summary_file''' + infile = os.path.join(data_dir, 'mic_plotter_load_summary_file.tsv') + got = mic_plotter.MicPlotter._load_summary_file(infile) + expected = { + 'name1': { + 'cluster1': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref1', 'pct_id': 100.0, 'known_var': 'no', 'novel_var': 'no', 'group1.A42T': 'no', 'group1.A42T.%': 'NA'}, + 'cluster2': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref2', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group1.A42T': 'yes', 'group1.A42T.%': 95.42}, + }, + 'name2': { + 'cluster1': {'assembled': 'yes', 'match': 'yes_nonunique', 'ref_seq': 'ref3', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group1.A42T': 'yes', 'group1.A42T.%': 90.90}, + 'cluster2': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'NA', 'novel_var': 'NA', 'group1.A42T': 'NA', 'group1.A42T.%': 'NA'}, + }, + } + self.maxDiff = None + self.assertEqual(got, expected) + From 89b2a1fe9b460ed321736df3ac91801ee26001c8 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 17 Feb 2017 16:24:34 +0000 Subject: [PATCH 04/88] New method to_boxplot_tsv --- ariba/mic_plotter.py | 39 +++++++++++++++++++ .../mic_plotter_to_boxplot_tsv.antibio1.tsv | 3 ++ .../mic_plotter_to_boxplot_tsv.antibio2.tsv | 3 ++ ariba/tests/mic_plotter_test.py | 35 +++++++++++++++++ 4 files changed, 80 insertions(+) create mode 100644 ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.tsv create mode 100644 ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.tsv diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 68bf4c79..de703162 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -83,3 +83,42 @@ def _load_summary_file(cls, infile): return data + + @classmethod + def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile): + ignore_columns = {'assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'} + with open(outfile, 'w') as f: + print('Sample\tMIC\tMutations', file=f) + + for sample in sorted(summary_data): + if sample not in mic_data: + raise Error('No MIC data found for sample "' + sample + '". Cannot continue') + + if antibiotic not in mic_data[sample]: + raise Error('Antibiotic "' + antibiotic + '" not found. Cannot continue') + + if mic_data[sample][antibiotic] == 'NA': + continue + + mutations = set() + + for cluster in summary_data[sample]: + if summary_data[sample][cluster]['assembled'] == 'interrupted': + mutations.add(cluster + '.interrupted') + + for column, value in summary_data[sample][cluster].items(): + if column in ignore_columns or column.endswith('.%'): + continue + + if value == 'yes': + mutations.add(cluster + '.' + column) + + if len(mutations): + mutations = list(mutations) + mutations.sort() + mutations = '+'.join(mutations) + else: + mutations = 'without_mutation' + + print(sample, mic_data[sample][antibiotic], mutations, sep='\t', file=f) + diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.tsv new file mode 100644 index 00000000..da14dad8 --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.tsv @@ -0,0 +1,3 @@ +Sample MIC Mutations +name1 0.25 cluster2.group2.A43T+cluster3.interrupted +name2 0.125 cluster1.group1.A42T diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.tsv new file mode 100644 index 00000000..ec781d6a --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.tsv @@ -0,0 +1,3 @@ +Sample MIC Mutations +name1 0.004 cluster2.group2.A43T+cluster3.interrupted +name3 0.002 without_mutation diff --git a/ariba/tests/mic_plotter_test.py b/ariba/tests/mic_plotter_test.py index 830f5da8..299b9893 100644 --- a/ariba/tests/mic_plotter_test.py +++ b/ariba/tests/mic_plotter_test.py @@ -1,4 +1,5 @@ import unittest +import filecmp import os from ariba import mic_plotter @@ -60,3 +61,37 @@ def test_load_summary_file(self): self.maxDiff = None self.assertEqual(got, expected) + + def test_to_boxplot_tsv(self): + '''Test _to_boxplot_tsv''' + mic_data = { + 'name1': {'antibio1': 0.25, 'antibio2': 0.004}, + 'name2': {'antibio1': 0.125, 'antibio2': 'NA'}, + 'name3': {'antibio1': 'NA', 'antibio2': 0.002}, + } + + summary_data = { + 'name1': { + 'cluster1': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref1', 'pct_id': 100.0, 'known_var': 'no', 'novel_var': 'no', 'group1.A42T': 'no', 'group1.A42T.%': 'NA'}, + 'cluster2': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref2', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group2.A43T': 'yes', 'group2.A43T.%': 95.42}, + 'cluster3': {'assembled': 'interrupted', 'match': 'no', 'ref_seq': 'ref3', 'pct_id': 99.0, 'known_var': 'no', 'novel_var': 'yes', 'A42T': 'no', 'A44T.%': 'NA'}, + }, + 'name2': { + 'cluster1': {'assembled': 'yes', 'match': 'yes_nonunique', 'ref_seq': 'ref3', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group1.A42T': 'yes', 'group1.A42T.%': 90.90}, + 'cluster2': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'NA', 'novel_var': 'NA', 'group2.A43T': 'NA', 'group2.A43T.%': 'NA'}, + 'cluster3': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'no', 'novel_var': 'no', 'A42T': 'no', 'A44T.%': 'NA'}, + }, + 'name3': { + 'cluster1': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref_seq42', 'pct_id': 100.0, 'known_var': 'no', 'novel_var': 'no', 'group1.A42T': 'no', 'group1.A42T.%': 'NA'}, + 'cluster2': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'NA', 'novel_var': 'NA', 'group2.A43T': 'NA', 'group2.A43T.%': 'NA'}, + 'cluster3': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'no', 'novel_var': 'no', 'A42T': 'no', 'A44T.%': 'NA'}, + }, + } + + tmp_tsv = 'tmp.mic_plotter_test.to_boxplot.tsv' + for i in ['1', '2']: + mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, 'antibio' + i, tmp_tsv) + expected_antibio1 = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.antibio' + i + '.tsv') + self.assertTrue(filecmp.cmp(tmp_tsv, expected_antibio1, shallow=False)) + os.unlink(tmp_tsv) + From 50994ac27293649f9d9219c58ca376c875cb4d83 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 17 Feb 2017 16:37:29 +0000 Subject: [PATCH 05/88] Return mutation info --- ariba/mic_plotter.py | 18 ++++++++++++------ ariba/tests/mic_plotter_test.py | 20 ++++++++++++++++---- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index de703162..21dde3b2 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -87,6 +87,9 @@ def _load_summary_file(cls, infile): @classmethod def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile): ignore_columns = {'assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'} + all_mutations = set() + all_mutations_seen_combinations = set() + with open(outfile, 'w') as f: print('Sample\tMIC\tMutations', file=f) @@ -113,12 +116,15 @@ def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile): if value == 'yes': mutations.add(cluster + '.' + column) - if len(mutations): - mutations = list(mutations) - mutations.sort() - mutations = '+'.join(mutations) - else: - mutations = 'without_mutation' + if len(mutations) == 0: + mutations.add('without_mutation') + all_mutations.update(mutations) + mutations = list(mutations) + mutations.sort() + all_mutations_seen_combinations.add(tuple(mutations)) + mutations = '+'.join(mutations) print(sample, mic_data[sample][antibiotic], mutations, sep='\t', file=f) + return all_mutations, all_mutations_seen_combinations + diff --git a/ariba/tests/mic_plotter_test.py b/ariba/tests/mic_plotter_test.py index 299b9893..2f392975 100644 --- a/ariba/tests/mic_plotter_test.py +++ b/ariba/tests/mic_plotter_test.py @@ -89,9 +89,21 @@ def test_to_boxplot_tsv(self): } tmp_tsv = 'tmp.mic_plotter_test.to_boxplot.tsv' - for i in ['1', '2']: - mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, 'antibio' + i, tmp_tsv) - expected_antibio1 = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.antibio' + i + '.tsv') - self.assertTrue(filecmp.cmp(tmp_tsv, expected_antibio1, shallow=False)) + expected_mutations = [ + {'cluster1.group1.A42T', 'cluster2.group2.A43T', 'cluster3.interrupted'}, + {'without_mutation', 'cluster2.group2.A43T', 'cluster3.interrupted'}, + ] + + expected_combs = [ + {('cluster2.group2.A43T', 'cluster3.interrupted'), ('cluster1.group1.A42T',)}, + {('without_mutation',), ('cluster2.group2.A43T', 'cluster3.interrupted')} + ] + + for i in [1, 2]: + got_muts, got_combs = mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, 'antibio' + str(i), tmp_tsv) + expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.antibio' + str(i) + '.tsv') + self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) + self.assertEqual(got_muts, expected_mutations[i-1]) + self.assertEqual(got_combs, expected_combs[i-1]) os.unlink(tmp_tsv) From c940627ab792cbc390db091b81831b0e78a7683f Mon Sep 17 00:00:00 2001 From: martinghunt Date: Mon, 20 Feb 2017 11:53:49 +0000 Subject: [PATCH 06/88] New method to_dots_tsv --- ariba/mic_plotter.py | 33 +++++++++++++++++++ ariba/tests/data/mic_plotter_to_dots.tsv | 4 +++ .../mic_plotter_to_dots_without_mutation.tsv | 6 ++++ ariba/tests/mic_plotter_test.py | 24 ++++++++++++++ 4 files changed, 67 insertions(+) create mode 100644 ariba/tests/data/mic_plotter_to_dots.tsv create mode 100644 ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 21dde3b2..2d8fc843 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -128,3 +128,36 @@ def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile): return all_mutations, all_mutations_seen_combinations + + @classmethod + def _to_dots_tsv(cls, all_mutations, combinations, outfile): + if 'without_mutation' in all_mutations: + all_mutations.remove('without_mutation') + combinations.remove(('without_mutation',)) + has_without_mutation = True + else: + has_without_mutation = False + + all_mutations = list(all_mutations) + all_mutations.sort() + combinations = list(combinations) + combinations.sort() + + if has_without_mutation: + all_mutations.append('without_mutation') + combinations.append(('without_mutation',)) + + output_columns = {} + for combination in combinations: + output_columns[combination] = [(1 if x in combination else 0) for x in all_mutations] + + with open(outfile, 'w') as f: + print('Mutation\t', end='', file=f) + for x in combinations: + print('\t', '+'.join(x), sep='', end='', file=f) + print('', file=f) + + for i in range(len(all_mutations)): + row = [all_mutations[i]] + [output_columns[x][i] for x in combinations] + print(*row, sep='\t', file=f) + diff --git a/ariba/tests/data/mic_plotter_to_dots.tsv b/ariba/tests/data/mic_plotter_to_dots.tsv new file mode 100644 index 00000000..307ba9c6 --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_dots.tsv @@ -0,0 +1,4 @@ +Mutation m1 m1+m3 m2+m3 +m1 1 1 0 +m2 0 0 1 +m3 0 1 1 diff --git a/ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv b/ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv new file mode 100644 index 00000000..a50841fa --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv @@ -0,0 +1,6 @@ +Mutation m1 m1+m3 m1+z1 m2+m3 without_mutation +m1 1 1 1 0 0 +m2 0 0 0 1 0 +m3 0 1 0 1 0 +z1 0 0 1 0 0 +without_mutation 0 0 0 0 1 diff --git a/ariba/tests/mic_plotter_test.py b/ariba/tests/mic_plotter_test.py index 2f392975..da5ec535 100644 --- a/ariba/tests/mic_plotter_test.py +++ b/ariba/tests/mic_plotter_test.py @@ -107,3 +107,27 @@ def test_to_boxplot_tsv(self): self.assertEqual(got_combs, expected_combs[i-1]) os.unlink(tmp_tsv) + + def test_to_dots_tsv(self): + '''test _to_dots_tsv''' + all_mutations = {'m1', 'm2', 'm3'} + combinations = { + ('m1',), + ('m1', 'm3'), + ('m2', 'm3'), + } + + tmp_tsv = 'tmp.test.mic_plotter_to_dots.tsv' + expected_tsv = os.path.join(data_dir, 'mic_plotter_to_dots.tsv') + mic_plotter.MicPlotter._to_dots_tsv(all_mutations, combinations, tmp_tsv) + self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) + os.unlink(tmp_tsv) + + all_mutations.update({'without_mutation', 'z1'}) + combinations.add(('without_mutation',)) + combinations.add(('m1', 'z1')) + expected_tsv = os.path.join(data_dir, 'mic_plotter_to_dots_without_mutation.tsv') + mic_plotter.MicPlotter._to_dots_tsv(all_mutations, combinations, tmp_tsv) + self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) + os.unlink(tmp_tsv) + From 1ff9338da4912915c5977832e6b4c00ad9768cc4 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Mon, 20 Feb 2017 14:54:43 +0000 Subject: [PATCH 07/88] Separate with dots instead of plus for R happiness --- ariba/mic_plotter.py | 5 +++-- ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.tsv | 2 +- ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.tsv | 2 +- ariba/tests/data/mic_plotter_to_dots.tsv | 2 +- ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv | 2 +- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 2d8fc843..583ec9b2 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -1,6 +1,7 @@ import csv import re import os +from ariba import common class Error (Exception): pass @@ -123,7 +124,7 @@ def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile): mutations = list(mutations) mutations.sort() all_mutations_seen_combinations.add(tuple(mutations)) - mutations = '+'.join(mutations) + mutations = '.'.join(mutations) print(sample, mic_data[sample][antibiotic], mutations, sep='\t', file=f) return all_mutations, all_mutations_seen_combinations @@ -154,7 +155,7 @@ def _to_dots_tsv(cls, all_mutations, combinations, outfile): with open(outfile, 'w') as f: print('Mutation\t', end='', file=f) for x in combinations: - print('\t', '+'.join(x), sep='', end='', file=f) + print('\t', '.'.join(x), sep='', end='', file=f) print('', file=f) for i in range(len(all_mutations)): diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.tsv index da14dad8..4806ebc3 100644 --- a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.tsv +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.tsv @@ -1,3 +1,3 @@ Sample MIC Mutations -name1 0.25 cluster2.group2.A43T+cluster3.interrupted +name1 0.25 cluster2.group2.A43T.cluster3.interrupted name2 0.125 cluster1.group1.A42T diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.tsv index ec781d6a..37b20976 100644 --- a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.tsv +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.tsv @@ -1,3 +1,3 @@ Sample MIC Mutations -name1 0.004 cluster2.group2.A43T+cluster3.interrupted +name1 0.004 cluster2.group2.A43T.cluster3.interrupted name3 0.002 without_mutation diff --git a/ariba/tests/data/mic_plotter_to_dots.tsv b/ariba/tests/data/mic_plotter_to_dots.tsv index 307ba9c6..7631ca35 100644 --- a/ariba/tests/data/mic_plotter_to_dots.tsv +++ b/ariba/tests/data/mic_plotter_to_dots.tsv @@ -1,4 +1,4 @@ -Mutation m1 m1+m3 m2+m3 +Mutation m1 m1.m3 m2.m3 m1 1 1 0 m2 0 0 1 m3 0 1 1 diff --git a/ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv b/ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv index a50841fa..497bad15 100644 --- a/ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv +++ b/ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv @@ -1,4 +1,4 @@ -Mutation m1 m1+m3 m1+z1 m2+m3 without_mutation +Mutation m1 m1.m3 m1.z1 m2.m3 without_mutation m1 1 1 1 0 0 m2 0 0 0 1 0 m3 0 1 0 1 0 From f0a50e9716d5cecb87a2c0d07cfc8762b08fddcd Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 21 Feb 2017 13:08:43 +0000 Subject: [PATCH 08/88] Add method _make_plot, fix some file format issues --- ariba/mic_plotter.py | 133 +++++++++++++++++- .../data/mic_plotter_load_summary_file.tsv | 6 +- ariba/tests/data/mic_plotter_to_dots.tsv | 2 +- .../mic_plotter_to_dots_without_mutation.tsv | 2 +- 4 files changed, 135 insertions(+), 8 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 583ec9b2..6741a9f1 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -8,9 +8,24 @@ class Error (Exception): pass regex_string_to_float = re.compile(r'\s*(?P[<>]?)\s*(?P=?)\s*(?P[0-9.]+)\s*$') class MicPlotter: - def __init__(self, mic_file, summary_file): + def __init__(self, + antibiotic, + mic_file, + summary_file, + outprefix, + main_title=None, + plot_height=15, + plot_width=15, + log_y=True, + ): + self.antibiotic = antibiotic self.mic_file = mic_file self.summary_file = summary_file + self.outprefix = outprefix + self.main_title = self.antibiotic if main_title is None else main_title + self.plot_height = plot_height + self.plot_width = plot_width + self.log_y = log_y @classmethod @@ -57,7 +72,8 @@ def _load_summary_file(cls, infile): data = {} with open(infile) as f: - reader = csv.DictReader(f, delimiter='\t') + reader = csv.DictReader(f, delimiter=',') + if reader.fieldnames[0] != 'name': raise Error('Error. Expected first column of summary file "' + infile + '" to be "name"') @@ -153,7 +169,7 @@ def _to_dots_tsv(cls, all_mutations, combinations, outfile): output_columns[combination] = [(1 if x in combination else 0) for x in all_mutations] with open(outfile, 'w') as f: - print('Mutation\t', end='', file=f) + print('Mutation', end='', file=f) for x in combinations: print('\t', '.'.join(x), sep='', end='', file=f) print('', file=f) @@ -162,3 +178,114 @@ def _to_dots_tsv(cls, all_mutations, combinations, outfile): row = [all_mutations[i]] + [output_columns[x][i] for x in combinations] print(*row, sep='\t', file=f) + + def _make_plot(self, + samples_file, + dots_file, + ): + r_script = self.outprefix + '.R' + + try: + f = open(r_script, 'w') + except: + raise Error('Error opening R script for writing "' + r_script + '"') + + libraries = ['ggplot2', 'RColorBrewer', 'reshape2', 'cowplot', 'latex2exp'] + for lib in libraries: + print('library(', lib, ')', sep='', file=f) + + print('samples = read.csv(file="', samples_file, r'''", header=TRUE, sep="\t")''', sep='', file=f) + print('dots = read.csv(file="', dots_file, r'''", header=TRUE, sep="\t")''', sep='', file=f) + + if self.log_y: + print('use.log = TRUE', file=f) + else: + print('use.log = FALSE', file=f) + + print(r''' +dots.melt = melt(dots) +colnames(dots.melt) <- c("var1", "var2", "value") + +accent <- brewer.pal(8, 'Accent') + accentPalette <- colorRampPalette(accent) + ncols <- length(as.vector(unique(samples$Mutations))) + cols <- accentPalette(ncols) + + +names(cols) <- sort(as.vector(unique(samples$Mutations))) +setcols <- c() + +for (i in 1:nrow(dots.melt)){ + if (dots.melt[i,3]==1){ + setcols <- c(setcols, cols[as.vector(dots.melt[i,2])]) + } + else{ + setcols <- c(setcols, NA) + } +} +dots.melt <- cbind(dots.melt, setcols) + +mutations <- levels(dots.melt$var1) +i <- match("without_mutation", mutations) +if (!is.na(i)) { + mutations <- c(mutations[-i], "without_mutation") +} + + +dotplot <- ggplot(dots.melt, aes(x=var2, y=var1)) + + geom_point(aes(fill=setcols, colour=setcols), size=8) + + scale_fill_identity()+ + scale_colour_identity()+ + ylim(rev(mutations)) + + theme_bw() + + theme(axis.text.x = element_blank(), + axis.text.y = element_text(size=18), + axis.title.x = element_blank(), + axis.title.y = element_blank(), + axis.ticks = element_blank(), + panel.border = element_blank(), + panel.grid.minor = element_blank(), + panel.grid.major = element_blank(), + legend.position="none") + +range.mics <- c(0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024) +if (use.log){ final.mics <- log(range.mics) }else{ final.mics <- range.mics } + +violinplot <- ggplot(data=samples, aes(x=Mutations, y=log(MIC))) + + geom_point(aes(color=Mutations), position = position_jitter(width=0.01, height=0.01), size=4, alpha=.5) + + geom_violin(aes(color=Mutations),alpha=.10, show.legend = FALSE) + + ylab(expression(paste("MIC ", mu, "g/mL"))) + + scale_colour_manual(values = cols) + + ggtitle("''' + self.main_title + r'''") + + scale_y_continuous(breaks=final.mics, labels=range.mics) + + theme_bw() + + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_blank(), + axis.line.x = element_line(color="black"), + axis.line.y = element_line(color="black"), + axis.title.x = element_blank(), + axis.title.y = element_text(size=22), + axis.text.x = element_blank(), + axis.text.y = element_text(size=24), + axis.title = element_text(size=20), + plot.title = element_text(lineheight=.6, size = 24, hjust=.5, face="bold"), + legend.position="none") + +plot_grid(violinplot, dotplot, ncol=1, align="v", rel_heights=c(3,1)) +''', file=f) + + print('ggsave("', self.outprefix, '.pdf", height=', self.plot_height, ', width=', self.plot_width, ')', sep='', file=f) + f.close() + common.syscall('R CMD BATCH ' + r_script) + + + def run(self): + mic_data = MicPlotter._load_mic_file(self.mic_file) + summary_data = MicPlotter._load_summary_file(self.summary_file) + boxplot_tsv = self.outprefix + '.boxplot.tsv' + all_mutations, combinations = MicPlotter._to_boxplot_tsv(summary_data, mic_data, self.antibiotic, boxplot_tsv) + dots_tsv = self.outprefix + '.dots.tsv' + MicPlotter._to_dots_tsv(all_mutations, combinations, dots_tsv) + self._make_plot(boxplot_tsv, dots_tsv) + diff --git a/ariba/tests/data/mic_plotter_load_summary_file.tsv b/ariba/tests/data/mic_plotter_load_summary_file.tsv index c0a7e75c..05c56772 100644 --- a/ariba/tests/data/mic_plotter_load_summary_file.tsv +++ b/ariba/tests/data/mic_plotter_load_summary_file.tsv @@ -1,3 +1,3 @@ -name cluster1.assembled cluster1.match cluster1.ref_seq cluster1.pct_id cluster1.known_var cluster1.novel_var cluster1.group1.A42T cluster1.group1.A42T.% cluster2.assembled cluster2.match cluster2.ref_seq cluster2.pct_id cluster2.known_var cluster2.novel_var cluster2.group1.A42T cluster2.group1.A42T.% -name1 yes yes ref1 100.0 no no no NA yes yes ref2 99.0 yes no yes 95.42 -name2 yes yes_nonunique ref3 99.0 yes no yes 90.90 no no NA NA NA NA NA NA +name,cluster1.assembled,cluster1.match,cluster1.ref_seq,cluster1.pct_id,cluster1.known_var,cluster1.novel_var,cluster1.group1.A42T,cluster1.group1.A42T.%,cluster2.assembled,cluster2.match,cluster2.ref_seq,cluster2.pct_id,cluster2.known_var,cluster2.novel_var,cluster2.group1.A42T,cluster2.group1.A42T.% +name1,yes,yes,ref1,100.0,no,no,no,NA,yes,yes,ref2,99.0,yes,no,yes,95.42 +name2,yes,yes_nonunique,ref3,99.0,yes,no,yes,90.90,no,no,NA,NA,NA,NA,NA,NA diff --git a/ariba/tests/data/mic_plotter_to_dots.tsv b/ariba/tests/data/mic_plotter_to_dots.tsv index 7631ca35..a188abfb 100644 --- a/ariba/tests/data/mic_plotter_to_dots.tsv +++ b/ariba/tests/data/mic_plotter_to_dots.tsv @@ -1,4 +1,4 @@ -Mutation m1 m1.m3 m2.m3 +Mutation m1 m1.m3 m2.m3 m1 1 1 0 m2 0 0 1 m3 0 1 1 diff --git a/ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv b/ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv index 497bad15..b7b79f00 100644 --- a/ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv +++ b/ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv @@ -1,4 +1,4 @@ -Mutation m1 m1.m3 m1.z1 m2.m3 without_mutation +Mutation m1 m1.m3 m1.z1 m2.m3 without_mutation m1 1 1 1 0 0 m2 0 0 0 1 0 m3 0 1 0 1 0 From b0e50ece0ad968ae25e87aa7ac6238ad4ea466ba Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 21 Feb 2017 13:09:20 +0000 Subject: [PATCH 09/88] Add task micplot --- ariba/tasks/__init__.py | 1 + ariba/tasks/micplot.py | 17 +++++++++++++++++ scripts/ariba | 17 +++++++++++++++++ 3 files changed, 35 insertions(+) create mode 100644 ariba/tasks/micplot.py diff --git a/ariba/tasks/__init__.py b/ariba/tasks/__init__.py index 769af324..299f5181 100644 --- a/ariba/tasks/__init__.py +++ b/ariba/tasks/__init__.py @@ -2,6 +2,7 @@ 'aln2meta', 'flag', 'getref', + 'micplot', 'prepareref', 'pubmlstget', 'pubmlstspecies', diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py new file mode 100644 index 00000000..eef793fc --- /dev/null +++ b/ariba/tasks/micplot.py @@ -0,0 +1,17 @@ +import argparse +import ariba + +def run(options): + print('HELLO') + plotter = ariba.mic_plotter.MicPlotter( + options.antibiotic, + options.mic_file, + options.summary_file, + options.outprefix, + main_title=options.main_title, + plot_height=options.plot_height, + plot_width=options.plot_width, + log_y=not options.no_log_y + ) + + plotter.run() diff --git a/scripts/ariba b/scripts/ariba index beab2716..39b8b955 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -56,6 +56,23 @@ subparser_getref.add_argument('outprefix', help='Prefix of output filenames') subparser_getref.set_defaults(func=ariba.tasks.getref.run) +#----------------------------- micplot ------------------------------- +subparser_micplot = subparsers.add_parser( + 'micplot', + help='Make violin/dot plots using MIC data', + usage='ariba prepareref [options] ', + description='Makes a violin and scatter plot of MIC per variant in the summary file', +) +subparser_micplot.add_argument('antibiotic', help='Antibiotic name. Must exactly match a column from the MIC file') +subparser_micplot.add_argument('mic_file', help='File containing MIC data for each sample and one or more antibiotics') +subparser_micplot.add_argument('summary_file', help='File made by running "ariba summary"') +subparser_micplot.add_argument('outprefix', help='Prefix of output files') +subparser_micplot.add_argument('--main_title', help='Main title of plot. Default is to use the antibiotic name') +subparser_micplot.add_argument('--plot_height', help='Height of plot (used in plot_height=X when running ggsave [%(default)s]', default=15) +subparser_micplot.add_argument('--plot_width', help='Height of plot (used in plot_width=X when running ggsave [%(default)s]', default=15) +subparser_micplot.add_argument('--no_log_y', action='store_true', help='Do not log scale the y axis') +subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) + #----------------------------- prepareref ------------------------------- subparser_prepareref = subparsers.add_parser( 'prepareref', From 2ff0f0cd0a325a95305d57c199ac69ef4390d46e Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 21 Feb 2017 13:15:29 +0000 Subject: [PATCH 10/88] Allow columns starting with digits to stop R doing that annoying thing of adding Xs all over the place. --- ariba/mic_plotter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 6741a9f1..f50b9d7b 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -195,7 +195,7 @@ def _make_plot(self, print('library(', lib, ')', sep='', file=f) print('samples = read.csv(file="', samples_file, r'''", header=TRUE, sep="\t")''', sep='', file=f) - print('dots = read.csv(file="', dots_file, r'''", header=TRUE, sep="\t")''', sep='', file=f) + print('dots = read.csv(file="', dots_file, r'''", header=TRUE, sep="\t", check.names=FALSE)''', sep='', file=f) if self.log_y: print('use.log = TRUE', file=f) From 8ce89bf636cd6dc1f3f0068e2961c3a4514b3eab Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 21 Feb 2017 13:36:05 +0000 Subject: [PATCH 11/88] Add option --plot_types --- ariba/mic_plotter.py | 54 ++++++++++++++++++++++++++---------------- ariba/tasks/micplot.py | 3 ++- scripts/ariba | 1 + 3 files changed, 37 insertions(+), 21 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index f50b9d7b..e7f2aef7 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -17,6 +17,7 @@ def __init__(self, plot_height=15, plot_width=15, log_y=True, + plot_types="points,violin", ): self.antibiotic = antibiotic self.mic_file = mic_file @@ -26,6 +27,11 @@ def __init__(self, self.plot_height = plot_height self.plot_width = plot_width self.log_y = log_y + self.plot_types = set(plot_types.split(',')) + + allowed_plot_types = {'point', 'violin', 'boxplot'} + if not self.plot_types.issubset(allowed_plot_types): + raise Error('Error in plot_types option. Allowed types are: ' + str(allowed_plot_types) + '. Got: ' + str(self.plot_types)) @classmethod @@ -251,26 +257,34 @@ def _make_plot(self, range.mics <- c(0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024) if (use.log){ final.mics <- log(range.mics) }else{ final.mics <- range.mics } -violinplot <- ggplot(data=samples, aes(x=Mutations, y=log(MIC))) + - geom_point(aes(color=Mutations), position = position_jitter(width=0.01, height=0.01), size=4, alpha=.5) + - geom_violin(aes(color=Mutations),alpha=.10, show.legend = FALSE) + - ylab(expression(paste("MIC ", mu, "g/mL"))) + - scale_colour_manual(values = cols) + - ggtitle("''' + self.main_title + r'''") + - scale_y_continuous(breaks=final.mics, labels=range.mics) + - theme_bw() + - theme(panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_blank(), - axis.line.x = element_line(color="black"), - axis.line.y = element_line(color="black"), - axis.title.x = element_blank(), - axis.title.y = element_text(size=22), - axis.text.x = element_blank(), - axis.text.y = element_text(size=24), - axis.title = element_text(size=20), - plot.title = element_text(lineheight=.6, size = 24, hjust=.5, face="bold"), - legend.position="none") +violinplot <- ggplot(data=samples, aes(x=Mutations, y=log(MIC))) + ''', file=f) + + if 'point' in self.plot_types: + print(r''' geom_point(aes(color=Mutations), position = position_jitter(width=0.01, height=0.01), size=4, alpha=.5) +''', file=f) + + if 'violin' in self.plot_types: + print(r''' geom_violin(aes(color=Mutations),alpha=.10, show.legend = FALSE) +''', file=f) + + if 'boxplot' in self.plot_types: + print(r''' geom_boxplot(aes(color=Mutations),alpha=.10, show.legend = FALSE) +''', file=f) + + print(r''' ylab(expression(paste("MIC ", mu, "g/mL"))) + + scale_colour_manual(values = cols) + + ggtitle("''' + self.main_title + r'''") + + scale_y_continuous(breaks=final.mics, labels=range.mics) + + theme_bw() + + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_blank(), + axis.line.x = element_line(color="black"), + axis.line.y = element_line(color="black"), + axis.title.x = element_blank(), + axis.title.y = element_text(size=22), + axis.text.x = element_blank(), + axis.text.y = element_text(size=24), + axis.title = element_text(size=20), + plot.title = element_text(lineheight=.6, size = 24, hjust=.5, face="bold"), + legend.position="none") plot_grid(violinplot, dotplot, ncol=1, align="v", rel_heights=c(3,1)) ''', file=f) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index eef793fc..d271038b 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -11,7 +11,8 @@ def run(options): main_title=options.main_title, plot_height=options.plot_height, plot_width=options.plot_width, - log_y=not options.no_log_y + log_y=not options.no_log_y, + plot_types=options.plot_types ) plotter.run() diff --git a/scripts/ariba b/scripts/ariba index 39b8b955..346d1230 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -71,6 +71,7 @@ subparser_micplot.add_argument('--main_title', help='Main title of plot. Default subparser_micplot.add_argument('--plot_height', help='Height of plot (used in plot_height=X when running ggsave [%(default)s]', default=15) subparser_micplot.add_argument('--plot_width', help='Height of plot (used in plot_width=X when running ggsave [%(default)s]', default=15) subparser_micplot.add_argument('--no_log_y', action='store_true', help='Do not log scale the y axis') +subparser_micplot.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from boxplot,violin,point [%(default)s]', default='violin,point') subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) #----------------------------- prepareref ------------------------------- From d3c499a7f4931daf5cc3b37140c9d22a47acb368 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 21 Feb 2017 13:36:29 +0000 Subject: [PATCH 12/88] Remove debug print --- ariba/tasks/micplot.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index d271038b..dc9b19d3 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -2,7 +2,6 @@ import ariba def run(options): - print('HELLO') plotter = ariba.mic_plotter.MicPlotter( options.antibiotic, options.mic_file, From 6644a239cf676eaa17c1fa637ff29a38cb28f9b0 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 21 Feb 2017 13:49:54 +0000 Subject: [PATCH 13/88] Fix log_y option --- ariba/mic_plotter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index e7f2aef7..dfd943a5 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -256,8 +256,12 @@ def _make_plot(self, range.mics <- c(0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024) if (use.log){ final.mics <- log(range.mics) }else{ final.mics <- range.mics } +''', file=f) -violinplot <- ggplot(data=samples, aes(x=Mutations, y=log(MIC))) + ''', file=f) + if self.log_y: + print(r'''violinplot <- ggplot(data=samples, aes(x=Mutations, y=log(MIC))) +''', file=f) + else: + print(r'''violinplot <- ggplot(data=samples, aes(x=Mutations, y=MIC)) +''', file=f) if 'point' in self.plot_types: print(r''' geom_point(aes(color=Mutations), position = position_jitter(width=0.01, height=0.01), size=4, alpha=.5) +''', file=f) From 3c4aa3e301e13780eae83f96a2b56be67f083dcf Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 21 Feb 2017 14:01:37 +0000 Subject: [PATCH 14/88] Add options jitter_width jitter_height --- ariba/mic_plotter.py | 7 ++++++- ariba/tasks/micplot.py | 4 +++- scripts/ariba | 2 ++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index dfd943a5..8eb4440a 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -18,6 +18,8 @@ def __init__(self, plot_width=15, log_y=True, plot_types="points,violin", + jitter_width=0.1, + jitter_height=0.01, ): self.antibiotic = antibiotic self.mic_file = mic_file @@ -33,6 +35,9 @@ def __init__(self, if not self.plot_types.issubset(allowed_plot_types): raise Error('Error in plot_types option. Allowed types are: ' + str(allowed_plot_types) + '. Got: ' + str(self.plot_types)) + self.jitter_width = jitter_width + self.jitter_height = jitter_height + @classmethod def _mic_string_to_float(cls, s): @@ -264,7 +269,7 @@ def _make_plot(self, print(r'''violinplot <- ggplot(data=samples, aes(x=Mutations, y=MIC)) +''', file=f) if 'point' in self.plot_types: - print(r''' geom_point(aes(color=Mutations), position = position_jitter(width=0.01, height=0.01), size=4, alpha=.5) +''', file=f) + print(r''' geom_point(aes(color=Mutations), position = position_jitter(width=''', self.jitter_width, ', height=', self.jitter_height, '), size=4, alpha=.5) +', sep='', file=f) if 'violin' in self.plot_types: print(r''' geom_violin(aes(color=Mutations),alpha=.10, show.legend = FALSE) +''', file=f) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index dc9b19d3..3dd8fe1a 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -11,7 +11,9 @@ def run(options): plot_height=options.plot_height, plot_width=options.plot_width, log_y=not options.no_log_y, - plot_types=options.plot_types + plot_types=options.plot_types, + jitter_width=options.jitter_width, + jitter_height=options.jitter_height, ) plotter.run() diff --git a/scripts/ariba b/scripts/ariba index 346d1230..d8dfeac2 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -72,6 +72,8 @@ subparser_micplot.add_argument('--plot_height', help='Height of plot (used in pl subparser_micplot.add_argument('--plot_width', help='Height of plot (used in plot_width=X when running ggsave [%(default)s]', default=15) subparser_micplot.add_argument('--no_log_y', action='store_true', help='Do not log scale the y axis') subparser_micplot.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from boxplot,violin,point [%(default)s]', default='violin,point') +subparser_micplot.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)]', default=0.1, type=float) +subparser_micplot.add_argument('--jitter_height', help='Jitter height option when plotting points [%(default)]', default=0.02, type=float) subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) #----------------------------- prepareref ------------------------------- From 8132972c19e9fa6c7ec91174f73cbebb162589c4 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 21 Feb 2017 14:06:41 +0000 Subject: [PATCH 15/88] Tidy up help for micplot --- scripts/ariba | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/ariba b/scripts/ariba index d8dfeac2..b54519ea 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -67,13 +67,13 @@ subparser_micplot.add_argument('antibiotic', help='Antibiotic name. Must exactly subparser_micplot.add_argument('mic_file', help='File containing MIC data for each sample and one or more antibiotics') subparser_micplot.add_argument('summary_file', help='File made by running "ariba summary"') subparser_micplot.add_argument('outprefix', help='Prefix of output files') -subparser_micplot.add_argument('--main_title', help='Main title of plot. Default is to use the antibiotic name') -subparser_micplot.add_argument('--plot_height', help='Height of plot (used in plot_height=X when running ggsave [%(default)s]', default=15) -subparser_micplot.add_argument('--plot_width', help='Height of plot (used in plot_width=X when running ggsave [%(default)s]', default=15) +subparser_micplot.add_argument('--main_title', help='Main title of plot. Default is to use the antibiotic name', metavar='"title in quotes"') +subparser_micplot.add_argument('--plot_height', help='Height of plot (used in plot_height=X when running ggsave [%(default)s]', default=15, type=float, metavar='FLOAT') +subparser_micplot.add_argument('--plot_width', help='Height of plot (used in plot_width=X when running ggsave [%(default)s]', default=15, type=float, metavar='FLOAT') subparser_micplot.add_argument('--no_log_y', action='store_true', help='Do not log scale the y axis') -subparser_micplot.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from boxplot,violin,point [%(default)s]', default='violin,point') -subparser_micplot.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)]', default=0.1, type=float) -subparser_micplot.add_argument('--jitter_height', help='Jitter height option when plotting points [%(default)]', default=0.02, type=float) +subparser_micplot.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from boxplot,violin,point [%(default)s]', default='violin,point', metavar='type1,type2,...') +subparser_micplot.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)s]', default=0.1, type=float, metavar='FLOAT') +subparser_micplot.add_argument('--jitter_height', help='Jitter height option when plotting points [%(default)s]', default=0.02, type=float, metavar='FLOAT') subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) #----------------------------- prepareref ------------------------------- From 723910d78360b9364ff7f6b3c9157a525e6c2ca0 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Wed, 22 Feb 2017 10:18:37 +0000 Subject: [PATCH 16/88] Optional column 2 in filenames fofn --- ariba/summary.py | 26 ++++++++--- ariba/tests/data/summary_test_load_fofn | 4 ++ ariba/tests/summary_test.py | 62 ++++++++++++++++--------- scripts/ariba | 2 +- 4 files changed, 65 insertions(+), 29 deletions(-) create mode 100644 ariba/tests/data/summary_test_load_fofn diff --git a/ariba/summary.py b/ariba/summary.py index de9f06f3..fa28d444 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -30,12 +30,12 @@ def __init__( raise Error('Error! Must supply filenames or fofn to Summary(). Cannot continue') if filenames is None: - self.filenames = [] + self.filenames = {} else: - self.filenames = filenames + self.filenames = {x: None for x in filenames} if fofn is not None: - self.filenames.extend(self._load_fofn(fofn)) + self.filenames.update(self._load_fofn(fofn)) self.cluster_columns = self._determine_cluster_cols(cluster_cols) self.filter_rows = filter_rows @@ -66,9 +66,21 @@ def _determine_cluster_cols(cols_string): return Summary._determine_cols(cols_string, allowed_cols, 'cluster columns') - def _load_fofn(self, fofn): + @classmethod + def _load_fofn(cls, fofn): + '''Returns dictionary of filename -> short name. Value is None + whenever short name is not provided''' + filenames = {} f = pyfastaq.utils.open_file_read(fofn) - filenames = [x.rstrip() for x in f.readlines()] + for line in f: + fields = line.rstrip().split() + if len(fields) == 1: + filenames[fields[0]] = None + elif len(fields) == 2: + filenames[fields[0]] = fields[1] + else: + raise Error('Error at the following line of file ' + fofn + '. Expected at most 2 fields.\n' + line) + pyfastaq.utils.close(f) return filenames @@ -159,8 +171,8 @@ def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols): summary_cols_set = set(['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var']) summary_cols_in_order = [x for x in summary_cols_in_order if cluster_cols[x]] - for filename in filenames: - line = [filename] + for filename in sorted(filenames): + line = [filename if filenames[filename] is None else filenames[filename]] for cluster_name in sorted(all_potential_columns): group_cols = sorted(list(all_potential_columns[cluster_name]['groups'])) diff --git a/ariba/tests/data/summary_test_load_fofn b/ariba/tests/data/summary_test_load_fofn new file mode 100644 index 00000000..95e7d6df --- /dev/null +++ b/ariba/tests/data/summary_test_load_fofn @@ -0,0 +1,4 @@ +/foo/bar/abc.tsv +/spam/eggs/file1.tsv short_name1 +/spam/eggs/file2.tsv short_name2 +/spam/eggs/file3.tsv diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index f98e6e60..ff427a70 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -11,11 +11,11 @@ def test_init(self): '''Test init''' fofn = os.path.join(data_dir, 'summary_test_init.fofn') s = summary.Summary('out', fofn=fofn) - self.assertEqual(s.filenames, ['file1', 'file2']) - s = summary.Summary('out', filenames=['file42']) - self.assertEqual(s.filenames, ['file42']) - s = summary.Summary('out', fofn=fofn, filenames=['file42']) - self.assertEqual(s.filenames, ['file42', 'file1', 'file2']) + self.assertEqual(s.filenames, {'file1': None, 'file2': None}) + s = summary.Summary('out', filenames={'file42': None}) + self.assertEqual(s.filenames, {'file42': None}) + s = summary.Summary('out', fofn=fofn, filenames={'file42': None}) + self.assertEqual(s.filenames, {'file42': None, 'file1': None, 'file2': None}) def test_determine_cluster_cols(self): @@ -43,6 +43,19 @@ def test_determine_cluster_cols(self): self.assertEqual(expected[i], summary.Summary._determine_cluster_cols(col_strings[i])) + def test_load_fofn(self): + '''Test _load_fofn''' + infile = os.path.join(data_dir, 'summary_test_load_fofn') + got = summary.Summary._load_fofn(infile) + expected = { + '/foo/bar/abc.tsv': None, + '/spam/eggs/file1.tsv': 'short_name1', + '/spam/eggs/file2.tsv': 'short_name2', + '/spam/eggs/file3.tsv': None + } + self.assertEqual(expected, got) + + def test_load_input_files(self): '''Test _load_input_files''' file1 = os.path.join(data_dir, 'summary_test_load_input_files.1.tsv') @@ -214,7 +227,7 @@ def test_gather_unfiltered_output_data(self): self.maxDiff = None s = summary.Summary('out', filenames=infiles) - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() self.assertEqual(expected_potential_cols, s.all_potential_columns) self.assertEqual(expected_all, s.all_data) @@ -226,7 +239,7 @@ def test_gather_unfiltered_output_data(self): expected_all[infiles[1]]['noncoding1']['groups'] = {'id1': 'het', 'id1.%': 80.0, 'id3': 'yes', 'id3.%': 100.0} expected_all[infiles[1]]['noncoding2']['groups'] = {'id2': 'het', 'id2.%': 40.0} s = summary.Summary('out', filenames=infiles, show_var_groups=True) - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() self.assertEqual(expected_potential_cols, s.all_potential_columns) self.assertEqual(expected_all, s.all_data) @@ -239,7 +252,7 @@ def test_gather_unfiltered_output_data(self): expected_all[infiles[1]]['noncoding1']['vars'] = {'A14T': 'het', 'A14T.%': 80.0, 'A6G': 'yes', 'A6G.%': 100.0} expected_all[infiles[1]]['noncoding2']['vars'] = {'A52T': 'het', 'A52T.%': 40.0} s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True) - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() self.assertEqual(expected_potential_cols, s.all_potential_columns) self.assertEqual(expected_all, s.all_data) @@ -250,7 +263,7 @@ def test_gather_unfiltered_output_data(self): expected_all[infiles[0]]['presence_absence2']['vars'] = {'V175L': 'yes'} expected_all[infiles[1]]['presence_absence1']['vars'] = {'A10V': 'yes'} s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True, show_novel_vars=True) - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() self.assertEqual(expected_potential_cols, s.all_potential_columns) self.assertEqual(expected_all, s.all_data) @@ -263,15 +276,22 @@ def test_to_matrix_all_cols(self): os.path.join(data_dir, 'summary_to_matrix.2.tsv') ] - s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True, show_novel_vars=True) - s.samples = summary.Summary._load_input_files(infiles, 90) + fofn = 'tmp.summary_to_matrix_all_cols' + with open(fofn, 'w') as f: + print(infiles[0], 'sample1', file=f) + print(infiles[1], file=f) + + + s = summary.Summary('out', fofn=fofn, show_var_groups=True, show_known_vars=True, show_novel_vars=True) + os.unlink(fofn) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() - got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns) expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.id3.%:c2', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding1.A6G.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.id2:o1', 'noncoding2.id2.%:c2', 'noncoding2.A42T:o1', 'noncoding2.A42T.%:c2', 'noncoding2.A52T:o1', 'noncoding2.A52T.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1', 'presence_absence1.A10V:o1'] expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.id3.%', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding1.A6G.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'noncoding2.A42T', 'noncoding2.A42T.%', 'noncoding2.A52T', 'noncoding2.A52T.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var', 'presence_absence1.A10V'] expected_matrix = [ - [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 100.0, 'no', 'NA', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 100.0, 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'], + ['sample1', 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 100.0, 'no', 'NA', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 100.0, 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'], [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 100.0, 'het', 80.0, 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'no', 'NA', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes'] ] @@ -288,9 +308,9 @@ def test_to_matrix_with_groups(self): ] s = summary.Summary('out', filenames=infiles, show_var_groups=True) - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() - got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns) expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.id3.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.id2:o1', 'noncoding2.id2.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1'] expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.id3.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var'] @@ -312,9 +332,9 @@ def test_to_matrix_with_vars(self): ] s = summary.Summary('out', filenames=infiles, show_known_vars=True, show_novel_vars=True) - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() - got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns) expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding1.A6G.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.A42T:o1', 'noncoding2.A42T.%:c2', 'noncoding2.A52T:o1', 'noncoding2.A52T.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1', 'presence_absence1.A10V:o1'] expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding1.A6G.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.A42T', 'noncoding2.A42T.%', 'noncoding2.A52T', 'noncoding2.A52T.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var', 'presence_absence1.A10V'] @@ -336,9 +356,9 @@ def test_to_matrix_cluster_only(self): ] s = summary.Summary('out', filenames=infiles) - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() - got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns) expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1'] expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var'] @@ -360,9 +380,9 @@ def test_to_matrix_assembled_only(self): ] s = summary.Summary('out', filenames=infiles, cluster_cols='assembled') - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() - got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns) expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding2.assembled:o1', 'presence_absence1.assembled:o1'] expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding2.assembled', 'presence_absence1.assembled'] diff --git a/scripts/ariba b/scripts/ariba index b54519ea..55f92cd1 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -207,7 +207,7 @@ subparser_summary = subparsers.add_parser( epilog='Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input.' ) -subparser_summary.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME') +subparser_summary.add_argument('-f', '--fofn', help='File of filenames of ariba reports to be summarised. Must be used if no input files listed after the outfile. The first column should be the filename. An optional second column can be used to specify a sample name for that file, which will be used instead of the filename in output files. Columns separated by whitespace.', metavar='FILENAME') subparser_summary.add_argument('--preset', choices=summary_presets, help='Shorthand for setting --cluster_cols,--col_filter,--row_filter,--v_groups,--variants. Using this overrides those options', metavar='|'.join(summary_presets)) subparser_summary.add_argument('--cluster_cols', help='Comma separated list of cluster columns to include. Choose from: assembled, match, ref_seq, pct_id, known_var, novel_var [%(default)s]', default='match', metavar='col1,col2,...') subparser_summary.add_argument('--col_filter', choices=['y', 'n'], default='y', help='Choose whether columns where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n') From 349710874fd0d7592132a96b1558ac0dbad42172 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Wed, 22 Feb 2017 11:53:50 +0000 Subject: [PATCH 17/88] Add option --no_combinations --- ariba/mic_plotter.py | 61 ++++++++++++------- ariba/tasks/micplot.py | 1 + ..._boxplot_tsv.antibio_no_combinations.1.tsv | 4 ++ ..._boxplot_tsv.antibio_no_combinations.2.tsv | 4 ++ ariba/tests/mic_plotter_test.py | 12 ++++ scripts/ariba | 1 + 6 files changed, 60 insertions(+), 23 deletions(-) create mode 100644 ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio_no_combinations.1.tsv create mode 100644 ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio_no_combinations.2.tsv diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 8eb4440a..48f38ec8 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -20,6 +20,7 @@ def __init__(self, plot_types="points,violin", jitter_width=0.1, jitter_height=0.01, + no_combinations=False ): self.antibiotic = antibiotic self.mic_file = mic_file @@ -37,6 +38,7 @@ def __init__(self, self.jitter_width = jitter_width self.jitter_height = jitter_height + self.no_combinations = no_combinations @classmethod @@ -113,7 +115,7 @@ def _load_summary_file(cls, infile): @classmethod - def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile): + def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile, no_combinations=False): ignore_columns = {'assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'} all_mutations = set() all_mutations_seen_combinations = set() @@ -150,9 +152,14 @@ def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile): all_mutations.update(mutations) mutations = list(mutations) mutations.sort() - all_mutations_seen_combinations.add(tuple(mutations)) - mutations = '.'.join(mutations) - print(sample, mic_data[sample][antibiotic], mutations, sep='\t', file=f) + if no_combinations: + for mutation in mutations: + all_mutations_seen_combinations.add((mutation,)) + print(sample, mic_data[sample][antibiotic], mutation, sep='\t', file=f) + else: + all_mutations_seen_combinations.add(tuple(mutations)) + mutations = '.'.join(mutations) + print(sample, mic_data[sample][antibiotic], mutations, sep='\t', file=f) return all_mutations, all_mutations_seen_combinations @@ -277,27 +284,35 @@ def _make_plot(self, if 'boxplot' in self.plot_types: print(r''' geom_boxplot(aes(color=Mutations),alpha=.10, show.legend = FALSE) +''', file=f) + if self.no_combinations: + axis_text_x = 'element_text(size=24, angle=45, hjust=1)' + else: + axis_text_x = 'element_blank()' + print(r''' ylab(expression(paste("MIC ", mu, "g/mL"))) + - scale_colour_manual(values = cols) + - ggtitle("''' + self.main_title + r'''") + - scale_y_continuous(breaks=final.mics, labels=range.mics) + - theme_bw() + - theme(panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_blank(), - axis.line.x = element_line(color="black"), - axis.line.y = element_line(color="black"), - axis.title.x = element_blank(), - axis.title.y = element_text(size=22), - axis.text.x = element_blank(), - axis.text.y = element_text(size=24), - axis.title = element_text(size=20), - plot.title = element_text(lineheight=.6, size = 24, hjust=.5, face="bold"), - legend.position="none") - -plot_grid(violinplot, dotplot, ncol=1, align="v", rel_heights=c(3,1)) + scale_colour_manual(values = cols) + + ggtitle("''' + self.main_title + r'''") + + scale_y_continuous(breaks=final.mics, labels=range.mics) + + theme_bw() + + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_blank(), + axis.line.x = element_line(color="black"), + axis.line.y = element_line(color="black"), + axis.title.x = element_blank(), + axis.title.y = element_text(size=22), + axis.text.x = ''' + axis_text_x + r''', + axis.text.y = element_text(size=24), + axis.title = element_text(size=20), + plot.title = element_text(lineheight=.6, size = 24, hjust=.5, face="bold"), + legend.position="none") ''', file=f) + if self.no_combinations: + print('violinplot', file=f) + else: + print('plot_grid(violinplot, dotplot, ncol=1, align="v", rel_heights=c(3,1))', file=f) + print('ggsave("', self.outprefix, '.pdf", height=', self.plot_height, ', width=', self.plot_width, ')', sep='', file=f) f.close() common.syscall('R CMD BATCH ' + r_script) @@ -307,7 +322,7 @@ def run(self): mic_data = MicPlotter._load_mic_file(self.mic_file) summary_data = MicPlotter._load_summary_file(self.summary_file) boxplot_tsv = self.outprefix + '.boxplot.tsv' - all_mutations, combinations = MicPlotter._to_boxplot_tsv(summary_data, mic_data, self.antibiotic, boxplot_tsv) + all_mutations, combinations = MicPlotter._to_boxplot_tsv(summary_data, mic_data, self.antibiotic, boxplot_tsv, no_combinations=self.no_combinations) dots_tsv = self.outprefix + '.dots.tsv' MicPlotter._to_dots_tsv(all_mutations, combinations, dots_tsv) self._make_plot(boxplot_tsv, dots_tsv) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 3dd8fe1a..fef1800c 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -14,6 +14,7 @@ def run(options): plot_types=options.plot_types, jitter_width=options.jitter_width, jitter_height=options.jitter_height, + no_combinations=options.no_combinations ) plotter.run() diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio_no_combinations.1.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio_no_combinations.1.tsv new file mode 100644 index 00000000..7ef4771f --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio_no_combinations.1.tsv @@ -0,0 +1,4 @@ +Sample MIC Mutations +name1 0.25 cluster2.group2.A43T +name1 0.25 cluster3.interrupted +name2 0.125 cluster1.group1.A42T diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio_no_combinations.2.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio_no_combinations.2.tsv new file mode 100644 index 00000000..1912500e --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio_no_combinations.2.tsv @@ -0,0 +1,4 @@ +Sample MIC Mutations +name1 0.004 cluster2.group2.A43T +name1 0.004 cluster3.interrupted +name3 0.002 without_mutation diff --git a/ariba/tests/mic_plotter_test.py b/ariba/tests/mic_plotter_test.py index da5ec535..e6e1659b 100644 --- a/ariba/tests/mic_plotter_test.py +++ b/ariba/tests/mic_plotter_test.py @@ -99,6 +99,11 @@ def test_to_boxplot_tsv(self): {('without_mutation',), ('cluster2.group2.A43T', 'cluster3.interrupted')} ] + expected_no_combs = [ + {('cluster2.group2.A43T',), ('cluster3.interrupted',), ('cluster1.group1.A42T',)}, + {('without_mutation',), ('cluster2.group2.A43T', ), ('cluster3.interrupted',)} + ] + for i in [1, 2]: got_muts, got_combs = mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, 'antibio' + str(i), tmp_tsv) expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.antibio' + str(i) + '.tsv') @@ -107,6 +112,13 @@ def test_to_boxplot_tsv(self): self.assertEqual(got_combs, expected_combs[i-1]) os.unlink(tmp_tsv) + got_muts, got_combs = mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, 'antibio' + str(i), tmp_tsv, no_combinations=True) + expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.antibio_no_combinations.' + str(i) + '.tsv') + self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) + self.assertEqual(got_muts, expected_mutations[i-1]) + self.assertEqual(got_combs, expected_no_combs[i-1]) + os.unlink(tmp_tsv) + def test_to_dots_tsv(self): '''test _to_dots_tsv''' diff --git a/scripts/ariba b/scripts/ariba index 55f92cd1..908b9112 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -74,6 +74,7 @@ subparser_micplot.add_argument('--no_log_y', action='store_true', help='Do not l subparser_micplot.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from boxplot,violin,point [%(default)s]', default='violin,point', metavar='type1,type2,...') subparser_micplot.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)s]', default=0.1, type=float, metavar='FLOAT') subparser_micplot.add_argument('--jitter_height', help='Jitter height option when plotting points [%(default)s]', default=0.02, type=float, metavar='FLOAT') +subparser_micplot.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) #----------------------------- prepareref ------------------------------- From b8de5c7623ba3333b5acfd9ca829a7f29af1e30b Mon Sep 17 00:00:00 2001 From: martinghunt Date: Wed, 22 Feb 2017 11:56:19 +0000 Subject: [PATCH 18/88] nodingbats to keep Illustrator happy with the pdf --- ariba/mic_plotter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 48f38ec8..aa5845a1 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -313,7 +313,7 @@ def _make_plot(self, else: print('plot_grid(violinplot, dotplot, ncol=1, align="v", rel_heights=c(3,1))', file=f) - print('ggsave("', self.outprefix, '.pdf", height=', self.plot_height, ', width=', self.plot_width, ')', sep='', file=f) + print('ggsave("', self.outprefix, '.pdf", useDingbats=FALSE, height=', self.plot_height, ', width=', self.plot_width, ')', sep='', file=f) f.close() common.syscall('R CMD BATCH ' + r_script) From f24e8a4b4204f938253d003c39d342c5d638a4a7 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Wed, 22 Feb 2017 13:07:38 +0000 Subject: [PATCH 19/88] Add option --mic_values --- ariba/mic_plotter.py | 10 ++++++++-- ariba/tasks/micplot.py | 3 ++- scripts/ariba | 1 + 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index aa5845a1..1a9ae6d7 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -20,7 +20,8 @@ def __init__(self, plot_types="points,violin", jitter_width=0.1, jitter_height=0.01, - no_combinations=False + no_combinations=False, + mic_values='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024' ): self.antibiotic = antibiotic self.mic_file = mic_file @@ -40,6 +41,11 @@ def __init__(self, self.jitter_height = jitter_height self.no_combinations = no_combinations + try: + self.mic_values = [float(x) for x in mic_values.split(',')] + except: + raise Error('Error in mic_values option. Needs to be a list of numbers separated by commas. Got this:\n' + mic_values) + @classmethod def _mic_string_to_float(cls, s): @@ -266,7 +272,7 @@ def _make_plot(self, panel.grid.major = element_blank(), legend.position="none") -range.mics <- c(0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024) +range.mics <- c(''' + ','.join([str(x) for x in self.mic_values]) + r''') if (use.log){ final.mics <- log(range.mics) }else{ final.mics <- range.mics } ''', file=f) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index fef1800c..4199dc0a 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -14,7 +14,8 @@ def run(options): plot_types=options.plot_types, jitter_width=options.jitter_width, jitter_height=options.jitter_height, - no_combinations=options.no_combinations + no_combinations=options.no_combinations, + mic_values=options.mic_values ) plotter.run() diff --git a/scripts/ariba b/scripts/ariba index 908b9112..3c242387 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -75,6 +75,7 @@ subparser_micplot.add_argument('--plot_types', help='Types of plots to make, sep subparser_micplot.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)s]', default=0.1, type=float, metavar='FLOAT') subparser_micplot.add_argument('--jitter_height', help='Jitter height option when plotting points [%(default)s]', default=0.02, type=float, metavar='FLOAT') subparser_micplot.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') +subparser_micplot.add_argument('--mic_values', help='Comma-separated list of MICs values to be shown on y axis [%(default)s]', default='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024') subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) #----------------------------- prepareref ------------------------------- From 5b354fd14286f7dab95583a6628308f4cbde953c Mon Sep 17 00:00:00 2001 From: martinghunt Date: Wed, 22 Feb 2017 13:25:34 +0000 Subject: [PATCH 20/88] New option --hlines --- ariba/mic_plotter.py | 17 ++++++++++++++++- ariba/tasks/micplot.py | 3 ++- scripts/ariba | 1 + 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 1a9ae6d7..3470d892 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -21,7 +21,8 @@ def __init__(self, jitter_width=0.1, jitter_height=0.01, no_combinations=False, - mic_values='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024' + mic_values='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024', + hlines='0.25,2' ): self.antibiotic = antibiotic self.mic_file = mic_file @@ -46,6 +47,14 @@ def __init__(self, except: raise Error('Error in mic_values option. Needs to be a list of numbers separated by commas. Got this:\n' + mic_values) + try: + if len(hlines) == 0: + self.hlines = [] + else: + self.hlines = [float(x) for x in hlines.split(',')] + except: + raise Error('Error in hlines option. Needs to be a list of numbers separated by commas, or empty. Got this:\n' + hlines) + @classmethod def _mic_string_to_float(cls, s): @@ -295,6 +304,12 @@ def _make_plot(self, else: axis_text_x = 'element_blank()' + for x in self.hlines: + if self.log_y: + print(' geom_hline(yintercept=log(', x, '), lty=2) +', sep='', file=f) + else: + print(' geom_hline(yintercept=', x, ', lty=2) +', sep='', file=f) + print(r''' ylab(expression(paste("MIC ", mu, "g/mL"))) + scale_colour_manual(values = cols) + ggtitle("''' + self.main_title + r'''") + diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 4199dc0a..68080815 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -15,7 +15,8 @@ def run(options): jitter_width=options.jitter_width, jitter_height=options.jitter_height, no_combinations=options.no_combinations, - mic_values=options.mic_values + mic_values=options.mic_values, + hlines=options.hlines ) plotter.run() diff --git a/scripts/ariba b/scripts/ariba index 3c242387..36a240ec 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -76,6 +76,7 @@ subparser_micplot.add_argument('--jitter_width', help='Jitter width option when subparser_micplot.add_argument('--jitter_height', help='Jitter height option when plotting points [%(default)s]', default=0.02, type=float, metavar='FLOAT') subparser_micplot.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') subparser_micplot.add_argument('--mic_values', help='Comma-separated list of MICs values to be shown on y axis [%(default)s]', default='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024') +subparser_micplot.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. For no lines use an empty string, ie: --hlines '' [%(default)s]', default='0.25,2') subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) #----------------------------- prepareref ------------------------------- From 362ead9b01f5cfc688a65f030bfd8ed48235bac5 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Wed, 22 Feb 2017 13:28:02 +0000 Subject: [PATCH 21/88] fix usage typo --- scripts/ariba | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ariba b/scripts/ariba index 36a240ec..85b71a82 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -75,7 +75,7 @@ subparser_micplot.add_argument('--plot_types', help='Types of plots to make, sep subparser_micplot.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)s]', default=0.1, type=float, metavar='FLOAT') subparser_micplot.add_argument('--jitter_height', help='Jitter height option when plotting points [%(default)s]', default=0.02, type=float, metavar='FLOAT') subparser_micplot.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') -subparser_micplot.add_argument('--mic_values', help='Comma-separated list of MICs values to be shown on y axis [%(default)s]', default='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024') +subparser_micplot.add_argument('--mic_values', help='Comma-separated list of MIC values to be shown on y axis ticks [%(default)s]', default='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024') subparser_micplot.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. For no lines use an empty string, ie: --hlines '' [%(default)s]', default='0.25,2') subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) From 38520b9daf3474fcc689f0d885416b83f91faea1 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Wed, 22 Feb 2017 13:40:44 +0000 Subject: [PATCH 22/88] add options --point_size --dot_size --- ariba/mic_plotter.py | 10 +++++++--- ariba/tasks/micplot.py | 4 +++- scripts/ariba | 6 ++++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 3470d892..3435c5b6 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -22,7 +22,9 @@ def __init__(self, jitter_height=0.01, no_combinations=False, mic_values='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024', - hlines='0.25,2' + hlines='0.25,2', + point_size=4, + dot_size=8, ): self.antibiotic = antibiotic self.mic_file = mic_file @@ -55,6 +57,8 @@ def __init__(self, except: raise Error('Error in hlines option. Needs to be a list of numbers separated by commas, or empty. Got this:\n' + hlines) + self.point_size = point_size + self.dot_size = dot_size @classmethod def _mic_string_to_float(cls, s): @@ -266,7 +270,7 @@ def _make_plot(self, dotplot <- ggplot(dots.melt, aes(x=var2, y=var1)) + - geom_point(aes(fill=setcols, colour=setcols), size=8) + + geom_point(aes(fill=setcols, colour=setcols), size=''' + str(self.dot_size) + r''') + scale_fill_identity()+ scale_colour_identity()+ ylim(rev(mutations)) + @@ -291,7 +295,7 @@ def _make_plot(self, print(r'''violinplot <- ggplot(data=samples, aes(x=Mutations, y=MIC)) +''', file=f) if 'point' in self.plot_types: - print(r''' geom_point(aes(color=Mutations), position = position_jitter(width=''', self.jitter_width, ', height=', self.jitter_height, '), size=4, alpha=.5) +', sep='', file=f) + print(r''' geom_point(aes(color=Mutations), position = position_jitter(width=''', self.jitter_width, ', height=', self.jitter_height, '), size=', self.point_size, ', alpha=.5) +', sep='', file=f) if 'violin' in self.plot_types: print(r''' geom_violin(aes(color=Mutations),alpha=.10, show.legend = FALSE) +''', file=f) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 68080815..6e621c50 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -16,7 +16,9 @@ def run(options): jitter_height=options.jitter_height, no_combinations=options.no_combinations, mic_values=options.mic_values, - hlines=options.hlines + hlines=options.hlines, + point_size=options.point_size, + dot_size=options.dot_size ) plotter.run() diff --git a/scripts/ariba b/scripts/ariba index 85b71a82..0fe8b9dc 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -75,8 +75,10 @@ subparser_micplot.add_argument('--plot_types', help='Types of plots to make, sep subparser_micplot.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)s]', default=0.1, type=float, metavar='FLOAT') subparser_micplot.add_argument('--jitter_height', help='Jitter height option when plotting points [%(default)s]', default=0.02, type=float, metavar='FLOAT') subparser_micplot.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') -subparser_micplot.add_argument('--mic_values', help='Comma-separated list of MIC values to be shown on y axis ticks [%(default)s]', default='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024') -subparser_micplot.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. For no lines use an empty string, ie: --hlines '' [%(default)s]', default='0.25,2') +subparser_micplot.add_argument('--mic_values', help='Comma-separated list of MIC values to be shown on y axis ticks [%(default)s]', default='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024', metavar='float1,float2,...') +subparser_micplot.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. For no lines use an empty string, ie: --hlines '' [%(default)s]', default='0.25,2', metavar='float1,float2,...') +subparser_micplot.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point [%(default)s]', default=4, metavar='FLOAT') +subparser_micplot.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=8, metavar='FLOAT') subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) #----------------------------- prepareref ------------------------------- From 16946318edcc6261a698438811a8df59aebaec93 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Wed, 22 Feb 2017 13:42:57 +0000 Subject: [PATCH 23/88] Fix quotes in usage --- scripts/ariba | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ariba b/scripts/ariba index 0fe8b9dc..bbee1b61 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -76,7 +76,7 @@ subparser_micplot.add_argument('--jitter_width', help='Jitter width option when subparser_micplot.add_argument('--jitter_height', help='Jitter height option when plotting points [%(default)s]', default=0.02, type=float, metavar='FLOAT') subparser_micplot.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') subparser_micplot.add_argument('--mic_values', help='Comma-separated list of MIC values to be shown on y axis ticks [%(default)s]', default='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024', metavar='float1,float2,...') -subparser_micplot.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. For no lines use an empty string, ie: --hlines '' [%(default)s]', default='0.25,2', metavar='float1,float2,...') +subparser_micplot.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. For no lines use an empty string, ie: --hlines \'\' [%(default)s]', default='0.25,2', metavar='float1,float2,...') subparser_micplot.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point [%(default)s]', default=4, metavar='FLOAT') subparser_micplot.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=8, metavar='FLOAT') subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) From 902f092d9cdfdd3b97b18cfe54e25c692c84f254 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Wed, 22 Feb 2017 14:20:41 +0000 Subject: [PATCH 24/88] R library latex2exp not needed --- ariba/mic_plotter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 3435c5b6..c25d9f06 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -227,7 +227,7 @@ def _make_plot(self, except: raise Error('Error opening R script for writing "' + r_script + '"') - libraries = ['ggplot2', 'RColorBrewer', 'reshape2', 'cowplot', 'latex2exp'] + libraries = ['ggplot2', 'RColorBrewer', 'reshape2', 'cowplot'] for lib in libraries: print('library(', lib, ')', sep='', file=f) From 663667f298ceba650c9e1b4f92e32dabec41dceb Mon Sep 17 00:00:00 2001 From: martinghunt Date: Wed, 22 Feb 2017 15:38:09 +0000 Subject: [PATCH 25/88] Remove dependency on R library cowplot --- ariba/mic_plotter.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index c25d9f06..a68c42a4 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -227,7 +227,7 @@ def _make_plot(self, except: raise Error('Error opening R script for writing "' + r_script + '"') - libraries = ['ggplot2', 'RColorBrewer', 'reshape2', 'cowplot'] + libraries = ['ggplot2', 'RColorBrewer', 'reshape2'] for lib in libraries: print('library(', lib, ')', sep='', file=f) @@ -335,10 +335,20 @@ def _make_plot(self, if self.no_combinations: print('violinplot', file=f) + print('ggsave("', self.outprefix, '.pdf", useDingbats=FALSE, height=', self.plot_height, ', width=', self.plot_width, ')', sep='', file=f) else: - print('plot_grid(violinplot, dotplot, ncol=1, align="v", rel_heights=c(3,1))', file=f) + print(r'''library(gtable) +library(grid) +g1 <- ggplotGrob(violinplot) +g2 <- ggplotGrob(dotplot) +g <- rbind(g1, g2, size="first") +g$widths <- unit.pmax(g1$widths, g2$widths) +panels <- g$layout$t[grepl("panel", g$layout$name)] +g$heights[panels][1] = unit(2,"null") +grid.newpage() +grid.draw(g) +ggsave("''', self.outprefix, '.pdf", plot=g, useDingbats=FALSE, height=', self.plot_height, ', width=', self.plot_width, ')', sep='', file=f) - print('ggsave("', self.outprefix, '.pdf", useDingbats=FALSE, height=', self.plot_height, ', width=', self.plot_width, ')', sep='', file=f) f.close() common.syscall('R CMD BATCH ' + r_script) From b9a5da920574bfd25a7d40693d363daf61cbd7aa Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 23 Feb 2017 09:12:59 +0000 Subject: [PATCH 26/88] Remove whitespace in R data files --- ariba/mic_plotter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index a68c42a4..11e858cb 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -163,7 +163,7 @@ def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile, no_combina continue if value == 'yes': - mutations.add(cluster + '.' + column) + mutations.add(cluster + '.' + column.strip()) if len(mutations) == 0: mutations.add('without_mutation') From 7cc5eddf9193f38c005a7f7a55104a27fa8502e2 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 23 Feb 2017 09:35:23 +0000 Subject: [PATCH 27/88] Add option --panel_heights --- ariba/mic_plotter.py | 10 +++++++++- ariba/tasks/micplot.py | 3 ++- scripts/ariba | 1 + 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 11e858cb..e77fbfbf 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -25,6 +25,7 @@ def __init__(self, hlines='0.25,2', point_size=4, dot_size=8, + panel_heights='5,1' ): self.antibiotic = antibiotic self.mic_file = mic_file @@ -60,6 +61,12 @@ def __init__(self, self.point_size = point_size self.dot_size = dot_size + try: + self.panel_heights = [int(x) for x in panel_heights.split(',')] + except: + raise Error('Error in panel_heights option. Needs to be of the form integer1,integer2. Got this:\n' + panel_heights) + + @classmethod def _mic_string_to_float(cls, s): regex_match = regex_string_to_float.match(s) @@ -344,7 +351,8 @@ def _make_plot(self, g <- rbind(g1, g2, size="first") g$widths <- unit.pmax(g1$widths, g2$widths) panels <- g$layout$t[grepl("panel", g$layout$name)] -g$heights[panels][1] = unit(2,"null") +g$heights[panels][1] = unit(''', self.panel_heights[0], r''',"null") +g$heights[panels][2] = unit(''', self.panel_heights[1], r''',"null") grid.newpage() grid.draw(g) ggsave("''', self.outprefix, '.pdf", plot=g, useDingbats=FALSE, height=', self.plot_height, ', width=', self.plot_width, ')', sep='', file=f) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 6e621c50..e87b666a 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -18,7 +18,8 @@ def run(options): mic_values=options.mic_values, hlines=options.hlines, point_size=options.point_size, - dot_size=options.dot_size + dot_size=options.dot_size, + panel_heights=options.panel_heights, ) plotter.run() diff --git a/scripts/ariba b/scripts/ariba index bbee1b61..d27923c9 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -79,6 +79,7 @@ subparser_micplot.add_argument('--mic_values', help='Comma-separated list of MIC subparser_micplot.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. For no lines use an empty string, ie: --hlines \'\' [%(default)s]', default='0.25,2', metavar='float1,float2,...') subparser_micplot.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point [%(default)s]', default=4, metavar='FLOAT') subparser_micplot.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=8, metavar='FLOAT') +subparser_micplot.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom panels. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='5,1', metavar='height1,height2') subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) #----------------------------- prepareref ------------------------------- From 893ac182b2004dff8c3d7a7c40b2a99822e04a85 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 23 Feb 2017 10:23:58 +0000 Subject: [PATCH 28/88] Add options --palette --number_of_colours --- ariba/mic_plotter.py | 33 ++++++++++++++++++++++++++------- ariba/tasks/micplot.py | 2 ++ scripts/ariba | 2 ++ 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index e77fbfbf..8c69ece0 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -25,7 +25,9 @@ def __init__(self, hlines='0.25,2', point_size=4, dot_size=8, - panel_heights='5,1' + panel_heights='5,1', + palette='Accent', + number_of_colours=0 ): self.antibiotic = antibiotic self.mic_file = mic_file @@ -66,6 +68,8 @@ def __init__(self, except: raise Error('Error in panel_heights option. Needs to be of the form integer1,integer2. Got this:\n' + panel_heights) + self.palette = palette + self.number_of_colours = number_of_colours @classmethod def _mic_string_to_float(cls, s): @@ -250,11 +254,26 @@ def _make_plot(self, dots.melt = melt(dots) colnames(dots.melt) <- c("var1", "var2", "value") -accent <- brewer.pal(8, 'Accent') - accentPalette <- colorRampPalette(accent) - ncols <- length(as.vector(unique(samples$Mutations))) - cols <- accentPalette(ncols) +palette.name = "''', self.palette, r'''" +colour.number = ''', self.number_of_colours, r''' +ncols <- length(as.vector(unique(samples$Mutations))) + +if (colour.number == 0) { + accent <- brewer.pal(8, palette.name) + accentPalette <- colorRampPalette(accent) + cols <- accentPalette(ncols) +} else if (colour.number == 1) { + cols <- rep("black", ncols) +} else { + if (colour.number == 2) { + unique_cols <- c("#7FC97F", "#BEAED4") + } + else { + unique_cols <- brewer.pal(colour.number, palette.name) + } + cols <- rep(unique_cols, ncols) +} names(cols) <- sort(as.vector(unique(samples$Mutations))) setcols <- c() @@ -277,7 +296,7 @@ def _make_plot(self, dotplot <- ggplot(dots.melt, aes(x=var2, y=var1)) + - geom_point(aes(fill=setcols, colour=setcols), size=''' + str(self.dot_size) + r''') + + geom_point(aes(fill=setcols, colour=setcols), size=''', self.dot_size, r''') + scale_fill_identity()+ scale_colour_identity()+ ylim(rev(mutations)) + @@ -294,7 +313,7 @@ def _make_plot(self, range.mics <- c(''' + ','.join([str(x) for x in self.mic_values]) + r''') if (use.log){ final.mics <- log(range.mics) }else{ final.mics <- range.mics } -''', file=f) +''', sep='', file=f) if self.log_y: print(r'''violinplot <- ggplot(data=samples, aes(x=Mutations, y=log(MIC))) +''', file=f) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index e87b666a..6f76e396 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -20,6 +20,8 @@ def run(options): point_size=options.point_size, dot_size=options.dot_size, panel_heights=options.panel_heights, + palette=options.palette, + number_of_colours=options.number_of_colours ) plotter.run() diff --git a/scripts/ariba b/scripts/ariba index d27923c9..f382363a 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -80,6 +80,8 @@ subparser_micplot.add_argument('--hlines', help='Comma-separated list of positio subparser_micplot.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point [%(default)s]', default=4, metavar='FLOAT') subparser_micplot.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=8, metavar='FLOAT') subparser_micplot.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom panels. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='5,1', metavar='height1,height2') +subparser_micplot.add_argument('--palette', help='ColourBrewer palette to use [%(default)s]', default='Accent', metavar='palette_name') +subparser_micplot.add_argument('--number_of_colours', help='Number of colours in plot. 0:same number as columns in the plot. 1:all black. >1: take the first N colours from the colour palette specified by --palette and cycle them [%(default)s]', default=0, metavar='INT') subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) #----------------------------- prepareref ------------------------------- From 673d79606814b0117a0ad9f0194ca9a6c0163033 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 23 Feb 2017 10:34:10 +0000 Subject: [PATCH 29/88] Bug fix using chosen palette with 2 colours --- ariba/mic_plotter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 8c69ece0..c06932d5 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -266,7 +266,7 @@ def _make_plot(self, cols <- rep("black", ncols) } else { if (colour.number == 2) { - unique_cols <- c("#7FC97F", "#BEAED4") + unique_cols <- brewer.pal(3, palette.name)[1:2] } else { unique_cols <- brewer.pal(colour.number, palette.name) From c6de9c149589c4d602dd735dfdf684723d5296ff Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 23 Feb 2017 11:43:09 +0000 Subject: [PATCH 30/88] Proprtionally sized point plotting --- ariba/mic_plotter.py | 25 +++++++++++++++---------- scripts/ariba | 2 +- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index c06932d5..e5a71d00 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -313,21 +313,26 @@ def _make_plot(self, range.mics <- c(''' + ','.join([str(x) for x in self.mic_values]) + r''') if (use.log){ final.mics <- log(range.mics) }else{ final.mics <- range.mics } -''', sep='', file=f) - if self.log_y: - print(r'''violinplot <- ggplot(data=samples, aes(x=Mutations, y=log(MIC))) +''', file=f) - else: - print(r'''violinplot <- ggplot(data=samples, aes(x=Mutations, y=MIC)) +''', file=f) +sized_dot_data <- aggregate(samples$Sample,by=list(x=samples$Mutations,y=samples$MIC),length) +names(sized_dot_data)[3] <- "count" + +top_plot = ggplot() +''', sep='', file=f) + + ymic = 'log(MIC)' if self.log_y else 'MIC' if 'point' in self.plot_types: - print(r''' geom_point(aes(color=Mutations), position = position_jitter(width=''', self.jitter_width, ', height=', self.jitter_height, '), size=', self.point_size, ', alpha=.5) +', sep='', file=f) + if self.point_size > 0: + print(' geom_point(data=samples, aes(x=Mutations, y=', ymic, ', color=Mutations), position = position_jitter(width=''', self.jitter_width, ', height=', self.jitter_height, '), size=', self.point_size, ', alpha=.5) +', sep='', file=f) + else: + y = 'log(y)' if self.log_y else 'y' + print(' geom_point(data=sized_dot_data, aes(x=x, y=', y, ', size=count, color=x)) +', sep='', file=f) if 'violin' in self.plot_types: - print(r''' geom_violin(aes(color=Mutations),alpha=.10, show.legend = FALSE) +''', file=f) + print(' geom_violin(data=samples, aes(x=Mutations, y=', ymic, ', color=Mutations), alpha=.10, show.legend = FALSE) +''', file=f) if 'boxplot' in self.plot_types: - print(r''' geom_boxplot(aes(color=Mutations),alpha=.10, show.legend = FALSE) +''', file=f) + print(' geom_boxplot(data=samples, aes(x=Mutations, y=', ymic, ', color=Mutations), alpha=.10, show.legend = FALSE) +''', file=f) if self.no_combinations: axis_text_x = 'element_text(size=24, angle=45, hjust=1)' @@ -360,12 +365,12 @@ def _make_plot(self, ''', file=f) if self.no_combinations: - print('violinplot', file=f) + print('top_plot', file=f) print('ggsave("', self.outprefix, '.pdf", useDingbats=FALSE, height=', self.plot_height, ', width=', self.plot_width, ')', sep='', file=f) else: print(r'''library(gtable) library(grid) -g1 <- ggplotGrob(violinplot) +g1 <- ggplotGrob(top_plot) g2 <- ggplotGrob(dotplot) g <- rbind(g1, g2, size="first") g$widths <- unit.pmax(g1$widths, g2$widths) diff --git a/scripts/ariba b/scripts/ariba index f382363a..f14b8382 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -77,7 +77,7 @@ subparser_micplot.add_argument('--jitter_height', help='Jitter height option whe subparser_micplot.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') subparser_micplot.add_argument('--mic_values', help='Comma-separated list of MIC values to be shown on y axis ticks [%(default)s]', default='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024', metavar='float1,float2,...') subparser_micplot.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. For no lines use an empty string, ie: --hlines \'\' [%(default)s]', default='0.25,2', metavar='float1,float2,...') -subparser_micplot.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point [%(default)s]', default=4, metavar='FLOAT') +subparser_micplot.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point. If zero, will group points and size them proportional to the group size [%(default)s]', default=4, metavar='FLOAT') subparser_micplot.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=8, metavar='FLOAT') subparser_micplot.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom panels. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='5,1', metavar='height1,height2') subparser_micplot.add_argument('--palette', help='ColourBrewer palette to use [%(default)s]', default='Accent', metavar='palette_name') From f21dcfebbf595f1380b6e591be00c0955d00b8eb Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 23 Feb 2017 13:49:52 +0000 Subject: [PATCH 31/88] Add option --dot_outline --- ariba/mic_plotter.py | 8 ++++++-- ariba/tasks/micplot.py | 1 + scripts/ariba | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index e5a71d00..922073d1 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -25,6 +25,7 @@ def __init__(self, hlines='0.25,2', point_size=4, dot_size=8, + dot_outline=False, panel_heights='5,1', palette='Accent', number_of_colours=0 @@ -62,6 +63,7 @@ def __init__(self, self.point_size = point_size self.dot_size = dot_size + self.dot_outline = dot_outline try: self.panel_heights = [int(x) for x in panel_heights.split(',')] @@ -250,6 +252,8 @@ def _make_plot(self, else: print('use.log = FALSE', file=f) + dot_colour = '"black"' if self.dot_outline else 'setcols' + print(r''' dots.melt = melt(dots) colnames(dots.melt) <- c("var1", "var2", "value") @@ -283,7 +287,7 @@ def _make_plot(self, setcols <- c(setcols, cols[as.vector(dots.melt[i,2])]) } else{ - setcols <- c(setcols, NA) + setcols <- c(setcols, "white") } } dots.melt <- cbind(dots.melt, setcols) @@ -296,7 +300,7 @@ def _make_plot(self, dotplot <- ggplot(dots.melt, aes(x=var2, y=var1)) + - geom_point(aes(fill=setcols, colour=setcols), size=''', self.dot_size, r''') + + geom_point(aes(fill=setcols, colour=''', dot_colour, '), shape=21, size=''', self.dot_size, r''') + scale_fill_identity()+ scale_colour_identity()+ ylim(rev(mutations)) + diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 6f76e396..4e93d6e1 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -19,6 +19,7 @@ def run(options): hlines=options.hlines, point_size=options.point_size, dot_size=options.dot_size, + dot_outline=options.dot_outline, panel_heights=options.panel_heights, palette=options.palette, number_of_colours=options.number_of_colours diff --git a/scripts/ariba b/scripts/ariba index f14b8382..b468740e 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -79,6 +79,7 @@ subparser_micplot.add_argument('--mic_values', help='Comma-separated list of MIC subparser_micplot.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. For no lines use an empty string, ie: --hlines \'\' [%(default)s]', default='0.25,2', metavar='float1,float2,...') subparser_micplot.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point. If zero, will group points and size them proportional to the group size [%(default)s]', default=4, metavar='FLOAT') subparser_micplot.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=8, metavar='FLOAT') +subparser_micplot.add_argument('--dot_outline', action='store_true', help='Black outline around all dots (whether coloured or not) in lower part of plots') subparser_micplot.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom panels. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='5,1', metavar='height1,height2') subparser_micplot.add_argument('--palette', help='ColourBrewer palette to use [%(default)s]', default='Accent', metavar='palette_name') subparser_micplot.add_argument('--number_of_colours', help='Number of colours in plot. 0:same number as columns in the plot. 1:all black. >1: take the first N colours from the colour palette specified by --palette and cycle them [%(default)s]', default=0, metavar='INT') From fdac1d609036e74aa68c259d86053c6462abb66f Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 23 Feb 2017 13:52:42 +0000 Subject: [PATCH 32/88] Tidy up indentation --- ariba/mic_plotter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 922073d1..053932dc 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -301,9 +301,9 @@ def _make_plot(self, dotplot <- ggplot(dots.melt, aes(x=var2, y=var1)) + geom_point(aes(fill=setcols, colour=''', dot_colour, '), shape=21, size=''', self.dot_size, r''') + - scale_fill_identity()+ - scale_colour_identity()+ - ylim(rev(mutations)) + + scale_fill_identity()+ + scale_colour_identity()+ + ylim(rev(mutations)) + theme_bw() + theme(axis.text.x = element_blank(), axis.text.y = element_text(size=18), @@ -321,7 +321,7 @@ def _make_plot(self, sized_dot_data <- aggregate(samples$Sample,by=list(x=samples$Mutations,y=samples$MIC),length) names(sized_dot_data)[3] <- "count" -top_plot = ggplot() +''', sep='', file=f) +top_plot <- ggplot() +''', sep='', file=f) ymic = 'log(MIC)' if self.log_y else 'MIC' From f7a5de7695c588362e90100b0f245989c56d7a7d Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 23 Feb 2017 15:29:01 +0000 Subject: [PATCH 33/88] Add legend and related options --- ariba/mic_plotter.py | 27 ++++++++++++++++++++++++--- ariba/tasks/micplot.py | 4 ++++ scripts/ariba | 4 ++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 053932dc..08eac816 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -24,6 +24,10 @@ def __init__(self, mic_values='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024', hlines='0.25,2', point_size=4, + point_range='2,15', + point_break='10,50,100,200,300', + point_legend_x=-0.15, + point_legend_y=0.9, dot_size=8, dot_outline=False, panel_heights='5,1', @@ -62,6 +66,19 @@ def __init__(self, raise Error('Error in hlines option. Needs to be a list of numbers separated by commas, or empty. Got this:\n' + hlines) self.point_size = point_size + + try: + self.point_range = [int(x) for x in point_range.split(',')] + except: + raise Error('Error in point_range option. Needs to be of the form integer1,integer2. Got this:\n' + point_range) + + try: + self.point_break = [int(x) for x in point_break.split(',')] + except: + raise Error('Error in point_break option. Needs to be comma-sparated list of integers. Got this:\n' + point_break) + + self.point_legend_x = point_legend_x + self.point_legend_y = point_legend_y self.dot_size = dot_size self.dot_outline = dot_outline @@ -324,6 +341,7 @@ def _make_plot(self, top_plot <- ggplot() +''', sep='', file=f) ymic = 'log(MIC)' if self.log_y else 'MIC' + legend_position = '"none"' if 'point' in self.plot_types: if self.point_size > 0: @@ -331,6 +349,7 @@ def _make_plot(self, else: y = 'log(y)' if self.log_y else 'y' print(' geom_point(data=sized_dot_data, aes(x=x, y=', y, ', size=count, color=x)) +', sep='', file=f) + legend_position = 'c(' + str(self.point_legend_x) + ',' + str(self.point_legend_y) + ')' if 'violin' in self.plot_types: print(' geom_violin(data=samples, aes(x=Mutations, y=', ymic, ', color=Mutations), alpha=.10, show.legend = FALSE) +''', file=f) @@ -350,7 +369,8 @@ def _make_plot(self, print(' geom_hline(yintercept=', x, ', lty=2) +', sep='', file=f) print(r''' ylab(expression(paste("MIC ", mu, "g/mL"))) + - scale_colour_manual(values = cols) + + scale_colour_manual(values = cols, guide=FALSE) + + scale_size(range=c(''' + ','.join([str(x) for x in self.point_range]) + r'''), breaks = c(''' + ','.join([str(x) for x in self.point_break]) + r''')) + ggtitle("''' + self.main_title + r'''") + scale_y_continuous(breaks=final.mics, labels=range.mics) + theme_bw() + @@ -365,8 +385,9 @@ def _make_plot(self, axis.text.y = element_text(size=24), axis.title = element_text(size=20), plot.title = element_text(lineheight=.6, size = 24, hjust=.5, face="bold"), - legend.position="none") -''', file=f) + legend.title = element_text(size=30), + legend.text = element_text(size=20), + legend.position=''' + legend_position + ')', file=f) if self.no_combinations: print('top_plot', file=f) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 4e93d6e1..1e0215ef 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -18,6 +18,10 @@ def run(options): mic_values=options.mic_values, hlines=options.hlines, point_size=options.point_size, + point_range=options.point_range, + point_break=options.point_break, + point_legend_x=options.point_legend_x, + point_legend_y=options.point_legend_y, dot_size=options.dot_size, dot_outline=options.dot_outline, panel_heights=options.panel_heights, diff --git a/scripts/ariba b/scripts/ariba index b468740e..bfc5cd88 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -78,6 +78,10 @@ subparser_micplot.add_argument('--no_combinations', action='store_true', help='D subparser_micplot.add_argument('--mic_values', help='Comma-separated list of MIC values to be shown on y axis ticks [%(default)s]', default='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024', metavar='float1,float2,...') subparser_micplot.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. For no lines use an empty string, ie: --hlines \'\' [%(default)s]', default='0.25,2', metavar='float1,float2,...') subparser_micplot.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point. If zero, will group points and size them proportional to the group size [%(default)s]', default=4, metavar='FLOAT') +subparser_micplot.add_argument('--point_range', help='Min and max size of plotted points when --point_size is 0 [%(default)s]', default='2,15', metavar='min,max') +subparser_micplot.add_argument('--point_break', help='Comma-spearated list of breakpoints in point sizes when --point_size is 0 [%(default)s]', default='10,50,100,200,300', metavar='integer1.integer2,integer3,...') +subparser_micplot.add_argument('--point_legend_x', type=float, help='x coord of legend when --point_size is 0 [%(default)s]', default=-0.15, metavar='FLOAT') +subparser_micplot.add_argument('--point_legend_y', type=float, help='y coord of legend when --point_size is 0 [%(default)s]', default=0.9, metavar='FLOAT') subparser_micplot.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=8, metavar='FLOAT') subparser_micplot.add_argument('--dot_outline', action='store_true', help='Black outline around all dots (whether coloured or not) in lower part of plots') subparser_micplot.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom panels. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='5,1', metavar='height1,height2') From 9767db174ab269d31a3375f98982b4ac0892b69d Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 24 Feb 2017 10:48:50 +0000 Subject: [PATCH 34/88] Do not try to log zero (breaks older R versions in ggplot) --- ariba/mic_plotter.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 08eac816..f138f4a2 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -332,7 +332,10 @@ def _make_plot(self, panel.grid.major = element_blank(), legend.position="none") -range.mics <- c(''' + ','.join([str(x) for x in self.mic_values]) + r''') +range.mics <- sort(c(''' + ','.join([str(x) for x in self.mic_values]) + r''')) +if (use.log & range.mics[1] == 0) { + range.mics <- range.mics[-1] +} if (use.log){ final.mics <- log(range.mics) }else{ final.mics <- range.mics } sized_dot_data <- aggregate(samples$Sample,by=list(x=samples$Mutations,y=samples$MIC),length) From 50e24d2fe94eb5717d45f7ade74221300f0db538 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 24 Feb 2017 11:07:19 +0000 Subject: [PATCH 35/88] Fix y axis label when logging --- ariba/mic_plotter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index f138f4a2..3501d950 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -371,7 +371,8 @@ def _make_plot(self, else: print(' geom_hline(yintercept=', x, ', lty=2) +', sep='', file=f) - print(r''' ylab(expression(paste("MIC ", mu, "g/mL"))) + + + print(r''' ylab(expression(paste("''' + ymic + r''' ", mu, "g/mL"))) + scale_colour_manual(values = cols, guide=FALSE) + scale_size(range=c(''' + ','.join([str(x) for x in self.point_range]) + r'''), breaks = c(''' + ','.join([str(x) for x in self.point_break]) + r''')) + ggtitle("''' + self.main_title + r'''") + From b212674ac3b27d73487b5e7aa66bf882cf6b131d Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 24 Feb 2017 13:00:33 +0000 Subject: [PATCH 36/88] Fixes for older R versions --- ariba/mic_plotter.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 3501d950..f9a9babd 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -381,8 +381,7 @@ def _make_plot(self, theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank(), - axis.line.x = element_line(color="black"), - axis.line.y = element_line(color="black"), + axis.line = element_line(color="black"), axis.title.x = element_blank(), axis.title.y = element_text(size=22), axis.text.x = ''' + axis_text_x + r''', @@ -404,8 +403,16 @@ def _make_plot(self, g <- rbind(g1, g2, size="first") g$widths <- unit.pmax(g1$widths, g2$widths) panels <- g$layout$t[grepl("panel", g$layout$name)] -g$heights[panels][1] = unit(''', self.panel_heights[0], r''',"null") -g$heights[panels][2] = unit(''', self.panel_heights[1], r''',"null") + +if(getRversion() < "3.3.0"){ + g$heights <- grid:::unit.list(g$heights) + g$heights[panels][1] <- list(unit(''', self.panel_heights[0], r''', "null")) + g$heights[panels][2] <- list(unit(''', self.panel_heights[1], r''', "null")) +} else { + g$heights[panels][1] = unit(''', self.panel_heights[0], r''',"null") + g$heights[panels][2] = unit(''', self.panel_heights[1], r''',"null") +} + grid.newpage() grid.draw(g) ggsave("''', self.outprefix, '.pdf", plot=g, useDingbats=FALSE, height=', self.plot_height, ', width=', self.plot_width, ')', sep='', file=f) From ddf74befa23d5d5ff6969b93e6dc04f502adf918 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 24 Feb 2017 13:07:43 +0000 Subject: [PATCH 37/88] Fix for older R version --- ariba/mic_plotter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index f9a9babd..94be1610 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -413,9 +413,11 @@ def _make_plot(self, g$heights[panels][2] = unit(''', self.panel_heights[1], r''',"null") } +pdf("''', self.outprefix, '.pdf", useDingbats=FALSE, height=', self.plot_height, ', width=', self.plot_width, r''') grid.newpage() grid.draw(g) -ggsave("''', self.outprefix, '.pdf", plot=g, useDingbats=FALSE, height=', self.plot_height, ', width=', self.plot_width, ')', sep='', file=f) +dev.off() +''', sep='', file=f) f.close() common.syscall('R CMD BATCH ' + r_script) From bff8514f90acdc0339975ac36771573581c802c1 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 24 Feb 2017 17:21:50 +0000 Subject: [PATCH 38/88] Add option --use_hets --- ariba/mic_plotter.py | 26 +++++- ariba/tasks/micplot.py | 1 + ...t_tsv.antibio1.exclude.no_combinations.tsv | 3 + ...lotter_to_boxplot_tsv.antibio1.exclude.tsv | 2 + ...xplot_tsv.antibio1.no.no_combinations.tsv} | 0 ...ic_plotter_to_boxplot_tsv.antibio1.no.tsv} | 0 ...xplot_tsv.antibio1.yes.no_combinations.tsv | 5 ++ ...ic_plotter_to_boxplot_tsv.antibio1.yes.tsv | 3 + ..._tsv.antibio2.exclude.no_combinations.tsv} | 0 ...otter_to_boxplot_tsv.antibio2.exclude.tsv} | 0 ...oxplot_tsv.antibio2.no.no_combinations.tsv | 4 + ...mic_plotter_to_boxplot_tsv.antibio2.no.tsv | 3 + ...xplot_tsv.antibio2.yes.no_combinations.tsv | 4 + ...ic_plotter_to_boxplot_tsv.antibio2.yes.tsv | 3 + ariba/tests/mic_plotter_test.py | 89 +++++++++++++------ scripts/ariba | 1 + 16 files changed, 112 insertions(+), 32 deletions(-) create mode 100644 ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.no_combinations.tsv create mode 100644 ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.tsv rename ariba/tests/data/{mic_plotter_to_boxplot_tsv.antibio_no_combinations.1.tsv => mic_plotter_to_boxplot_tsv.antibio1.no.no_combinations.tsv} (100%) rename ariba/tests/data/{mic_plotter_to_boxplot_tsv.antibio1.tsv => mic_plotter_to_boxplot_tsv.antibio1.no.tsv} (100%) create mode 100644 ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.no_combinations.tsv create mode 100644 ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.tsv rename ariba/tests/data/{mic_plotter_to_boxplot_tsv.antibio_no_combinations.2.tsv => mic_plotter_to_boxplot_tsv.antibio2.exclude.no_combinations.tsv} (100%) rename ariba/tests/data/{mic_plotter_to_boxplot_tsv.antibio2.tsv => mic_plotter_to_boxplot_tsv.antibio2.exclude.tsv} (100%) create mode 100644 ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.no_combinations.tsv create mode 100644 ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.tsv create mode 100644 ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.no_combinations.tsv create mode 100644 ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.tsv diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 94be1610..ea32786a 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -13,6 +13,7 @@ def __init__(self, mic_file, summary_file, outprefix, + use_hets='yes', main_title=None, plot_height=15, plot_width=15, @@ -38,6 +39,12 @@ def __init__(self, self.mic_file = mic_file self.summary_file = summary_file self.outprefix = outprefix + + allowed_use_hets = {'yes', 'no', 'exclude'} + if not use_hets in allowed_use_hets: + raise Error('Error in use_hets option. Allowed options are: ' + str(allowed_use_hets) + '. Got: ' + use_hets) + self.use_hets = use_hets + self.main_title = self.antibiotic if main_title is None else main_title self.plot_height = plot_height self.plot_width = plot_width @@ -90,6 +97,7 @@ def __init__(self, self.palette = palette self.number_of_colours = number_of_colours + @classmethod def _mic_string_to_float(cls, s): regex_match = regex_string_to_float.match(s) @@ -164,7 +172,8 @@ def _load_summary_file(cls, infile): @classmethod - def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile, no_combinations=False): + def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile, use_hets, no_combinations=False): + assert use_hets in {'yes', 'no', 'exclude'} ignore_columns = {'assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'} all_mutations = set() all_mutations_seen_combinations = set() @@ -183,6 +192,7 @@ def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile, no_combina continue mutations = set() + found_het_and_exclude = False for cluster in summary_data[sample]: if summary_data[sample][cluster]['assembled'] == 'interrupted': @@ -192,8 +202,18 @@ def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile, no_combina if column in ignore_columns or column.endswith('.%'): continue - if value == 'yes': + if value == 'yes' or (use_hets == 'yes' and value == 'het'): mutations.add(cluster + '.' + column.strip()) + elif use_hets == 'exclude' and value == 'het': + found_het_and_exclude = True + break + + if found_het_and_exclude: + break + + if found_het_and_exclude: + continue + if len(mutations) == 0: mutations.add('without_mutation') @@ -427,7 +447,7 @@ def run(self): mic_data = MicPlotter._load_mic_file(self.mic_file) summary_data = MicPlotter._load_summary_file(self.summary_file) boxplot_tsv = self.outprefix + '.boxplot.tsv' - all_mutations, combinations = MicPlotter._to_boxplot_tsv(summary_data, mic_data, self.antibiotic, boxplot_tsv, no_combinations=self.no_combinations) + all_mutations, combinations = MicPlotter._to_boxplot_tsv(summary_data, mic_data, self.antibiotic, boxplot_tsv, self.use_hets, no_combinations=self.no_combinations) dots_tsv = self.outprefix + '.dots.tsv' MicPlotter._to_dots_tsv(all_mutations, combinations, dots_tsv) self._make_plot(boxplot_tsv, dots_tsv) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 1e0215ef..f00ad1a9 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -7,6 +7,7 @@ def run(options): options.mic_file, options.summary_file, options.outprefix, + use_hets=options.use_hets, main_title=options.main_title, plot_height=options.plot_height, plot_width=options.plot_width, diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.no_combinations.tsv new file mode 100644 index 00000000..b4c4325a --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.no_combinations.tsv @@ -0,0 +1,3 @@ +Sample MIC Mutations +name1 0.25 cluster2.group2.A43T +name1 0.25 cluster3.interrupted diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.tsv new file mode 100644 index 00000000..59743fb6 --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.tsv @@ -0,0 +1,2 @@ +Sample MIC Mutations +name1 0.25 cluster2.group2.A43T.cluster3.interrupted diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio_no_combinations.1.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.no_combinations.tsv similarity index 100% rename from ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio_no_combinations.1.tsv rename to ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.no_combinations.tsv diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.tsv similarity index 100% rename from ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.tsv rename to ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.tsv diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.no_combinations.tsv new file mode 100644 index 00000000..0092bde7 --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.no_combinations.tsv @@ -0,0 +1,5 @@ +Sample MIC Mutations +name1 0.25 cluster2.group2.A43T +name1 0.25 cluster3.interrupted +name2 0.125 cluster1.group1.A42T +name2 0.125 cluster4.group4.A44T diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.tsv new file mode 100644 index 00000000..cb8b103c --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.tsv @@ -0,0 +1,3 @@ +Sample MIC Mutations +name1 0.25 cluster2.group2.A43T.cluster3.interrupted +name2 0.125 cluster1.group1.A42T.cluster4.group4.A44T diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio_no_combinations.2.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.no_combinations.tsv similarity index 100% rename from ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio_no_combinations.2.tsv rename to ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.no_combinations.tsv diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.tsv similarity index 100% rename from ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.tsv rename to ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.tsv diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.no_combinations.tsv new file mode 100644 index 00000000..1912500e --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.no_combinations.tsv @@ -0,0 +1,4 @@ +Sample MIC Mutations +name1 0.004 cluster2.group2.A43T +name1 0.004 cluster3.interrupted +name3 0.002 without_mutation diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.tsv new file mode 100644 index 00000000..37b20976 --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.tsv @@ -0,0 +1,3 @@ +Sample MIC Mutations +name1 0.004 cluster2.group2.A43T.cluster3.interrupted +name3 0.002 without_mutation diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.no_combinations.tsv new file mode 100644 index 00000000..1912500e --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.no_combinations.tsv @@ -0,0 +1,4 @@ +Sample MIC Mutations +name1 0.004 cluster2.group2.A43T +name1 0.004 cluster3.interrupted +name3 0.002 without_mutation diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.tsv new file mode 100644 index 00000000..37b20976 --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.tsv @@ -0,0 +1,3 @@ +Sample MIC Mutations +name1 0.004 cluster2.group2.A43T.cluster3.interrupted +name3 0.002 without_mutation diff --git a/ariba/tests/mic_plotter_test.py b/ariba/tests/mic_plotter_test.py index e6e1659b..78990c0e 100644 --- a/ariba/tests/mic_plotter_test.py +++ b/ariba/tests/mic_plotter_test.py @@ -75,49 +75,80 @@ def test_to_boxplot_tsv(self): 'cluster1': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref1', 'pct_id': 100.0, 'known_var': 'no', 'novel_var': 'no', 'group1.A42T': 'no', 'group1.A42T.%': 'NA'}, 'cluster2': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref2', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group2.A43T': 'yes', 'group2.A43T.%': 95.42}, 'cluster3': {'assembled': 'interrupted', 'match': 'no', 'ref_seq': 'ref3', 'pct_id': 99.0, 'known_var': 'no', 'novel_var': 'yes', 'A42T': 'no', 'A44T.%': 'NA'}, + 'cluster4': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref4', 'pct_id': 100.0, 'known_var': 'no', 'novel_var': 'no', 'group4.A44T': 'no', 'group4.A44T.%': 'NA'}, }, 'name2': { 'cluster1': {'assembled': 'yes', 'match': 'yes_nonunique', 'ref_seq': 'ref3', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group1.A42T': 'yes', 'group1.A42T.%': 90.90}, 'cluster2': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'NA', 'novel_var': 'NA', 'group2.A43T': 'NA', 'group2.A43T.%': 'NA'}, 'cluster3': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'no', 'novel_var': 'no', 'A42T': 'no', 'A44T.%': 'NA'}, + 'cluster4': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref4', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group4.A44T': 'het', 'group4.A44T.%': '50.0'}, }, 'name3': { 'cluster1': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref_seq42', 'pct_id': 100.0, 'known_var': 'no', 'novel_var': 'no', 'group1.A42T': 'no', 'group1.A42T.%': 'NA'}, 'cluster2': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'NA', 'novel_var': 'NA', 'group2.A43T': 'NA', 'group2.A43T.%': 'NA'}, 'cluster3': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'no', 'novel_var': 'no', 'A42T': 'no', 'A44T.%': 'NA'}, + 'cluster4': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref4', 'pct_id': 100.0, 'known_var': 'yes', 'novel_var': 'no', 'group4.A44T': 'no', 'group4.A44T.%': 'NA'}, }, } + expected_mutations = { + 'antibio1': { + 'yes': {'cluster1.group1.A42T', 'cluster2.group2.A43T', 'cluster3.interrupted', 'cluster4.group4.A44T'}, + 'no': {'cluster1.group1.A42T', 'cluster2.group2.A43T', 'cluster3.interrupted'}, + 'exclude': {'cluster2.group2.A43T', 'cluster3.interrupted'}, + }, + 'antibio2': { + 'yes': {'without_mutation', 'cluster2.group2.A43T', 'cluster3.interrupted'}, + 'no': {'without_mutation', 'cluster2.group2.A43T', 'cluster3.interrupted'}, + 'exclude': {'without_mutation', 'cluster2.group2.A43T', 'cluster3.interrupted'}, + } + } + + expected_combs = { + 'antibio1': { + 'yes': {('cluster2.group2.A43T', 'cluster3.interrupted'), ('cluster1.group1.A42T', 'cluster4.group4.A44T')}, + 'no': {('cluster2.group2.A43T', 'cluster3.interrupted'), ('cluster1.group1.A42T',)}, + 'exclude': {('cluster2.group2.A43T', 'cluster3.interrupted')} + }, + 'antibio2': { + 'yes': {('without_mutation',), ('cluster2.group2.A43T', 'cluster3.interrupted')}, + 'no': {('without_mutation',), ('cluster2.group2.A43T', 'cluster3.interrupted')}, + 'exclude': {('without_mutation',), ('cluster2.group2.A43T', 'cluster3.interrupted')} + } + } + + + expected_no_combs = { + 'antibio1': { + 'yes': {('cluster2.group2.A43T',), ('cluster3.interrupted',), ('cluster1.group1.A42T',), ('cluster4.group4.A44T',)}, + 'no': {('cluster2.group2.A43T',), ('cluster3.interrupted',), ('cluster1.group1.A42T',)}, + 'exclude': {('cluster2.group2.A43T',), ('cluster3.interrupted',)} + }, + 'antibio2': { + 'yes': {('without_mutation',), ('cluster2.group2.A43T', ), ('cluster3.interrupted',)}, + 'no': {('without_mutation',), ('cluster2.group2.A43T', ), ('cluster3.interrupted',)}, + 'exclude': {('without_mutation',), ('cluster2.group2.A43T', ), ('cluster3.interrupted',)} + } + } + + tmp_tsv = 'tmp.mic_plotter_test.to_boxplot.tsv' - expected_mutations = [ - {'cluster1.group1.A42T', 'cluster2.group2.A43T', 'cluster3.interrupted'}, - {'without_mutation', 'cluster2.group2.A43T', 'cluster3.interrupted'}, - ] - - expected_combs = [ - {('cluster2.group2.A43T', 'cluster3.interrupted'), ('cluster1.group1.A42T',)}, - {('without_mutation',), ('cluster2.group2.A43T', 'cluster3.interrupted')} - ] - - expected_no_combs = [ - {('cluster2.group2.A43T',), ('cluster3.interrupted',), ('cluster1.group1.A42T',)}, - {('without_mutation',), ('cluster2.group2.A43T', ), ('cluster3.interrupted',)} - ] - - for i in [1, 2]: - got_muts, got_combs = mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, 'antibio' + str(i), tmp_tsv) - expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.antibio' + str(i) + '.tsv') - self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) - self.assertEqual(got_muts, expected_mutations[i-1]) - self.assertEqual(got_combs, expected_combs[i-1]) - os.unlink(tmp_tsv) - - got_muts, got_combs = mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, 'antibio' + str(i), tmp_tsv, no_combinations=True) - expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.antibio_no_combinations.' + str(i) + '.tsv') - self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) - self.assertEqual(got_muts, expected_mutations[i-1]) - self.assertEqual(got_combs, expected_no_combs[i-1]) - os.unlink(tmp_tsv) + + for antibio in ['antibio1', 'antibio2']: + for use_het in ['no', 'yes', 'exclude']: + got_muts, got_combs = mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, antibio, tmp_tsv, use_het) + expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.' + antibio + '.' + use_het + '.tsv') + self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) + self.assertEqual(got_muts, expected_mutations[antibio][use_het]) + self.assertEqual(got_combs, expected_combs[antibio][use_het]) + os.unlink(tmp_tsv) + + got_muts, got_combs = mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, antibio, tmp_tsv, use_het, no_combinations=True) + expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.' + antibio + '.' + use_het + '.no_combinations.tsv') + self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) + self.assertEqual(got_muts, expected_mutations[antibio][use_het]) + self.assertEqual(got_combs, expected_no_combs[antibio][use_het]) + os.unlink(tmp_tsv) def test_to_dots_tsv(self): diff --git a/scripts/ariba b/scripts/ariba index bfc5cd88..aeb68a76 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -67,6 +67,7 @@ subparser_micplot.add_argument('antibiotic', help='Antibiotic name. Must exactly subparser_micplot.add_argument('mic_file', help='File containing MIC data for each sample and one or more antibiotics') subparser_micplot.add_argument('summary_file', help='File made by running "ariba summary"') subparser_micplot.add_argument('outprefix', help='Prefix of output files') +subparser_micplot.add_argument('--use_hets', choices=['yes', 'no', 'exclude'], help='How to deal with HET snps. Choose from yes,no,exclude. yes: count a het SNP as present. no: do not count a het SNP as present. exclude: completely remove any sample with any het SNP [%(default)s]', default='yes') subparser_micplot.add_argument('--main_title', help='Main title of plot. Default is to use the antibiotic name', metavar='"title in quotes"') subparser_micplot.add_argument('--plot_height', help='Height of plot (used in plot_height=X when running ggsave [%(default)s]', default=15, type=float, metavar='FLOAT') subparser_micplot.add_argument('--plot_width', help='Height of plot (used in plot_width=X when running ggsave [%(default)s]', default=15, type=float, metavar='FLOAT') From b389eff5ebf3ab56eb2a3983c21f16db70423aaf Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 24 Feb 2017 17:26:09 +0000 Subject: [PATCH 39/88] No hlines by default --- scripts/ariba | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ariba b/scripts/ariba index aeb68a76..c447f3d4 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -77,7 +77,7 @@ subparser_micplot.add_argument('--jitter_width', help='Jitter width option when subparser_micplot.add_argument('--jitter_height', help='Jitter height option when plotting points [%(default)s]', default=0.02, type=float, metavar='FLOAT') subparser_micplot.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') subparser_micplot.add_argument('--mic_values', help='Comma-separated list of MIC values to be shown on y axis ticks [%(default)s]', default='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024', metavar='float1,float2,...') -subparser_micplot.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. For no lines use an empty string, ie: --hlines \'\' [%(default)s]', default='0.25,2', metavar='float1,float2,...') +subparser_micplot.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. Default is to draw no lines.', metavar='float1,float2,...', default='') subparser_micplot.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point. If zero, will group points and size them proportional to the group size [%(default)s]', default=4, metavar='FLOAT') subparser_micplot.add_argument('--point_range', help='Min and max size of plotted points when --point_size is 0 [%(default)s]', default='2,15', metavar='min,max') subparser_micplot.add_argument('--point_break', help='Comma-spearated list of breakpoints in point sizes when --point_size is 0 [%(default)s]', default='10,50,100,200,300', metavar='integer1.integer2,integer3,...') From 4909635c0f48c3ce64f55e53dffc66422a18a1b2 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 24 Feb 2017 17:32:06 +0000 Subject: [PATCH 40/88] New option --dot_y_text_size --- ariba/mic_plotter.py | 4 +++- ariba/tasks/micplot.py | 1 + scripts/ariba | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index ea32786a..c4cb7c1d 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -31,6 +31,7 @@ def __init__(self, point_legend_y=0.9, dot_size=8, dot_outline=False, + dot_y_text_size=18, panel_heights='5,1', palette='Accent', number_of_colours=0 @@ -88,6 +89,7 @@ def __init__(self, self.point_legend_y = point_legend_y self.dot_size = dot_size self.dot_outline = dot_outline + self.dot_y_text_size = dot_y_text_size try: self.panel_heights = [int(x) for x in panel_heights.split(',')] @@ -343,7 +345,7 @@ def _make_plot(self, ylim(rev(mutations)) + theme_bw() + theme(axis.text.x = element_blank(), - axis.text.y = element_text(size=18), + axis.text.y = element_text(size=''', self.dot_y_text_size, r'''), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.ticks = element_blank(), diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index f00ad1a9..b78006a2 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -25,6 +25,7 @@ def run(options): point_legend_y=options.point_legend_y, dot_size=options.dot_size, dot_outline=options.dot_outline, + dot_y_text_size=options.dot_y_text_size, panel_heights=options.panel_heights, palette=options.palette, number_of_colours=options.number_of_colours diff --git a/scripts/ariba b/scripts/ariba index c447f3d4..61d90e35 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -85,6 +85,7 @@ subparser_micplot.add_argument('--point_legend_x', type=float, help='x coord of subparser_micplot.add_argument('--point_legend_y', type=float, help='y coord of legend when --point_size is 0 [%(default)s]', default=0.9, metavar='FLOAT') subparser_micplot.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=8, metavar='FLOAT') subparser_micplot.add_argument('--dot_outline', action='store_true', help='Black outline around all dots (whether coloured or not) in lower part of plots') +subparser_micplot.add_argument('--dot_y_text_size', type=int, help='Text size of labels in lower dot plot [%(default)s]', default=18, metavar='INT') subparser_micplot.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom panels. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='5,1', metavar='height1,height2') subparser_micplot.add_argument('--palette', help='ColourBrewer palette to use [%(default)s]', default='Accent', metavar='palette_name') subparser_micplot.add_argument('--number_of_colours', help='Number of colours in plot. 0:same number as columns in the plot. 1:all black. >1: take the first N colours from the colour palette specified by --palette and cycle them [%(default)s]', default=0, metavar='INT') From 3ca945f0d0b531a382d04ba7e888e0fa3ba296ad Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 24 Feb 2017 23:35:20 +0000 Subject: [PATCH 41/88] Check depth not too low --- ariba/ext/vcfcall_ariba.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ariba/ext/vcfcall_ariba.cpp b/ariba/ext/vcfcall_ariba.cpp index 3ead497e..c2dba617 100644 --- a/ariba/ext/vcfcall_ariba.cpp +++ b/ariba/ext/vcfcall_ariba.cpp @@ -133,7 +133,8 @@ void vectorSumAndMax(const std::vector& v, uint32_t& maxValue, uint32_ bool depthOk(const uint32_t& totalDepth, const uint32_t& testNum, const uint32_t& minSecondDepth, const float& maxAlleleFreq) { - return (testNum >= minSecondDepth && 1.0 * testNum / totalDepth <= maxAlleleFreq); + double depthFreq = 1.0 * testNum / totalDepth; + return (testNum >= minSecondDepth && depthFreq >= (1 - maxAlleleFreq) && depthFreq <= maxAlleleFreq); } From e441a781b16c0ea37026b9fb27be4e8e47f6856a Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 24 Feb 2017 23:35:51 +0000 Subject: [PATCH 42/88] Change variant string format in summary output --- ariba/summary_cluster_variant.py | 53 ++++++--- .../tests/data/summary_test_whole_run.out.csv | 6 +- ariba/tests/summary_cluster_variant_test.py | 104 +++++++++++------- ariba/tests/summary_test.py | 28 ++--- 4 files changed, 118 insertions(+), 73 deletions(-) diff --git a/ariba/summary_cluster_variant.py b/ariba/summary_cluster_variant.py index 62c231a0..1f7d2215 100644 --- a/ariba/summary_cluster_variant.py +++ b/ariba/summary_cluster_variant.py @@ -1,3 +1,5 @@ +import re + class Error (Exception): pass class SummaryClusterVariant: @@ -32,21 +34,34 @@ def _has_nonsynonymous(cls, data_dict): @classmethod - def _depths_look_het(cls, depths): - sorted_depths = sorted(depths) - min_var_read_depth = 5 - min_second_var_read_depth = 2 - max_allele_freq = 0.90 - total_depth = sum(depths) - second_depth_ok = len(depths) == 1 or (len(depths) > 1 and sorted_depths[-2] >= min_second_var_read_depth) - max_depth_ok = total_depth >= min_var_read_depth and sorted_depths[-1] / total_depth <= max_allele_freq - return depths[0] < sorted_depths[-1] or (second_depth_ok and max_depth_ok) + def _filter_depths(cls, ref_base, depths): + if ref_base not in depths: + return {} + + min_freq = 0.1 + max_freq = 0.9 + new_depths = {} + total_depth = sum(depths.values()) + ref_depth = depths[ref_base] + return {x: depths[x] for x in depths if depths[x] >= ref_depth or min_freq <= depths[x] / total_depth <= max_freq} + + + #@classmethod + #def _depths_look_het(cls, depths): + # sorted_depths = sorted(depths) + # min_var_read_depth = 5 + # min_second_var_read_depth = 2 + # max_allele_freq = 0.90 + # total_depth = sum(depths) + # second_depth_ok = len(depths) == 1 or (len(depths) > 1 and sorted_depths[-2] >= min_second_var_read_depth) + # max_depth_ok = total_depth >= min_var_read_depth and sorted_depths[-1] / total_depth <= max_allele_freq + # return depths[0] < sorted_depths[-1] or (second_depth_ok and max_depth_ok) @classmethod def _get_is_het_and_percent(cls, data_dict): if data_dict['gene'] == '1' or not (data_dict['known_var'] == '1' or data_dict['ref_ctg_effect'] == 'SNP' or data_dict['var_type'] == 'HET') or data_dict['smtls_nts'] == '.' or ';' in data_dict['smtls_nts'] or data_dict['smtls_nts_depth'] == 'ND': - return False, None + return False, None, None else: nucleotides = data_dict['smtls_nts'].split(',') depths = data_dict['smtls_nts_depth'].split(',') @@ -72,21 +87,27 @@ def _get_is_het_and_percent(cls, data_dict): elif data_dict['known_var_change'] != '.': var_nucleotide = data_dict['known_var_change'][-1] else: - return False, None + return False, None, None if var_nucleotide == '.': - return False, None + return False, None, None total_depth = sum(depths) if max([len(x) for x in nucleotides]) == 1: var_depth = nuc_to_depth.get(var_nucleotide, 0) else: var_depth = sum([nuc_to_depth[x] for x in nuc_to_depth if x[0] == var_nucleotide]) - is_het = SummaryClusterVariant._depths_look_het(depths) + filtered_depths = SummaryClusterVariant._filter_depths(nucleotides[0], nuc_to_depth) + + if len(filtered_depths) > 0: + bases = ''.join(sorted(list(filtered_depths.keys()))) + return len(filtered_depths) > 1, round(100 * var_depth / total_depth, 1), bases + else: + return False, None, None return is_het, round(100 * var_depth / total_depth, 1) except: - return False, None + return False, None, None def _get_nonsynon_variant_data(self, data_dict): @@ -101,7 +122,9 @@ def _get_nonsynon_variant_data(self, data_dict): else: self.var_string = data_dict['ref_ctg_effect'] - self.is_het, self.het_percent = SummaryClusterVariant._get_is_het_and_percent(data_dict) + self.is_het, self.het_percent, var_bases = SummaryClusterVariant._get_is_het_and_percent(data_dict) + if var_bases is not None: + self.var_string = re.sub(r'[^0-9]', '', self.var_string) + var_bases if not SummaryClusterVariant._has_nonsynonymous(data_dict): self.has_nonsynon = False diff --git a/ariba/tests/data/summary_test_whole_run.out.csv b/ariba/tests/data/summary_test_whole_run.out.csv index 1a282e3f..44ba9db1 100644 --- a/ariba/tests/data/summary_test_whole_run.out.csv +++ b/ariba/tests/data/summary_test_whole_run.out.csv @@ -1,3 +1,3 @@ -name,23S.assembled,23S.match,23S.ref_seq,23S.pct_id,23S.known_var,23S.C2597T,23S.C2597T.%,coding1.assembled,coding1.match,coding1.ref_seq,coding1.pct_id,coding2.assembled,coding2.match,coding2.ref_seq,coding2.pct_id,coding3.assembled,coding3.match,coding3.ref_seq,coding3.pct_id,coding5.assembled,coding5.match,coding5.ref_seq,coding5.pct_id,coding5.known_var,coding5.A42S,coding6.assembled,coding6.match,coding6.ref_seq,coding6.pct_id,coding6.known_var,coding6.A52S,coding7.assembled,coding7.ref_seq,coding7.pct_id,coding8.assembled,coding8.match,coding8.ref_seq,coding8.pct_id,coding8.novel_var,coding8.A53S,mdfA.assembled,mdfA.ref_seq,mdfA.pct_id,mdfA.novel_var,mdfA.G261GGGTGTGGTGTGGT/GGGTGTGGT,noncoding1.assembled,noncoding1.match,noncoding1.ref_seq,noncoding1.pct_id,noncoding10.assembled,noncoding10.match,noncoding10.ref_seq,noncoding10.pct_id,noncoding10.novel_var,noncoding10.C100T,noncoding10.C100T.%,noncoding11.assembled,noncoding11.match,noncoding11.ref_seq,noncoding11.pct_id,noncoding11.novel_var,noncoding11.G101A,noncoding11.G101A.%,noncoding2.assembled,noncoding2.match,noncoding2.ref_seq,noncoding2.pct_id,noncoding3.assembled,noncoding3.match,noncoding3.ref_seq,noncoding3.pct_id,noncoding5.assembled,noncoding5.match,noncoding5.ref_seq,noncoding5.pct_id,noncoding5.known_var,noncoding5.A42T,noncoding5.A42T.%,noncoding6.assembled,noncoding6.match,noncoding6.ref_seq,noncoding6.pct_id,noncoding6.known_var,noncoding6.A52T,noncoding6.A52T.%,noncoding7.assembled,noncoding7.match,noncoding7.ref_seq,noncoding7.pct_id,noncoding7.known_var,noncoding7.A53T,noncoding7.A53T.%,noncoding8.assembled,noncoding8.match,noncoding8.ref_seq,noncoding8.pct_id,noncoding9.assembled,noncoding9.ref_seq,noncoding9.pct_id -/nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/summary_test_whole_run.in.1.tsv,yes,yes,23S.rDNA_WHO_F_01358c,99.86,yes,yes,100.0,interrupted,no,coding1_ref1,99.1,yes,yes,coding2_ref1,98.2,no,no,NA,NA,yes,no,coding5_ref1,97.4,no,no,yes,yes,coding6_ref1,95.5,yes,yes,yes,coding7_ref1,95.4,yes,yes,coding8_ref1,95.3,yes,yes,interrupted,mdfA.3001328.JQ394987.0_1233.561,97.0,yes,yes,yes,yes,noncoding1_ref1,99.1,yes,yes,noncoding10_ref1,95.1,yes,yes,99.0,yes,yes,noncoding11_ref1,95.05,het,het,30.0,yes,yes,noncoding2_ref1,98.2,no,no,NA,NA,yes,yes,noncoding5_ref1,97.4,yes,yes,100.0,yes,yes,noncoding6_ref1,95.5,yes,het,70.0,yes,yes,noncoding7_ref1,95.4,yes,yes,98.6,yes,yes,noncoding8_ref1,95.3,yes,noncoding9_ref1,95.2 -/nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/summary_test_whole_run.in.2.tsv,yes_nonunique,no,23S.rDNA_WHO_F_01358c,99.84,het,het,12.8,yes,yes,coding1_ref2,99.2,no,no,NA,NA,yes,yes,coding3_ref1,97.6,yes,yes,coding5_ref1,97.4,yes,yes,no,no,NA,NA,NA,NA,no,NA,NA,no,no,NA,NA,NA,NA,no,NA,NA,NA,NA,yes,yes,noncoding1_ref2,99.2,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,yes,yes,noncoding3_ref1,97.6,yes,no,noncoding5_ref1,99.42,no,no,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,no,NA,NA +name,23S.assembled,23S.match,23S.ref_seq,23S.pct_id,23S.known_var,23S.2597CT,23S.2597CT.%,23S.2597TC,23S.2597TC.%,coding1.assembled,coding1.match,coding1.ref_seq,coding1.pct_id,coding2.assembled,coding2.match,coding2.ref_seq,coding2.pct_id,coding3.assembled,coding3.match,coding3.ref_seq,coding3.pct_id,coding5.assembled,coding5.match,coding5.ref_seq,coding5.pct_id,coding5.known_var,coding5.A42S,coding6.assembled,coding6.match,coding6.ref_seq,coding6.pct_id,coding6.known_var,coding6.A52S,coding7.assembled,coding7.ref_seq,coding7.pct_id,coding8.assembled,coding8.match,coding8.ref_seq,coding8.pct_id,coding8.novel_var,coding8.A53S,mdfA.assembled,mdfA.ref_seq,mdfA.pct_id,mdfA.novel_var,mdfA.G261GGGTGTGGTGTGGT/GGGTGTGGT,noncoding1.assembled,noncoding1.match,noncoding1.ref_seq,noncoding1.pct_id,noncoding10.assembled,noncoding10.match,noncoding10.ref_seq,noncoding10.pct_id,noncoding10.novel_var,noncoding10.100T,noncoding10.100T.%,noncoding11.assembled,noncoding11.match,noncoding11.ref_seq,noncoding11.pct_id,noncoding11.novel_var,noncoding11.101AG,noncoding11.101AG.%,noncoding2.assembled,noncoding2.match,noncoding2.ref_seq,noncoding2.pct_id,noncoding3.assembled,noncoding3.match,noncoding3.ref_seq,noncoding3.pct_id,noncoding5.assembled,noncoding5.match,noncoding5.ref_seq,noncoding5.pct_id,noncoding5.known_var,noncoding5.42T,noncoding5.42T.%,noncoding6.assembled,noncoding6.match,noncoding6.ref_seq,noncoding6.pct_id,noncoding6.known_var,noncoding6.52CT,noncoding6.52CT.%,noncoding7.assembled,noncoding7.match,noncoding7.ref_seq,noncoding7.pct_id,noncoding7.known_var,noncoding7.53T,noncoding7.53T.%,noncoding8.assembled,noncoding8.match,noncoding8.ref_seq,noncoding8.pct_id,noncoding9.assembled,noncoding9.ref_seq,noncoding9.pct_id +summary_test_whole_run.in.1.tsv,yes,yes,23S.rDNA_WHO_F_01358c,99.86,yes,no,NA,yes,100.0,interrupted,no,coding1_ref1,99.1,yes,yes,coding2_ref1,98.2,no,no,NA,NA,yes,no,coding5_ref1,97.4,no,no,yes,yes,coding6_ref1,95.5,yes,yes,yes,coding7_ref1,95.4,yes,yes,coding8_ref1,95.3,yes,yes,interrupted,mdfA.3001328.JQ394987.0_1233.561,97.0,yes,yes,yes,yes,noncoding1_ref1,99.1,yes,yes,noncoding10_ref1,95.1,yes,yes,99.0,yes,yes,noncoding11_ref1,95.05,het,het,30.0,yes,yes,noncoding2_ref1,98.2,no,no,NA,NA,yes,yes,noncoding5_ref1,97.4,yes,yes,100.0,yes,yes,noncoding6_ref1,95.5,yes,het,70.0,yes,yes,noncoding7_ref1,95.4,yes,yes,98.6,yes,yes,noncoding8_ref1,95.3,yes,noncoding9_ref1,95.2 +summary_test_whole_run.in.2.tsv,yes_nonunique,no,23S.rDNA_WHO_F_01358c,99.84,het,het,12.8,no,NA,yes,yes,coding1_ref2,99.2,no,no,NA,NA,yes,yes,coding3_ref1,97.6,yes,yes,coding5_ref1,97.4,yes,yes,no,no,NA,NA,NA,NA,no,NA,NA,no,no,NA,NA,NA,NA,no,NA,NA,NA,NA,yes,yes,noncoding1_ref2,99.2,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,yes,yes,noncoding3_ref1,97.6,yes,no,noncoding5_ref1,99.42,no,no,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,no,NA,NA diff --git a/ariba/tests/summary_cluster_variant_test.py b/ariba/tests/summary_cluster_variant_test.py index f88cc0c7..10af7d93 100644 --- a/ariba/tests/summary_cluster_variant_test.py +++ b/ariba/tests/summary_cluster_variant_test.py @@ -23,51 +23,73 @@ def test_has_nonsynonymous(self): self.assertEqual(expected[i], summary_cluster_variant.SummaryClusterVariant._has_nonsynonymous(dicts[i])) - def test_depths_look_het(self): - '''test _depths_look_het''' + def test_filter_depths(self): + '''test _filter_depths''' tests = [ - ([1], False), - ([2], False), - ([3], False), - ([4], False), - ([5], False), - ([90, 1], False), - ([90, 9], False), - ([90, 10], True), - ([9, 1], False), - ([9, 2], True), - ([1, 2], True), - ([90, 5, 5], True), - ([90, 2, 1, 1], False), - ([97, 2, 1], False), + ('A', {'A': 1}, {'A': 1}), + ('A', {'A': 2}, {'A': 2}), + ('A', {'A': 3}, {'A': 3}), + ('A', {'A': 4}, {'A': 4}), + ('A', {'A': 5}, {'A': 5}), + ('A', {'A': 90, 'C': 9}, {'A': 90}), + ('C', {'A': 90, 'C': 9}, {'A': 90, 'C': 9}), + ('C', {'A': 90, 'C': 9, 'G':1}, {'A': 90, 'C': 9}), + ('A', {'A': 90, 'C': 10}, {'A': 90, 'C': 10}), + ('A', {'A': 90, 'C': 5, 'G': 5}, {'A': 90}), + ('A', {'A': 89, 'C': 10, 'G': 1}, {'A': 89, 'C': 10}), + ('A', {'A': 80, 'C': 10, 'G': 10}, {'A': 80, 'C': 10, 'G': 10}), ] - for depths, expected in tests: - self.assertEqual(expected, summary_cluster_variant.SummaryClusterVariant._depths_look_het(depths)) + for ref_base, depths, expected in tests: + self.assertEqual(expected, summary_cluster_variant.SummaryClusterVariant._filter_depths(ref_base, depths)) + + + #def test_depths_look_het(self): + # '''test _depths_look_het''' + # tests = [ + # ([1], False), + # ([2], False), + # ([3], False), + # ([4], False), + # ([5], False), + # ([90, 1], False), + # ([90, 9], False), + # ([90, 10], True), + # ([9, 1], False), + # ([9, 2], True), + # ([1, 2], True), + # ([89, 10, 1], True), + # ([89, 9, 2], False), + # ([90, 2, 1, 1], False), + # ([97, 2, 1], False), + # ] + + # for depths, expected in tests: + # self.assertEqual(expected, summary_cluster_variant.SummaryClusterVariant._depths_look_het(depths)) def test_get_is_het_and_percent(self): '''test _get_is_het_and_percent''' tests = [ - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', (False, 100.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tT,A\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs', (True, 25.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tA,T\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs', (True, 75.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 40.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T\t95,5\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (False, 5.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T\t90,10\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 10.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,C\t90,6,4\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 6.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,C\t3,7,90\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 7.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tHET\t.\t.\t.\t.\t.\t.\t.\t.\t84\t84\tA\t50\tA,T\t40,10\t.\t.', (True, 20.0)), - ('ariba_ref1\t23S.rDNA_WHO_F_01358c\t0\t1\t531\t9914\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3120\t744.8\t1\tSNP\tn\tC2597T\t1\tC2597T\tSNP\t2597\t2597\tC\t2755\t2755\tT\t823\tTC,T\t487,1\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T.\tHigh-level resistance to Azithromycin', (False, 100.0)), - ('ariba\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t90,10\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 10.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t91,9\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (False, 9.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t50,50\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 50.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t70,30\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 30.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t91,9\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (False, 91.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t90,10\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 90.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t50,50\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 50.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t10,90\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 10.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t9,91\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 9.0)), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', (False, 100.0, 'T')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tT,A\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs', (True, 25.0, 'AT')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tA,T\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs', (True, 75.0, 'AT')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 40.0, 'AGT')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T\t95,5\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (False, 5.0, 'A')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T\t90,10\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 10.0, 'AT')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,C\t90,6,4\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (False, 6.0, 'A')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,C\t3,7,90\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 7.0, 'ACT')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tHET\t.\t.\t.\t.\t.\t.\t.\t.\t84\t84\tA\t50\tA,T\t40,10\t.\t.', (True, 20.0, 'AT')), + ('ariba_ref1\t23S.rDNA_WHO_F_01358c\t0\t1\t531\t9914\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3120\t744.8\t1\tSNP\tn\tC2597T\t1\tC2597T\tSNP\t2597\t2597\tC\t2755\t2755\tT\t823\tC,T\t487,1\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T.\tHigh-level resistance to Azithromycin', (False, 0.2, 'C')), + ('ariba\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t90,10\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 10.0, 'CT')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t91,9\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (False, 9.0, 'C')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t50,50\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 50.0, 'CT')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t70,30\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 30.0, 'CT')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t91,9\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (False, 91.0, 'T')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t90,10\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 90.0, 'CT')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t50,50\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 50.0, 'CT')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t10,90\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 10.0, 'CT')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t9,91\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 9.0, 'CT')), ] for line, expected in tests: @@ -88,10 +110,10 @@ def test_init(self): expected = [ {'coding': True, 'known': True, 'var_string': 'I14L', 'var_group': '.', 'het_percent': None}, - {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 100.0}, - {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 25.0}, - {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 50.0}, - {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 10.0}, + {'coding': False, 'known': True, 'var_string': '14T', 'var_group': 'id1', 'het_percent': 100.0}, + {'coding': False, 'known': True, 'var_string': '14AT', 'var_group': 'id1', 'het_percent': 25.0}, + {'coding': False, 'known': True, 'var_string': '14AGT', 'var_group': 'id1', 'het_percent': 50.0}, + {'coding': False, 'known': True, 'var_string': '14AT', 'var_group': 'id1', 'het_percent': 10.0}, ] assert len(lines) == len(expected) diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index ff427a70..90665fe9 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -244,13 +244,13 @@ def test_gather_unfiltered_output_data(self): self.assertEqual(expected_potential_cols, s.all_potential_columns) self.assertEqual(expected_all, s.all_data) - expected_potential_cols['noncoding1']['vars'] = {'A14T.%', 'A6G', 'A6G.%', 'A14T'} - expected_potential_cols['noncoding2']['vars'] = {'A52T', 'A52T.%', 'A42T', 'A42T.%'} + expected_potential_cols['noncoding1']['vars'] = {'14T', '14T.%', '14GT', '14GT.%', '6G', '6G.%'} + expected_potential_cols['noncoding2']['vars'] = {'52GT', '52GT.%', '42T', '42T.%'} - expected_all[infiles[0]]['noncoding1']['vars'] = {'A14T': 'yes', 'A14T.%': 100.0} - expected_all[infiles[0]]['noncoding2']['vars'] = {'A42T': 'yes', 'A42T.%': 100.0, 'A52T': 'het', 'A52T.%': 40.0} - expected_all[infiles[1]]['noncoding1']['vars'] = {'A14T': 'het', 'A14T.%': 80.0, 'A6G': 'yes', 'A6G.%': 100.0} - expected_all[infiles[1]]['noncoding2']['vars'] = {'A52T': 'het', 'A52T.%': 40.0} + expected_all[infiles[0]]['noncoding1']['vars'] = {'14T': 'yes', '14T.%': 100.0} + expected_all[infiles[0]]['noncoding2']['vars'] = {'42T': 'yes', '42T.%': 100.0, '52GT': 'het', '52GT.%': 40.0} + expected_all[infiles[1]]['noncoding1']['vars'] = {'14GT': 'het', '14GT.%': 80.0, '6G': 'yes', '6G.%': 100.0} + expected_all[infiles[1]]['noncoding2']['vars'] = {'52GT': 'het', '52GT.%': 40.0} s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True) s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() @@ -288,11 +288,11 @@ def test_to_matrix_all_cols(self): s._gather_unfiltered_output_data() got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns) - expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.id3.%:c2', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding1.A6G.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.id2:o1', 'noncoding2.id2.%:c2', 'noncoding2.A42T:o1', 'noncoding2.A42T.%:c2', 'noncoding2.A52T:o1', 'noncoding2.A52T.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1', 'presence_absence1.A10V:o1'] - expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.id3.%', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding1.A6G.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'noncoding2.A42T', 'noncoding2.A42T.%', 'noncoding2.A52T', 'noncoding2.A52T.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var', 'presence_absence1.A10V'] + expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.id3.%:c2', 'noncoding1.14GT:o1', 'noncoding1.14GT.%:c2', 'noncoding1.14T:o1', 'noncoding1.14T.%:c2', 'noncoding1.6G:o1', 'noncoding1.6G.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.id2:o1', 'noncoding2.id2.%:c2', 'noncoding2.42T:o1', 'noncoding2.42T.%:c2', 'noncoding2.52GT:o1', 'noncoding2.52GT.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1', 'presence_absence1.A10V:o1'] + expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.id3.%', 'noncoding1.14GT', 'noncoding1.14GT.%', 'noncoding1.14T', 'noncoding1.14T.%', 'noncoding1.6G', 'noncoding1.6G.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'noncoding2.42T', 'noncoding2.42T.%', 'noncoding2.52GT', 'noncoding2.52GT.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var', 'presence_absence1.A10V'] expected_matrix = [ - ['sample1', 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 100.0, 'no', 'NA', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 100.0, 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'], - [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 100.0, 'het', 80.0, 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'no', 'NA', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes'] + ['sample1', 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 100.0, 'no', 'NA', 'no', 'NA', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 100.0, 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'], + [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 100.0, 'het', 80.0, 'no', 'NA', 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'no', 'NA', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes'] ] self.assertEqual(expected_phandango_header, got_phandango_header) @@ -336,11 +336,11 @@ def test_to_matrix_with_vars(self): s._gather_unfiltered_output_data() got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns) - expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding1.A6G.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.A42T:o1', 'noncoding2.A42T.%:c2', 'noncoding2.A52T:o1', 'noncoding2.A52T.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1', 'presence_absence1.A10V:o1'] - expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding1.A6G.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.A42T', 'noncoding2.A42T.%', 'noncoding2.A52T', 'noncoding2.A52T.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var', 'presence_absence1.A10V'] + expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.14GT:o1', 'noncoding1.14GT.%:c2', 'noncoding1.14T:o1', 'noncoding1.14T.%:c2', 'noncoding1.6G:o1', 'noncoding1.6G.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.42T:o1', 'noncoding2.42T.%:c2', 'noncoding2.52GT:o1', 'noncoding2.52GT.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1', 'presence_absence1.A10V:o1'] + expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.14GT', 'noncoding1.14GT.%', 'noncoding1.14T', 'noncoding1.14T.%', 'noncoding1.6G', 'noncoding1.6G.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.42T', 'noncoding2.42T.%', 'noncoding2.52GT', 'noncoding2.52GT.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var', 'presence_absence1.A10V'] expected_matrix = [ - [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 100.0, 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'], - [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'no', 'NA', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes'] + [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'no', 'NA', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 100.0, 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'], + [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'no', 'NA', 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'no', 'NA', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes'] ] self.assertEqual(expected_phandango_header, got_phandango_header) From 9d5dfb84de874563b391c46c2ad7289c6497ca7f Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 24 Feb 2017 23:59:12 +0000 Subject: [PATCH 43/88] Group micplot options --- scripts/ariba | 53 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/scripts/ariba b/scripts/ariba index 61d90e35..f2c86750 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -67,28 +67,37 @@ subparser_micplot.add_argument('antibiotic', help='Antibiotic name. Must exactly subparser_micplot.add_argument('mic_file', help='File containing MIC data for each sample and one or more antibiotics') subparser_micplot.add_argument('summary_file', help='File made by running "ariba summary"') subparser_micplot.add_argument('outprefix', help='Prefix of output files') -subparser_micplot.add_argument('--use_hets', choices=['yes', 'no', 'exclude'], help='How to deal with HET snps. Choose from yes,no,exclude. yes: count a het SNP as present. no: do not count a het SNP as present. exclude: completely remove any sample with any het SNP [%(default)s]', default='yes') -subparser_micplot.add_argument('--main_title', help='Main title of plot. Default is to use the antibiotic name', metavar='"title in quotes"') -subparser_micplot.add_argument('--plot_height', help='Height of plot (used in plot_height=X when running ggsave [%(default)s]', default=15, type=float, metavar='FLOAT') -subparser_micplot.add_argument('--plot_width', help='Height of plot (used in plot_width=X when running ggsave [%(default)s]', default=15, type=float, metavar='FLOAT') -subparser_micplot.add_argument('--no_log_y', action='store_true', help='Do not log scale the y axis') -subparser_micplot.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from boxplot,violin,point [%(default)s]', default='violin,point', metavar='type1,type2,...') -subparser_micplot.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)s]', default=0.1, type=float, metavar='FLOAT') -subparser_micplot.add_argument('--jitter_height', help='Jitter height option when plotting points [%(default)s]', default=0.02, type=float, metavar='FLOAT') -subparser_micplot.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') -subparser_micplot.add_argument('--mic_values', help='Comma-separated list of MIC values to be shown on y axis ticks [%(default)s]', default='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024', metavar='float1,float2,...') -subparser_micplot.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. Default is to draw no lines.', metavar='float1,float2,...', default='') -subparser_micplot.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point. If zero, will group points and size them proportional to the group size [%(default)s]', default=4, metavar='FLOAT') -subparser_micplot.add_argument('--point_range', help='Min and max size of plotted points when --point_size is 0 [%(default)s]', default='2,15', metavar='min,max') -subparser_micplot.add_argument('--point_break', help='Comma-spearated list of breakpoints in point sizes when --point_size is 0 [%(default)s]', default='10,50,100,200,300', metavar='integer1.integer2,integer3,...') -subparser_micplot.add_argument('--point_legend_x', type=float, help='x coord of legend when --point_size is 0 [%(default)s]', default=-0.15, metavar='FLOAT') -subparser_micplot.add_argument('--point_legend_y', type=float, help='y coord of legend when --point_size is 0 [%(default)s]', default=0.9, metavar='FLOAT') -subparser_micplot.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=8, metavar='FLOAT') -subparser_micplot.add_argument('--dot_outline', action='store_true', help='Black outline around all dots (whether coloured or not) in lower part of plots') -subparser_micplot.add_argument('--dot_y_text_size', type=int, help='Text size of labels in lower dot plot [%(default)s]', default=18, metavar='INT') -subparser_micplot.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom panels. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='5,1', metavar='height1,height2') -subparser_micplot.add_argument('--palette', help='ColourBrewer palette to use [%(default)s]', default='Accent', metavar='palette_name') -subparser_micplot.add_argument('--number_of_colours', help='Number of colours in plot. 0:same number as columns in the plot. 1:all black. >1: take the first N colours from the colour palette specified by --palette and cycle them [%(default)s]', default=0, metavar='INT') + +micplot_general_group = subparser_micplot.add_argument_group('General options') +micplot_general_group.add_argument('--main_title', help='Main title of plot. Default is to use the antibiotic name', metavar='"title in quotes"') +micplot_general_group.add_argument('--plot_height', help='Height of plot (used in plot_height=X when running ggsave [%(default)s]', default=15, type=float, metavar='FLOAT') +micplot_general_group.add_argument('--plot_width', help='Height of plot (used in plot_width=X when running ggsave [%(default)s]', default=15, type=float, metavar='FLOAT') +micplot_general_group.add_argument('--use_hets', choices=['yes', 'no', 'exclude'], help='How to deal with HET snps. Choose from yes,no,exclude. yes: count a het SNP as present. no: do not count a het SNP as present. exclude: completely remove any sample with any het SNP [%(default)s]', default='yes') +micplot_general_group.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') +micplot_general_group.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom plots. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='5,1', metavar='height1,height2') + +micplot_colour_group = subparser_micplot.add_argument_group('Colour options') +micplot_colour_group.add_argument('--palette', help='ColourBrewer palette to use [%(default)s]', default='Accent', metavar='palette_name') +micplot_colour_group.add_argument('--number_of_colours', help='Number of colours in plot. 0:same number as columns in the plot. 1:all black. >1: take the first N colours from the colour palette specified by --palette and cycle them [%(default)s]', default=0, metavar='INT') + +micplot_upper_plot_group = subparser_micplot.add_argument_group('Upper plot options') +micplot_upper_plot_group.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from boxplot,violin,point [%(default)s]', default='violin,point', metavar='type1,type2,...') +micplot_upper_plot_group.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. Default is to draw no lines.', metavar='float1,float2,...', default='') +micplot_upper_plot_group.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)s]', default=0.1, type=float, metavar='FLOAT') +micplot_upper_plot_group.add_argument('--jitter_height', help='Jitter height option when plotting points [%(default)s]', default=0.02, type=float, metavar='FLOAT') +micplot_upper_plot_group.add_argument('--no_log_y', action='store_true', help='Do not log scale the y axis') +micplot_upper_plot_group.add_argument('--mic_values', help='Comma-separated list of MIC values to be shown on y axis ticks [%(default)s]', default='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024', metavar='float1,float2,...') +micplot_upper_plot_group.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point. If zero, will group points and size them proportional to the group size [%(default)s]', default=4, metavar='FLOAT') +micplot_upper_plot_group.add_argument('--point_range', help='Min and max size of plotted points when --point_size is 0 [%(default)s]', default='2,15', metavar='min,max') +micplot_upper_plot_group.add_argument('--point_break', help='Comma-separated list of breakpoints in point sizes when --point_size is 0 [%(default)s]', default='10,50,100,200,300', metavar='integer1.integer2,integer3,...') +micplot_upper_plot_group.add_argument('--point_legend_x', type=float, help='x coord of legend when --point_size is 0 [%(default)s]', default=-0.15, metavar='FLOAT') +micplot_upper_plot_group.add_argument('--point_legend_y', type=float, help='y coord of legend when --point_size is 0 [%(default)s]', default=0.9, metavar='FLOAT') + +micplot_lower_plot_group = subparser_micplot.add_argument_group('Lower plot options') +micplot_lower_plot_group.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=8, metavar='FLOAT') +micplot_lower_plot_group.add_argument('--dot_outline', action='store_true', help='Black outline around all dots (whether coloured or not) in lower part of plots') +micplot_lower_plot_group.add_argument('--dot_y_text_size', type=int, help='Text size of labels [%(default)s]', default=18, metavar='INT') + subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) #----------------------------- prepareref ------------------------------- From 27902b166c4dd06e6979d7a642caf203f1a75351 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Mon, 27 Feb 2017 09:30:50 +0000 Subject: [PATCH 44/88] Sort mutations by position; speed up avoid for loop --- ariba/mic_plotter.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index c4cb7c1d..cbe16382 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -321,17 +321,19 @@ def _make_plot(self, names(cols) <- sort(as.vector(unique(samples$Mutations))) setcols <- c() -for (i in 1:nrow(dots.melt)){ - if (dots.melt[i,3]==1){ - setcols <- c(setcols, cols[as.vector(dots.melt[i,2])]) - } - else{ - setcols <- c(setcols, "white") - } -} +setcols <- apply(dots.melt, 1, function(x){ + + if (x[3]==1){ cols[x[2]] }else{ "white" } + +}) + dots.melt <- cbind(dots.melt, setcols) mutations <- levels(dots.melt$var1) +mut.order <- gsub(".*\\.\\w", "", mutations) +mut.order <- as.numeric(gsub("\\D+", "", mut.order)) +mutations <- mutations[order(mut.order)] + i <- match("without_mutation", mutations) if (!is.na(i)) { mutations <- c(mutations[-i], "without_mutation") From cbc0f39d7966ef34c71d049a45bfd1f58722cd45 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Mon, 27 Feb 2017 10:16:54 +0000 Subject: [PATCH 45/88] Report filename when wrong number of columns --- ariba/summary_cluster.py | 10 ++++++++-- ariba/summary_sample.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py index f3f952c7..512948b1 100644 --- a/ariba/summary_cluster.py +++ b/ariba/summary_cluster.py @@ -1,3 +1,4 @@ +import sys from ariba import flag, report, summary_cluster_variant class Error (Exception): pass @@ -30,10 +31,15 @@ def __eq__(self, other): @classmethod - def line2dict(cls, line): + def line2dict(cls, line, filename=None): data = line.rstrip().split('\t') if len(data) != len(report.columns): - raise Error('Wrong number of columns in the following line. Expected ' + str(len(report.columns)) + ' but got ' + str(len(data)) + '\n' + line) + if filename is not None: + filename_message = 'Error reading ariba summary file "' + filename + '". ' + else: + filename_message = '' + raise Error(filename_message + 'Wrong number of columns in the following line. Expected ' + str(len(report.columns)) + ' but got ' + str(len(data)) + '\n' + line) + d = {report.columns[i]: data[i] for i in range(len(data))} try: d['flag'] = flag.Flag(int(d['flag']) ) diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py index bc1ea25f..11107284 100644 --- a/ariba/summary_sample.py +++ b/ariba/summary_sample.py @@ -27,7 +27,7 @@ def _load_file(filename, min_pc_id, only_clusters=None): raise Error('Error parsing the following line.\n' + line) continue - data_dict = summary_cluster.SummaryCluster.line2dict(line) + data_dict = summary_cluster.SummaryCluster.line2dict(line, filename=filename) cluster = data_dict['cluster'] if only_clusters is not None and cluster not in only_clusters: continue From 9c86b787df9fb205918914744ed3ee172126fdd5 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Mon, 27 Feb 2017 11:18:16 +0000 Subject: [PATCH 46/88] Add option --interrupted --- ariba/mic_plotter.py | 10 ++++++---- ariba/tasks/micplot.py | 3 ++- scripts/ariba | 1 + 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index cbe16382..ca0adf45 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -34,7 +34,8 @@ def __init__(self, dot_y_text_size=18, panel_heights='5,1', palette='Accent', - number_of_colours=0 + number_of_colours=0, + interrupted=False ): self.antibiotic = antibiotic self.mic_file = mic_file @@ -98,6 +99,7 @@ def __init__(self, self.palette = palette self.number_of_colours = number_of_colours + self.interrupted = interrupted @classmethod @@ -174,7 +176,7 @@ def _load_summary_file(cls, infile): @classmethod - def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile, use_hets, no_combinations=False): + def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile, use_hets, no_combinations=False, interrupted=False): assert use_hets in {'yes', 'no', 'exclude'} ignore_columns = {'assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'} all_mutations = set() @@ -197,7 +199,7 @@ def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile, use_hets, found_het_and_exclude = False for cluster in summary_data[sample]: - if summary_data[sample][cluster]['assembled'] == 'interrupted': + if summary_data[sample][cluster]['assembled'] == 'interrupted' and interrupted: mutations.add(cluster + '.interrupted') for column, value in summary_data[sample][cluster].items(): @@ -451,7 +453,7 @@ def run(self): mic_data = MicPlotter._load_mic_file(self.mic_file) summary_data = MicPlotter._load_summary_file(self.summary_file) boxplot_tsv = self.outprefix + '.boxplot.tsv' - all_mutations, combinations = MicPlotter._to_boxplot_tsv(summary_data, mic_data, self.antibiotic, boxplot_tsv, self.use_hets, no_combinations=self.no_combinations) + all_mutations, combinations = MicPlotter._to_boxplot_tsv(summary_data, mic_data, self.antibiotic, boxplot_tsv, self.use_hets, no_combinations=self.no_combinations, interrupted=self.interrupted) dots_tsv = self.outprefix + '.dots.tsv' MicPlotter._to_dots_tsv(all_mutations, combinations, dots_tsv) self._make_plot(boxplot_tsv, dots_tsv) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index b78006a2..3249b33b 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -28,7 +28,8 @@ def run(options): dot_y_text_size=options.dot_y_text_size, panel_heights=options.panel_heights, palette=options.palette, - number_of_colours=options.number_of_colours + number_of_colours=options.number_of_colours, + interrupted=options.interrupted ) plotter.run() diff --git a/scripts/ariba b/scripts/ariba index f2c86750..3d43732c 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -73,6 +73,7 @@ micplot_general_group.add_argument('--main_title', help='Main title of plot. Def micplot_general_group.add_argument('--plot_height', help='Height of plot (used in plot_height=X when running ggsave [%(default)s]', default=15, type=float, metavar='FLOAT') micplot_general_group.add_argument('--plot_width', help='Height of plot (used in plot_width=X when running ggsave [%(default)s]', default=15, type=float, metavar='FLOAT') micplot_general_group.add_argument('--use_hets', choices=['yes', 'no', 'exclude'], help='How to deal with HET snps. Choose from yes,no,exclude. yes: count a het SNP as present. no: do not count a het SNP as present. exclude: completely remove any sample with any het SNP [%(default)s]', default='yes') +micplot_general_group.add_argument('--interrupted', action='store_true', help='Include interrupted genes (as in the assembled column of the ariba summary files)') micplot_general_group.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') micplot_general_group.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom plots. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='5,1', metavar='height1,height2') From c02f1ecce8bcda7f0233aedd2d098c5db8172fc3 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Mon, 27 Feb 2017 11:30:30 +0000 Subject: [PATCH 47/88] Add --interrupted option --- ariba/tests/mic_plotter_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ariba/tests/mic_plotter_test.py b/ariba/tests/mic_plotter_test.py index 78990c0e..3d6a2b33 100644 --- a/ariba/tests/mic_plotter_test.py +++ b/ariba/tests/mic_plotter_test.py @@ -136,14 +136,14 @@ def test_to_boxplot_tsv(self): for antibio in ['antibio1', 'antibio2']: for use_het in ['no', 'yes', 'exclude']: - got_muts, got_combs = mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, antibio, tmp_tsv, use_het) + got_muts, got_combs = mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, antibio, tmp_tsv, use_het, interrupted=True) expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.' + antibio + '.' + use_het + '.tsv') self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) self.assertEqual(got_muts, expected_mutations[antibio][use_het]) self.assertEqual(got_combs, expected_combs[antibio][use_het]) os.unlink(tmp_tsv) - got_muts, got_combs = mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, antibio, tmp_tsv, use_het, no_combinations=True) + got_muts, got_combs = mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, antibio, tmp_tsv, use_het, no_combinations=True, interrupted=True) expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.' + antibio + '.' + use_het + '.no_combinations.tsv') self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) self.assertEqual(got_muts, expected_mutations[antibio][use_het]) From 4261c351ea47ac12909fe0035f7a791f0fb7e589 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Mon, 27 Feb 2017 11:57:56 +0000 Subject: [PATCH 48/88] Fix sorting variant names --- ariba/mic_plotter.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index ca0adf45..65dfc902 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -35,7 +35,8 @@ def __init__(self, panel_heights='5,1', palette='Accent', number_of_colours=0, - interrupted=False + interrupted=False, + no_clust_when_group=False, ): self.antibiotic = antibiotic self.mic_file = mic_file @@ -331,10 +332,18 @@ def _make_plot(self, dots.melt <- cbind(dots.melt, setcols) -mutations <- levels(dots.melt$var1) -mut.order <- gsub(".*\\.\\w", "", mutations) -mut.order <- as.numeric(gsub("\\D+", "", mut.order)) -mutations <- mutations[order(mut.order)] +mutlevels <- levels(dots.melt$var1) +genes <- unique(gsub("\\..*", "", mutlevels)) +mutations <- c() + +for (i in 1:length(genes)){ + curmutations <- mutlevels[grep(paste0(genes[i], "."), mutlevels)] + curgene <- gsub("^.*\\.", "", curmutations) + curgene <- gsub("^\\D+", "", curgene) # if there is a reference base + genepos <- as.numeric(gsub("\\D+", "", curgene)) + curmutations <- curmutations[order(genepos)] + mutations <- c(mutations, curmutations) +} i <- match("without_mutation", mutations) if (!is.na(i)) { From 3c60d72944ac20bfbe7aefd0573944be2ea9fc77 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Mon, 27 Feb 2017 11:59:12 +0000 Subject: [PATCH 49/88] Remove unused option --- ariba/mic_plotter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 65dfc902..4f7d040f 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -35,8 +35,7 @@ def __init__(self, panel_heights='5,1', palette='Accent', number_of_colours=0, - interrupted=False, - no_clust_when_group=False, + interrupted=False ): self.antibiotic = antibiotic self.mic_file = mic_file From 800fc7ce8a5ebb96d944e45d7865541d4375fb50 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Mon, 27 Feb 2017 14:41:15 +0000 Subject: [PATCH 50/88] Bug fix reporting hets not really present --- ariba/summary_cluster_variant.py | 5 +++-- ariba/tests/data/summary_test_whole_run.out.csv | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/ariba/summary_cluster_variant.py b/ariba/summary_cluster_variant.py index 1f7d2215..709a2b63 100644 --- a/ariba/summary_cluster_variant.py +++ b/ariba/summary_cluster_variant.py @@ -126,8 +126,9 @@ def _get_nonsynon_variant_data(self, data_dict): if var_bases is not None: self.var_string = re.sub(r'[^0-9]', '', self.var_string) + var_bases - if not SummaryClusterVariant._has_nonsynonymous(data_dict): - self.has_nonsynon = False + self.has_nonsynon = SummaryClusterVariant._has_nonsynonymous(data_dict) and not (data_dict['var_type'] == 'HET' and not self.is_het) + + if not self.has_nonsynon: return self.has_nonsynon = True diff --git a/ariba/tests/data/summary_test_whole_run.out.csv b/ariba/tests/data/summary_test_whole_run.out.csv index 44ba9db1..daa0ae38 100644 --- a/ariba/tests/data/summary_test_whole_run.out.csv +++ b/ariba/tests/data/summary_test_whole_run.out.csv @@ -1,3 +1,3 @@ -name,23S.assembled,23S.match,23S.ref_seq,23S.pct_id,23S.known_var,23S.2597CT,23S.2597CT.%,23S.2597TC,23S.2597TC.%,coding1.assembled,coding1.match,coding1.ref_seq,coding1.pct_id,coding2.assembled,coding2.match,coding2.ref_seq,coding2.pct_id,coding3.assembled,coding3.match,coding3.ref_seq,coding3.pct_id,coding5.assembled,coding5.match,coding5.ref_seq,coding5.pct_id,coding5.known_var,coding5.A42S,coding6.assembled,coding6.match,coding6.ref_seq,coding6.pct_id,coding6.known_var,coding6.A52S,coding7.assembled,coding7.ref_seq,coding7.pct_id,coding8.assembled,coding8.match,coding8.ref_seq,coding8.pct_id,coding8.novel_var,coding8.A53S,mdfA.assembled,mdfA.ref_seq,mdfA.pct_id,mdfA.novel_var,mdfA.G261GGGTGTGGTGTGGT/GGGTGTGGT,noncoding1.assembled,noncoding1.match,noncoding1.ref_seq,noncoding1.pct_id,noncoding10.assembled,noncoding10.match,noncoding10.ref_seq,noncoding10.pct_id,noncoding10.novel_var,noncoding10.100T,noncoding10.100T.%,noncoding11.assembled,noncoding11.match,noncoding11.ref_seq,noncoding11.pct_id,noncoding11.novel_var,noncoding11.101AG,noncoding11.101AG.%,noncoding2.assembled,noncoding2.match,noncoding2.ref_seq,noncoding2.pct_id,noncoding3.assembled,noncoding3.match,noncoding3.ref_seq,noncoding3.pct_id,noncoding5.assembled,noncoding5.match,noncoding5.ref_seq,noncoding5.pct_id,noncoding5.known_var,noncoding5.42T,noncoding5.42T.%,noncoding6.assembled,noncoding6.match,noncoding6.ref_seq,noncoding6.pct_id,noncoding6.known_var,noncoding6.52CT,noncoding6.52CT.%,noncoding7.assembled,noncoding7.match,noncoding7.ref_seq,noncoding7.pct_id,noncoding7.known_var,noncoding7.53T,noncoding7.53T.%,noncoding8.assembled,noncoding8.match,noncoding8.ref_seq,noncoding8.pct_id,noncoding9.assembled,noncoding9.ref_seq,noncoding9.pct_id -summary_test_whole_run.in.1.tsv,yes,yes,23S.rDNA_WHO_F_01358c,99.86,yes,no,NA,yes,100.0,interrupted,no,coding1_ref1,99.1,yes,yes,coding2_ref1,98.2,no,no,NA,NA,yes,no,coding5_ref1,97.4,no,no,yes,yes,coding6_ref1,95.5,yes,yes,yes,coding7_ref1,95.4,yes,yes,coding8_ref1,95.3,yes,yes,interrupted,mdfA.3001328.JQ394987.0_1233.561,97.0,yes,yes,yes,yes,noncoding1_ref1,99.1,yes,yes,noncoding10_ref1,95.1,yes,yes,99.0,yes,yes,noncoding11_ref1,95.05,het,het,30.0,yes,yes,noncoding2_ref1,98.2,no,no,NA,NA,yes,yes,noncoding5_ref1,97.4,yes,yes,100.0,yes,yes,noncoding6_ref1,95.5,yes,het,70.0,yes,yes,noncoding7_ref1,95.4,yes,yes,98.6,yes,yes,noncoding8_ref1,95.3,yes,noncoding9_ref1,95.2 -summary_test_whole_run.in.2.tsv,yes_nonunique,no,23S.rDNA_WHO_F_01358c,99.84,het,het,12.8,no,NA,yes,yes,coding1_ref2,99.2,no,no,NA,NA,yes,yes,coding3_ref1,97.6,yes,yes,coding5_ref1,97.4,yes,yes,no,no,NA,NA,NA,NA,no,NA,NA,no,no,NA,NA,NA,NA,no,NA,NA,NA,NA,yes,yes,noncoding1_ref2,99.2,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,yes,yes,noncoding3_ref1,97.6,yes,no,noncoding5_ref1,99.42,no,no,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,no,NA,NA +name,23S.assembled,23S.match,23S.ref_seq,23S.pct_id,23S.known_var,23S.2597CT,23S.2597CT.%,23S.2597TC,23S.2597TC.%,coding1.assembled,coding1.match,coding1.ref_seq,coding1.pct_id,coding2.assembled,coding2.match,coding2.ref_seq,coding2.pct_id,coding3.assembled,coding3.match,coding3.ref_seq,coding3.pct_id,coding5.assembled,coding5.match,coding5.ref_seq,coding5.pct_id,coding5.known_var,coding5.A42S,coding6.assembled,coding6.match,coding6.ref_seq,coding6.pct_id,coding6.known_var,coding6.A52S,coding7.assembled,coding7.ref_seq,coding7.pct_id,coding8.assembled,coding8.match,coding8.ref_seq,coding8.pct_id,coding8.novel_var,coding8.A53S,mdfA.assembled,mdfA.ref_seq,mdfA.pct_id,mdfA.novel_var,noncoding1.assembled,noncoding1.match,noncoding1.ref_seq,noncoding1.pct_id,noncoding10.assembled,noncoding10.match,noncoding10.ref_seq,noncoding10.pct_id,noncoding10.novel_var,noncoding10.100T,noncoding10.100T.%,noncoding11.assembled,noncoding11.match,noncoding11.ref_seq,noncoding11.pct_id,noncoding11.novel_var,noncoding11.101AG,noncoding11.101AG.%,noncoding2.assembled,noncoding2.match,noncoding2.ref_seq,noncoding2.pct_id,noncoding3.assembled,noncoding3.match,noncoding3.ref_seq,noncoding3.pct_id,noncoding5.assembled,noncoding5.match,noncoding5.ref_seq,noncoding5.pct_id,noncoding5.known_var,noncoding5.42T,noncoding5.42T.%,noncoding6.assembled,noncoding6.match,noncoding6.ref_seq,noncoding6.pct_id,noncoding6.known_var,noncoding6.52CT,noncoding6.52CT.%,noncoding7.assembled,noncoding7.match,noncoding7.ref_seq,noncoding7.pct_id,noncoding7.known_var,noncoding7.53T,noncoding7.53T.%,noncoding8.assembled,noncoding8.match,noncoding8.ref_seq,noncoding8.pct_id,noncoding9.assembled,noncoding9.ref_seq,noncoding9.pct_id +summary_test_whole_run.in.1.tsv,yes,yes,23S.rDNA_WHO_F_01358c,99.86,yes,no,NA,yes,100.0,interrupted,no,coding1_ref1,99.1,yes,yes,coding2_ref1,98.2,no,no,NA,NA,yes,no,coding5_ref1,97.4,no,no,yes,yes,coding6_ref1,95.5,yes,yes,yes,coding7_ref1,95.4,yes,yes,coding8_ref1,95.3,yes,yes,interrupted,mdfA.3001328.JQ394987.0_1233.561,97.0,yes,yes,yes,noncoding1_ref1,99.1,yes,yes,noncoding10_ref1,95.1,yes,yes,99.0,yes,yes,noncoding11_ref1,95.05,het,het,30.0,yes,yes,noncoding2_ref1,98.2,no,no,NA,NA,yes,yes,noncoding5_ref1,97.4,yes,yes,100.0,yes,yes,noncoding6_ref1,95.5,yes,het,70.0,yes,yes,noncoding7_ref1,95.4,yes,yes,98.6,yes,yes,noncoding8_ref1,95.3,yes,noncoding9_ref1,95.2 +summary_test_whole_run.in.2.tsv,yes_nonunique,no,23S.rDNA_WHO_F_01358c,99.84,het,het,12.8,no,NA,yes,yes,coding1_ref2,99.2,no,no,NA,NA,yes,yes,coding3_ref1,97.6,yes,yes,coding5_ref1,97.4,yes,yes,no,no,NA,NA,NA,NA,no,NA,NA,no,no,NA,NA,NA,NA,no,NA,NA,NA,yes,yes,noncoding1_ref2,99.2,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,yes,yes,noncoding3_ref1,97.6,yes,no,noncoding5_ref1,99.42,no,no,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,no,NA,NA From 5614d625fbc51f3309158302cee57d772aceb806 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Mon, 27 Feb 2017 15:30:38 +0000 Subject: [PATCH 51/88] bug fix losing without_mutation --- ariba/mic_plotter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 4f7d040f..c2655789 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -331,8 +331,11 @@ def _make_plot(self, dots.melt <- cbind(dots.melt, setcols) + mutlevels <- levels(dots.melt$var1) genes <- unique(gsub("\\..*", "", mutlevels)) +check.without <- match("without_mutation", genes) +if(!is.na(check.without)){ genes <- genes[-check.without] } mutations <- c() for (i in 1:length(genes)){ @@ -344,10 +347,7 @@ def _make_plot(self, mutations <- c(mutations, curmutations) } -i <- match("without_mutation", mutations) -if (!is.na(i)) { - mutations <- c(mutations[-i], "without_mutation") -} +if (!is.na(check.without)){ mutations <- c(mutations, "without_mutation") } dotplot <- ggplot(dots.melt, aes(x=var2, y=var1)) + From 6d76f9def932b3d275bb1d51c04a40a0ea50eaee Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 28 Feb 2017 08:44:46 +0000 Subject: [PATCH 52/88] Add options --violin_y_jitter --violin_scale_width --- ariba/mic_plotter.py | 12 ++++++++++-- ariba/tasks/micplot.py | 4 +++- scripts/ariba | 2 ++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index c2655789..8c208c04 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -35,7 +35,9 @@ def __init__(self, panel_heights='5,1', palette='Accent', number_of_colours=0, - interrupted=False + interrupted=False, + violin_y_jitter=0, + violin_scale_width=False ): self.antibiotic = antibiotic self.mic_file = mic_file @@ -100,6 +102,8 @@ def __init__(self, self.palette = palette self.number_of_colours = number_of_colours self.interrupted = interrupted + self.violin_y_jitter = violin_y_jitter + self.violin_scale_width = violin_scale_width @classmethod @@ -389,7 +393,11 @@ def _make_plot(self, legend_position = 'c(' + str(self.point_legend_x) + ',' + str(self.point_legend_y) + ')' if 'violin' in self.plot_types: - print(' geom_violin(data=samples, aes(x=Mutations, y=', ymic, ', color=Mutations), alpha=.10, show.legend = FALSE) +''', file=f) + print(' geom_violin(data=samples, aes(x=Mutations, y=jitter(', ymic, ',', self.violin_y_jitter, '), color=Mutations)', sep='', end='', file=f) + if self.violin_scale_width: + print(', scale="width"', end='', file=f) + + print(', alpha=.10, show.legend = FALSE) +', file=f) if 'boxplot' in self.plot_types: print(' geom_boxplot(data=samples, aes(x=Mutations, y=', ymic, ', color=Mutations), alpha=.10, show.legend = FALSE) +''', file=f) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 3249b33b..a72f023a 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -29,7 +29,9 @@ def run(options): panel_heights=options.panel_heights, palette=options.palette, number_of_colours=options.number_of_colours, - interrupted=options.interrupted + interrupted=options.interrupted, + violin_y_jitter=options.violin_y_jitter, + violin_scale_width=options.violin_scale_width ) plotter.run() diff --git a/scripts/ariba b/scripts/ariba index 3d43732c..ecc6ef98 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -93,6 +93,8 @@ micplot_upper_plot_group.add_argument('--point_range', help='Min and max size of micplot_upper_plot_group.add_argument('--point_break', help='Comma-separated list of breakpoints in point sizes when --point_size is 0 [%(default)s]', default='10,50,100,200,300', metavar='integer1.integer2,integer3,...') micplot_upper_plot_group.add_argument('--point_legend_x', type=float, help='x coord of legend when --point_size is 0 [%(default)s]', default=-0.15, metavar='FLOAT') micplot_upper_plot_group.add_argument('--point_legend_y', type=float, help='y coord of legend when --point_size is 0 [%(default)s]', default=0.9, metavar='FLOAT') +micplot_upper_plot_group.add_argument('--violin_y_jitter', type=float, help='y jitter in violin plot. Set to a small number>0, eg 0.001, if violins are not visible. Also see --violin_scale_width [%(default)s]', default=0) +micplot_upper_plot_group.add_argument('--violin_scale_width', action='store_true', help='Use this if violins are too narrow. Also see --violin_y_jitter') micplot_lower_plot_group = subparser_micplot.add_argument_group('Lower plot options') micplot_lower_plot_group.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=8, metavar='FLOAT') From 7332426539c11b6c6fe393f3b9b321e0ab6cdcc7 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 28 Feb 2017 08:48:19 +0000 Subject: [PATCH 53/88] metavar=float --- scripts/ariba | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ariba b/scripts/ariba index ecc6ef98..7aa57ca9 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -93,7 +93,7 @@ micplot_upper_plot_group.add_argument('--point_range', help='Min and max size of micplot_upper_plot_group.add_argument('--point_break', help='Comma-separated list of breakpoints in point sizes when --point_size is 0 [%(default)s]', default='10,50,100,200,300', metavar='integer1.integer2,integer3,...') micplot_upper_plot_group.add_argument('--point_legend_x', type=float, help='x coord of legend when --point_size is 0 [%(default)s]', default=-0.15, metavar='FLOAT') micplot_upper_plot_group.add_argument('--point_legend_y', type=float, help='y coord of legend when --point_size is 0 [%(default)s]', default=0.9, metavar='FLOAT') -micplot_upper_plot_group.add_argument('--violin_y_jitter', type=float, help='y jitter in violin plot. Set to a small number>0, eg 0.001, if violins are not visible. Also see --violin_scale_width [%(default)s]', default=0) +micplot_upper_plot_group.add_argument('--violin_y_jitter', type=float, help='y jitter in violin plot. Set to a small number>0, eg 0.001, if violins are not visible. Also see --violin_scale_width [%(default)s]', default=0, metavar='FLOAT') micplot_upper_plot_group.add_argument('--violin_scale_width', action='store_true', help='Use this if violins are too narrow. Also see --violin_y_jitter') micplot_lower_plot_group = subparser_micplot.add_argument_group('Lower plot options') From f78a2f5cdc62b0b0192671279751117d0b89fcc7 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 28 Feb 2017 23:13:15 +0000 Subject: [PATCH 54/88] Use matplotlib instead of R --- ariba/mic_plotter.py | 542 +++++++++--------- ariba/tasks/micplot.py | 8 +- ariba/tests/data/mic_plotter_to_dots.tsv | 4 - .../mic_plotter_to_dots_without_mutation.tsv | 6 - ariba/tests/mic_plotter_test.py | 111 +++- scripts/ariba | 14 +- 6 files changed, 369 insertions(+), 316 deletions(-) delete mode 100644 ariba/tests/data/mic_plotter_to_dots.tsv delete mode 100644 ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 8c208c04..34612300 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -1,12 +1,21 @@ import csv import re import os +import itertools +import collections +import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec +import matplotlib.cm as cmx +import math +import pyfastaq from ariba import common class Error (Exception): pass regex_string_to_float = re.compile(r'\s*(?P[<>]?)\s*(?P=?)\s*(?P[0-9.]+)\s*$') +regex_position_from_var = re.compile(r'^[^0-9]*(?P[0-9]+)[^0-9]*$') + class MicPlotter: def __init__(self, antibiotic, @@ -15,29 +24,25 @@ def __init__(self, outprefix, use_hets='yes', main_title=None, - plot_height=15, - plot_width=15, - log_y=True, + plot_height=7, + plot_width=7, + log_y=2, plot_types="points,violin", jitter_width=0.1, jitter_height=0.01, no_combinations=False, - mic_values='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024', hlines='0.25,2', point_size=4, point_range='2,15', point_break='10,50,100,200,300', - point_legend_x=-0.15, - point_legend_y=0.9, - dot_size=8, + dot_size=100, dot_outline=False, dot_y_text_size=18, panel_heights='5,1', palette='Accent', number_of_colours=0, interrupted=False, - violin_y_jitter=0, - violin_scale_width=False + violin_width=0.75 ): self.antibiotic = antibiotic self.mic_file = mic_file @@ -63,11 +68,6 @@ def __init__(self, self.jitter_height = jitter_height self.no_combinations = no_combinations - try: - self.mic_values = [float(x) for x in mic_values.split(',')] - except: - raise Error('Error in mic_values option. Needs to be a list of numbers separated by commas. Got this:\n' + mic_values) - try: if len(hlines) == 0: self.hlines = [] @@ -88,8 +88,6 @@ def __init__(self, except: raise Error('Error in point_break option. Needs to be comma-sparated list of integers. Got this:\n' + point_break) - self.point_legend_x = point_legend_x - self.point_legend_y = point_legend_y self.dot_size = dot_size self.dot_outline = dot_outline self.dot_y_text_size = dot_y_text_size @@ -102,8 +100,7 @@ def __init__(self, self.palette = palette self.number_of_colours = number_of_colours self.interrupted = interrupted - self.violin_y_jitter = violin_y_jitter - self.violin_scale_width = violin_scale_width + self.violin_width = violin_width @classmethod @@ -180,297 +177,310 @@ def _load_summary_file(cls, infile): @classmethod - def _to_boxplot_tsv(cls, summary_data, mic_data, antibiotic, outfile, use_hets, no_combinations=False, interrupted=False): + def _get_colours(cls, total_length, number_of_colours, colormap): + if number_of_colours == 1: + return ["black"] * total_length + elif number_of_colours == 0: + cmap = cmx.get_cmap(colormap) + vals = [1.0 * x / (total_length - 1) for x in range(total_length)] + return [cmap(x) for x in vals] + else: + cmap = cmx.get_cmap(colormap) + colours = [] + for i in itertools.cycle(range(number_of_colours)): + colours.append(cmap(i)) + if len(colours) >= total_length: + break + return colours + + + @classmethod + def _get_top_plot_data(cls, summary_data, mic_data, antibiotic, use_hets, no_combinations=False, interrupted=False, outfile=None): assert use_hets in {'yes', 'no', 'exclude'} + if outfile is not None: + f = pyfastaq.utils.open_file_write(outfile) + print('Sample\tMIC\tMutations', file=f) + ignore_columns = {'assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'} all_mutations = set() all_mutations_seen_combinations = set() + top_plot_data = {} # cluster combination -> list of y coords (MIC values) - with open(outfile, 'w') as f: - print('Sample\tMIC\tMutations', file=f) - - for sample in sorted(summary_data): - if sample not in mic_data: - raise Error('No MIC data found for sample "' + sample + '". Cannot continue') - - if antibiotic not in mic_data[sample]: - raise Error('Antibiotic "' + antibiotic + '" not found. Cannot continue') + for sample in sorted(summary_data): + if sample not in mic_data: + raise Error('No MIC data found for sample "' + sample + '". Cannot continue') + if antibiotic not in mic_data[sample]: + raise Error('Antibiotic "' + antibiotic + '" not found. Cannot continue') - if mic_data[sample][antibiotic] == 'NA': - continue + if mic_data[sample][antibiotic] == 'NA': + continue - mutations = set() - found_het_and_exclude = False + mutations = set() + found_het_and_exclude = False - for cluster in summary_data[sample]: - if summary_data[sample][cluster]['assembled'] == 'interrupted' and interrupted: - mutations.add(cluster + '.interrupted') + for cluster in summary_data[sample]: + if summary_data[sample][cluster]['assembled'] == 'interrupted' and interrupted: + mutations.add(cluster + '.interrupted') - for column, value in summary_data[sample][cluster].items(): - if column in ignore_columns or column.endswith('.%'): - continue + for column, value in summary_data[sample][cluster].items(): + if column in ignore_columns or column.endswith('.%'): + continue - if value == 'yes' or (use_hets == 'yes' and value == 'het'): - mutations.add(cluster + '.' + column.strip()) - elif use_hets == 'exclude' and value == 'het': - found_het_and_exclude = True - break - - if found_het_and_exclude: + if value == 'yes' or (use_hets == 'yes' and value == 'het'): + mutations.add(cluster + '.' + column.strip()) + elif use_hets == 'exclude' and value == 'het': + found_het_and_exclude = True break if found_het_and_exclude: - continue + break + + if found_het_and_exclude: + continue - if len(mutations) == 0: - mutations.add('without_mutation') + if len(mutations) == 0: + mutations.add('without_mutation') - all_mutations.update(mutations) - mutations = list(mutations) - mutations.sort() - if no_combinations: - for mutation in mutations: - all_mutations_seen_combinations.add((mutation,)) + all_mutations.update(mutations) + mutations = list(mutations) + mutations.sort() + if no_combinations: + for mutation in mutations: + all_mutations_seen_combinations.add((mutation,)) + if mutation not in top_plot_data: + top_plot_data[mutation] = [] + top_plot_data[mutation].append(mic_data[sample][antibiotic]) + if outfile is not None: print(sample, mic_data[sample][antibiotic], mutation, sep='\t', file=f) - else: - all_mutations_seen_combinations.add(tuple(mutations)) - mutations = '.'.join(mutations) + else: + all_mutations_seen_combinations.add(tuple(mutations)) + mutations = '.'.join(mutations) + if mutations not in top_plot_data: + top_plot_data[mutations] = [] + top_plot_data[mutations].append(mic_data[sample][antibiotic]) + if outfile is not None: print(sample, mic_data[sample][antibiotic], mutations, sep='\t', file=f) - return all_mutations, all_mutations_seen_combinations + + if outfile is not None: + pyfastaq.utils.close(f) + + return top_plot_data, all_mutations, all_mutations_seen_combinations @classmethod - def _to_dots_tsv(cls, all_mutations, combinations, outfile): - if 'without_mutation' in all_mutations: - all_mutations.remove('without_mutation') - combinations.remove(('without_mutation',)) - has_without_mutation = True + def _top_plot_y_ticks(cls, mic_data, antibiotic, log_y): + mic_values = set() + for sample in mic_data: + mic = mic_data[sample][antibiotic] + if mic not in [None, 'NA']: + mic_values.add(mic) + + max_mic = max(mic_values) + min_mic = min(mic_values) + new_mic_values = [] + i = 1 + while i < max_mic * 2: + new_mic_values.append(i) + i *= 2 + + i = 0.5 + while i > min_mic / 2: + new_mic_values.append(i) + i *= 0.5 + + new_mic_values.sort() + new_mic_values = [round(x, 4) for x in new_mic_values] + + if log_y > 0: + tick_positions = [math.log(x, log_y) for x in new_mic_values] else: - has_without_mutation = False + tick_positions = new_mic_values - all_mutations = list(all_mutations) - all_mutations.sort() - combinations = list(combinations) - combinations.sort() + return tick_positions, new_mic_values - if has_without_mutation: - all_mutations.append('without_mutation') - combinations.append(('without_mutation',)) - output_columns = {} - for combination in combinations: - output_columns[combination] = [(1 if x in combination else 0) for x in all_mutations] + @classmethod + def _top_plot_scatter_counts(cls, mutations, top_plot_data, colours, log_y): + x_coords = [] + y_coords = [] + sizes = [] + colour_list = [] + + for i, mutation in enumerate(mutations): + counts = collections.Counter(top_plot_data[mutation]) + for mic in sorted(counts): + x_coords.append(i + 1) + if log_y > 0: + y_coords.append(math.log(mic, log_y)) + else: + y_coords.append(mic) + sizes.append(counts[mic]) + colour_list.append(colours[i]) + + return x_coords, y_coords, sizes, colour_list - with open(outfile, 'w') as f: - print('Mutation', end='', file=f) - for x in combinations: - print('\t', '.'.join(x), sep='', end='', file=f) - print('', file=f) - for i in range(len(all_mutations)): - row = [all_mutations[i]] + [output_columns[x][i] for x in combinations] - print(*row, sep='\t', file=f) + @classmethod + def _top_plot_violin_data(cls, mutations, top_plot_data, log_y): + violin_data = [] + violin_pos = [] + for i, mutation in enumerate(mutations): + if log_y > 0: + violin_data.append([math.log(x, log_y) for x in top_plot_data[mutation]]) + else: + violin_data.append(top_plot_data[mutation]) + violin_pos.append(i + 1) - def _make_plot(self, - samples_file, - dots_file, - ): - r_script = self.outprefix + '.R' + return violin_data, violin_pos - try: - f = open(r_script, 'w') - except: - raise Error('Error opening R script for writing "' + r_script + '"') + + @classmethod + def _ordered_bottom_plot_rows(cls, mutations): + l = [] + infinity = float('inf') + + for x in mutations: + try: + cluster, variant = x.split('.', maxsplit=1) + except: + l.append((x, infinity, x)) + continue + + if '.' in variant: + try: + var_group, var = variant.split('.', maxsplit=1) + except: + var_group = None + var = variant + + variant = var + + regex_match = regex_position_from_var.match(variant) + if regex_match is not None and regex_match.group('coord') != '': + coord = int(regex_match.group('coord')) + else: + coord = infinity - libraries = ['ggplot2', 'RColorBrewer', 'reshape2'] - for lib in libraries: - print('library(', lib, ')', sep='', file=f) + l.append((cluster, coord, x)) - print('samples = read.csv(file="', samples_file, r'''", header=TRUE, sep="\t")''', sep='', file=f) - print('dots = read.csv(file="', dots_file, r'''", header=TRUE, sep="\t", check.names=FALSE)''', sep='', file=f) + l.sort() + return [x[-1] for x in l] - if self.log_y: - print('use.log = TRUE', file=f) - else: - print('use.log = FALSE', file=f) - - dot_colour = '"black"' if self.dot_outline else 'setcols' - - print(r''' -dots.melt = melt(dots) -colnames(dots.melt) <- c("var1", "var2", "value") - -palette.name = "''', self.palette, r'''" -colour.number = ''', self.number_of_colours, r''' -ncols <- length(as.vector(unique(samples$Mutations))) - -if (colour.number == 0) { - accent <- brewer.pal(8, palette.name) - accentPalette <- colorRampPalette(accent) - cols <- accentPalette(ncols) -} else if (colour.number == 1) { - cols <- rep("black", ncols) -} else { - if (colour.number == 2) { - unique_cols <- brewer.pal(3, palette.name)[1:2] - } - else { - unique_cols <- brewer.pal(colour.number, palette.name) - } - - cols <- rep(unique_cols, ncols) -} - -names(cols) <- sort(as.vector(unique(samples$Mutations))) -setcols <- c() - -setcols <- apply(dots.melt, 1, function(x){ - - if (x[3]==1){ cols[x[2]] }else{ "white" } - -}) - -dots.melt <- cbind(dots.melt, setcols) - - -mutlevels <- levels(dots.melt$var1) -genes <- unique(gsub("\\..*", "", mutlevels)) -check.without <- match("without_mutation", genes) -if(!is.na(check.without)){ genes <- genes[-check.without] } -mutations <- c() - -for (i in 1:length(genes)){ - curmutations <- mutlevels[grep(paste0(genes[i], "."), mutlevels)] - curgene <- gsub("^.*\\.", "", curmutations) - curgene <- gsub("^\\D+", "", curgene) # if there is a reference base - genepos <- as.numeric(gsub("\\D+", "", curgene)) - curmutations <- curmutations[order(genepos)] - mutations <- c(mutations, curmutations) -} - -if (!is.na(check.without)){ mutations <- c(mutations, "without_mutation") } - - -dotplot <- ggplot(dots.melt, aes(x=var2, y=var1)) + - geom_point(aes(fill=setcols, colour=''', dot_colour, '), shape=21, size=''', self.dot_size, r''') + - scale_fill_identity()+ - scale_colour_identity()+ - ylim(rev(mutations)) + - theme_bw() + - theme(axis.text.x = element_blank(), - axis.text.y = element_text(size=''', self.dot_y_text_size, r'''), - axis.title.x = element_blank(), - axis.title.y = element_blank(), - axis.ticks = element_blank(), - panel.border = element_blank(), - panel.grid.minor = element_blank(), - panel.grid.major = element_blank(), - legend.position="none") - -range.mics <- sort(c(''' + ','.join([str(x) for x in self.mic_values]) + r''')) -if (use.log & range.mics[1] == 0) { - range.mics <- range.mics[-1] -} -if (use.log){ final.mics <- log(range.mics) }else{ final.mics <- range.mics } - -sized_dot_data <- aggregate(samples$Sample,by=list(x=samples$Mutations,y=samples$MIC),length) -names(sized_dot_data)[3] <- "count" - -top_plot <- ggplot() +''', sep='', file=f) - - ymic = 'log(MIC)' if self.log_y else 'MIC' - legend_position = '"none"' - - if 'point' in self.plot_types: - if self.point_size > 0: - print(' geom_point(data=samples, aes(x=Mutations, y=', ymic, ', color=Mutations), position = position_jitter(width=''', self.jitter_width, ', height=', self.jitter_height, '), size=', self.point_size, ', alpha=.5) +', sep='', file=f) - else: - y = 'log(y)' if self.log_y else 'y' - print(' geom_point(data=sized_dot_data, aes(x=x, y=', y, ', size=count, color=x)) +', sep='', file=f) - legend_position = 'c(' + str(self.point_legend_x) + ',' + str(self.point_legend_y) + ')' - if 'violin' in self.plot_types: - print(' geom_violin(data=samples, aes(x=Mutations, y=jitter(', ymic, ',', self.violin_y_jitter, '), color=Mutations)', sep='', end='', file=f) - if self.violin_scale_width: - print(', scale="width"', end='', file=f) + @classmethod + def _ordered_columns(cls, mutations, top_plot_data): + # FIXME + return sorted(list(mutations)) - print(', alpha=.10, show.legend = FALSE) +', file=f) - if 'boxplot' in self.plot_types: - print(' geom_boxplot(data=samples, aes(x=Mutations, y=', ymic, ', color=Mutations), alpha=.10, show.legend = FALSE) +''', file=f) + @classmethod + def _bottom_scatter_data(cls, bottom_plot_rows, columns, colours): + x_coords = [] + y_coords = [] + colour_list = [] - if self.no_combinations: - axis_text_x = 'element_text(size=24, angle=45, hjust=1)' - else: - axis_text_x = 'element_blank()' + for i, row in enumerate(bottom_plot_rows): + for j, col in enumerate(columns): + if row in col: + x_coords.append(j + 1) + y_coords.append(len(bottom_plot_rows) - i) + colour_list.append(colours[j]) - for x in self.hlines: - if self.log_y: - print(' geom_hline(yintercept=log(', x, '), lty=2) +', sep='', file=f) - else: - print(' geom_hline(yintercept=', x, ', lty=2) +', sep='', file=f) - - - print(r''' ylab(expression(paste("''' + ymic + r''' ", mu, "g/mL"))) + - scale_colour_manual(values = cols, guide=FALSE) + - scale_size(range=c(''' + ','.join([str(x) for x in self.point_range]) + r'''), breaks = c(''' + ','.join([str(x) for x in self.point_break]) + r''')) + - ggtitle("''' + self.main_title + r'''") + - scale_y_continuous(breaks=final.mics, labels=range.mics) + - theme_bw() + - theme(panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_blank(), - axis.line = element_line(color="black"), - axis.title.x = element_blank(), - axis.title.y = element_text(size=22), - axis.text.x = ''' + axis_text_x + r''', - axis.text.y = element_text(size=24), - axis.title = element_text(size=20), - plot.title = element_text(lineheight=.6, size = 24, hjust=.5, face="bold"), - legend.title = element_text(size=30), - legend.text = element_text(size=20), - legend.position=''' + legend_position + ')', file=f) - - if self.no_combinations: - print('top_plot', file=f) - print('ggsave("', self.outprefix, '.pdf", useDingbats=FALSE, height=', self.plot_height, ', width=', self.plot_width, ')', sep='', file=f) + return x_coords, y_coords, colour_list + + + @classmethod + def _right_plot_data(cls, scatter_count_sizes, number_of_circles, x_pos): + y_max = max(scatter_count_sizes) + if y_max > 100: + y_max = int(math.ceil(y_max / 100.0)) * 100 + sizes = [5, 50] + [x for x in range(100, y_max, 100)] else: - print(r'''library(gtable) -library(grid) -g1 <- ggplotGrob(top_plot) -g2 <- ggplotGrob(dotplot) -g <- rbind(g1, g2, size="first") -g$widths <- unit.pmax(g1$widths, g2$widths) -panels <- g$layout$t[grepl("panel", g$layout$name)] - -if(getRversion() < "3.3.0"){ - g$heights <- grid:::unit.list(g$heights) - g$heights[panels][1] <- list(unit(''', self.panel_heights[0], r''', "null")) - g$heights[panels][2] <- list(unit(''', self.panel_heights[1], r''', "null")) -} else { - g$heights[panels][1] = unit(''', self.panel_heights[0], r''',"null") - g$heights[panels][2] = unit(''', self.panel_heights[1], r''',"null") -} - -pdf("''', self.outprefix, '.pdf", useDingbats=FALSE, height=', self.plot_height, ', width=', self.plot_width, r''') -grid.newpage() -grid.draw(g) -dev.off() -''', sep='', file=f) - - f.close() - common.syscall('R CMD BATCH ' + r_script) + y_max = int(math.ceil(y_max / 10.0)) * 10 + sizes = [5] + [x for x in range(10, y_max, 10)] + x_coords = [x_pos] * len(sizes) + y_coords = [x + 1 for x in range(len(sizes))] + y_coords.reverse() + return x_coords, y_coords, sizes + + + def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): + bottom_plot_rows = MicPlotter._ordered_bottom_plot_rows(all_mutations) + columns = MicPlotter._ordered_columns(mut_combinations, top_plot_data) + colours = MicPlotter._get_colours(len(columns), self.number_of_colours, self.palette) + bottom_scatter_x, bottom_scatter_y, bottom_colours = MicPlotter._bottom_scatter_data(bottom_plot_rows, columns, colours) + columns = ['.'.join(x) for x in columns] + assert len(colours) == len(columns) + max_x = len(colours) + 1 + + scatter_count_x, scatter_count_y, scatter_count_sizes, scatter_count_colours = MicPlotter._top_plot_scatter_counts(columns, top_plot_data, colours, self.log_y) + + violin_data, violin_positions = MicPlotter._top_plot_violin_data(columns, top_plot_data, self.log_y) + + # -------------------- SET UP GRID & PLOTS ----------------- + fig=plt.figure(figsize=(self.plot_height, self.plot_width)) + gs = gridspec.GridSpec(2, 2, height_ratios=self.panel_heights, width_ratios=[5,1]) + plots=[] + plots.append(plt.subplot(gs[0])) + plots.append(plt.subplot(gs[1])) + plots.append(plt.subplot(gs[2])) + + # ------------------------- TOP PLOT ----------------------- + for h in self.hlines: + if self.log_y > 0: + h = math.log(h, self.log_y) + plots[0].hlines(h, 0, max_x, linestyle='--', linewidth=1, color='black') + + + violins = plots[0].violinplot(violin_data, violin_positions, widths=self.violin_width, showmeans=False, showextrema=False, showmedians=False) + for x, pc in enumerate(violins['bodies']): + pc.set_facecolor(colours[x]) + pc.set_edgecolor(colours[x]) + + plots[0].scatter(scatter_count_x, scatter_count_y, s=scatter_count_sizes, c=scatter_count_colours, linewidth=0) + plots[0].axis([0,max(bottom_scatter_x) + 1,min(scatter_count_y), max(scatter_count_y)]) + + y_tick_positions, y_tick_labels = MicPlotter._top_plot_y_ticks(mic_data, self.antibiotic, self.log_y) + plots[0].yaxis.set_ticks(y_tick_positions) + plots[0].set_yticklabels(y_tick_labels) + ylabel = r'$\log_' + str(int(self.log_y)) + '$(MIC) $\mu$g/mL' if self.log_y > 0 else r'MIC $\mu$g/mL' + plots[0].set_ylabel(ylabel) + plots[0].set_xticklabels([]) + plots[0].set_title(self.main_title, fontsize=18) + + # ------------------------- BOTTOM PLOT ----------------------- + plots[2].axis([0,max(bottom_scatter_x) + 1,0,max(bottom_scatter_y) + 1]) + plots[2].scatter(bottom_scatter_x, bottom_scatter_y, marker='o', s=self.dot_size, color=bottom_colours) + plots[2].spines["top"].set_visible(False) + plots[2].spines["right"].set_visible(False) + plots[2].spines["bottom"].set_visible(False) + plots[2].spines["left"].set_visible(False) + plots[2].yaxis.set_tick_params(length=0) + plots[2].xaxis.set_ticks([]) + plots[2].set_xticklabels([]) + plots[2].yaxis.set_ticks([(i+1) for i in range(len(bottom_plot_rows))]) + plots[2].set_yticklabels(bottom_plot_rows[::-1]) + + # ------------------------- RIGHT PLOT ------------------------- + right_x_coord = 0.75 + right_x, right_y, right_sizes = MicPlotter._right_plot_data(scatter_count_sizes, 5, right_x_coord) + plots[1].scatter(right_x, right_y, s=right_sizes, c="black") + plots[1].axis('off') + plots[1].axis([0,4,-2*len(right_y),len(right_y)+1]) + for i, y in enumerate(right_y): + plots[1].annotate(right_sizes[i], [right_x_coord + 0.75, y-0.2]) + plots[1].annotate("Counts", [right_x_coord - 0.1, len(right_y) + 0.5]) + + plt.tight_layout() + plt.savefig(self.outprefix + '.pdf') def run(self): mic_data = MicPlotter._load_mic_file(self.mic_file) summary_data = MicPlotter._load_summary_file(self.summary_file) boxplot_tsv = self.outprefix + '.boxplot.tsv' - all_mutations, combinations = MicPlotter._to_boxplot_tsv(summary_data, mic_data, self.antibiotic, boxplot_tsv, self.use_hets, no_combinations=self.no_combinations, interrupted=self.interrupted) - dots_tsv = self.outprefix + '.dots.tsv' - MicPlotter._to_dots_tsv(all_mutations, combinations, dots_tsv) - self._make_plot(boxplot_tsv, dots_tsv) - + top_plot_data, all_mutations, combinations = MicPlotter._get_top_plot_data(summary_data, mic_data, self.antibiotic, self.use_hets, no_combinations=self.no_combinations, interrupted=self.interrupted, outfile=boxplot_tsv) + self._make_plot(mic_data, top_plot_data, all_mutations, combinations) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index a72f023a..5136490a 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -11,18 +11,15 @@ def run(options): main_title=options.main_title, plot_height=options.plot_height, plot_width=options.plot_width, - log_y=not options.no_log_y, + log_y=options.log_y, plot_types=options.plot_types, jitter_width=options.jitter_width, jitter_height=options.jitter_height, no_combinations=options.no_combinations, - mic_values=options.mic_values, hlines=options.hlines, point_size=options.point_size, point_range=options.point_range, point_break=options.point_break, - point_legend_x=options.point_legend_x, - point_legend_y=options.point_legend_y, dot_size=options.dot_size, dot_outline=options.dot_outline, dot_y_text_size=options.dot_y_text_size, @@ -30,8 +27,7 @@ def run(options): palette=options.palette, number_of_colours=options.number_of_colours, interrupted=options.interrupted, - violin_y_jitter=options.violin_y_jitter, - violin_scale_width=options.violin_scale_width + violin_width=options.violin_width ) plotter.run() diff --git a/ariba/tests/data/mic_plotter_to_dots.tsv b/ariba/tests/data/mic_plotter_to_dots.tsv deleted file mode 100644 index a188abfb..00000000 --- a/ariba/tests/data/mic_plotter_to_dots.tsv +++ /dev/null @@ -1,4 +0,0 @@ -Mutation m1 m1.m3 m2.m3 -m1 1 1 0 -m2 0 0 1 -m3 0 1 1 diff --git a/ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv b/ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv deleted file mode 100644 index b7b79f00..00000000 --- a/ariba/tests/data/mic_plotter_to_dots_without_mutation.tsv +++ /dev/null @@ -1,6 +0,0 @@ -Mutation m1 m1.m3 m1.z1 m2.m3 without_mutation -m1 1 1 1 0 0 -m2 0 0 0 1 0 -m3 0 1 0 1 0 -z1 0 0 1 0 0 -without_mutation 0 0 0 0 1 diff --git a/ariba/tests/mic_plotter_test.py b/ariba/tests/mic_plotter_test.py index 3d6a2b33..9b4fc9c3 100644 --- a/ariba/tests/mic_plotter_test.py +++ b/ariba/tests/mic_plotter_test.py @@ -62,8 +62,27 @@ def test_load_summary_file(self): self.assertEqual(got, expected) - def test_to_boxplot_tsv(self): - '''Test _to_boxplot_tsv''' + def test_get_colours(self): + '''test _get_colours''' + col1 = (0.0, 0.0, 0.5, 1.0) + col2 = (0.0, 0.0, 0.517825311942959, 1.0) + + tests = [ + (1, 1, 'jet', ["black"]), + (2, 1, 'jet', ["black", "black"]), + (3, 1, 'jet', ["black", "black", "black"]), + (2, 2, 'jet', [col1, col2]), + (3, 2, 'jet', [col1, col2, col1]), + (4, 2, 'jet', [col1, col2, col1, col2]), + (3, 0, 'jet', [(0.0, 0.0, 0.5, 1.0), (0.49019607843137247, 1.0, 0.47754585705249841, 1.0), (0.5, 0.0, 0.0, 1.0)]) + ] + + for total_length, number_of_colours, colormap, expected in tests: + self.assertEqual(expected, mic_plotter.MicPlotter._get_colours(total_length, number_of_colours, colormap)) + + + def test_get_top_plot_data(self): + '''Test _get_top_plot_data''' mic_data = { 'name1': {'antibio1': 0.25, 'antibio2': 0.004}, 'name2': {'antibio1': 0.125, 'antibio2': 'NA'}, @@ -91,6 +110,32 @@ def test_to_boxplot_tsv(self): }, } + expected_top_plot_data = { + 'antibio1': { + 'yes': {'cluster1.group1.A42T.cluster4.group4.A44T': [0.125], 'cluster2.group2.A43T.cluster3.interrupted': [0.25]}, + 'no': {'cluster1.group1.A42T': [0.125], 'cluster2.group2.A43T.cluster3.interrupted': [0.25]}, + 'exclude': {'cluster2.group2.A43T.cluster3.interrupted': [0.25]}, + }, + 'antibio2': { + 'yes': {'without_mutation': [0.002], 'cluster2.group2.A43T.cluster3.interrupted': [0.004]}, + 'no': {'without_mutation': [0.002], 'cluster2.group2.A43T.cluster3.interrupted': [0.004]}, + 'exclude': {'without_mutation': [0.002], 'cluster2.group2.A43T.cluster3.interrupted': [0.004]}, + } + } + + expected_top_plot_data_no_combs = { + 'antibio1': { + 'yes': {'cluster2.group2.A43T': [0.25], 'cluster4.group4.A44T': [0.125], 'cluster3.interrupted': [0.25], 'cluster1.group1.A42T': [0.125]}, + 'no': {'cluster2.group2.A43T': [0.25], 'cluster3.interrupted': [0.25], 'cluster1.group1.A42T': [0.125]}, + 'exclude': {'cluster2.group2.A43T': [0.25], 'cluster3.interrupted': [0.25]}, + }, + 'antibio2': { + 'yes': {'cluster2.group2.A43T': [0.004], 'without_mutation': [0.002], 'cluster3.interrupted': [0.004]}, + 'no': {'cluster2.group2.A43T': [0.004], 'without_mutation': [0.002], 'cluster3.interrupted': [0.004]}, + 'exclude': {'cluster2.group2.A43T': [0.004], 'without_mutation': [0.002], 'cluster3.interrupted': [0.004]}, + } + } + expected_mutations = { 'antibio1': { 'yes': {'cluster1.group1.A42T', 'cluster2.group2.A43T', 'cluster3.interrupted', 'cluster4.group4.A44T'}, @@ -136,41 +181,57 @@ def test_to_boxplot_tsv(self): for antibio in ['antibio1', 'antibio2']: for use_het in ['no', 'yes', 'exclude']: - got_muts, got_combs = mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, antibio, tmp_tsv, use_het, interrupted=True) + got_data, got_muts, got_combs = mic_plotter.MicPlotter._get_top_plot_data(summary_data, mic_data, antibio, use_het, interrupted=True, outfile=tmp_tsv) expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.' + antibio + '.' + use_het + '.tsv') self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) self.assertEqual(got_muts, expected_mutations[antibio][use_het]) self.assertEqual(got_combs, expected_combs[antibio][use_het]) + self.assertEqual(got_data, expected_top_plot_data[antibio][use_het]) os.unlink(tmp_tsv) - got_muts, got_combs = mic_plotter.MicPlotter._to_boxplot_tsv(summary_data, mic_data, antibio, tmp_tsv, use_het, no_combinations=True, interrupted=True) + got_data, got_muts, got_combs = mic_plotter.MicPlotter._get_top_plot_data(summary_data, mic_data, antibio, use_het, no_combinations=True, interrupted=True, outfile=tmp_tsv) expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.' + antibio + '.' + use_het + '.no_combinations.tsv') self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) self.assertEqual(got_muts, expected_mutations[antibio][use_het]) self.assertEqual(got_combs, expected_no_combs[antibio][use_het]) + self.assertEqual(got_data, expected_top_plot_data_no_combs[antibio][use_het]) os.unlink(tmp_tsv) - def test_to_dots_tsv(self): - '''test _to_dots_tsv''' - all_mutations = {'m1', 'm2', 'm3'} - combinations = { - ('m1',), - ('m1', 'm3'), - ('m2', 'm3'), - } + def test_ordered_bottom_plot_rows(self): + '''test _ordered_bottom_plot_rows''' + to_order = {'clust1.grp1.42T', 'clust1.grp1.47G', 'clust0.10T', 'abcdefg'} + got = mic_plotter.MicPlotter._ordered_bottom_plot_rows(to_order) + expected = ['abcdefg', 'clust0.10T', 'clust1.grp1.42T', 'clust1.grp1.47G'] + self.assertEqual(expected, got) + + + def test_ordered_columns(self): + '''test _ordered_colunns''' + top_plot_data = {} + # FIXME + + + def test_bottom_scatter_data(self): + '''test _bottom_scatter_data''' + #FIXME + pass + + + def test_top_plot_y_ticks(self): + '''test _top_plot_y_ticks''' + # FIXME + pass + + + def test_top_plot_scatter_counts(self): + '''test _top_plot_scatter_counts''' + top_plot_data = {} + # FIXME + - tmp_tsv = 'tmp.test.mic_plotter_to_dots.tsv' - expected_tsv = os.path.join(data_dir, 'mic_plotter_to_dots.tsv') - mic_plotter.MicPlotter._to_dots_tsv(all_mutations, combinations, tmp_tsv) - self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) - os.unlink(tmp_tsv) - - all_mutations.update({'without_mutation', 'z1'}) - combinations.add(('without_mutation',)) - combinations.add(('m1', 'z1')) - expected_tsv = os.path.join(data_dir, 'mic_plotter_to_dots_without_mutation.tsv') - mic_plotter.MicPlotter._to_dots_tsv(all_mutations, combinations, tmp_tsv) - self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) - os.unlink(tmp_tsv) + def test_top_plot_violin_data(self): + '''test _top_plot_violin_data''' + top_plot_data = {} + # FIXME diff --git a/scripts/ariba b/scripts/ariba index 7aa57ca9..137aa6eb 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -70,8 +70,8 @@ subparser_micplot.add_argument('outprefix', help='Prefix of output files') micplot_general_group = subparser_micplot.add_argument_group('General options') micplot_general_group.add_argument('--main_title', help='Main title of plot. Default is to use the antibiotic name', metavar='"title in quotes"') -micplot_general_group.add_argument('--plot_height', help='Height of plot (used in plot_height=X when running ggsave [%(default)s]', default=15, type=float, metavar='FLOAT') -micplot_general_group.add_argument('--plot_width', help='Height of plot (used in plot_width=X when running ggsave [%(default)s]', default=15, type=float, metavar='FLOAT') +micplot_general_group.add_argument('--plot_height', help='Height of plot (used in plot_height=X when running ggsave [%(default)s]', default=7, type=float, metavar='FLOAT') +micplot_general_group.add_argument('--plot_width', help='Height of plot (used in plot_width=X when running ggsave [%(default)s]', default=7, type=float, metavar='FLOAT') micplot_general_group.add_argument('--use_hets', choices=['yes', 'no', 'exclude'], help='How to deal with HET snps. Choose from yes,no,exclude. yes: count a het SNP as present. no: do not count a het SNP as present. exclude: completely remove any sample with any het SNP [%(default)s]', default='yes') micplot_general_group.add_argument('--interrupted', action='store_true', help='Include interrupted genes (as in the assembled column of the ariba summary files)') micplot_general_group.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') @@ -86,18 +86,14 @@ micplot_upper_plot_group.add_argument('--plot_types', help='Types of plots to ma micplot_upper_plot_group.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. Default is to draw no lines.', metavar='float1,float2,...', default='') micplot_upper_plot_group.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)s]', default=0.1, type=float, metavar='FLOAT') micplot_upper_plot_group.add_argument('--jitter_height', help='Jitter height option when plotting points [%(default)s]', default=0.02, type=float, metavar='FLOAT') -micplot_upper_plot_group.add_argument('--no_log_y', action='store_true', help='Do not log scale the y axis') -micplot_upper_plot_group.add_argument('--mic_values', help='Comma-separated list of MIC values to be shown on y axis ticks [%(default)s]', default='0,0.001,0.0025,0.0075,0.015,0.03,0.06,0.125,0.25,0.5,1,2,4,8,16,32,64,128,256,512,1024', metavar='float1,float2,...') +micplot_upper_plot_group.add_argument('--log_y', type=float, help='Base of log applied to y values. Set to zero to not log [%(default)s]', default=2, metavar='FLOAT') micplot_upper_plot_group.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point. If zero, will group points and size them proportional to the group size [%(default)s]', default=4, metavar='FLOAT') micplot_upper_plot_group.add_argument('--point_range', help='Min and max size of plotted points when --point_size is 0 [%(default)s]', default='2,15', metavar='min,max') micplot_upper_plot_group.add_argument('--point_break', help='Comma-separated list of breakpoints in point sizes when --point_size is 0 [%(default)s]', default='10,50,100,200,300', metavar='integer1.integer2,integer3,...') -micplot_upper_plot_group.add_argument('--point_legend_x', type=float, help='x coord of legend when --point_size is 0 [%(default)s]', default=-0.15, metavar='FLOAT') -micplot_upper_plot_group.add_argument('--point_legend_y', type=float, help='y coord of legend when --point_size is 0 [%(default)s]', default=0.9, metavar='FLOAT') -micplot_upper_plot_group.add_argument('--violin_y_jitter', type=float, help='y jitter in violin plot. Set to a small number>0, eg 0.001, if violins are not visible. Also see --violin_scale_width [%(default)s]', default=0, metavar='FLOAT') -micplot_upper_plot_group.add_argument('--violin_scale_width', action='store_true', help='Use this if violins are too narrow. Also see --violin_y_jitter') +micplot_upper_plot_group.add_argument('--violin_width', type=float, help='Width of violins [%(default)s]', default=0.75) micplot_lower_plot_group = subparser_micplot.add_argument_group('Lower plot options') -micplot_lower_plot_group.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=8, metavar='FLOAT') +micplot_lower_plot_group.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=100, metavar='FLOAT') micplot_lower_plot_group.add_argument('--dot_outline', action='store_true', help='Black outline around all dots (whether coloured or not) in lower part of plots') micplot_lower_plot_group.add_argument('--dot_y_text_size', type=int, help='Text size of labels [%(default)s]', default=18, metavar='INT') From a95def660ef34c1f06bd3cf889da32685aa3f9c5 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 1 Mar 2017 15:41:45 +0000 Subject: [PATCH 55/88] New method _nucmer_hits_to_ref_and_qry_coords --- ariba/assembly_compare.py | 33 ++++++++++++++++++++++++++++ ariba/tests/assembly_compare_test.py | 31 ++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/ariba/assembly_compare.py b/ariba/assembly_compare.py index a0eb6a00..bd8215df 100644 --- a/ariba/assembly_compare.py +++ b/ariba/assembly_compare.py @@ -135,6 +135,39 @@ def nucmer_hits_to_ref_coords(cls, nucmer_hits, contig=None): return coords + @classmethod + def nucmer_hits_to_ref_and_qry_coords(cls, nucmer_hits, contig=None): + '''Same as nucmer_hits_to_ref_coords, except removes containing hits first, + and returns ref and qry coords lists''' + if contig is None: + ctg_coords = {key: [] for key in nucmer_hits.keys()} + else: + ctg_coords = {contig: []} + + ref_coords = {} + + for key in ctg_coords: + hits = copy.copy(nucmer_hits[key]) + hits.sort(key=lambda x: len(x.ref_coords())) + + if len(hits) > 1: + i = 0 + while i < len(hits) - 1: + c1 = hits[i].ref_coords() + c2 = hits[i+1].ref_coords() + if c2.contains(c1): + hits.pop(i) + else: + i += 1 + + ref_coords[key] = [hit.ref_coords() for hit in hits] + ctg_coords[key] = [hit.qry_coords() for hit in hits] + pyfastaq.intervals.merge_overlapping_in_list(ref_coords[key]) + pyfastaq.intervals.merge_overlapping_in_list(ctg_coords[key]) + + return ctg_coords, ref_coords + + @staticmethod def ref_cov_per_contig(nucmer_hits): '''Input is hits made by self._parse_nucmer_coords_file. diff --git a/ariba/tests/assembly_compare_test.py b/ariba/tests/assembly_compare_test.py index 7d4648e0..03da24bc 100644 --- a/ariba/tests/assembly_compare_test.py +++ b/ariba/tests/assembly_compare_test.py @@ -107,6 +107,37 @@ def test_nucmer_hits_to_ref_coords(self): self.assertEqual(expected, got) + def test_nucmer_hits_to_ref_and_qry_coords(self): + '''test _nucmer_hits_to_ref_and_qry_coords''' + hits = [ + ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'], + ['31', '52', '1', '22', '22', '22', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'], + ['11', '32', '1000', '1022', '22', '22', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'], + ['100', '142', '200', '242', '42', '42', '99.42', '1000', '1000', '1', '1', 'ref', 'contig1'], + ['100', '110', '200', '210', '11', '11', '99.42', '1000', '1000', '1', '1', 'ref', 'contig2'], + ] + nucmer_hits = { + 'contig1': [ + pymummer.alignment.Alignment('\t'.join(hits[0])), + pymummer.alignment.Alignment('\t'.join(hits[1])), + pymummer.alignment.Alignment('\t'.join(hits[2])), + ], + 'contig2': [ + pymummer.alignment.Alignment('\t'.join(hits[3])), + ] + } + + got_ctg, got_ref = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_and_qry_coords(nucmer_hits) + expected_ctg = { + 'contig1': [pyfastaq.intervals.Interval(0,51), pyfastaq.intervals.Interval(99, 141)], + 'contig2': [pyfastaq.intervals.Interval(99, 109)] + } + expected_ref = { + 'contig1': [pyfastaq.intervals.Interval(0,51), pyfastaq.intervals.Interval(99, 141)], + 'contig2': [pyfastaq.intervals.Interval(99, 109)] + } + + def test_ref_cov_per_contig(self): '''test ref_cov_per_contig''' hits = [ From 9cbaf334f23f540860c61d4979433f5fd20214d4 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 1 Mar 2017 15:43:46 +0000 Subject: [PATCH 56/88] Bug fix when one contig hits same ref position in two different places in contig --- ariba/assembly_variants.py | 23 +++++++++++++++++++---- ariba/cluster.py | 4 ++-- ariba/tests/assembly_variants_test.py | 17 +++++++++++++---- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/ariba/assembly_variants.py b/ariba/assembly_variants.py index 733334cc..fa4e2e21 100644 --- a/ariba/assembly_variants.py +++ b/ariba/assembly_variants.py @@ -260,7 +260,7 @@ def _get_remaining_known_ref_variants(known_ref_variants, used_ref_variants, nuc return variants - def get_variants(self, ref_sequence_name, nucmer_coords): + def get_variants(self, ref_sequence_name, allowed_ctg_coords, allowed_ref_coords, nucmer_matches=None): '''Nucmr coords = dict. Key=contig name. Value = list of intervals of ref coords that match the contig. Made by assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords Returns dictionary. Key=contig name. Value = list of variants. Each variant @@ -287,12 +287,27 @@ def get_variants(self, ref_sequence_name, nucmer_coords): known_non_wild_variants_in_ref = self.refdata.all_non_wild_type_variants(ref_sequence_name) - for contig in nucmer_coords: + for contig in allowed_ctg_coords: + if contig not in allowed_ref_coords: + continue + used_known_variants = set() variants[contig] = [] if contig in mummer_variants: for mummer_variant_list in mummer_variants[contig]: + ref_start = min([x.ref_start for x in mummer_variant_list]) + ref_end = max([x.ref_end for x in mummer_variant_list]) + ctg_start = min([x.qry_start for x in mummer_variant_list]) + ctg_end = min([x.qry_end for x in mummer_variant_list]) + ref_interval = intervals.Interval(ref_start, ref_end) + ctg_interval = intervals.Interval(ctg_start, ctg_end) + ref_ok = True in {x.intersects(ref_interval) for x in allowed_ref_coords[contig]} + qry_ok = True in {x.intersects(ctg_interval) for x in allowed_ctg_coords[contig]} + + if not (ref_ok and qry_ok): + continue + if seq_type == 'p': new_variant, used_variants = self._get_one_variant_for_one_contig_coding(ref_sequence, refdata_var_dict, mummer_variant_list) else: @@ -306,10 +321,10 @@ def get_variants(self, ref_sequence_name, nucmer_coords): # for this contig, need to know all the ref sequence and coords it maps to. # Then report just the unused known variants, as the contig also has these variants if seq_type == 'p': - new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['p'], used_known_variants, nucmer_coords[contig]) + new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['p'], used_known_variants, allowed_ref_coords[contig]) else: - new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['n'], used_known_variants, nucmer_coords[contig]) + new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['n'], used_known_variants, allowed_ref_coords[contig]) if is_variant_only: new_variants = [x for x in new_variants if len(x[5]) > 0] diff --git a/ariba/cluster.py b/ariba/cluster.py index 4d41d60b..6285513b 100644 --- a/ariba/cluster.py +++ b/ariba/cluster.py @@ -382,9 +382,9 @@ def _run(self): self.assembly_compare.run() self.status_flag = self.assembly_compare.update_flag(self.status_flag) - nucmer_hits_to_ref = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords(self.assembly_compare.nucmer_hits) + allowed_ctg_pos, allowed_ref_pos = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_and_qry_coords(self.assembly_compare.nucmer_hits) assembly_variants_obj = assembly_variants.AssemblyVariants(self.refdata, self.assembly_compare.nucmer_snps_file) - self.assembly_variants = assembly_variants_obj.get_variants(self.ref_sequence.id, nucmer_hits_to_ref) + self.assembly_variants = assembly_variants_obj.get_variants(self.ref_sequence.id, allowed_ctg_pos, allowed_ref_pos) for var_list in self.assembly_variants.values(): for var in var_list: diff --git a/ariba/tests/assembly_variants_test.py b/ariba/tests/assembly_variants_test.py index 898106b3..a149fd51 100644 --- a/ariba/tests/assembly_variants_test.py +++ b/ariba/tests/assembly_variants_test.py @@ -309,7 +309,12 @@ def test_get_variants_presence_absence(self): v2 = pymummer.variant.Variant(pymummer.snp.Snp('14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1')) v3 = pymummer.variant.Variant(pymummer.snp.Snp('15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1')) - nucmer_coords = { + ref_nucmer_coords = { + 'contig1': [pyfastaq.intervals.Interval(0, 30)], + 'contig2': [pyfastaq.intervals.Interval(10, 41)], + } + + ctg_nucmer_coords = { 'contig1': [pyfastaq.intervals.Interval(0, 30)], 'contig2': [pyfastaq.intervals.Interval(10, 41)], } @@ -328,7 +333,7 @@ def test_get_variants_presence_absence(self): } a_variants = assembly_variants.AssemblyVariants(refdata, nucmer_snp_file) - got = a_variants.get_variants('presence_absence', nucmer_coords) + got = a_variants.get_variants('presence_absence', ctg_nucmer_coords, ref_nucmer_coords) self.assertEqual(expected, got) @@ -352,11 +357,15 @@ def test_get_variants_variants_only(self): v2 = pymummer.variant.Variant(pymummer.snp.Snp('14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1')) v3 = pymummer.variant.Variant(pymummer.snp.Snp('15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1')) - nucmer_coords = { + ctg_nucmer_coords = { 'contig1': [pyfastaq.intervals.Interval(0, 41)], 'contig2': [pyfastaq.intervals.Interval(10, 41)], } + ref_nucmer_coords = { + 'contig1': [pyfastaq.intervals.Interval(0, 41)], + 'contig2': [pyfastaq.intervals.Interval(10, 41)], + } expected = { 'contig1': [ (4, 'p', 'A5D', 'NONSYN', [v2, v3], set(), set()), @@ -367,6 +376,6 @@ def test_get_variants_variants_only(self): } a_variants = assembly_variants.AssemblyVariants(refdata, nucmer_snp_file) - got = a_variants.get_variants('variants_only', nucmer_coords) + got = a_variants.get_variants('variants_only', ctg_nucmer_coords, ref_nucmer_coords) self.assertEqual(expected, got) From db5ef80b5cb6338a6bbc60b390864d65a5e9ad19 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 08:14:08 +0000 Subject: [PATCH 57/88] add hidden --xkcd option --- ariba/mic_plotter.py | 5 ++++- ariba/tasks/micplot.py | 3 ++- scripts/ariba | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 34612300..0abb6ae1 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -42,7 +42,8 @@ def __init__(self, palette='Accent', number_of_colours=0, interrupted=False, - violin_width=0.75 + violin_width=0.75, + xkcd=False ): self.antibiotic = antibiotic self.mic_file = mic_file @@ -101,6 +102,8 @@ def __init__(self, self.number_of_colours = number_of_colours self.interrupted = interrupted self.violin_width = violin_width + if xkcd: + plt.xkcd() @classmethod diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 5136490a..c38e78d5 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -27,7 +27,8 @@ def run(options): palette=options.palette, number_of_colours=options.number_of_colours, interrupted=options.interrupted, - violin_width=options.violin_width + violin_width=options.violin_width, + xkcd=options.xkcd ) plotter.run() diff --git a/scripts/ariba b/scripts/ariba index 137aa6eb..5a5ff2c8 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -76,6 +76,7 @@ micplot_general_group.add_argument('--use_hets', choices=['yes', 'no', 'exclude' micplot_general_group.add_argument('--interrupted', action='store_true', help='Include interrupted genes (as in the assembled column of the ariba summary files)') micplot_general_group.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') micplot_general_group.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom plots. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='5,1', metavar='height1,height2') +micplot_general_group.add_argument('--xkcd', action='store_true', help=argparse.SUPPRESS) micplot_colour_group = subparser_micplot.add_argument_group('Colour options') micplot_colour_group.add_argument('--palette', help='ColourBrewer palette to use [%(default)s]', default='Accent', metavar='palette_name') From b5a367bf0fae67b3ea8f72b092e1862a1f6df205 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 08:19:26 +0000 Subject: [PATCH 58/88] plot_width/height fix (were swapped) --- ariba/mic_plotter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 0abb6ae1..e101c66e 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -424,7 +424,7 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): violin_data, violin_positions = MicPlotter._top_plot_violin_data(columns, top_plot_data, self.log_y) # -------------------- SET UP GRID & PLOTS ----------------- - fig=plt.figure(figsize=(self.plot_height, self.plot_width)) + fig=plt.figure(figsize=(self.plot_width, self.plot_height)) gs = gridspec.GridSpec(2, 2, height_ratios=self.panel_heights, width_ratios=[5,1]) plots=[] plots.append(plt.subplot(gs[0])) From cbd4fd495ad6df458c7edaf45e63a74fd4b270d9 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 08:20:22 +0000 Subject: [PATCH 59/88] Fix plot_width/height usage --- scripts/ariba | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ariba b/scripts/ariba index 5a5ff2c8..f3df3cac 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -70,8 +70,8 @@ subparser_micplot.add_argument('outprefix', help='Prefix of output files') micplot_general_group = subparser_micplot.add_argument_group('General options') micplot_general_group.add_argument('--main_title', help='Main title of plot. Default is to use the antibiotic name', metavar='"title in quotes"') -micplot_general_group.add_argument('--plot_height', help='Height of plot (used in plot_height=X when running ggsave [%(default)s]', default=7, type=float, metavar='FLOAT') -micplot_general_group.add_argument('--plot_width', help='Height of plot (used in plot_width=X when running ggsave [%(default)s]', default=7, type=float, metavar='FLOAT') +micplot_general_group.add_argument('--plot_height', help='Height of plot in inches [%(default)s]', default=7, type=float, metavar='FLOAT') +micplot_general_group.add_argument('--plot_width', help='Width of plot in inches [%(default)s]', default=7, type=float, metavar='FLOAT') micplot_general_group.add_argument('--use_hets', choices=['yes', 'no', 'exclude'], help='How to deal with HET snps. Choose from yes,no,exclude. yes: count a het SNP as present. no: do not count a het SNP as present. exclude: completely remove any sample with any het SNP [%(default)s]', default='yes') micplot_general_group.add_argument('--interrupted', action='store_true', help='Include interrupted genes (as in the assembled column of the ariba summary files)') micplot_general_group.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') From 9bcb29d4a7c19656c5898ad441b624661ce565c0 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 09:31:33 +0000 Subject: [PATCH 60/88] Add --colour_skip option --- ariba/mic_plotter.py | 28 +++++++++++++++++++++++++--- ariba/tasks/micplot.py | 1 + scripts/ariba | 3 ++- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index e101c66e..0b7d6349 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -41,6 +41,7 @@ def __init__(self, panel_heights='5,1', palette='Accent', number_of_colours=0, + colour_skip=None, interrupted=False, violin_width=0.75, xkcd=False @@ -100,6 +101,15 @@ def __init__(self, self.palette = palette self.number_of_colours = number_of_colours + + if colour_skip is None: + self.colour_skip = None + else: + try: + self.colour_skip = [float(x) for x in colour_skip.split(',')] + except: + raise Error('Error in colour_skip option. Needs to be of the form a,b where 0 <= a < b <= 1. Got this:\n' + colour_skip) + self.interrupted = interrupted self.violin_width = violin_width if xkcd: @@ -180,12 +190,24 @@ def _load_summary_file(cls, infile): @classmethod - def _get_colours(cls, total_length, number_of_colours, colormap): + def _get_colours(cls, total_length, number_of_colours, colormap, skip=None): if number_of_colours == 1: return ["black"] * total_length elif number_of_colours == 0: cmap = cmx.get_cmap(colormap) - vals = [1.0 * x / (total_length - 1) for x in range(total_length)] + if skip is None: + vals = [1.0 * x / (total_length - 1) for x in range(total_length)] + else: + assert len(skip) == 2 and 0 <= skip[0] <= 1 and 0 <= skip[1] <= 1 + if skip[-1] == 1: + vals = [skip[0] * x / (total_length - 1) for x in range(total_length)] + elif skip[0] == 0: + vals = [skip[1] + (1 - skip[1]) * x / (total_length - 1) for x in range(total_length)] + else: + length = 1 - (skip[1] - skip[0]) + vals = [(length) * x / (total_length - 1) for x in range(total_length)] + vals = [x if x < skip[0] else x + (1-length) for x in vals] + return [cmap(x) for x in vals] else: cmap = cmx.get_cmap(colormap) @@ -413,7 +435,7 @@ def _right_plot_data(cls, scatter_count_sizes, number_of_circles, x_pos): def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): bottom_plot_rows = MicPlotter._ordered_bottom_plot_rows(all_mutations) columns = MicPlotter._ordered_columns(mut_combinations, top_plot_data) - colours = MicPlotter._get_colours(len(columns), self.number_of_colours, self.palette) + colours = MicPlotter._get_colours(len(columns), self.number_of_colours, self.palette, self.colour_skip) bottom_scatter_x, bottom_scatter_y, bottom_colours = MicPlotter._bottom_scatter_data(bottom_plot_rows, columns, colours) columns = ['.'.join(x) for x in columns] assert len(colours) == len(columns) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index c38e78d5..644ed744 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -26,6 +26,7 @@ def run(options): panel_heights=options.panel_heights, palette=options.palette, number_of_colours=options.number_of_colours, + colour_skip=options.colour_skip, interrupted=options.interrupted, violin_width=options.violin_width, xkcd=options.xkcd diff --git a/scripts/ariba b/scripts/ariba index f3df3cac..dc28b23a 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -80,7 +80,8 @@ micplot_general_group.add_argument('--xkcd', action='store_true', help=argparse. micplot_colour_group = subparser_micplot.add_argument_group('Colour options') micplot_colour_group.add_argument('--palette', help='ColourBrewer palette to use [%(default)s]', default='Accent', metavar='palette_name') -micplot_colour_group.add_argument('--number_of_colours', help='Number of colours in plot. 0:same number as columns in the plot. 1:all black. >1: take the first N colours from the colour palette specified by --palette and cycle them [%(default)s]', default=0, metavar='INT') +micplot_colour_group.add_argument('--number_of_colours', type=int, help='Number of colours in plot. 0:same number as columns in the plot. 1:all black. >1: take the first N colours from the colour palette specified by --palette and cycle them [%(default)s]', default=0, metavar='INT') +micplot_colour_group.add_argument('--colour_skip', help='If using a continious palette, --colour_skip a,b (where 0 <= a < b <= 1) will skip the range between a and b. Useful for excluding near-white colours') micplot_upper_plot_group = subparser_micplot.add_argument_group('Upper plot options') micplot_upper_plot_group.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from boxplot,violin,point [%(default)s]', default='violin,point', metavar='type1,type2,...') From f44590327f4f662fd0e160c270bdadc56a629041 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 09:37:28 +0000 Subject: [PATCH 61/88] Rename palette -> colourmap --- ariba/mic_plotter.py | 6 +++--- ariba/tasks/micplot.py | 2 +- scripts/ariba | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 0b7d6349..cb9ffe73 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -39,7 +39,7 @@ def __init__(self, dot_outline=False, dot_y_text_size=18, panel_heights='5,1', - palette='Accent', + colourmap='Accent', number_of_colours=0, colour_skip=None, interrupted=False, @@ -99,7 +99,7 @@ def __init__(self, except: raise Error('Error in panel_heights option. Needs to be of the form integer1,integer2. Got this:\n' + panel_heights) - self.palette = palette + self.colourmap = colourmap self.number_of_colours = number_of_colours if colour_skip is None: @@ -435,7 +435,7 @@ def _right_plot_data(cls, scatter_count_sizes, number_of_circles, x_pos): def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): bottom_plot_rows = MicPlotter._ordered_bottom_plot_rows(all_mutations) columns = MicPlotter._ordered_columns(mut_combinations, top_plot_data) - colours = MicPlotter._get_colours(len(columns), self.number_of_colours, self.palette, self.colour_skip) + colours = MicPlotter._get_colours(len(columns), self.number_of_colours, self.colourmap, self.colour_skip) bottom_scatter_x, bottom_scatter_y, bottom_colours = MicPlotter._bottom_scatter_data(bottom_plot_rows, columns, colours) columns = ['.'.join(x) for x in columns] assert len(colours) == len(columns) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 644ed744..01f45219 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -24,7 +24,7 @@ def run(options): dot_outline=options.dot_outline, dot_y_text_size=options.dot_y_text_size, panel_heights=options.panel_heights, - palette=options.palette, + colourmap=options.colourmap, number_of_colours=options.number_of_colours, colour_skip=options.colour_skip, interrupted=options.interrupted, diff --git a/scripts/ariba b/scripts/ariba index dc28b23a..13c0cb85 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -79,9 +79,9 @@ micplot_general_group.add_argument('--panel_heights', help='Two integers that de micplot_general_group.add_argument('--xkcd', action='store_true', help=argparse.SUPPRESS) micplot_colour_group = subparser_micplot.add_argument_group('Colour options') -micplot_colour_group.add_argument('--palette', help='ColourBrewer palette to use [%(default)s]', default='Accent', metavar='palette_name') -micplot_colour_group.add_argument('--number_of_colours', type=int, help='Number of colours in plot. 0:same number as columns in the plot. 1:all black. >1: take the first N colours from the colour palette specified by --palette and cycle them [%(default)s]', default=0, metavar='INT') -micplot_colour_group.add_argument('--colour_skip', help='If using a continious palette, --colour_skip a,b (where 0 <= a < b <= 1) will skip the range between a and b. Useful for excluding near-white colours') +micplot_colour_group.add_argument('--colourmap', help='Colours to use. See http://matplotlib.org/users/colormaps.html [%(default)s]', default='Accent', metavar='colourmap name') +micplot_colour_group.add_argument('--number_of_colours', type=int, help='Number of colours in plot. 0:same number as columns in the plot. 1:all black. >1: take the first N colours from the colourmap specified by --colourmap and cycle them [%(default)s]', default=0, metavar='INT') +micplot_colour_group.add_argument('--colour_skip', help='If using a continuous colourmap, --colour_skip a,b (where 0 <= a < b <= 1) will skip the range between a and b. Useful for excluding near-white colours') micplot_upper_plot_group = subparser_micplot.add_argument_group('Upper plot options') micplot_upper_plot_group.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from boxplot,violin,point [%(default)s]', default='violin,point', metavar='type1,type2,...') From e324a81fe1687fa9d4a666186fdf29c26cb2581f Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 09:39:46 +0000 Subject: [PATCH 62/88] Tweak usage, add metavar --- scripts/ariba | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ariba b/scripts/ariba index 13c0cb85..9a6316f2 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -81,7 +81,7 @@ micplot_general_group.add_argument('--xkcd', action='store_true', help=argparse. micplot_colour_group = subparser_micplot.add_argument_group('Colour options') micplot_colour_group.add_argument('--colourmap', help='Colours to use. See http://matplotlib.org/users/colormaps.html [%(default)s]', default='Accent', metavar='colourmap name') micplot_colour_group.add_argument('--number_of_colours', type=int, help='Number of colours in plot. 0:same number as columns in the plot. 1:all black. >1: take the first N colours from the colourmap specified by --colourmap and cycle them [%(default)s]', default=0, metavar='INT') -micplot_colour_group.add_argument('--colour_skip', help='If using a continuous colourmap, --colour_skip a,b (where 0 <= a < b <= 1) will skip the range between a and b. Useful for excluding near-white colours') +micplot_colour_group.add_argument('--colour_skip', help='If using a continuous colourmap, --colour_skip a,b (where 0 <= a < b <= 1) will skip the range between a and b. Useful for excluding near-white colours', metavar='FLOAT1,FLOAT2') micplot_upper_plot_group = subparser_micplot.add_argument_group('Upper plot options') micplot_upper_plot_group.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from boxplot,violin,point [%(default)s]', default='violin,point', metavar='type1,type2,...') From c003682c594279d4c4e6deb54b7c325b701b0a7b Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 09:45:50 +0000 Subject: [PATCH 63/88] Remove jitter_height option --- ariba/mic_plotter.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index cb9ffe73..5d1bd556 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -29,7 +29,6 @@ def __init__(self, log_y=2, plot_types="points,violin", jitter_width=0.1, - jitter_height=0.01, no_combinations=False, hlines='0.25,2', point_size=4, @@ -67,7 +66,6 @@ def __init__(self, raise Error('Error in plot_types option. Allowed types are: ' + str(allowed_plot_types) + '. Got: ' + str(self.plot_types)) self.jitter_width = jitter_width - self.jitter_height = jitter_height self.no_combinations = no_combinations try: From 07ff877ba095b423338a31464bf49b57989b8017 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 10:27:08 +0000 Subject: [PATCH 64/88] New method _filter_top_data --- ariba/mic_plotter.py | 25 ++++++++++++++++++++- ariba/tests/mic_plotter_test.py | 40 +++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 5d1bd556..5ecb80f1 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -43,7 +43,8 @@ def __init__(self, colour_skip=None, interrupted=False, violin_width=0.75, - xkcd=False + xkcd=False, + min_samples=1 ): self.antibiotic = antibiotic self.mic_file = mic_file @@ -112,6 +113,7 @@ def __init__(self, self.violin_width = violin_width if xkcd: plt.xkcd() + self.min_samples = min_samples @classmethod @@ -292,6 +294,27 @@ def _get_top_plot_data(cls, summary_data, mic_data, antibiotic, use_hets, no_com return top_plot_data, all_mutations, all_mutations_seen_combinations + @classmethod + def _filter_top_plot_data(cls, top_plot_data, all_mutations, seen_combinations, min_samples): + if min_samples == 1: + return top_plot_data, all_mutations, seen_combinations + + new_top_plot_data = {} + new_all_mutations = set() + new_seen_combinations = set() + + for mutation_tuple in seen_combinations: + mutation_string = '.'.join(mutation_tuple) + mics = top_plot_data[mutation_string] + + if len(mics) >= min_samples: + new_top_plot_data[mutation_string] = mics + new_seen_combinations.add(mutation_tuple) + new_all_mutations.update(mutation_tuple) + + return new_top_plot_data, new_all_mutations, new_seen_combinations + + @classmethod def _top_plot_y_ticks(cls, mic_data, antibiotic, log_y): mic_values = set() diff --git a/ariba/tests/mic_plotter_test.py b/ariba/tests/mic_plotter_test.py index 9b4fc9c3..1cef8a6d 100644 --- a/ariba/tests/mic_plotter_test.py +++ b/ariba/tests/mic_plotter_test.py @@ -1,4 +1,5 @@ import unittest +import copy import filecmp import os from ariba import mic_plotter @@ -198,6 +199,45 @@ def test_get_top_plot_data(self): os.unlink(tmp_tsv) + def test_filter_top_plot_data(self): + '''test _filter_top_plot_data''' + top_plot_data = { + 'var1': [1, 2, 3], + 'var2.var3': [1], + 'var1.var3': [1, 2], + } + + all_mutations = {'var1', 'var2', 'var3'} + seen_combinations = {('var1',), ('var1', 'var3'), ('var2', 'var3')} + + got_top, got_all, got_seen = mic_plotter.MicPlotter._filter_top_plot_data(top_plot_data, all_mutations, seen_combinations, 1) + self.assertEqual(got_top, top_plot_data) + self.assertEqual(got_all, all_mutations) + self.assertEqual(got_seen, seen_combinations) + + + got_top, got_all, got_seen = mic_plotter.MicPlotter._filter_top_plot_data(top_plot_data, all_mutations, seen_combinations, 2) + expected_top_plot_data = { + 'var1': [1, 2, 3], + 'var1.var3': [1, 2], + } + expected_all_mutations = {'var1', 'var3'} + expected_seen_combinations = {('var1',), ('var1', 'var3')} + self.assertEqual(got_top, expected_top_plot_data) + self.assertEqual(got_all, expected_all_mutations) + self.assertEqual(got_seen, expected_seen_combinations) + + got_top, got_all, got_seen = mic_plotter.MicPlotter._filter_top_plot_data(top_plot_data, all_mutations, seen_combinations, 3) + expected_top_plot_data = { + 'var1': [1, 2, 3], + } + expected_all_mutations = {'var1'} + expected_seen_combinations = {('var1',),} + self.assertEqual(got_top, expected_top_plot_data) + self.assertEqual(got_all, expected_all_mutations) + self.assertEqual(got_seen, expected_seen_combinations) + + def test_ordered_bottom_plot_rows(self): '''test _ordered_bottom_plot_rows''' to_order = {'clust1.grp1.42T', 'clust1.grp1.47G', 'clust0.10T', 'abcdefg'} From 93a8862190064ae6241feeafbecd5f1e95a80379 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 10:33:30 +0000 Subject: [PATCH 65/88] Add option --min_samples --- ariba/mic_plotter.py | 1 + ariba/tasks/micplot.py | 4 ++-- scripts/ariba | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 5ecb80f1..0f051bb6 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -529,4 +529,5 @@ def run(self): summary_data = MicPlotter._load_summary_file(self.summary_file) boxplot_tsv = self.outprefix + '.boxplot.tsv' top_plot_data, all_mutations, combinations = MicPlotter._get_top_plot_data(summary_data, mic_data, self.antibiotic, self.use_hets, no_combinations=self.no_combinations, interrupted=self.interrupted, outfile=boxplot_tsv) + top_plot_data, all_mutations, combinations = MicPlotter._filter_top_plot_data(top_plot_data, all_mutations, combinations, self.min_samples) self._make_plot(mic_data, top_plot_data, all_mutations, combinations) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 01f45219..59abc15f 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -14,7 +14,6 @@ def run(options): log_y=options.log_y, plot_types=options.plot_types, jitter_width=options.jitter_width, - jitter_height=options.jitter_height, no_combinations=options.no_combinations, hlines=options.hlines, point_size=options.point_size, @@ -29,7 +28,8 @@ def run(options): colour_skip=options.colour_skip, interrupted=options.interrupted, violin_width=options.violin_width, - xkcd=options.xkcd + xkcd=options.xkcd, + min_samples=options.min_samples ) plotter.run() diff --git a/scripts/ariba b/scripts/ariba index 9a6316f2..cd63ba28 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -74,6 +74,7 @@ micplot_general_group.add_argument('--plot_height', help='Height of plot in inch micplot_general_group.add_argument('--plot_width', help='Width of plot in inches [%(default)s]', default=7, type=float, metavar='FLOAT') micplot_general_group.add_argument('--use_hets', choices=['yes', 'no', 'exclude'], help='How to deal with HET snps. Choose from yes,no,exclude. yes: count a het SNP as present. no: do not count a het SNP as present. exclude: completely remove any sample with any het SNP [%(default)s]', default='yes') micplot_general_group.add_argument('--interrupted', action='store_true', help='Include interrupted genes (as in the assembled column of the ariba summary files)') +micplot_general_group.add_argument('--min_samples', type=int, help='Minimum number of samples in each column required to include in plot [(%default)s]', metavar='INT', default=1) micplot_general_group.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') micplot_general_group.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom plots. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='5,1', metavar='height1,height2') micplot_general_group.add_argument('--xkcd', action='store_true', help=argparse.SUPPRESS) @@ -87,7 +88,6 @@ micplot_upper_plot_group = subparser_micplot.add_argument_group('Upper plot opti micplot_upper_plot_group.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from boxplot,violin,point [%(default)s]', default='violin,point', metavar='type1,type2,...') micplot_upper_plot_group.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. Default is to draw no lines.', metavar='float1,float2,...', default='') micplot_upper_plot_group.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)s]', default=0.1, type=float, metavar='FLOAT') -micplot_upper_plot_group.add_argument('--jitter_height', help='Jitter height option when plotting points [%(default)s]', default=0.02, type=float, metavar='FLOAT') micplot_upper_plot_group.add_argument('--log_y', type=float, help='Base of log applied to y values. Set to zero to not log [%(default)s]', default=2, metavar='FLOAT') micplot_upper_plot_group.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point. If zero, will group points and size them proportional to the group size [%(default)s]', default=4, metavar='FLOAT') micplot_upper_plot_group.add_argument('--point_range', help='Min and max size of plotted points when --point_size is 0 [%(default)s]', default='2,15', metavar='min,max') From 9b28a32e5f14353a37bba8e98496bb51b5b8371c Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 10:49:39 +0000 Subject: [PATCH 66/88] Implement --point_size option --- ariba/mic_plotter.py | 77 +++++++++++++++++++++++---------- ariba/tests/mic_plotter_test.py | 6 +++ 2 files changed, 60 insertions(+), 23 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 0f051bb6..33d038e7 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -368,6 +368,24 @@ def _top_plot_scatter_counts(cls, mutations, top_plot_data, colours, log_y): return x_coords, y_coords, sizes, colour_list + @classmethod + def _top_plot_scatter_data(cls, mutations, top_plot_data, colours, log_y): + x_coords = [] + y_coords = [] + colour_list = [] + + for i, mutation in enumerate(mutations): + for mic in top_plot_data[mutation]: + x_coords.append(i + 1) + if log_y > 0: + y_coords.append(math.log(mic, log_y)) + else: + y_coords.append(mic) + colour_list.append(colours[i]) + + return x_coords, y_coords, colour_list + + @classmethod def _top_plot_violin_data(cls, mutations, top_plot_data, log_y): violin_data = [] @@ -463,16 +481,24 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): max_x = len(colours) + 1 scatter_count_x, scatter_count_y, scatter_count_sizes, scatter_count_colours = MicPlotter._top_plot_scatter_counts(columns, top_plot_data, colours, self.log_y) - + scatter_data_x, scatter_data_y, scatter_data_colours = MicPlotter._top_plot_scatter_data(columns, top_plot_data, colours, self.log_y) violin_data, violin_positions = MicPlotter._top_plot_violin_data(columns, top_plot_data, self.log_y) # -------------------- SET UP GRID & PLOTS ----------------- fig=plt.figure(figsize=(self.plot_width, self.plot_height)) - gs = gridspec.GridSpec(2, 2, height_ratios=self.panel_heights, width_ratios=[5,1]) + if self.point_size == 0: + gs = gridspec.GridSpec(2, 2, height_ratios=self.panel_heights, width_ratios=[5,1]) + else: + gs = gridspec.GridSpec(2, 1, height_ratios=self.panel_heights) + plots=[] plots.append(plt.subplot(gs[0])) plots.append(plt.subplot(gs[1])) - plots.append(plt.subplot(gs[2])) + if self.point_size == 0: + plots.append(plt.subplot(gs[2])) + bottom_plot_index = 2 + else: + bottom_plot_index = 1 # ------------------------- TOP PLOT ----------------------- for h in self.hlines: @@ -486,7 +512,11 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): pc.set_facecolor(colours[x]) pc.set_edgecolor(colours[x]) - plots[0].scatter(scatter_count_x, scatter_count_y, s=scatter_count_sizes, c=scatter_count_colours, linewidth=0) + if self.point_size == 0: + plots[0].scatter(scatter_count_x, scatter_count_y, s=scatter_count_sizes, c=scatter_count_colours, linewidth=0) + else: + plots[0].scatter(scatter_data_x, scatter_data_y, c=scatter_data_colours, s=self.point_size) + plots[0].axis([0,max(bottom_scatter_x) + 1,min(scatter_count_y), max(scatter_count_y)]) y_tick_positions, y_tick_labels = MicPlotter._top_plot_y_ticks(mic_data, self.antibiotic, self.log_y) @@ -498,27 +528,28 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): plots[0].set_title(self.main_title, fontsize=18) # ------------------------- BOTTOM PLOT ----------------------- - plots[2].axis([0,max(bottom_scatter_x) + 1,0,max(bottom_scatter_y) + 1]) - plots[2].scatter(bottom_scatter_x, bottom_scatter_y, marker='o', s=self.dot_size, color=bottom_colours) - plots[2].spines["top"].set_visible(False) - plots[2].spines["right"].set_visible(False) - plots[2].spines["bottom"].set_visible(False) - plots[2].spines["left"].set_visible(False) - plots[2].yaxis.set_tick_params(length=0) - plots[2].xaxis.set_ticks([]) - plots[2].set_xticklabels([]) - plots[2].yaxis.set_ticks([(i+1) for i in range(len(bottom_plot_rows))]) - plots[2].set_yticklabels(bottom_plot_rows[::-1]) + plots[bottom_plot_index].axis([0,max(bottom_scatter_x) + 1,0,max(bottom_scatter_y) + 1]) + plots[bottom_plot_index].scatter(bottom_scatter_x, bottom_scatter_y, marker='o', s=self.dot_size, color=bottom_colours) + plots[bottom_plot_index].spines["top"].set_visible(False) + plots[bottom_plot_index].spines["right"].set_visible(False) + plots[bottom_plot_index].spines["bottom"].set_visible(False) + plots[bottom_plot_index].spines["left"].set_visible(False) + plots[bottom_plot_index].yaxis.set_tick_params(length=0) + plots[bottom_plot_index].xaxis.set_ticks([]) + plots[bottom_plot_index].set_xticklabels([]) + plots[bottom_plot_index].yaxis.set_ticks([(i+1) for i in range(len(bottom_plot_rows))]) + plots[bottom_plot_index].set_yticklabels(bottom_plot_rows[::-1]) # ------------------------- RIGHT PLOT ------------------------- - right_x_coord = 0.75 - right_x, right_y, right_sizes = MicPlotter._right_plot_data(scatter_count_sizes, 5, right_x_coord) - plots[1].scatter(right_x, right_y, s=right_sizes, c="black") - plots[1].axis('off') - plots[1].axis([0,4,-2*len(right_y),len(right_y)+1]) - for i, y in enumerate(right_y): - plots[1].annotate(right_sizes[i], [right_x_coord + 0.75, y-0.2]) - plots[1].annotate("Counts", [right_x_coord - 0.1, len(right_y) + 0.5]) + if self.point_size == 0: + right_x_coord = 0.75 + right_x, right_y, right_sizes = MicPlotter._right_plot_data(scatter_count_sizes, 5, right_x_coord) + plots[1].scatter(right_x, right_y, s=right_sizes, c="black") + plots[1].axis('off') + plots[1].axis([0,4,-2*len(right_y),len(right_y)+1]) + for i, y in enumerate(right_y): + plots[1].annotate(right_sizes[i], [right_x_coord + 0.75, y-0.2]) + plots[1].annotate("Counts", [right_x_coord - 0.1, len(right_y) + 0.5]) plt.tight_layout() plt.savefig(self.outprefix + '.pdf') diff --git a/ariba/tests/mic_plotter_test.py b/ariba/tests/mic_plotter_test.py index 1cef8a6d..4ee33771 100644 --- a/ariba/tests/mic_plotter_test.py +++ b/ariba/tests/mic_plotter_test.py @@ -270,6 +270,12 @@ def test_top_plot_scatter_counts(self): # FIXME + def test_top_plot_scatter_data(self): + '''test _top_plot_scatter_data''' + top_plot_data = {} + # FIXME + + def test_top_plot_violin_data(self): '''test _top_plot_violin_data''' top_plot_data = {} From f065a8778bac6a88cdcbc9a327d3e99eade6f03e Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 11:30:42 +0000 Subject: [PATCH 67/88] Fix typo in usage --- scripts/ariba | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ariba b/scripts/ariba index cd63ba28..4cf99370 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -74,7 +74,7 @@ micplot_general_group.add_argument('--plot_height', help='Height of plot in inch micplot_general_group.add_argument('--plot_width', help='Width of plot in inches [%(default)s]', default=7, type=float, metavar='FLOAT') micplot_general_group.add_argument('--use_hets', choices=['yes', 'no', 'exclude'], help='How to deal with HET snps. Choose from yes,no,exclude. yes: count a het SNP as present. no: do not count a het SNP as present. exclude: completely remove any sample with any het SNP [%(default)s]', default='yes') micplot_general_group.add_argument('--interrupted', action='store_true', help='Include interrupted genes (as in the assembled column of the ariba summary files)') -micplot_general_group.add_argument('--min_samples', type=int, help='Minimum number of samples in each column required to include in plot [(%default)s]', metavar='INT', default=1) +micplot_general_group.add_argument('--min_samples', type=int, help='Minimum number of samples in each column required to include in plot [%(default)s]', metavar='INT', default=1) micplot_general_group.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') micplot_general_group.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom plots. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='5,1', metavar='height1,height2') micplot_general_group.add_argument('--xkcd', action='store_true', help=argparse.SUPPRESS) From ec304044bfe857d8cc3c7664c9f9fcf1a470a4fb Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 11:31:05 +0000 Subject: [PATCH 68/88] Implement x jitter --- ariba/mic_plotter.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 33d038e7..ddaacf4e 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -1,4 +1,5 @@ import csv +import random import re import os import itertools @@ -369,14 +370,18 @@ def _top_plot_scatter_counts(cls, mutations, top_plot_data, colours, log_y): @classmethod - def _top_plot_scatter_data(cls, mutations, top_plot_data, colours, log_y): + def _top_plot_scatter_data(cls, mutations, top_plot_data, colours, log_y, x_jitter): x_coords = [] y_coords = [] colour_list = [] for i, mutation in enumerate(mutations): for mic in top_plot_data[mutation]: - x_coords.append(i + 1) + if len(top_plot_data[mutation]) > 1: + x_coords.append(i + 1 + random.uniform(-x_jitter, x_jitter)) + else: + x_coords.append(i + 1) + if log_y > 0: y_coords.append(math.log(mic, log_y)) else: @@ -481,7 +486,7 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): max_x = len(colours) + 1 scatter_count_x, scatter_count_y, scatter_count_sizes, scatter_count_colours = MicPlotter._top_plot_scatter_counts(columns, top_plot_data, colours, self.log_y) - scatter_data_x, scatter_data_y, scatter_data_colours = MicPlotter._top_plot_scatter_data(columns, top_plot_data, colours, self.log_y) + scatter_data_x, scatter_data_y, scatter_data_colours = MicPlotter._top_plot_scatter_data(columns, top_plot_data, colours, self.log_y, self.jitter_width) violin_data, violin_positions = MicPlotter._top_plot_violin_data(columns, top_plot_data, self.log_y) # -------------------- SET UP GRID & PLOTS ----------------- From fe7d072c4162712f0fc78c303f5d6f27a2d4bbce Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 11:52:02 +0000 Subject: [PATCH 69/88] Report presence/absence sequences --- ariba/mic_plotter.py | 17 +++++++++++++---- ariba/tasks/micplot.py | 1 + scripts/ariba | 1 + 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index ddaacf4e..79a06514 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -9,7 +9,7 @@ import matplotlib.cm as cmx import math import pyfastaq -from ariba import common +from ariba import common, reference_data class Error (Exception): pass @@ -19,6 +19,7 @@ class Error (Exception): pass class MicPlotter: def __init__(self, + refdata_dir, antibiotic, mic_file, summary_file, @@ -47,6 +48,9 @@ def __init__(self, xkcd=False, min_samples=1 ): + refdata_fa = os.path.join(refdata_dir, '02.cdhit.all.fa') + refdata_tsv = os.path.join(refdata_dir, '01.filter.check_metadata.tsv') + self.refdata = reference_data.ReferenceData([refdata_fa], [refdata_tsv]) self.antibiotic = antibiotic self.mic_file = mic_file self.summary_file = summary_file @@ -221,7 +225,7 @@ def _get_colours(cls, total_length, number_of_colours, colormap, skip=None): @classmethod - def _get_top_plot_data(cls, summary_data, mic_data, antibiotic, use_hets, no_combinations=False, interrupted=False, outfile=None): + def _get_top_plot_data(cls, summary_data, mic_data, antibiotic, use_hets, refdata=None, no_combinations=False, interrupted=False, outfile=None): assert use_hets in {'yes', 'no', 'exclude'} if outfile is not None: f = pyfastaq.utils.open_file_write(outfile) @@ -245,9 +249,14 @@ def _get_top_plot_data(cls, summary_data, mic_data, antibiotic, use_hets, no_com found_het_and_exclude = False for cluster in summary_data[sample]: - if summary_data[sample][cluster]['assembled'] == 'interrupted' and interrupted: + if 'assembled' in summary_data[sample][cluster] and summary_data[sample][cluster]['assembled'] == 'interrupted' and interrupted: mutations.add(cluster + '.interrupted') + if refdata is not None and 'match' in summary_data[sample][cluster] and summary_data[sample][cluster]['match'] == 'yes' and 'ref_seq' in summary_data[sample][cluster]: + ref_type, variant_only = self.refdata.sequence_type(summary_data[sample][cluster]['ref_seq']) + if not variant_only: + mutations.add(cluster + '.present') + for column, value in summary_data[sample][cluster].items(): if column in ignore_columns or column.endswith('.%'): continue @@ -564,6 +573,6 @@ def run(self): mic_data = MicPlotter._load_mic_file(self.mic_file) summary_data = MicPlotter._load_summary_file(self.summary_file) boxplot_tsv = self.outprefix + '.boxplot.tsv' - top_plot_data, all_mutations, combinations = MicPlotter._get_top_plot_data(summary_data, mic_data, self.antibiotic, self.use_hets, no_combinations=self.no_combinations, interrupted=self.interrupted, outfile=boxplot_tsv) + top_plot_data, all_mutations, combinations = MicPlotter._get_top_plot_data(summary_data, mic_data, self.antibiotic, self.use_hets, refdata=self.refdata, no_combinations=self.no_combinations, interrupted=self.interrupted, outfile=boxplot_tsv) top_plot_data, all_mutations, combinations = MicPlotter._filter_top_plot_data(top_plot_data, all_mutations, combinations, self.min_samples) self._make_plot(mic_data, top_plot_data, all_mutations, combinations) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 59abc15f..0b12ac9a 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -3,6 +3,7 @@ def run(options): plotter = ariba.mic_plotter.MicPlotter( + options.prepareref_dir, options.antibiotic, options.mic_file, options.summary_file, diff --git a/scripts/ariba b/scripts/ariba index 4cf99370..9511c680 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -63,6 +63,7 @@ subparser_micplot = subparsers.add_parser( usage='ariba prepareref [options] ', description='Makes a violin and scatter plot of MIC per variant in the summary file', ) +subparser_micplot.add_argument('prepareref_dir', help='Name of output directory when "ariba prepareref" was run') subparser_micplot.add_argument('antibiotic', help='Antibiotic name. Must exactly match a column from the MIC file') subparser_micplot.add_argument('mic_file', help='File containing MIC data for each sample and one or more antibiotics') subparser_micplot.add_argument('summary_file', help='File made by running "ariba summary"') From 0927a650362fa73232bd07b56f939ec1ba8a8283 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 11:53:23 +0000 Subject: [PATCH 70/88] Ignore MULTIPLE --- ariba/mic_plotter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 79a06514..23e8d9fc 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -231,7 +231,7 @@ def _get_top_plot_data(cls, summary_data, mic_data, antibiotic, use_hets, refdat f = pyfastaq.utils.open_file_write(outfile) print('Sample\tMIC\tMutations', file=f) - ignore_columns = {'assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'} + ignore_columns = {'assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var', 'MULTIPLE'} all_mutations = set() all_mutations_seen_combinations = set() top_plot_data = {} # cluster combination -> list of y coords (MIC values) From e7f018cbdda8a0e7d75930731a1d7226ac299fad Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 12:06:37 +0000 Subject: [PATCH 71/88] fix --plot_types (remove option of boxplot) --- ariba/mic_plotter.py | 25 +++++++++++++++---------- scripts/ariba | 2 +- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 23e8d9fc..30f6f0de 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -67,7 +67,7 @@ def __init__(self, self.log_y = log_y self.plot_types = set(plot_types.split(',')) - allowed_plot_types = {'point', 'violin', 'boxplot'} + allowed_plot_types = {'point', 'violin'} if not self.plot_types.issubset(allowed_plot_types): raise Error('Error in plot_types option. Allowed types are: ' + str(allowed_plot_types) + '. Got: ' + str(self.plot_types)) @@ -500,6 +500,9 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): # -------------------- SET UP GRID & PLOTS ----------------- fig=plt.figure(figsize=(self.plot_width, self.plot_height)) + if 'point' not in self.plot_types: + self.point_size = 42 + if self.point_size == 0: gs = gridspec.GridSpec(2, 2, height_ratios=self.panel_heights, width_ratios=[5,1]) else: @@ -521,15 +524,17 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): plots[0].hlines(h, 0, max_x, linestyle='--', linewidth=1, color='black') - violins = plots[0].violinplot(violin_data, violin_positions, widths=self.violin_width, showmeans=False, showextrema=False, showmedians=False) - for x, pc in enumerate(violins['bodies']): - pc.set_facecolor(colours[x]) - pc.set_edgecolor(colours[x]) + if 'violin' in self.plot_types: + violins = plots[0].violinplot(violin_data, violin_positions, widths=self.violin_width, showmeans=False, showextrema=False, showmedians=False) + for x, pc in enumerate(violins['bodies']): + pc.set_facecolor(colours[x]) + pc.set_edgecolor(colours[x]) - if self.point_size == 0: - plots[0].scatter(scatter_count_x, scatter_count_y, s=scatter_count_sizes, c=scatter_count_colours, linewidth=0) - else: - plots[0].scatter(scatter_data_x, scatter_data_y, c=scatter_data_colours, s=self.point_size) + if 'point' in self.plot_types: + if self.point_size == 0: + plots[0].scatter(scatter_count_x, scatter_count_y, s=scatter_count_sizes, c=scatter_count_colours, linewidth=0) + else: + plots[0].scatter(scatter_data_x, scatter_data_y, c=scatter_data_colours, s=self.point_size) plots[0].axis([0,max(bottom_scatter_x) + 1,min(scatter_count_y), max(scatter_count_y)]) @@ -572,7 +577,7 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): def run(self): mic_data = MicPlotter._load_mic_file(self.mic_file) summary_data = MicPlotter._load_summary_file(self.summary_file) - boxplot_tsv = self.outprefix + '.boxplot.tsv' + boxplot_tsv = self.outprefix + '.data.tsv' top_plot_data, all_mutations, combinations = MicPlotter._get_top_plot_data(summary_data, mic_data, self.antibiotic, self.use_hets, refdata=self.refdata, no_combinations=self.no_combinations, interrupted=self.interrupted, outfile=boxplot_tsv) top_plot_data, all_mutations, combinations = MicPlotter._filter_top_plot_data(top_plot_data, all_mutations, combinations, self.min_samples) self._make_plot(mic_data, top_plot_data, all_mutations, combinations) diff --git a/scripts/ariba b/scripts/ariba index 9511c680..34785bf5 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -86,7 +86,7 @@ micplot_colour_group.add_argument('--number_of_colours', type=int, help='Number micplot_colour_group.add_argument('--colour_skip', help='If using a continuous colourmap, --colour_skip a,b (where 0 <= a < b <= 1) will skip the range between a and b. Useful for excluding near-white colours', metavar='FLOAT1,FLOAT2') micplot_upper_plot_group = subparser_micplot.add_argument_group('Upper plot options') -micplot_upper_plot_group.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from boxplot,violin,point [%(default)s]', default='violin,point', metavar='type1,type2,...') +micplot_upper_plot_group.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from violin,point [%(default)s]', default='violin,point', metavar='type1,type2,...') micplot_upper_plot_group.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. Default is to draw no lines.', metavar='float1,float2,...', default='') micplot_upper_plot_group.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)s]', default=0.1, type=float, metavar='FLOAT') micplot_upper_plot_group.add_argument('--log_y', type=float, help='Base of log applied to y values. Set to zero to not log [%(default)s]', default=2, metavar='FLOAT') From add501b71631a847829e99c82f1c052ed2e81c7f Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 12:11:31 +0000 Subject: [PATCH 72/88] Fix short usage --- scripts/ariba | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ariba b/scripts/ariba index 34785bf5..fe0eb0ac 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -60,7 +60,7 @@ subparser_getref.set_defaults(func=ariba.tasks.getref.run) subparser_micplot = subparsers.add_parser( 'micplot', help='Make violin/dot plots using MIC data', - usage='ariba prepareref [options] ', + usage='ariba prepareref [options] ', description='Makes a violin and scatter plot of MIC per variant in the summary file', ) subparser_micplot.add_argument('prepareref_dir', help='Name of output directory when "ariba prepareref" was run') From 3e1a80ddb7d8541d3169d12c717cebb8b60585d0 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 12:58:03 +0000 Subject: [PATCH 73/88] Remove unused options --- ariba/mic_plotter.py | 15 +-------------- ariba/tasks/micplot.py | 2 -- scripts/ariba | 4 +--- 3 files changed, 2 insertions(+), 19 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 30f6f0de..a543d9c9 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -32,10 +32,8 @@ def __init__(self, plot_types="points,violin", jitter_width=0.1, no_combinations=False, - hlines='0.25,2', + hlines='', point_size=4, - point_range='2,15', - point_break='10,50,100,200,300', dot_size=100, dot_outline=False, dot_y_text_size=18, @@ -83,17 +81,6 @@ def __init__(self, raise Error('Error in hlines option. Needs to be a list of numbers separated by commas, or empty. Got this:\n' + hlines) self.point_size = point_size - - try: - self.point_range = [int(x) for x in point_range.split(',')] - except: - raise Error('Error in point_range option. Needs to be of the form integer1,integer2. Got this:\n' + point_range) - - try: - self.point_break = [int(x) for x in point_break.split(',')] - except: - raise Error('Error in point_break option. Needs to be comma-sparated list of integers. Got this:\n' + point_break) - self.dot_size = dot_size self.dot_outline = dot_outline self.dot_y_text_size = dot_y_text_size diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 0b12ac9a..bc9e307f 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -18,8 +18,6 @@ def run(options): no_combinations=options.no_combinations, hlines=options.hlines, point_size=options.point_size, - point_range=options.point_range, - point_break=options.point_break, dot_size=options.dot_size, dot_outline=options.dot_outline, dot_y_text_size=options.dot_y_text_size, diff --git a/scripts/ariba b/scripts/ariba index fe0eb0ac..bf578e57 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -78,7 +78,7 @@ micplot_general_group.add_argument('--interrupted', action='store_true', help='I micplot_general_group.add_argument('--min_samples', type=int, help='Minimum number of samples in each column required to include in plot [%(default)s]', metavar='INT', default=1) micplot_general_group.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') micplot_general_group.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom plots. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='5,1', metavar='height1,height2') -micplot_general_group.add_argument('--xkcd', action='store_true', help=argparse.SUPPRESS) +micplot_general_group.add_argument('--xkcd', action='store_true', help='Best used with xkcd font installed ;)') micplot_colour_group = subparser_micplot.add_argument_group('Colour options') micplot_colour_group.add_argument('--colourmap', help='Colours to use. See http://matplotlib.org/users/colormaps.html [%(default)s]', default='Accent', metavar='colourmap name') @@ -91,8 +91,6 @@ micplot_upper_plot_group.add_argument('--hlines', help='Comma-separated list of micplot_upper_plot_group.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)s]', default=0.1, type=float, metavar='FLOAT') micplot_upper_plot_group.add_argument('--log_y', type=float, help='Base of log applied to y values. Set to zero to not log [%(default)s]', default=2, metavar='FLOAT') micplot_upper_plot_group.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point. If zero, will group points and size them proportional to the group size [%(default)s]', default=4, metavar='FLOAT') -micplot_upper_plot_group.add_argument('--point_range', help='Min and max size of plotted points when --point_size is 0 [%(default)s]', default='2,15', metavar='min,max') -micplot_upper_plot_group.add_argument('--point_break', help='Comma-separated list of breakpoints in point sizes when --point_size is 0 [%(default)s]', default='10,50,100,200,300', metavar='integer1.integer2,integer3,...') micplot_upper_plot_group.add_argument('--violin_width', type=float, help='Width of violins [%(default)s]', default=0.75) micplot_lower_plot_group = subparser_micplot.add_argument_group('Lower plot options') From bdd7746cf6c04e34a6a8f58bff9bff8381bcfa7d Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 13:06:35 +0000 Subject: [PATCH 74/88] Implement --dot_y_text_size --- ariba/mic_plotter.py | 6 +++--- scripts/ariba | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index a543d9c9..93a0e014 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -36,8 +36,8 @@ def __init__(self, point_size=4, dot_size=100, dot_outline=False, - dot_y_text_size=18, - panel_heights='5,1', + dot_y_text_size=7, + panel_heights='9,2', colourmap='Accent', number_of_colours=0, colour_skip=None, @@ -544,7 +544,7 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): plots[bottom_plot_index].xaxis.set_ticks([]) plots[bottom_plot_index].set_xticklabels([]) plots[bottom_plot_index].yaxis.set_ticks([(i+1) for i in range(len(bottom_plot_rows))]) - plots[bottom_plot_index].set_yticklabels(bottom_plot_rows[::-1]) + plots[bottom_plot_index].set_yticklabels(bottom_plot_rows[::-1], fontsize=self.dot_y_text_size) # ------------------------- RIGHT PLOT ------------------------- if self.point_size == 0: diff --git a/scripts/ariba b/scripts/ariba index bf578e57..c24aa8a9 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -77,7 +77,7 @@ micplot_general_group.add_argument('--use_hets', choices=['yes', 'no', 'exclude' micplot_general_group.add_argument('--interrupted', action='store_true', help='Include interrupted genes (as in the assembled column of the ariba summary files)') micplot_general_group.add_argument('--min_samples', type=int, help='Minimum number of samples in each column required to include in plot [%(default)s]', metavar='INT', default=1) micplot_general_group.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') -micplot_general_group.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom plots. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='5,1', metavar='height1,height2') +micplot_general_group.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom plots. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='9,2', metavar='height1,height2') micplot_general_group.add_argument('--xkcd', action='store_true', help='Best used with xkcd font installed ;)') micplot_colour_group = subparser_micplot.add_argument_group('Colour options') @@ -96,7 +96,7 @@ micplot_upper_plot_group.add_argument('--violin_width', type=float, help='Width micplot_lower_plot_group = subparser_micplot.add_argument_group('Lower plot options') micplot_lower_plot_group.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=100, metavar='FLOAT') micplot_lower_plot_group.add_argument('--dot_outline', action='store_true', help='Black outline around all dots (whether coloured or not) in lower part of plots') -micplot_lower_plot_group.add_argument('--dot_y_text_size', type=int, help='Text size of labels [%(default)s]', default=18, metavar='INT') +micplot_lower_plot_group.add_argument('--dot_y_text_size', type=int, help='Text size of labels [%(default)s]', default=7, metavar='INT') subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) From 0aac51220104b2021219b024990442ca31056225 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 13:16:05 +0000 Subject: [PATCH 75/88] Implement --dot_outline --- ariba/mic_plotter.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 93a0e014..3873f878 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -442,7 +442,7 @@ def _ordered_columns(cls, mutations, top_plot_data): @classmethod - def _bottom_scatter_data(cls, bottom_plot_rows, columns, colours): + def _bottom_scatter_data(cls, bottom_plot_rows, columns, colours, outline=False): x_coords = [] y_coords = [] colour_list = [] @@ -453,6 +453,10 @@ def _bottom_scatter_data(cls, bottom_plot_rows, columns, colours): x_coords.append(j + 1) y_coords.append(len(bottom_plot_rows) - i) colour_list.append(colours[j]) + elif outline: + x_coords.append(j + 1) + y_coords.append(len(bottom_plot_rows) - i) + colour_list.append("white") return x_coords, y_coords, colour_list @@ -476,7 +480,7 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): bottom_plot_rows = MicPlotter._ordered_bottom_plot_rows(all_mutations) columns = MicPlotter._ordered_columns(mut_combinations, top_plot_data) colours = MicPlotter._get_colours(len(columns), self.number_of_colours, self.colourmap, self.colour_skip) - bottom_scatter_x, bottom_scatter_y, bottom_colours = MicPlotter._bottom_scatter_data(bottom_plot_rows, columns, colours) + bottom_scatter_x, bottom_scatter_y, bottom_colours = MicPlotter._bottom_scatter_data(bottom_plot_rows, columns, colours, outline=self.dot_outline) columns = ['.'.join(x) for x in columns] assert len(colours) == len(columns) max_x = len(colours) + 1 @@ -534,8 +538,9 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): plots[0].set_title(self.main_title, fontsize=18) # ------------------------- BOTTOM PLOT ----------------------- + edgecolor = "black" if self.dot_outline else bottom_colours plots[bottom_plot_index].axis([0,max(bottom_scatter_x) + 1,0,max(bottom_scatter_y) + 1]) - plots[bottom_plot_index].scatter(bottom_scatter_x, bottom_scatter_y, marker='o', s=self.dot_size, color=bottom_colours) + plots[bottom_plot_index].scatter(bottom_scatter_x, bottom_scatter_y, marker='o', s=self.dot_size, c=bottom_colours, edgecolor=edgecolor, lw=1) plots[bottom_plot_index].spines["top"].set_visible(False) plots[bottom_plot_index].spines["right"].set_visible(False) plots[bottom_plot_index].spines["bottom"].set_visible(False) From 929625ed3d7acc137fa4ad938d7957e7d77f10b9 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 13:30:23 +0000 Subject: [PATCH 76/88] Add a little to top y axis min/max --- ariba/mic_plotter.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 3873f878..d65ae606 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -527,7 +527,14 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): else: plots[0].scatter(scatter_data_x, scatter_data_y, c=scatter_data_colours, s=self.point_size) - plots[0].axis([0,max(bottom_scatter_x) + 1,min(scatter_count_y), max(scatter_count_y)]) + if self.log_y > 0: + miny = min(scatter_count_y) - 0.5 + maxy = max(scatter_count_y) + 0.5 + else: + miny = 0 + maxy = 1.05 * max(scatter_count_y) + + plots[0].axis([0,max(bottom_scatter_x) + 1, miny, maxy]) y_tick_positions, y_tick_labels = MicPlotter._top_plot_y_ticks(mic_data, self.antibiotic, self.log_y) plots[0].yaxis.set_ticks(y_tick_positions) From 844ee4b7df5f4afb9032455e4827a92395df62e9 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 13:46:00 +0000 Subject: [PATCH 77/88] Add options --panel_widths --count_legend_x --- ariba/mic_plotter.py | 14 +++++++++++--- ariba/tasks/micplot.py | 4 +++- scripts/ariba | 2 ++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index d65ae606..5672abf2 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -38,13 +38,15 @@ def __init__(self, dot_outline=False, dot_y_text_size=7, panel_heights='9,2', + panel_widths='5,1', colourmap='Accent', number_of_colours=0, colour_skip=None, interrupted=False, violin_width=0.75, xkcd=False, - min_samples=1 + min_samples=1, + count_legend_x=-2 ): refdata_fa = os.path.join(refdata_dir, '02.cdhit.all.fa') refdata_tsv = os.path.join(refdata_dir, '01.filter.check_metadata.tsv') @@ -90,6 +92,11 @@ def __init__(self, except: raise Error('Error in panel_heights option. Needs to be of the form integer1,integer2. Got this:\n' + panel_heights) + try: + self.panel_widths = [int(x) for x in panel_widths.split(',')] + except: + raise Error('Error in panel_widths option. Needs to be of the form integer1,integer2. Got this:\n' + panel_widths) + self.colourmap = colourmap self.number_of_colours = number_of_colours @@ -106,6 +113,7 @@ def __init__(self, if xkcd: plt.xkcd() self.min_samples = min_samples + self.count_legend_x = count_legend_x @classmethod @@ -495,7 +503,7 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): self.point_size = 42 if self.point_size == 0: - gs = gridspec.GridSpec(2, 2, height_ratios=self.panel_heights, width_ratios=[5,1]) + gs = gridspec.GridSpec(2, 2, height_ratios=self.panel_heights, width_ratios=self.panel_widths) else: gs = gridspec.GridSpec(2, 1, height_ratios=self.panel_heights) @@ -569,7 +577,7 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): plots[1].annotate(right_sizes[i], [right_x_coord + 0.75, y-0.2]) plots[1].annotate("Counts", [right_x_coord - 0.1, len(right_y) + 0.5]) - plt.tight_layout() + plt.tight_layout(w_pad=self.count_legend_x) plt.savefig(self.outprefix + '.pdf') diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index bc9e307f..9f2154cb 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -22,13 +22,15 @@ def run(options): dot_outline=options.dot_outline, dot_y_text_size=options.dot_y_text_size, panel_heights=options.panel_heights, + panel_widths=options.panel_widths, colourmap=options.colourmap, number_of_colours=options.number_of_colours, colour_skip=options.colour_skip, interrupted=options.interrupted, violin_width=options.violin_width, xkcd=options.xkcd, - min_samples=options.min_samples + min_samples=options.min_samples, + count_legend_x=options.count_legend_x ) plotter.run() diff --git a/scripts/ariba b/scripts/ariba index c24aa8a9..9b96e593 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -78,6 +78,8 @@ micplot_general_group.add_argument('--interrupted', action='store_true', help='I micplot_general_group.add_argument('--min_samples', type=int, help='Minimum number of samples in each column required to include in plot [%(default)s]', metavar='INT', default=1) micplot_general_group.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') micplot_general_group.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom plots. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='9,2', metavar='height1,height2') +micplot_general_group.add_argument('--panel_widths', help='Two integers that determine relative width of plots and space used by counts legend. eg 5,1 means ratio of 5:1 between top and bottom panel widths. Only applies when plotting points and --point_size 0 [%(default)s]', default='5,1', metavar='width1,width2') +micplot_general_group.add_argument('--count_legend_x', type=float, help='Control x position of counts legend when plotting points and --point_size 0 [%(default)s]', default=-2, metavar='FLOAT') micplot_general_group.add_argument('--xkcd', action='store_true', help='Best used with xkcd font installed ;)') micplot_colour_group = subparser_micplot.add_argument_group('Colour options') From 5c42985975c15812840957ff3b51804f005e340b Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 14:46:31 +0000 Subject: [PATCH 78/88] Add option out_format --- ariba/mic_plotter.py | 6 ++++-- ariba/tasks/micplot.py | 3 ++- scripts/ariba | 1 + 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 5672abf2..f03de6f3 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -46,7 +46,8 @@ def __init__(self, violin_width=0.75, xkcd=False, min_samples=1, - count_legend_x=-2 + count_legend_x=-2, + out_format='pdf' ): refdata_fa = os.path.join(refdata_dir, '02.cdhit.all.fa') refdata_tsv = os.path.join(refdata_dir, '01.filter.check_metadata.tsv') @@ -114,6 +115,7 @@ def __init__(self, plt.xkcd() self.min_samples = min_samples self.count_legend_x = count_legend_x + self.out_format = out_format @classmethod @@ -578,7 +580,7 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): plots[1].annotate("Counts", [right_x_coord - 0.1, len(right_y) + 0.5]) plt.tight_layout(w_pad=self.count_legend_x) - plt.savefig(self.outprefix + '.pdf') + plt.savefig(self.outprefix + '.' + self.out_format) def run(self): diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 9f2154cb..0b7949fb 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -30,7 +30,8 @@ def run(options): violin_width=options.violin_width, xkcd=options.xkcd, min_samples=options.min_samples, - count_legend_x=options.count_legend_x + count_legend_x=options.count_legend_x, + out_format=options.out_format ) plotter.run() diff --git a/scripts/ariba b/scripts/ariba index 9b96e593..00c7ddac 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -70,6 +70,7 @@ subparser_micplot.add_argument('summary_file', help='File made by running "ariba subparser_micplot.add_argument('outprefix', help='Prefix of output files') micplot_general_group = subparser_micplot.add_argument_group('General options') +micplot_general_group.add_argument('--out_format', help='Output format of image file. Use anything that matplotlib can save to, eg pdf or png [%(default)s]', default='pdf') micplot_general_group.add_argument('--main_title', help='Main title of plot. Default is to use the antibiotic name', metavar='"title in quotes"') micplot_general_group.add_argument('--plot_height', help='Height of plot in inches [%(default)s]', default=7, type=float, metavar='FLOAT') micplot_general_group.add_argument('--plot_width', help='Width of plot in inches [%(default)s]', default=7, type=float, metavar='FLOAT') From d4f5ad2b51d42205b87d333491cc93e859c29724 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Thu, 2 Mar 2017 15:49:36 +0000 Subject: [PATCH 79/88] Bug fix checking if variant only --- ariba/mic_plotter.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index f03de6f3..34e47f90 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -150,7 +150,7 @@ def _load_mic_file(cls, infile): reader = csv.DictReader(f, delimiter='\t') if reader.fieldnames[0] != 'Sample': raise Error('Error. Expected first column of MIC file "' + infile + '" to be "Sample"') - + for row in reader: mic_data[row['Sample']] = {x: MicPlotter._mic_string_to_float(row[x]) for x in reader.fieldnames[1:]} @@ -174,7 +174,7 @@ def _load_summary_file(cls, infile): for field in row: if field == 'name': - continue + continue cluster, col = field.split('.', maxsplit=1) if cluster not in clusters: @@ -209,7 +209,7 @@ def _get_colours(cls, total_length, number_of_colours, colormap, skip=None): length = 1 - (skip[1] - skip[0]) vals = [(length) * x / (total_length - 1) for x in range(total_length)] vals = [x if x < skip[0] else x + (1-length) for x in vals] - + return [cmap(x) for x in vals] else: cmap = cmx.get_cmap(colormap) @@ -250,7 +250,7 @@ def _get_top_plot_data(cls, summary_data, mic_data, antibiotic, use_hets, refdat mutations.add(cluster + '.interrupted') if refdata is not None and 'match' in summary_data[sample][cluster] and summary_data[sample][cluster]['match'] == 'yes' and 'ref_seq' in summary_data[sample][cluster]: - ref_type, variant_only = self.refdata.sequence_type(summary_data[sample][cluster]['ref_seq']) + ref_type, variant_only = refdata.sequence_type(summary_data[sample][cluster]['ref_seq']) if not variant_only: mutations.add(cluster + '.present') @@ -371,7 +371,7 @@ def _top_plot_scatter_counts(cls, mutations, top_plot_data, colours, log_y): y_coords.append(mic) sizes.append(counts[mic]) colour_list.append(colours[i]) - + return x_coords, y_coords, sizes, colour_list @@ -393,7 +393,7 @@ def _top_plot_scatter_data(cls, mutations, top_plot_data, colours, log_y, x_jitt else: y_coords.append(mic) colour_list.append(colours[i]) - + return x_coords, y_coords, colour_list @@ -411,7 +411,7 @@ def _top_plot_violin_data(cls, mutations, top_plot_data, log_y): return violin_data, violin_pos - + @classmethod def _ordered_bottom_plot_rows(cls, mutations): l = [] @@ -494,7 +494,7 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): columns = ['.'.join(x) for x in columns] assert len(colours) == len(columns) max_x = len(colours) + 1 - + scatter_count_x, scatter_count_y, scatter_count_sizes, scatter_count_colours = MicPlotter._top_plot_scatter_counts(columns, top_plot_data, colours, self.log_y) scatter_data_x, scatter_data_y, scatter_data_colours = MicPlotter._top_plot_scatter_data(columns, top_plot_data, colours, self.log_y, self.jitter_width) violin_data, violin_positions = MicPlotter._top_plot_violin_data(columns, top_plot_data, self.log_y) From 9d0b3e2153fd6d6a4eec31591756a42cabe23539 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 3 Mar 2017 10:39:37 +0000 Subject: [PATCH 80/88] Add Mann Whiteny test --- ariba/mic_plotter.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 34e47f90..854ebd34 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -1,4 +1,5 @@ import csv +import sys import random import re import os @@ -486,6 +487,32 @@ def _right_plot_data(cls, scatter_count_sizes, number_of_circles, x_pos): return x_coords, y_coords, sizes + @classmethod + def _pairwise_mannwhitney(cls, violin_data, columns, outfile): + try: + from scipy.stats import mannwhitneyu + except: + print('WARNING: skipping Mann Whitney tests because scipy.stats.mannwhitneyu not found', file=sys.stderr) + return + + output = [] + + for i, list1 in enumerate(violin_data): + for j, list2 in enumerate(violin_data): + if j <= i or len(list1) < 2 or len(list2) < 2: + continue + + statistic, pvalue = mannwhitneyu(list1, list2, alternative='two-sided') + output.append((columns[i], columns[j], len(list1), len(list2), statistic, pvalue)) + + output.sort(key=lambda x: x[-1]) + + with open(outfile, 'w') as f: + print('Combination1', 'Combination2', 'Size1', 'Size2', 'Mann_Whitney_U', 'p-value', sep='\t', file=f) + for x in output: + print(*x, sep='\t', file=f) + + def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): bottom_plot_rows = MicPlotter._ordered_bottom_plot_rows(all_mutations) columns = MicPlotter._ordered_columns(mut_combinations, top_plot_data) @@ -498,6 +525,7 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): scatter_count_x, scatter_count_y, scatter_count_sizes, scatter_count_colours = MicPlotter._top_plot_scatter_counts(columns, top_plot_data, colours, self.log_y) scatter_data_x, scatter_data_y, scatter_data_colours = MicPlotter._top_plot_scatter_data(columns, top_plot_data, colours, self.log_y, self.jitter_width) violin_data, violin_positions = MicPlotter._top_plot_violin_data(columns, top_plot_data, self.log_y) + MicPlotter._pairwise_mannwhitney(violin_data, columns, self.outprefix + '.mannwhitney.tsv') # -------------------- SET UP GRID & PLOTS ----------------- fig=plt.figure(figsize=(self.plot_width, self.plot_height)) @@ -582,6 +610,8 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): plt.tight_layout(w_pad=self.count_legend_x) plt.savefig(self.outprefix + '.' + self.out_format) + + def run(self): mic_data = MicPlotter._load_mic_file(self.mic_file) From 24e714fe6a5ef2a4b5b963870eac4d403e2ae591 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 3 Mar 2017 10:48:42 +0000 Subject: [PATCH 81/88] Tidy up counts legend --- ariba/mic_plotter.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 854ebd34..e52af4e8 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -473,14 +473,18 @@ def _bottom_scatter_data(cls, bottom_plot_rows, columns, colours, outline=False) @classmethod - def _right_plot_data(cls, scatter_count_sizes, number_of_circles, x_pos): + def _right_plot_data(cls, scatter_count_sizes, x_pos): y_max = max(scatter_count_sizes) if y_max > 100: y_max = int(math.ceil(y_max / 100.0)) * 100 - sizes = [5, 50] + [x for x in range(100, y_max, 100)] + sizes = [5, 50] else: y_max = int(math.ceil(y_max / 10.0)) * 10 - sizes = [5] + [x for x in range(10, y_max, 10)] + sizes = [5, 10] + + while sizes[-1] < y_max: + sizes.append(sizes[-1]*2) + x_coords = [x_pos] * len(sizes) y_coords = [x + 1 for x in range(len(sizes))] y_coords.reverse() @@ -599,7 +603,7 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): # ------------------------- RIGHT PLOT ------------------------- if self.point_size == 0: right_x_coord = 0.75 - right_x, right_y, right_sizes = MicPlotter._right_plot_data(scatter_count_sizes, 5, right_x_coord) + right_x, right_y, right_sizes = MicPlotter._right_plot_data(scatter_count_sizes, right_x_coord) plots[1].scatter(right_x, right_y, s=right_sizes, c="black") plots[1].axis('off') plots[1].axis([0,4,-2*len(right_y),len(right_y)+1]) From 283b253a33a5f27a54bf599f4f1d51ee07caa756 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 3 Mar 2017 11:01:48 +0000 Subject: [PATCH 82/88] Add --point_scale option --- ariba/mic_plotter.py | 9 +++++++-- ariba/tasks/micplot.py | 1 + scripts/ariba | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index e52af4e8..3b0827e0 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -35,6 +35,7 @@ def __init__(self, no_combinations=False, hlines='', point_size=4, + point_scale=1, dot_size=100, dot_outline=False, dot_y_text_size=7, @@ -85,6 +86,7 @@ def __init__(self, raise Error('Error in hlines option. Needs to be a list of numbers separated by commas, or empty. Got this:\n' + hlines) self.point_size = point_size + self.point_scale = point_scale self.dot_size = dot_size self.dot_outline = dot_outline self.dot_y_text_size = dot_y_text_size @@ -563,9 +565,11 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): pc.set_facecolor(colours[x]) pc.set_edgecolor(colours[x]) + scaled_count_sizes = [self.point_scale * x for x in scatter_count_sizes] + if 'point' in self.plot_types: if self.point_size == 0: - plots[0].scatter(scatter_count_x, scatter_count_y, s=scatter_count_sizes, c=scatter_count_colours, linewidth=0) + plots[0].scatter(scatter_count_x, scatter_count_y, s=scaled_count_sizes, c=scatter_count_colours, linewidth=0) else: plots[0].scatter(scatter_data_x, scatter_data_y, c=scatter_data_colours, s=self.point_size) @@ -604,7 +608,8 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): if self.point_size == 0: right_x_coord = 0.75 right_x, right_y, right_sizes = MicPlotter._right_plot_data(scatter_count_sizes, right_x_coord) - plots[1].scatter(right_x, right_y, s=right_sizes, c="black") + right_scaled_sizes = [self.point_scale * x for x in right_sizes] + plots[1].scatter(right_x, right_y, s=right_scaled_sizes, c="black") plots[1].axis('off') plots[1].axis([0,4,-2*len(right_y),len(right_y)+1]) for i, y in enumerate(right_y): diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 0b7949fb..0dc73f81 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -18,6 +18,7 @@ def run(options): no_combinations=options.no_combinations, hlines=options.hlines, point_size=options.point_size, + point_scale=options.point_scale, dot_size=options.dot_size, dot_outline=options.dot_outline, dot_y_text_size=options.dot_y_text_size, diff --git a/scripts/ariba b/scripts/ariba index 00c7ddac..b242805b 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -94,6 +94,7 @@ micplot_upper_plot_group.add_argument('--hlines', help='Comma-separated list of micplot_upper_plot_group.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)s]', default=0.1, type=float, metavar='FLOAT') micplot_upper_plot_group.add_argument('--log_y', type=float, help='Base of log applied to y values. Set to zero to not log [%(default)s]', default=2, metavar='FLOAT') micplot_upper_plot_group.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point. If zero, will group points and size them proportional to the group size [%(default)s]', default=4, metavar='FLOAT') +micplot_upper_plot_group.add_argument('--point_scale', type=float, help='Scale point sizes when --point_size 0. All point sizes are multiplied by this number. Useful if you have large data set [%(default)s]', default=1, metavar='FLOAT') micplot_upper_plot_group.add_argument('--violin_width', type=float, help='Width of violins [%(default)s]', default=0.75) micplot_lower_plot_group = subparser_micplot.add_argument_group('Lower plot options') From 080c59c5e47757cdd5777cded38db2de7a31f29e Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 3 Mar 2017 11:33:20 +0000 Subject: [PATCH 83/88] Add option --p_cutoff and do p-value correction --- ariba/mic_plotter.py | 15 ++++++++++----- ariba/tasks/micplot.py | 3 ++- scripts/ariba | 1 + 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 3b0827e0..0e7506f5 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -49,7 +49,8 @@ def __init__(self, xkcd=False, min_samples=1, count_legend_x=-2, - out_format='pdf' + out_format='pdf', + p_cutoff=0.05 ): refdata_fa = os.path.join(refdata_dir, '02.cdhit.all.fa') refdata_tsv = os.path.join(refdata_dir, '01.filter.check_metadata.tsv') @@ -119,6 +120,7 @@ def __init__(self, self.min_samples = min_samples self.count_legend_x = count_legend_x self.out_format = out_format + self.p_cutoff = p_cutoff @classmethod @@ -494,7 +496,7 @@ def _right_plot_data(cls, scatter_count_sizes, x_pos): @classmethod - def _pairwise_mannwhitney(cls, violin_data, columns, outfile): + def _pairwise_mannwhitney(cls, violin_data, columns, outfile, p_cutoff): try: from scipy.stats import mannwhitneyu except: @@ -514,9 +516,12 @@ def _pairwise_mannwhitney(cls, violin_data, columns, outfile): output.sort(key=lambda x: x[-1]) with open(outfile, 'w') as f: - print('Combination1', 'Combination2', 'Size1', 'Size2', 'Mann_Whitney_U', 'p-value', sep='\t', file=f) + print('Combination1', 'Combination2', 'Size1', 'Size2', 'Mann_Whitney_U', 'p-value', 'significant', 'corrected_p-value', 'corrected_significant', sep='\t', file=f) for x in output: - print(*x, sep='\t', file=f) + significant = 'yes' if x[5] < p_cutoff else 'no' + corrected_p = min(1, len(output) * x[5]) + corrected_significant = 'yes' if corrected_p < p_cutoff else 'no' + print(*x, significant, corrected_p, corrected_significant, sep='\t', file=f) def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): @@ -531,7 +536,7 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): scatter_count_x, scatter_count_y, scatter_count_sizes, scatter_count_colours = MicPlotter._top_plot_scatter_counts(columns, top_plot_data, colours, self.log_y) scatter_data_x, scatter_data_y, scatter_data_colours = MicPlotter._top_plot_scatter_data(columns, top_plot_data, colours, self.log_y, self.jitter_width) violin_data, violin_positions = MicPlotter._top_plot_violin_data(columns, top_plot_data, self.log_y) - MicPlotter._pairwise_mannwhitney(violin_data, columns, self.outprefix + '.mannwhitney.tsv') + MicPlotter._pairwise_mannwhitney(violin_data, columns, self.outprefix + '.mannwhitney.tsv', self.p_cutoff) # -------------------- SET UP GRID & PLOTS ----------------- fig=plt.figure(figsize=(self.plot_width, self.plot_height)) diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py index 0dc73f81..28ad1440 100644 --- a/ariba/tasks/micplot.py +++ b/ariba/tasks/micplot.py @@ -32,7 +32,8 @@ def run(options): xkcd=options.xkcd, min_samples=options.min_samples, count_legend_x=options.count_legend_x, - out_format=options.out_format + out_format=options.out_format, + p_cutoff=options.p_cutoff ) plotter.run() diff --git a/scripts/ariba b/scripts/ariba index b242805b..d59e2649 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -81,6 +81,7 @@ micplot_general_group.add_argument('--no_combinations', action='store_true', hel micplot_general_group.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom plots. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='9,2', metavar='height1,height2') micplot_general_group.add_argument('--panel_widths', help='Two integers that determine relative width of plots and space used by counts legend. eg 5,1 means ratio of 5:1 between top and bottom panel widths. Only applies when plotting points and --point_size 0 [%(default)s]', default='5,1', metavar='width1,width2') micplot_general_group.add_argument('--count_legend_x', type=float, help='Control x position of counts legend when plotting points and --point_size 0 [%(default)s]', default=-2, metavar='FLOAT') +micplot_general_group.add_argument('--p_cutoff', type=float, help='p-value cutoff for Mann-Whitney tests [%(default)s]', default=0.05) micplot_general_group.add_argument('--xkcd', action='store_true', help='Best used with xkcd font installed ;)') micplot_colour_group = subparser_micplot.add_argument_group('Colour options') From 8c8b9b328fcb8fecd8ae102858c8bfc72c4e68c8 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Sat, 4 Mar 2017 10:32:59 +0000 Subject: [PATCH 84/88] report effect size --- ariba/mic_plotter.py | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index 0e7506f5..ad96e1cd 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -496,11 +496,11 @@ def _right_plot_data(cls, scatter_count_sizes, x_pos): @classmethod - def _pairwise_mannwhitney(cls, violin_data, columns, outfile, p_cutoff): + def _pairwise_compare(cls, violin_data, columns, outfile, p_cutoff, compare_test): try: - from scipy.stats import mannwhitneyu + import scipy.stats except: - print('WARNING: skipping Mann Whitney tests because scipy.stats.mannwhitneyu not found', file=sys.stderr) + print('WARNING: skipping Mann Whitney tests because scipy.stats not found', file=sys.stderr) return output = [] @@ -510,18 +510,32 @@ def _pairwise_mannwhitney(cls, violin_data, columns, outfile, p_cutoff): if j <= i or len(list1) < 2 or len(list2) < 2: continue - statistic, pvalue = mannwhitneyu(list1, list2, alternative='two-sided') - output.append((columns[i], columns[j], len(list1), len(list2), statistic, pvalue)) + list1set = set(list1) - output.sort(key=lambda x: x[-1]) + if len(list1set) == 1 and list1set == set(list2): + statistic = 'NA' + pvalue = 1 + else: + if compare_test == 'mannwhitneyu': + statistic, pvalue = scipy.stats.mannwhitneyu(list1, list2, alternative='two-sided') + elif compare_test == 'ks_2samp': + statistic, pvalue = scipy.stats.ks_2samp(list1, list2) + else: + raise Error('Test "' + compare_test + '" not recognised. Cannot continue') + + effect_size = abs(scipy.stats.norm.ppf(pvalue) / math.sqrt(len(list1) + len(list2))) + significant = 'yes' if pvalue < p_cutoff else 'no' + output.append((columns[i], columns[j], len(list1), len(list2), pvalue, significant, effect_size)) + + output.sort(key=lambda x: x[4]) with open(outfile, 'w') as f: - print('Combination1', 'Combination2', 'Size1', 'Size2', 'Mann_Whitney_U', 'p-value', 'significant', 'corrected_p-value', 'corrected_significant', sep='\t', file=f) + print('Combination1', 'Combination2', 'Size1', 'Size2', 'p-value', 'significant', 'effect_size', 'corrected_p-value', 'corrected_significant', 'corrected_effect_size', sep='\t', file=f) for x in output: - significant = 'yes' if x[5] < p_cutoff else 'no' - corrected_p = min(1, len(output) * x[5]) + corrected_p = min(1, len(output) * x[4]) corrected_significant = 'yes' if corrected_p < p_cutoff else 'no' - print(*x, significant, corrected_p, corrected_significant, sep='\t', file=f) + corrected_effect_size = scipy.stats.norm.ppf(corrected_p) / math.sqrt(x[2] + x[3]) + print(*x, corrected_p, corrected_significant, corrected_effect_size, sep='\t', file=f) def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): @@ -536,7 +550,8 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): scatter_count_x, scatter_count_y, scatter_count_sizes, scatter_count_colours = MicPlotter._top_plot_scatter_counts(columns, top_plot_data, colours, self.log_y) scatter_data_x, scatter_data_y, scatter_data_colours = MicPlotter._top_plot_scatter_data(columns, top_plot_data, colours, self.log_y, self.jitter_width) violin_data, violin_positions = MicPlotter._top_plot_violin_data(columns, top_plot_data, self.log_y) - MicPlotter._pairwise_mannwhitney(violin_data, columns, self.outprefix + '.mannwhitney.tsv', self.p_cutoff) + MicPlotter._pairwise_compare(violin_data, columns, self.outprefix + '.mannwhitney.tsv', self.p_cutoff, 'mannwhitneyu') + MicPlotter._pairwise_compare(violin_data, columns, self.outprefix + '.ks_2sample.tsv', self.p_cutoff, 'ks_2samp') # -------------------- SET UP GRID & PLOTS ----------------- fig=plt.figure(figsize=(self.plot_width, self.plot_height)) @@ -624,7 +639,7 @@ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): plt.tight_layout(w_pad=self.count_legend_x) plt.savefig(self.outprefix + '.' + self.out_format) - + def run(self): From 24030162228a2c843d7e112c75e23bfa076d6da7 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Sat, 4 Mar 2017 12:24:26 +0000 Subject: [PATCH 85/88] version bump 2.8.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b50f6a61..2cfd7269 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ setup( ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod], name='ariba', - version='2.7.2', + version='2.8.0', description='ARIBA: Antibiotic Resistance Identification By Assembly', packages = find_packages(), package_data={'ariba': ['test_run_data/*']}, From e6feb257b144ddc9acbe6476decdd157a6048e27 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Sat, 4 Mar 2017 12:26:29 +0000 Subject: [PATCH 86/88] Require matplotlib --- README.md | 3 ++- setup.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index beb1ae1d..5e1ff0d0 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,8 @@ Once the dependencies are installed, install ARIBA using pip: ARIBA also depends on several Python packages, all of which are available via pip, so the above command will get those automatically if they -are not installed. The packages are dendropy >= 4.2.0, +are not installed. The packages are dendropy >= 4.2.0, matplotlib (no +minimum version required, but only tested on 2.0.0), pyfastaq >= 3.12.0, pysam >= 0.9.1, and pymummer >= 0.10.1. Alternatively, you can download the latest release from this github repository, diff --git a/setup.py b/setup.py index 2cfd7269..f0cea2de 100644 --- a/setup.py +++ b/setup.py @@ -68,6 +68,7 @@ install_requires=[ 'BeautifulSoup4 >= 4.1.0', 'dendropy >= 4.2.0', + 'matplotlib', 'pyfastaq >= 3.12.0', 'pysam >= 0.9.1', 'pymummer>=0.10.2', From 0d2904f7b459f84cbf6eb91c2072b322671b145d Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Sat, 4 Mar 2017 12:42:27 +0000 Subject: [PATCH 87/88] fix print to work with python 3.4.2 --- ariba/mic_plotter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py index ad96e1cd..ca5992f4 100644 --- a/ariba/mic_plotter.py +++ b/ariba/mic_plotter.py @@ -535,7 +535,7 @@ def _pairwise_compare(cls, violin_data, columns, outfile, p_cutoff, compare_test corrected_p = min(1, len(output) * x[4]) corrected_significant = 'yes' if corrected_p < p_cutoff else 'no' corrected_effect_size = scipy.stats.norm.ppf(corrected_p) / math.sqrt(x[2] + x[3]) - print(*x, corrected_p, corrected_significant, corrected_effect_size, sep='\t', file=f) + print('\t'.join([str(z) for z in x]), corrected_p, corrected_significant, corrected_effect_size, sep='\t', file=f) def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): From c2f3935b34ca1518a16f3ae21d75dad98c5e6f55 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Sat, 4 Mar 2017 12:51:27 +0000 Subject: [PATCH 88/88] Version bump 2.8.1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f0cea2de..96a51b9b 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ setup( ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod], name='ariba', - version='2.8.0', + version='2.8.1', description='ARIBA: Antibiotic Resistance Identification By Assembly', packages = find_packages(), package_data={'ariba': ['test_run_data/*']},