From b731f74450e20bcc1f95920481adb2c35f407c51 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Tue, 27 Oct 2015 13:48:08 +0000 Subject: [PATCH 1/6] Add flag has_nonsynonymous_variants --- ariba/flag.py | 1 + ariba/tests/flag_test.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ariba/flag.py b/ariba/flag.py index 97f8e1a2..fa0d5cee 100644 --- a/ariba/flag.py +++ b/ariba/flag.py @@ -11,6 +11,7 @@ class Error (Exception): pass 'assembly_fail', 'variants_suggest_collapsed_repeat', 'hit_both_strands', + 'has_nonsynonymous_variants', ] diff --git a/ariba/tests/flag_test.py b/ariba/tests/flag_test.py index b187db1c..3053c538 100644 --- a/ariba/tests/flag_test.py +++ b/ariba/tests/flag_test.py @@ -24,7 +24,7 @@ def test_set_flag(self): def test_add(self): '''Test add''' f = flag.Flag() - expected = [1, 3, 7, 15, 31, 63, 127, 255, 511] + expected = [1, 3, 7, 15, 31, 63, 127, 255, 511, 1023] for i in range(len(flag.flags_in_order)): f.add(flag.flags_in_order[i]) self.assertEqual(f.to_number(), expected[i]) @@ -50,6 +50,7 @@ def test_to_long_str(self): '[ ] assembly_fail', '[ ] variants_suggest_collapsed_repeat', '[ ] hit_both_strands', + '[ ] has_nonsynonymous_variants', ]) self.assertEqual(expected, f.to_long_string()) From f884e8578fa98af461c008b53ec900fe8d9c7895 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Tue, 27 Oct 2015 14:43:57 +0000 Subject: [PATCH 2/6] Use new flag has_nonsynonymous_variants --- ariba/cluster.py | 22 +++++++++++++++++- ariba/tests/cluster_test.py | 45 ++++++++++++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/ariba/cluster.py b/ariba/cluster.py index 07a87b7d..f8023ca3 100644 --- a/ariba/cluster.py +++ b/ariba/cluster.py @@ -874,7 +874,10 @@ def _get_vcf_variant_counts(self): self.status_flag.add('variants_suggest_collapsed_repeat') - def _make_report_lines(self): + def _initial_make_report_lines(self): + '''Makes report lines. While they are being made, we discover if there were + and non-synonymous variants. This affects the flag, which also gets updated + by the function. To then fix the report lines, must run _update_flag_in_report_lines()''' self.report_lines = [] total_reads = self._get_read_counts() @@ -903,6 +906,9 @@ def _make_report_lines(self): t = self._get_variant_effect(variants) if t is not None: effect, new_bases = t + if effect != 'SYN': + self.status_flag.add('has_nonsynonymous_variants') + for v in variants: depths = self._get_assembly_read_depths(contig, v.qry_start) if depths is None: @@ -987,6 +993,20 @@ def _make_report_lines(self): self.report_lines.sort(key=itemgetter(0, 14, 15)) + + def _update_flag_in_report_lines(self): + '''This corrects the flag in all the report lines made by _initial_make_report_lines()''' + flag_column = 1 + if self.status_flag.has('has_nonsynonymous_variants'): + for line in self.report_lines: + line[flag_column] = self.status_flag.to_number() + + + def _make_report_lines(self): + self._initial_make_report_lines() + self._update_flag_in_report_lines() + + def _clean(self): if self.verbose: print('Cleaning', self.root_dir) diff --git a/ariba/tests/cluster_test.py b/ariba/tests/cluster_test.py index 3106b339..8c60b75e 100644 --- a/ariba/tests/cluster_test.py +++ b/ariba/tests/cluster_test.py @@ -730,7 +730,50 @@ def test_get_vcf_variant_counts(self): clean_cluster_dir(cluster_dir) - def test_make_report_lines(self): + def test_make_report_lines_nonsynonymous(self): + '''test _make_report_lines''' + cluster_dir = os.path.join(data_dir, 'cluster_test_generic') + clean_cluster_dir(cluster_dir) + c = cluster.Cluster(cluster_dir, 'cluster_name') + c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA') + v1 = pymummer.variant.Variant(pymummer.snp.Snp('8\tA\tG\t8\tx\tx\t39\t39\tx\tx\tgene\tcontig')) + + nucmer_hit = ['1', '10', '1', '10', '10', '10', '90.00', '1000', '1000', '1', '1', 'gene', 'contig'] + c.nucmer_hits = {'contig': [pymummer.alignment.Alignment('\t'.join(nucmer_hit))]} + c.mummer_variants = {'contig': [[v1]]} + c.percent_identities = {'contig': 92.42} + c.status_flag.set_flag(42) + c.assembled_ok = True + c.final_assembly_read_depths = os.path.join(data_dir, 'cluster_test_make_report_lines.read_depths.gz') + c._make_report_lines() + expected = [[ + 'gene', + 554, + 2, + 'cluster_name', + 39, + 10, + 92.42, + 'SNP', + 'NONSYN', + 'E3G', + 8, + 8, + 'A', + 'contig', + 39, + 8, + 8, + 'G', + '.', + '.', + '.' + ]] + self.assertEqual(expected, c.report_lines) + clean_cluster_dir(cluster_dir) + + + def test_make_report_lines_synonymous(self): '''test _make_report_lines''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) From 71c65b741f986b4592d961055a5533907c2ea634 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Tue, 27 Oct 2015 15:02:39 +0000 Subject: [PATCH 3/6] Add extra summary number (For presence/absence of nonsynonymous changes) --- ariba/summary.py | 11 +++++++---- ariba/tests/summary_test.py | 7 ++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/ariba/summary.py b/ariba/summary.py index 0262524c..0e34e584 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -122,10 +122,13 @@ def _to_summary_number(self, l): if f.has('hit_both_strands') or (not f.has('complete_orf')): return 1 - if f.has('unique_contig') and f.has('gene_assembled_into_one_contig'): - return 3 - - return 2 + if f.has('unique_contig') and f.has('gene_assembled_into_one_contig') and f.has('complete_orf'): + if f.has('has_nonsynonymous_variants'): + return 3 + else: + return 4 + else: + return 2 def _pc_id_of_longest(self, l): diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 32740d6a..1dac3936 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -75,7 +75,8 @@ def test_to_summary_number(self): (7, 1), (259, 1), (15, 2), - (27, 3), + (539, 3), + (27, 4), ] for t in tests: @@ -96,8 +97,8 @@ def test_gather_output_rows(self): s._gather_output_rows() expected = [ ['filename', 'gene1', 'gene2', 'gene3'], - [infiles[0], 3, 2, 0], - [infiles[1], 3, 0, 3], + [infiles[0], 4, 2, 0], + [infiles[1], 4, 0, 4], ] self.assertEqual(expected, s.rows_out) From 8232f676a90c62fbeef40ee32daef0590a1e0fcc Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Tue, 27 Oct 2015 15:17:06 +0000 Subject: [PATCH 4/6] Add option to not filter summary output --- ariba/summary.py | 5 +++++ ariba/tasks/summary.py | 2 ++ ariba/tests/summary_test.py | 19 ++++++++++++++++++- 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/ariba/summary.py b/ariba/summary.py index 0e34e584..935846b9 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -48,6 +48,7 @@ def __init__( outfile, filenames=None, fofn=None, + filter_output=True, min_id=90.0 ): if filenames is None and fofn is None: @@ -61,6 +62,7 @@ def __init__( if fofn is not None: self.filenames.extend(self._load_fofn(fofn)) + self.filter_output = filter_output self.min_id = min_id self.outfile = outfile @@ -168,6 +170,9 @@ def _gather_output_rows(self): def _filter_output_rows(self): + if not self.filter_output: + return + # remove rows that are all zeros self.rows_out = [x for x in self.rows_out if x[1:] != [0]*(len(x)-1)] diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py index 7dbde7d3..02cf4eda 100644 --- a/ariba/tasks/summary.py +++ b/ariba/tasks/summary.py @@ -8,6 +8,7 @@ def run(): epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input. The input report files must be in tsv format, not xls.') parser.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME') parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT') + parser.add_argument('--no_filter', action='store_true', help='Do not filter rows or columns of output that are all 0 (by deafult, they are removed from the output)') parser.add_argument('outfile', help='Name of output file. If file ends with ".xls", then an excel spreadsheet is written. Otherwise a tsv file is written') parser.add_argument('infiles', nargs='*', help='Files to be summarised') options = parser.parse_args() @@ -18,6 +19,7 @@ def run(): options.outfile, fofn=options.fofn, filenames=options.infiles, + filter_output=(not options.no_filter), min_id=options.min_id ) s.run() diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 1dac3936..5e274162 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -1,4 +1,5 @@ import unittest +import copy import filecmp import os from ariba import summary, flag @@ -103,7 +104,7 @@ def test_gather_output_rows(self): self.assertEqual(expected, s.rows_out) - def test_filter_output_rows(self): + def test_filter_output_rows_filter_true(self): '''Test _filter_output_rows''' s = summary.Summary('out', filenames=['spam', 'eggs']) s.rows_out = [ @@ -123,6 +124,22 @@ def test_filter_output_rows(self): self.assertEqual(s.rows_out, expected) + def test_filter_output_rows_filter_false(self): + '''Test _filter_output_rows''' + s = summary.Summary('out', filenames=['spam', 'eggs'], filter_output=False) + rows_out = [ + ['filename', 'gene1', 'gene2', 'gene3'], + ['file1', 0, 0, 0], + ['file2', 1, 0, 3], + ['file3', 2, 0, 4], + ] + + s.rows_out = copy.copy(rows_out) + + s._filter_output_rows() + self.assertEqual(s.rows_out, rows_out) + + def test_write_tsv(self): '''Test _write_tsv''' tmp_out = 'tmp.out.tsv' From 63a42ac04fa98d991a906761a31f76e686f9ea65 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Tue, 27 Oct 2015 15:26:39 +0000 Subject: [PATCH 5/6] Version bumps --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 7d4c109c..39ab459f 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name='ariba', - version='0.5.0', + version='0.6.0', description='ARIBA: Antibiotic Resistance Identification By Assembly', packages = find_packages(), author='Martin Hunt', @@ -18,9 +18,9 @@ tests_require=['nose >= 1.3'], install_requires=[ 'openpyxl', - 'pyfastaq >= 3.0.1', + 'pyfastaq >= 3.10.0', 'pysam >= 0.8.1', - 'pymummer>=0.0.2' + 'pymummer>=0.6.1' ], license='GPLv3', classifiers=[ From a01c30344aba1e270aa2c2d7be22327e4aca8b54 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 28 Oct 2015 08:55:35 +0000 Subject: [PATCH 6/6] Verison bump --- ariba/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/common.py b/ariba/common.py index c4798115..fcc298e9 100644 --- a/ariba/common.py +++ b/ariba/common.py @@ -1,7 +1,7 @@ import sys import subprocess -version = '0.5.0' +version = '0.6.0' def syscall(cmd, allow_fail=False, verbose=False): if verbose: