From 573c524698fdfb3f90ec8657d2a40d0f20c203b1 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Mon, 23 Mar 2015 15:52:15 +0000 Subject: [PATCH 1/7] Add method has --- ariba/flag.py | 7 +++++++ ariba/tests/flag_test.py | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/ariba/flag.py b/ariba/flag.py index cea7fc1c..06b0da1b 100644 --- a/ariba/flag.py +++ b/ariba/flag.py @@ -39,6 +39,9 @@ def to_number(self): n += flag_bits[f] return n + def __eq__(self, other): + return type(other) is type(self) and self.__dict__ == other.__dict__ + def __str__(self): return str(self.to_number()) @@ -51,3 +54,7 @@ def to_long_string(self): lines.append('[' + x_or_not + '] ' + f) return '\n'.join(lines) + + def has(self, s): + return self.flags[s] + diff --git a/ariba/tests/flag_test.py b/ariba/tests/flag_test.py index c2801a2c..e70c4374 100644 --- a/ariba/tests/flag_test.py +++ b/ariba/tests/flag_test.py @@ -53,3 +53,11 @@ def test_to_long_str(self): self.assertEqual(expected, f.to_long_string()) + + def test_has(self): + '''Test has''' + for x in flag.flags_in_order: + f = flag.Flag(0) + self.assertFalse(f.has(x)) + f.add(x) + self.assertTrue(f.has(x)) From 0c9e76a6e4bbf8e54e1c15611189a22ca3ca3583 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Mon, 23 Mar 2015 15:52:36 +0000 Subject: [PATCH 2/7] Add module summary --- ariba/__init__.py | 1 + ariba/summary.py | 200 ++++++++++++++++++ .../summary_test_gather_output_rows.in.1.tsv | 4 + .../summary_test_gather_output_rows.in.2.tsv | 3 + ariba/tests/data/summary_test_init.fofn | 2 + .../tests/data/summary_test_load_file.in.tsv | 5 + .../tests/data/summary_test_write_tsv.out.tsv | 3 + ariba/tests/summary_test.py | 132 ++++++++++++ 8 files changed, 350 insertions(+) create mode 100644 ariba/summary.py create mode 100644 ariba/tests/data/summary_test_gather_output_rows.in.1.tsv create mode 100644 ariba/tests/data/summary_test_gather_output_rows.in.2.tsv create mode 100644 ariba/tests/data/summary_test_init.fofn create mode 100644 ariba/tests/data/summary_test_load_file.in.tsv create mode 100644 ariba/tests/data/summary_test_write_tsv.out.tsv create mode 100644 ariba/tests/summary_test.py diff --git a/ariba/__init__.py b/ariba/__init__.py index 1e9dee73..4446c002 100644 --- a/ariba/__init__.py +++ b/ariba/__init__.py @@ -12,6 +12,7 @@ 'mapping', 'refcheck', 'scaffold_graph', + 'summary', 'tasks', ] diff --git a/ariba/summary.py b/ariba/summary.py new file mode 100644 index 00000000..d284f72f --- /dev/null +++ b/ariba/summary.py @@ -0,0 +1,200 @@ +import openpyxl +import pyfastaq +from ariba import flag + +class Error (Exception): pass + +columns = [ + 'gene', + 'flag', + 'cluster', + 'gene_len', + 'assembled', + 'pc_ident', + 'var_type', + 'var_effect', + 'new_aa', + 'gene_start', + 'gene_end', + 'gene_nt', + 'scaffold', + 'scaff_len', + 'scaff_start', + 'scaff_end', + 'scaff_nt', +] + +int_columns = [ + 'gene_len', + 'assembled', + 'gene_start', + 'gene_end', + 'scaff_len', + 'scaff_start', + 'scaff_end', +] + + +class Summary: + def __init__( + self, + outfile, + filenames=None, + fofn=None, + min_id=90.0 + ): + if filenames is None and fofn is None: + raise Error('Error! Must supply filenames or fofn to Summary(). Cannot continue') + + if filenames is None: + self.filenames = [] + else: + self.filenames = filenames + + if fofn is not None: + self.filenames.extend(self._load_fofn(fofn)) + + self.min_id = min_id + self.outfile = outfile + + + def _load_fofn(self, fofn): + f = pyfastaq.utils.open_file_read(fofn) + filenames = [x.rstrip() for x in f.readlines()] + pyfastaq.utils.close(f) + return filenames + + + def _check_files_exist(self): + for fname in self.filenames: + if not os.path.exists(fname): + raise Error('File not found: "' + fname + '". Cannot continue') + + + def _line2dict(self, line): + data = line.rstrip().split('\t') + d = {columns[i]: data[i] for i in range(len(data))} + d['flag'] = flag.Flag(int(d['flag']) ) + for key in int_columns: + try: + d[key] = int(d[key]) + except: + assert d[key] == '.' + try: + d['pc_ident'] = float(d['pc_ident']) + except: + assert d['pc_ident'] == '.' + return d + + + def _load_file(self, filename): + f = pyfastaq.utils.open_file_read(filename) + d = {} + + for line in f: + if line.startswith('#'): + if line.rstrip()[1:].split('\t') != columns: + raise Error('Error parsing the following line.\n' + line) + continue + data = self._line2dict(line) + + if data['gene'] not in d: + d[data['gene']] = [] + + d[data['gene']].append(data) + + pyfastaq.utils.close(f) + return d + + + def _to_summary_number(self, l): + f = l[0]['flag'] + if f.has('assembly_fail') or not f.has('gene_assembled') or self._pc_id_of_longest(l) <= self.min_id: + return 0 + + if not f.has('complete_orf'): + return 1 + + if f.has('unique_contig') and f.has('gene_assembled_into_one_contig'): + return 3 + + return 2 + + + def _pc_id_of_longest(self, l): + longest = 0 + identity = None + for data in l: + if data['assembled'] > longest: + longest = data['assembled'] + identity = data['pc_ident'] + + assert identity is not None + return identity + + + + def _gather_output_rows(self): + self.data = {filename: self._load_file(filename) for filename in self.filenames} + + all_genes = set() + for l in self.data.values(): + all_genes.update(set(l.keys())) + all_genes = list(all_genes) + all_genes.sort() + + self.rows_out = [] + self.rows_out.append(['filename'] + all_genes) + + for filename in self.filenames: + new_row = [filename] + for gene in all_genes: + if gene not in self.data[filename]: + new_row.append(0) + else: + new_row.append(self._to_summary_number(self.data[filename][gene])) + + self.rows_out.append(new_row) + + + def _filter_output_rows(self): + # remove rows that are all zeros + self.rows_out = [x for x in self.rows_out if x[1:] != [0]*(len(x)-1)] + + # remove columns that are all zeros + to_remove = [] + for i in range(1, len(self.rows_out[0])): + if sum([x[i] for x in self.rows_out[1:]]) == 0: + to_remove.append(i) + + for i in range(len(self.rows_out)): + self.rows_out[i] = [self.rows_out[i][j] for j in range(len(self.rows_out[i])) if j not in to_remove] + + + + def _write_tsv(self): + f = pyfastaq.utils.open_file_write(self.outfile) + for row in self.rows_out: + print('\t'.join([str(x) for x in row]), file=f) + pyfastaq.utils.close(f) + + + def _write_xls(self): + workbook = openpyxl.Workbook() + worksheet = workbook.worksheets[0] + worksheet.title = 'ARIBA_summary' + for row in self.rows_out: + worksheet.append(row) + workbook.save(self.outfile) + + + def run(self): + self._check_files_exist() + self._gather_output_rows() + self._filter_output_rows() + if self.outfile.endswith('.xls'): + self._write_xls() + else: + self._write_tsv() + + diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv new file mode 100644 index 00000000..6ed46946 --- /dev/null +++ b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv @@ -0,0 +1,4 @@ +#gene flag cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt +gene1 27 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . . +gene2 15 2 780 780 100.0 . . . . . . gene2.scaffold.2 1124 . . . +gene2 15 2 780 770 99.0 . . . . . . gene2.scaffold.3 1097 . . . diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv new file mode 100644 index 00000000..61c260cc --- /dev/null +++ b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv @@ -0,0 +1,3 @@ +#gene flag cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt +gene1 27 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . . +gene3 27 3 750 750 98.93 . . . . . . gene3.scaffold.1 1047 . . . diff --git a/ariba/tests/data/summary_test_init.fofn b/ariba/tests/data/summary_test_init.fofn new file mode 100644 index 00000000..89361a0a --- /dev/null +++ b/ariba/tests/data/summary_test_init.fofn @@ -0,0 +1,2 @@ +file1 +file2 diff --git a/ariba/tests/data/summary_test_load_file.in.tsv b/ariba/tests/data/summary_test_load_file.in.tsv new file mode 100644 index 00000000..e81496ce --- /dev/null +++ b/ariba/tests/data/summary_test_load_file.in.tsv @@ -0,0 +1,5 @@ +#gene flag cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt +gene1 27 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . . +gene2 15 2 780 780 100.0 . . . . . . gene2.scaffold.2 1124 . . . +gene2 15 2 780 770 99.0 . . . . . . gene2.scaffold.3 1097 . . . +gene3 187 3 750 750 98.93 SNP SYN . 318 318 C gene3.scaffold.1 1047 319 319 G diff --git a/ariba/tests/data/summary_test_write_tsv.out.tsv b/ariba/tests/data/summary_test_write_tsv.out.tsv new file mode 100644 index 00000000..4c81011f --- /dev/null +++ b/ariba/tests/data/summary_test_write_tsv.out.tsv @@ -0,0 +1,3 @@ +filename gene1 gene3 +file2 1 3 +file3 2 4 diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py new file mode 100644 index 00000000..3394eee8 --- /dev/null +++ b/ariba/tests/summary_test.py @@ -0,0 +1,132 @@ +import unittest +import filecmp +import os +from ariba import summary, flag + +modules_dir = os.path.dirname(os.path.abspath(summary.__file__)) +data_dir = os.path.join(modules_dir, 'tests', 'data') + +class TestSummry(unittest.TestCase): + def test_init(self): + '''Test init''' + fofn = os.path.join(data_dir, 'summary_test_init.fofn') + s = summary.Summary('out', fofn=fofn) + self.assertEqual(s.filenames, ['file1', 'file2']) + s = summary.Summary('out', filenames=['file42']) + self.assertEqual(s.filenames, ['file42']) + s = summary.Summary('out', fofn=fofn, filenames=['file42']) + self.assertEqual(s.filenames, ['file42', 'file1', 'file2']) + + + + def test_line2dict(self): + '''Test _line2dict''' + line = '\t'.join(['gene1', '187', '3', '750', '750', '98.93', 'SNP', 'SYN', '.', '66', '66', 'A', 'gene1.scaffold.1', '1047', '67', '67', 'C']) + s = summary.Summary('out', filenames=['spam', 'eggs']) + expected = { + 'gene': 'gene1', + 'flag': flag.Flag(187), + 'cluster': '3', + 'gene_len': 750, + 'assembled': 750, + 'pc_ident': 98.93, + 'var_type': 'SNP', + 'var_effect': 'SYN', + 'new_aa': '.', + 'gene_start': 66, + 'gene_end': 66, + 'gene_nt': 'A', + 'scaffold': 'gene1.scaffold.1', + 'scaff_len': 1047, + 'scaff_start': 67, + 'scaff_end': 67, + 'scaff_nt': 'C' + } + self.assertEqual(s._line2dict(line), expected) + + + def test_load_file(self): + '''Test _load_file''' + s = summary.Summary('out', filenames=['spam', 'eggs']) + infile = os.path.join(data_dir, 'summary_test_load_file.in.tsv') + + lines = [ + ['gene1', '27', '1', '822', '822', '100.0', '.', '.', '.', '.', '.', '.', 'gene1.scaffold.1', '1490', '.', '.', '.'], + ['gene2', '15', '2', '780', '780', '100.0', '.', '.', '.', '.', '.', '.', 'gene2.scaffold.2', '1124', '.', '.', '.'], + ['gene2', '15', '2', '780', '770', '99.0', '.', '.', '.', '.', '.', '.', 'gene2.scaffold.3', '1097', '.', '.', '.'], + ['gene3', '187', '3', '750', '750', '98.93', 'SNP', 'SYN', '.', '318', '318', 'C', 'gene3.scaffold.1', '1047', '319', '319', 'G'] +] + dicts = [s._line2dict('\t'.join(x)) for x in lines] + expected = {'gene1': [dicts[0]], 'gene2': dicts[1:3], 'gene3': [dicts[3]]} + got = s._load_file(infile) + self.assertEqual(expected, got) + + + def test_to_summary_number(self): + '''Test _to_summary_number''' + s = summary.Summary('out', filenames=['spam', 'eggs']) + tests = [ + (0, 0), + (64, 0), + (7, 1), + (15, 2), + (27, 3), + ] + + for t in tests: + l = [{'flag': flag.Flag(t[0]), 'assembled': 42, 'pc_ident': 99}] + self.assertEqual(s._to_summary_number(l), t[1]) + + l = [{'flag': flag.Flag(27), 'assembled': 42, 'pc_ident': 89}] + self.assertEqual(s._to_summary_number(l), 0) + + + def test_gather_output_rows(self): + '''Test _gather_output_rows''' + infiles = [ + os.path.join(data_dir, 'summary_test_gather_output_rows.in.1.tsv'), + os.path.join(data_dir, 'summary_test_gather_output_rows.in.2.tsv') + ] + s = summary.Summary('out', filenames=infiles) + s._gather_output_rows() + expected = [ + ['filename', 'gene1', 'gene2', 'gene3'], + [infiles[0], 3, 2, 0], + [infiles[1], 3, 0, 3], + ] + self.assertEqual(expected, s.rows_out) + + + def test_filter_output_rows(self): + '''Test _filter_output_rows''' + s = summary.Summary('out', filenames=['spam', 'eggs']) + s.rows_out = [ + ['filename', 'gene1', 'gene2', 'gene3'], + ['file1', 0, 0, 0], + ['file2', 1, 0, 3], + ['file3', 2, 0, 4], + ] + + expected = [ + ['filename', 'gene1', 'gene3'], + ['file2', 1, 3], + ['file3', 2, 4], + ] + + s._filter_output_rows() + self.assertEqual(s.rows_out, expected) + + + def test_write_tsv(self): + '''Test _write_tsv''' + tmp_out = 'tmp.out.tsv' + s = summary.Summary(tmp_out, filenames=['spam', 'eggs']) + s.rows_out = [ + ['filename', 'gene1', 'gene3'], + ['file2', 1, 3], + ['file3', 2, 4], + ] + s._write_tsv() + expected = os.path.join(data_dir, 'summary_test_write_tsv.out.tsv') + self.assertTrue(filecmp.cmp(tmp_out, expected, shallow=False)) + os.unlink(tmp_out) From fe772e8fa7b4125c15299bc55bac0bb2901dd714 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Mon, 23 Mar 2015 16:15:42 +0000 Subject: [PATCH 3/7] Fix check files exist --- ariba/summary.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ariba/summary.py b/ariba/summary.py index d284f72f..2a23db91 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -1,3 +1,4 @@ +import os import openpyxl import pyfastaq from ariba import flag From 51023d9ed0eaa541b1cf67cb5c521e8497e01747 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Mon, 23 Mar 2015 16:16:20 +0000 Subject: [PATCH 4/7] Add hash to first line of output --- ariba/tasks/summary.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 ariba/tasks/summary.py diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py new file mode 100644 index 00000000..271dd473 --- /dev/null +++ b/ariba/tasks/summary.py @@ -0,0 +1,23 @@ +import argparse +import ariba + +def run(): + parser = argparse.ArgumentParser( + description = 'Make a summry of ARIBA report files', + usage = 'ariba summary [options] [infiles]', + epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input') + parser.add_argument('-f', '--fofn', help='File of filenames of ariba reports to be summarised. Must b used if no input files listed after the outfile', metavar='FILENAME') + parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT') + parser.add_argument('outfile', help='Name of output file. If file ends with ".xls", then an excel spreadsheet is written. Otherwise a tsv file is written') + parser.add_argument('infiles', nargs='*', help='Files to be summarised') + options = parser.parse_args() + if len(options.infiles) == 0: + options.infiles = None + + s = ariba.summary.Summary( + options.outfile, + fofn=options.fofn, + filenames=options.infiles, + min_id=options.min_id + ) + s.run() From 6251600d603da3d34e96071b1c46f89f9d4271e1 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Mon, 23 Mar 2015 16:18:00 +0000 Subject: [PATCH 5/7] Add hash to first line of output --- ariba/summary.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ariba/summary.py b/ariba/summary.py index 2a23db91..c15b935f 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -175,6 +175,7 @@ def _filter_output_rows(self): def _write_tsv(self): f = pyfastaq.utils.open_file_write(self.outfile) + print('#', end='', file=f) for row in self.rows_out: print('\t'.join([str(x) for x in row]), file=f) pyfastaq.utils.close(f) From e857d0f17adfb8d7d7fb5f7f0c882fae56a5cdd3 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Mon, 23 Mar 2015 16:19:24 +0000 Subject: [PATCH 6/7] Add summary task --- scripts/ariba | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/ariba b/scripts/ariba index 8655bd4c..7a4debea 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -7,6 +7,7 @@ import sys tasks = { 'refcheck': 'Check or fix input genes FASTA', 'run': 'Run the ARIBA local assembly pipeline', + 'summary': 'Summarise multiple reports made by "run"', 'flag': 'Translate the meaning of a flag output by the pipeline', 'version': 'Print version and exit', } @@ -15,6 +16,7 @@ tasks = { ordered_tasks = [ 'refcheck', 'run', + 'summary', 'flag', 'version', ] From 56b5a4bbb35fa7fa79393f7d6e7f24c66b4aeca5 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Mon, 23 Mar 2015 16:20:22 +0000 Subject: [PATCH 7/7] Add hash to first line of output --- ariba/tests/data/summary_test_write_tsv.out.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/tests/data/summary_test_write_tsv.out.tsv b/ariba/tests/data/summary_test_write_tsv.out.tsv index 4c81011f..4bb564a9 100644 --- a/ariba/tests/data/summary_test_write_tsv.out.tsv +++ b/ariba/tests/data/summary_test_write_tsv.out.tsv @@ -1,3 +1,3 @@ -filename gene1 gene3 +#filename gene1 gene3 file2 1 3 file3 2 4