Merge pull request #15 from martinghunt/add_summary_task

Add summary task
sanger-pathogens · Mar 23, 2015 · aadce69 · aadce69
2 parents 07cf284 + 56b5a4b
commit aadce69
Show file tree

Hide file tree

Showing 12 changed files with 392 additions and 0 deletions.
diff --git a/ariba/__init__.py b/ariba/__init__.py
@@ -12,6 +12,7 @@
     'mapping',
     'refcheck',
     'scaffold_graph',
+    'summary',
     'tasks',
 ]
 

diff --git a/ariba/flag.py b/ariba/flag.py
@@ -39,6 +39,9 @@ def to_number(self):
                 n += flag_bits[f]
         return n
 
+    def __eq__(self, other):
+       return type(other) is type(self) and self.__dict__ == other.__dict__
+
 
     def __str__(self):
         return str(self.to_number())
@@ -51,3 +54,7 @@ def to_long_string(self):
             lines.append('[' + x_or_not + '] ' + f)
         return '\n'.join(lines)
 
+
+    def has(self, s):
+        return self.flags[s]
+
diff --git a/ariba/summary.py b/ariba/summary.py
@@ -0,0 +1,202 @@
+import os
+import openpyxl
+import pyfastaq
+from ariba import flag
+
+class Error (Exception): pass
+
+columns = [
+    'gene',
+    'flag',
+    'cluster',
+    'gene_len',
+    'assembled',
+    'pc_ident',
+    'var_type',
+    'var_effect',
+    'new_aa',
+    'gene_start',
+    'gene_end',
+    'gene_nt',
+    'scaffold',
+    'scaff_len',
+    'scaff_start',
+    'scaff_end',
+    'scaff_nt',
+]
+
+int_columns = [
+    'gene_len',
+    'assembled',
+    'gene_start',
+    'gene_end',
+    'scaff_len',
+    'scaff_start',
+    'scaff_end',
+]
+
+
+class Summary:
+    def __init__(
+      self,
+      outfile,
+      filenames=None,
+      fofn=None,
+      min_id=90.0
+    ):
+        if filenames is None and fofn is None:
+            raise Error('Error! Must supply filenames or fofn to Summary(). Cannot continue')
+
+        if filenames is None:
+            self.filenames = []
+        else:
+            self.filenames = filenames
+
+        if fofn is not None:
+            self.filenames.extend(self._load_fofn(fofn))
+
+        self.min_id = min_id
+        self.outfile = outfile
+
+
+    def _load_fofn(self, fofn):
+        f = pyfastaq.utils.open_file_read(fofn)
+        filenames = [x.rstrip() for x in f.readlines()]
+        pyfastaq.utils.close(f)
+        return filenames
+
+
+    def _check_files_exist(self):
+        for fname in self.filenames:
+            if not os.path.exists(fname):
+                raise Error('File not found: "' + fname + '". Cannot continue')
+
+
+    def _line2dict(self, line):
+        data = line.rstrip().split('\t')
+        d = {columns[i]: data[i] for i in range(len(data))}
+        d['flag'] = flag.Flag(int(d['flag']) )
+        for key in int_columns:
+            try:
+                d[key] = int(d[key])
+            except:
+                assert d[key] == '.'
+        try:
+            d['pc_ident'] = float(d['pc_ident'])
+        except:
+            assert d['pc_ident'] == '.'
+        return d
+
+
+    def _load_file(self, filename):
+        f = pyfastaq.utils.open_file_read(filename)
+        d = {}
+
+        for line in f:
+            if line.startswith('#'):
+                if line.rstrip()[1:].split('\t') != columns:
+                    raise Error('Error parsing the following line.\n' + line)
+                continue
+            data = self._line2dict(line)
+
+            if data['gene'] not in d:
+                d[data['gene']] = []
+
+            d[data['gene']].append(data)
+
+        pyfastaq.utils.close(f)
+        return d
+
+
+    def _to_summary_number(self, l):
+        f = l[0]['flag']
+        if f.has('assembly_fail') or not f.has('gene_assembled') or self._pc_id_of_longest(l) <= self.min_id:
+            return 0
+
+        if not f.has('complete_orf'):
+            return 1
+
+        if f.has('unique_contig') and f.has('gene_assembled_into_one_contig'):
+            return 3
+
+        return 2
+
+
+    def _pc_id_of_longest(self, l):
+        longest = 0
+        identity = None
+        for data in l:
+            if data['assembled'] > longest:
+                longest = data['assembled']
+                identity = data['pc_ident']
+
+        assert identity is not None
+        return identity
+
+
+
+    def _gather_output_rows(self):
+        self.data = {filename: self._load_file(filename) for filename in self.filenames}
+
+        all_genes = set()
+        for l in self.data.values():
+            all_genes.update(set(l.keys()))
+        all_genes = list(all_genes)
+        all_genes.sort()
+
+        self.rows_out = []
+        self.rows_out.append(['filename'] + all_genes)
+
+        for filename in self.filenames:
+            new_row = [filename]
+            for gene in all_genes:
+                if gene not in self.data[filename]:
+                    new_row.append(0)
+                else:
+                    new_row.append(self._to_summary_number(self.data[filename][gene]))
+
+            self.rows_out.append(new_row)
+
+
+    def _filter_output_rows(self):
+        # remove rows that are all zeros
+        self.rows_out = [x for x in self.rows_out if x[1:] != [0]*(len(x)-1)]
+
+        # remove columns that are all zeros
+        to_remove = []
+        for i in range(1, len(self.rows_out[0])):
+            if sum([x[i] for x in self.rows_out[1:]]) == 0:
+                to_remove.append(i)
+
+        for i in range(len(self.rows_out)):
+            self.rows_out[i] = [self.rows_out[i][j] for j in range(len(self.rows_out[i])) if j not in to_remove]
+
+
+
+    def _write_tsv(self):
+        f = pyfastaq.utils.open_file_write(self.outfile)
+        print('#', end='', file=f)
+        for row in self.rows_out:
+            print('\t'.join([str(x) for x in row]), file=f)
+        pyfastaq.utils.close(f)
+
+
+    def _write_xls(self):
+        workbook = openpyxl.Workbook()
+        worksheet = workbook.worksheets[0] 
+        worksheet.title = 'ARIBA_summary'
+        for row in self.rows_out:
+            worksheet.append(row)
+        workbook.save(self.outfile)
+
+
+    def run(self):
+        self._check_files_exist()
+        self._gather_output_rows()
+        self._filter_output_rows()
+        if self.outfile.endswith('.xls'):
+            self._write_xls()
+        else:
+            self._write_tsv()
+
+
diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
@@ -0,0 +1,23 @@
+import argparse
+import ariba
+
+def run():
+    parser = argparse.ArgumentParser(
+        description = 'Make a summry of ARIBA report files',
+        usage = 'ariba summary [options] <outfile> [infiles]',
+        epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input')
+    parser.add_argument('-f', '--fofn', help='File of filenames of ariba reports to be summarised. Must b used if no input files listed after the outfile', metavar='FILENAME')
+    parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
+    parser.add_argument('outfile', help='Name of output file. If file ends with ".xls", then an excel spreadsheet is written. Otherwise a tsv file is written')
+    parser.add_argument('infiles', nargs='*', help='Files to be summarised')
+    options = parser.parse_args()
+    if len(options.infiles) == 0:
+        options.infiles = None
+
+    s = ariba.summary.Summary(
+        options.outfile,
+        fofn=options.fofn,
+        filenames=options.infiles,
+        min_id=options.min_id
+    ) 
+    s.run()
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
@@ -0,0 +1,4 @@
+#gene	flag	cluster	gene_len	assembled	pc_ident	var_type	var_effect	new_aa	gene_start	gene_end	gene_nt	scaffold	scaff_len	scaff_start	scaff_end	scaff_nt
+gene1	27	1	822	822	100.0	.	.	.	.	.	.	gene1.scaffold.1	1490	.	.	.
+gene2	15	2	780	780	100.0	.	.	.	.	.	.	gene2.scaffold.2	1124	.	.	.
+gene2	15	2	780	770	99.0	.	.	.	.	.	.	gene2.scaffold.3	1097	.	.	.
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
@@ -0,0 +1,3 @@
+#gene	flag	cluster	gene_len	assembled	pc_ident	var_type	var_effect	new_aa	gene_start	gene_end	gene_nt	scaffold	scaff_len	scaff_start	scaff_end	scaff_nt
+gene1	27	1	822	822	100.0	.	.	.	.	.	.	gene1.scaffold.1	1490	.	.	.
+gene3	27	3	750	750	98.93	.	.	.	.	.	.	gene3.scaffold.1	1047	.	.	.
diff --git a/ariba/tests/data/summary_test_init.fofn b/ariba/tests/data/summary_test_init.fofn
@@ -0,0 +1,2 @@
+file1
+file2
diff --git a/ariba/tests/data/summary_test_load_file.in.tsv b/ariba/tests/data/summary_test_load_file.in.tsv
@@ -0,0 +1,5 @@
+#gene	flag	cluster	gene_len	assembled	pc_ident	var_type	var_effect	new_aa	gene_start	gene_end	gene_nt	scaffold	scaff_len	scaff_start	scaff_end	scaff_nt
+gene1	27	1	822	822	100.0	.	.	.	.	.	.	gene1.scaffold.1	1490	.	.	.
+gene2	15	2	780	780	100.0	.	.	.	.	.	.	gene2.scaffold.2	1124	.	.	.
+gene2	15	2	780	770	99.0	.	.	.	.	.	.	gene2.scaffold.3	1097	.	.	.
+gene3	187	3	750	750	98.93	SNP	SYN	.	318	318	C	gene3.scaffold.1	1047	319	319	G
diff --git a/ariba/tests/data/summary_test_write_tsv.out.tsv b/ariba/tests/data/summary_test_write_tsv.out.tsv
@@ -0,0 +1,3 @@
+#filename	gene1	gene3
+file2	1	3
+file3	2	4
diff --git a/ariba/tests/flag_test.py b/ariba/tests/flag_test.py
@@ -53,3 +53,11 @@ def test_to_long_str(self):
 
         self.assertEqual(expected, f.to_long_string())
 
+
+    def test_has(self):
+        '''Test has'''
+        for x in flag.flags_in_order:
+            f = flag.Flag(0)
+            self.assertFalse(f.has(x))
+            f.add(x)
+            self.assertTrue(f.has(x))