Skip to content

Commit

Permalink
Merge pull request #15 from martinghunt/add_summary_task
Browse files Browse the repository at this point in the history
Add summary task
  • Loading branch information
martinghunt committed Mar 23, 2015
2 parents 07cf284 + 56b5a4b commit aadce69
Show file tree
Hide file tree
Showing 12 changed files with 392 additions and 0 deletions.
1 change: 1 addition & 0 deletions ariba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
'mapping',
'refcheck',
'scaffold_graph',
'summary',
'tasks',
]

Expand Down
7 changes: 7 additions & 0 deletions ariba/flag.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ def to_number(self):
n += flag_bits[f]
return n

def __eq__(self, other):
return type(other) is type(self) and self.__dict__ == other.__dict__


def __str__(self):
return str(self.to_number())
Expand All @@ -51,3 +54,7 @@ def to_long_string(self):
lines.append('[' + x_or_not + '] ' + f)
return '\n'.join(lines)


def has(self, s):
return self.flags[s]

202 changes: 202 additions & 0 deletions ariba/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import os
import openpyxl
import pyfastaq
from ariba import flag

class Error (Exception): pass

columns = [
'gene',
'flag',
'cluster',
'gene_len',
'assembled',
'pc_ident',
'var_type',
'var_effect',
'new_aa',
'gene_start',
'gene_end',
'gene_nt',
'scaffold',
'scaff_len',
'scaff_start',
'scaff_end',
'scaff_nt',
]

int_columns = [
'gene_len',
'assembled',
'gene_start',
'gene_end',
'scaff_len',
'scaff_start',
'scaff_end',
]


class Summary:
def __init__(
self,
outfile,
filenames=None,
fofn=None,
min_id=90.0
):
if filenames is None and fofn is None:
raise Error('Error! Must supply filenames or fofn to Summary(). Cannot continue')

if filenames is None:
self.filenames = []
else:
self.filenames = filenames

if fofn is not None:
self.filenames.extend(self._load_fofn(fofn))

self.min_id = min_id
self.outfile = outfile


def _load_fofn(self, fofn):
f = pyfastaq.utils.open_file_read(fofn)
filenames = [x.rstrip() for x in f.readlines()]
pyfastaq.utils.close(f)
return filenames


def _check_files_exist(self):
for fname in self.filenames:
if not os.path.exists(fname):
raise Error('File not found: "' + fname + '". Cannot continue')


def _line2dict(self, line):
data = line.rstrip().split('\t')
d = {columns[i]: data[i] for i in range(len(data))}
d['flag'] = flag.Flag(int(d['flag']) )
for key in int_columns:
try:
d[key] = int(d[key])
except:
assert d[key] == '.'
try:
d['pc_ident'] = float(d['pc_ident'])
except:
assert d['pc_ident'] == '.'
return d


def _load_file(self, filename):
f = pyfastaq.utils.open_file_read(filename)
d = {}

for line in f:
if line.startswith('#'):
if line.rstrip()[1:].split('\t') != columns:
raise Error('Error parsing the following line.\n' + line)
continue
data = self._line2dict(line)

if data['gene'] not in d:
d[data['gene']] = []

d[data['gene']].append(data)

pyfastaq.utils.close(f)
return d


def _to_summary_number(self, l):
f = l[0]['flag']
if f.has('assembly_fail') or not f.has('gene_assembled') or self._pc_id_of_longest(l) <= self.min_id:
return 0

if not f.has('complete_orf'):
return 1

if f.has('unique_contig') and f.has('gene_assembled_into_one_contig'):
return 3

return 2


def _pc_id_of_longest(self, l):
longest = 0
identity = None
for data in l:
if data['assembled'] > longest:
longest = data['assembled']
identity = data['pc_ident']

assert identity is not None
return identity



def _gather_output_rows(self):
self.data = {filename: self._load_file(filename) for filename in self.filenames}

all_genes = set()
for l in self.data.values():
all_genes.update(set(l.keys()))
all_genes = list(all_genes)
all_genes.sort()

self.rows_out = []
self.rows_out.append(['filename'] + all_genes)

for filename in self.filenames:
new_row = [filename]
for gene in all_genes:
if gene not in self.data[filename]:
new_row.append(0)
else:
new_row.append(self._to_summary_number(self.data[filename][gene]))

self.rows_out.append(new_row)


def _filter_output_rows(self):
# remove rows that are all zeros
self.rows_out = [x for x in self.rows_out if x[1:] != [0]*(len(x)-1)]

# remove columns that are all zeros
to_remove = []
for i in range(1, len(self.rows_out[0])):
if sum([x[i] for x in self.rows_out[1:]]) == 0:
to_remove.append(i)

for i in range(len(self.rows_out)):
self.rows_out[i] = [self.rows_out[i][j] for j in range(len(self.rows_out[i])) if j not in to_remove]



def _write_tsv(self):
f = pyfastaq.utils.open_file_write(self.outfile)
print('#', end='', file=f)
for row in self.rows_out:
print('\t'.join([str(x) for x in row]), file=f)
pyfastaq.utils.close(f)


def _write_xls(self):
workbook = openpyxl.Workbook()
worksheet = workbook.worksheets[0]
worksheet.title = 'ARIBA_summary'
for row in self.rows_out:
worksheet.append(row)
workbook.save(self.outfile)


def run(self):
self._check_files_exist()
self._gather_output_rows()
self._filter_output_rows()
if self.outfile.endswith('.xls'):
self._write_xls()
else:
self._write_tsv()


23 changes: 23 additions & 0 deletions ariba/tasks/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import argparse
import ariba

def run():
parser = argparse.ArgumentParser(
description = 'Make a summry of ARIBA report files',
usage = 'ariba summary [options] <outfile> [infiles]',
epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input')
parser.add_argument('-f', '--fofn', help='File of filenames of ariba reports to be summarised. Must b used if no input files listed after the outfile', metavar='FILENAME')
parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
parser.add_argument('outfile', help='Name of output file. If file ends with ".xls", then an excel spreadsheet is written. Otherwise a tsv file is written')
parser.add_argument('infiles', nargs='*', help='Files to be summarised')
options = parser.parse_args()
if len(options.infiles) == 0:
options.infiles = None

s = ariba.summary.Summary(
options.outfile,
fofn=options.fofn,
filenames=options.infiles,
min_id=options.min_id
)
s.run()
4 changes: 4 additions & 0 deletions ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#gene flag cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt
gene1 27 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . .
gene2 15 2 780 780 100.0 . . . . . . gene2.scaffold.2 1124 . . .
gene2 15 2 780 770 99.0 . . . . . . gene2.scaffold.3 1097 . . .
3 changes: 3 additions & 0 deletions ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#gene flag cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt
gene1 27 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . .
gene3 27 3 750 750 98.93 . . . . . . gene3.scaffold.1 1047 . . .
2 changes: 2 additions & 0 deletions ariba/tests/data/summary_test_init.fofn
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
file1
file2
5 changes: 5 additions & 0 deletions ariba/tests/data/summary_test_load_file.in.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#gene flag cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt
gene1 27 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . .
gene2 15 2 780 780 100.0 . . . . . . gene2.scaffold.2 1124 . . .
gene2 15 2 780 770 99.0 . . . . . . gene2.scaffold.3 1097 . . .
gene3 187 3 750 750 98.93 SNP SYN . 318 318 C gene3.scaffold.1 1047 319 319 G
3 changes: 3 additions & 0 deletions ariba/tests/data/summary_test_write_tsv.out.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#filename gene1 gene3
file2 1 3
file3 2 4
8 changes: 8 additions & 0 deletions ariba/tests/flag_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,11 @@ def test_to_long_str(self):

self.assertEqual(expected, f.to_long_string())


def test_has(self):
'''Test has'''
for x in flag.flags_in_order:
f = flag.Flag(0)
self.assertFalse(f.has(x))
f.add(x)
self.assertTrue(f.has(x))
Loading

0 comments on commit aadce69

Please sign in to comment.