-
Notifications
You must be signed in to change notification settings - Fork 56
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #15 from martinghunt/add_summary_task
Add summary task
- Loading branch information
Showing
12 changed files
with
392 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
'mapping', | ||
'refcheck', | ||
'scaffold_graph', | ||
'summary', | ||
'tasks', | ||
] | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,202 @@ | ||
import os | ||
import openpyxl | ||
import pyfastaq | ||
from ariba import flag | ||
|
||
class Error (Exception): pass | ||
|
||
columns = [ | ||
'gene', | ||
'flag', | ||
'cluster', | ||
'gene_len', | ||
'assembled', | ||
'pc_ident', | ||
'var_type', | ||
'var_effect', | ||
'new_aa', | ||
'gene_start', | ||
'gene_end', | ||
'gene_nt', | ||
'scaffold', | ||
'scaff_len', | ||
'scaff_start', | ||
'scaff_end', | ||
'scaff_nt', | ||
] | ||
|
||
int_columns = [ | ||
'gene_len', | ||
'assembled', | ||
'gene_start', | ||
'gene_end', | ||
'scaff_len', | ||
'scaff_start', | ||
'scaff_end', | ||
] | ||
|
||
|
||
class Summary: | ||
def __init__( | ||
self, | ||
outfile, | ||
filenames=None, | ||
fofn=None, | ||
min_id=90.0 | ||
): | ||
if filenames is None and fofn is None: | ||
raise Error('Error! Must supply filenames or fofn to Summary(). Cannot continue') | ||
|
||
if filenames is None: | ||
self.filenames = [] | ||
else: | ||
self.filenames = filenames | ||
|
||
if fofn is not None: | ||
self.filenames.extend(self._load_fofn(fofn)) | ||
|
||
self.min_id = min_id | ||
self.outfile = outfile | ||
|
||
|
||
def _load_fofn(self, fofn): | ||
f = pyfastaq.utils.open_file_read(fofn) | ||
filenames = [x.rstrip() for x in f.readlines()] | ||
pyfastaq.utils.close(f) | ||
return filenames | ||
|
||
|
||
def _check_files_exist(self): | ||
for fname in self.filenames: | ||
if not os.path.exists(fname): | ||
raise Error('File not found: "' + fname + '". Cannot continue') | ||
|
||
|
||
def _line2dict(self, line): | ||
data = line.rstrip().split('\t') | ||
d = {columns[i]: data[i] for i in range(len(data))} | ||
d['flag'] = flag.Flag(int(d['flag']) ) | ||
for key in int_columns: | ||
try: | ||
d[key] = int(d[key]) | ||
except: | ||
assert d[key] == '.' | ||
try: | ||
d['pc_ident'] = float(d['pc_ident']) | ||
except: | ||
assert d['pc_ident'] == '.' | ||
return d | ||
|
||
|
||
def _load_file(self, filename): | ||
f = pyfastaq.utils.open_file_read(filename) | ||
d = {} | ||
|
||
for line in f: | ||
if line.startswith('#'): | ||
if line.rstrip()[1:].split('\t') != columns: | ||
raise Error('Error parsing the following line.\n' + line) | ||
continue | ||
data = self._line2dict(line) | ||
|
||
if data['gene'] not in d: | ||
d[data['gene']] = [] | ||
|
||
d[data['gene']].append(data) | ||
|
||
pyfastaq.utils.close(f) | ||
return d | ||
|
||
|
||
def _to_summary_number(self, l): | ||
f = l[0]['flag'] | ||
if f.has('assembly_fail') or not f.has('gene_assembled') or self._pc_id_of_longest(l) <= self.min_id: | ||
return 0 | ||
|
||
if not f.has('complete_orf'): | ||
return 1 | ||
|
||
if f.has('unique_contig') and f.has('gene_assembled_into_one_contig'): | ||
return 3 | ||
|
||
return 2 | ||
|
||
|
||
def _pc_id_of_longest(self, l): | ||
longest = 0 | ||
identity = None | ||
for data in l: | ||
if data['assembled'] > longest: | ||
longest = data['assembled'] | ||
identity = data['pc_ident'] | ||
|
||
assert identity is not None | ||
return identity | ||
|
||
|
||
|
||
def _gather_output_rows(self): | ||
self.data = {filename: self._load_file(filename) for filename in self.filenames} | ||
|
||
all_genes = set() | ||
for l in self.data.values(): | ||
all_genes.update(set(l.keys())) | ||
all_genes = list(all_genes) | ||
all_genes.sort() | ||
|
||
self.rows_out = [] | ||
self.rows_out.append(['filename'] + all_genes) | ||
|
||
for filename in self.filenames: | ||
new_row = [filename] | ||
for gene in all_genes: | ||
if gene not in self.data[filename]: | ||
new_row.append(0) | ||
else: | ||
new_row.append(self._to_summary_number(self.data[filename][gene])) | ||
|
||
self.rows_out.append(new_row) | ||
|
||
|
||
def _filter_output_rows(self): | ||
# remove rows that are all zeros | ||
self.rows_out = [x for x in self.rows_out if x[1:] != [0]*(len(x)-1)] | ||
|
||
# remove columns that are all zeros | ||
to_remove = [] | ||
for i in range(1, len(self.rows_out[0])): | ||
if sum([x[i] for x in self.rows_out[1:]]) == 0: | ||
to_remove.append(i) | ||
|
||
for i in range(len(self.rows_out)): | ||
self.rows_out[i] = [self.rows_out[i][j] for j in range(len(self.rows_out[i])) if j not in to_remove] | ||
|
||
|
||
|
||
def _write_tsv(self): | ||
f = pyfastaq.utils.open_file_write(self.outfile) | ||
print('#', end='', file=f) | ||
for row in self.rows_out: | ||
print('\t'.join([str(x) for x in row]), file=f) | ||
pyfastaq.utils.close(f) | ||
|
||
|
||
def _write_xls(self): | ||
workbook = openpyxl.Workbook() | ||
worksheet = workbook.worksheets[0] | ||
worksheet.title = 'ARIBA_summary' | ||
for row in self.rows_out: | ||
worksheet.append(row) | ||
workbook.save(self.outfile) | ||
|
||
|
||
def run(self): | ||
self._check_files_exist() | ||
self._gather_output_rows() | ||
self._filter_output_rows() | ||
if self.outfile.endswith('.xls'): | ||
self._write_xls() | ||
else: | ||
self._write_tsv() | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import argparse | ||
import ariba | ||
|
||
def run(): | ||
parser = argparse.ArgumentParser( | ||
description = 'Make a summry of ARIBA report files', | ||
usage = 'ariba summary [options] <outfile> [infiles]', | ||
epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input') | ||
parser.add_argument('-f', '--fofn', help='File of filenames of ariba reports to be summarised. Must b used if no input files listed after the outfile', metavar='FILENAME') | ||
parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT') | ||
parser.add_argument('outfile', help='Name of output file. If file ends with ".xls", then an excel spreadsheet is written. Otherwise a tsv file is written') | ||
parser.add_argument('infiles', nargs='*', help='Files to be summarised') | ||
options = parser.parse_args() | ||
if len(options.infiles) == 0: | ||
options.infiles = None | ||
|
||
s = ariba.summary.Summary( | ||
options.outfile, | ||
fofn=options.fofn, | ||
filenames=options.infiles, | ||
min_id=options.min_id | ||
) | ||
s.run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#gene flag cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt | ||
gene1 27 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . . | ||
gene2 15 2 780 780 100.0 . . . . . . gene2.scaffold.2 1124 . . . | ||
gene2 15 2 780 770 99.0 . . . . . . gene2.scaffold.3 1097 . . . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#gene flag cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt | ||
gene1 27 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . . | ||
gene3 27 3 750 750 98.93 . . . . . . gene3.scaffold.1 1047 . . . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
file1 | ||
file2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#gene flag cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt | ||
gene1 27 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . . | ||
gene2 15 2 780 780 100.0 . . . . . . gene2.scaffold.2 1124 . . . | ||
gene2 15 2 780 770 99.0 . . . . . . gene2.scaffold.3 1097 . . . | ||
gene3 187 3 750 750 98.93 SNP SYN . 318 318 C gene3.scaffold.1 1047 319 319 G |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#filename gene1 gene3 | ||
file2 1 3 | ||
file3 2 4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.