Skip to content

Commit

Permalink
Merge pull request #37 from martinghunt/always_run_refcheck
Browse files Browse the repository at this point in the history
Always run refcheck; Add JS candy output
  • Loading branch information
aslett1 committed Nov 25, 2015
2 parents dbc9f44 + 1bf542e commit ac94d94
Show file tree
Hide file tree
Showing 10 changed files with 211 additions and 3 deletions.
76 changes: 74 additions & 2 deletions ariba/summary.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import openpyxl
import pyfastaq
from ariba import flag
from ariba import flag, common

class Error (Exception): pass

Expand Down Expand Up @@ -49,6 +49,7 @@ def __init__(
filenames=None,
fofn=None,
filter_output=True,
js_candy_prefix=None,
min_id=90.0
):
if filenames is None and fofn is None:
Expand All @@ -65,6 +66,7 @@ def __init__(
self.filter_output = filter_output
self.min_id = min_id
self.outfile = outfile
self.js_candy_prefix = js_candy_prefix


def _load_fofn(self, fofn):
Expand Down Expand Up @@ -195,6 +197,16 @@ def _write_tsv(self):
pyfastaq.utils.close(f)


def _write_js_candy_csv(self, outfile):
f = pyfastaq.utils.open_file_write(outfile)
# js candy needs the "name" column.
# Names must match those used in the tree file
print('name', *self.rows_out[0][1:], sep=',', file=f)
for row in self.rows_out[1:]:
print(*row, sep=',', file=f)
pyfastaq.utils.close(f)


def _write_xls(self):
workbook = openpyxl.Workbook()
worksheet = workbook.worksheets[0]
Expand All @@ -204,6 +216,65 @@ def _write_xls(self):
workbook.save(self.outfile)


@staticmethod
def _distance_score_between_values(value1, value2):
if value1 != value2 and 0 in [value1, value2]:
return 1
else:
return 0


@classmethod
def _distance_score_between_lists(cls, scores1, scores2):
assert len(scores1) == len(scores2)
return sum([cls._distance_score_between_values(scores1[i], scores2[i]) for i in range(1, len(scores1))])


def _write_distance_matrix(self, outfile):
if len(self.rows_out) < 3:
raise Error('Cannot calculate distance matrix to make tree for js_candy.\n' +
'Only one sample present.')

if len(self.rows_out[0]) < 2:
raise Error('Cannot calculate distance matrix to make tree for js_candy.\n' +
'No genes present in output')

with open(outfile, 'w') as f:
sample_names = [x[0] for x in self.rows_out]
print(*sample_names[1:], sep='\t', file=f)

for i in range(1,len(self.rows_out)):
scores = []
for j in range(2, len(self.rows_out)):
scores.append(self._distance_score_between_lists(self.rows_out[i], self.rows_out[j]))
print(self.rows_out[i][0], *scores, sep='\t', file=f)


@staticmethod
def _newick_from_dist_matrix(distance_file, outfile):
r_script = outfile + '.tmp.R'

with open(r_script, 'w') as f:
print('library(ape)', file=f)
print('a=read.table("', distance_file, '", header=TRUE, row.names=1)', sep='', file=f)
print('h=hclust(dist(a))', file=f)
print('write.tree(as.phylo(h), file="', outfile, '")', sep='', file=f)

common.syscall('R CMD BATCH --no-save ' + r_script)
os.unlink(r_script + 'out')
os.unlink(r_script)


def _write_js_candy_files(self, outprefix):
distance_file = outprefix + '.distance_matrix'
tree_file = outprefix + '.tre'
csv_file = outprefix + '.csv'
self._write_distance_matrix(distance_file)
self._newick_from_dist_matrix(distance_file, tree_file)
os.unlink(distance_file)
self._write_js_candy_csv(csv_file)


def run(self):
self._check_files_exist()
self._gather_output_rows()
Expand All @@ -213,4 +284,5 @@ def run(self):
else:
self._write_tsv()


if self.js_candy_prefix is not None:
self._write_js_candy_files(self.js_candy_prefix)
23 changes: 23 additions & 0 deletions ariba/tasks/run.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import sys
import pyfastaq
import ariba

Expand Down Expand Up @@ -29,6 +30,10 @@ def run():
assembly_group.add_argument('--assembler_k', type=int, help='kmer size to use with assembler. You can use 0 to set kmer to 2/3 of the read length. Warning - lower kmers are usually better. [%(default)s]', metavar='INT', default=21)
assembly_group.add_argument('--spades_other', help='Put options string to be used with spades in quotes. This will NOT be sanity checked. Do not use -k or -t: for these options you should use the ariba run options --assembler_k and --threads [%(default)s]', default="--only-assembler", metavar="OPTIONS")

refcheck_group = parser.add_argument_group('refcheck options')
refcheck_group.add_argument('--refcheck_min_length', type=int, help='Minimum allowed length in nucleotides of reference gene [%(default)s]', metavar='INT', default=6)
refcheck_group.add_argument('--refcheck_max_length', type=int, help='Maximum allowed length in nucleotides of reference gene [%(default)s]', metavar='INT', default=10000)

other_group = parser.add_argument_group('Other options')
other_group.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
other_group.add_argument('--threads', type=int, help='Number of threads for bowtie2 and spades [%(default)s]', default=1, metavar='INT')
Expand Down Expand Up @@ -58,6 +63,24 @@ def run():
ariba.external_progs.check_versions(options, verbose=options.verbose, not_required=set(['sspace', 'gapfiller']))
pyfastaq.sequences.genetic_code = options.genetic_code


checker = ariba.refcheck.Checker(
options.db_fasta,
min_length=options.refcheck_min_length,
max_length=options.refcheck_max_length,
)
ok, reason, seq = checker.run()

if not ok:
print('\nInput reference file of genes failed refcheck! Cannot continue.', file=sys.stderr)
print('The first failed sequence is:\n', file=sys.stderr)
print(seq, file=sys.stderr)
print('\nIt failed for the reason:', reason, file=sys.stderr)
print('\nTo make a new fasta file called new_genes.fa, with the bad genes fixed/removed run:\n', file=sys.stderr)
print(' ariba refcheck -o new_genes', options.db_fasta, file=sys.stderr)
sys.exit(1)


c = ariba.clusters.Clusters(
options.db_fasta,
options.reads_1,
Expand Down
4 changes: 3 additions & 1 deletion ariba/tasks/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def run():
usage = 'ariba summary [options] <outfile> [report1.tsv report2.tsv ...]',
epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input. The input report files must be in tsv format, not xls.')
parser.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME')
parser.add_argument('--js_candy', help='Write files that can be used as input to JS Candy, named with the given prefix. Writes a tree and csv file, which can be drag and dropped straight into JS Candy', metavar='files_prefix')
parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
parser.add_argument('--no_filter', action='store_true', help='Do not filter rows or columns of output that are all 0 (by deafult, they are removed from the output)')
parser.add_argument('outfile', help='Name of output file. If file ends with ".xls", then an excel spreadsheet is written. Otherwise a tsv file is written')
Expand All @@ -20,6 +21,7 @@ def run():
fofn=options.fofn,
filenames=options.infiles,
filter_output=(not options.no_filter),
min_id=options.min_id
min_id=options.min_id,
js_candy_prefix=options.js_candy
)
s.run()
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
file1 file2 file3
file1 3 2
file2 0 1
file3 1 0
1 change: 1 addition & 0 deletions ariba/tests/data/summary_test_newick_from_dist_matrix.tre
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(file1:1.58113883,(file2:0.7071067812,file3:0.7071067812):0.8740320489);
4 changes: 4 additions & 0 deletions ariba/tests/data/summary_test_write_distance_matrix.distances
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
file1 file2 file3
file1 3 2
file2 0 1
file3 1 0
3 changes: 3 additions & 0 deletions ariba/tests/data/summary_test_write_js_candy_csv.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
name,gene1,gene3
file1,1,3
file2,2,4
4 changes: 4 additions & 0 deletions ariba/tests/data/summary_test_write_js_candy_files.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name,gene1,gene2,gene3
file1,0,1,0
file2,1,0,3
file3,0,0,4
1 change: 1 addition & 0 deletions ariba/tests/data/summary_test_write_js_candy_files.tre
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(file1:1.58113883,(file2:0.7071067812,file3:0.7071067812):0.8740320489);
94 changes: 94 additions & 0 deletions ariba/tests/summary_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,97 @@ def test_write_tsv(self):
expected = os.path.join(data_dir, 'summary_test_write_tsv.out.tsv')
self.assertTrue(filecmp.cmp(tmp_out, expected, shallow=False))
os.unlink(tmp_out)


def test_write_js_candy_csv(self):
'''Test _write_js_candy_csv'''
tmp_out = 'tmp.test_write_js_candy.csv'
s = summary.Summary(tmp_out, filenames=['spam', 'eggs'])
s.rows_out = [
['filename', 'gene1', 'gene3'],
['file1', 1, 3],
['file2', 2, 4],
]
s._write_js_candy_csv(tmp_out)
expected = os.path.join(data_dir, 'summary_test_write_js_candy_csv.csv')
self.assertTrue(filecmp.cmp(expected, tmp_out, shallow=False))
os.unlink(tmp_out)


def test_distance_score_bewteen_values(self):
'''Test _distance_score_bewteen_values'''
tests = [
((0, 0), 0),
((0, 1), 1),
((0, 2), 1),
((1, 0), 1),
((1, 1), 0),
((1, 2), 0),
((2, 0), 1),
((2, 1), 0),
((2, 2), 0),
]

for (val1, val2), expected in tests:
self.assertEqual(expected, summary.Summary._distance_score_between_values(val1, val2))

# check distance calculation is commutative
for val1 in range(5):
for val2 in range(5):
d1 = summary.Summary._distance_score_between_values(val1, val2)
d2 = summary.Summary._distance_score_between_values(val2, val1)
self.assertEqual(d1, d2)


def test_distance_score_between_lists(self):
'''Test _distance_score_between_lists'''
list1 = ['na', 0, 0, 0, 1, 1, 1, 2, 2, 2]
list2 = ['na', 0, 1, 2, 0, 1, 2, 1, 2, 2]
self.assertEqual(3, summary.Summary._distance_score_between_lists(list1, list2))


def test_write_distance_matrix(self):
'''Test _write_distance_matrix'''
s = summary.Summary('out', filenames=['spam', 'eggs'])
s.rows_out = [
['filename', 'gene1', 'gene2', 'gene3'],
['file1', 0, 1, 0],
['file2', 1, 0, 3],
['file3', 0, 0, 4],
]

tmp_distances = 'tmp.test.write_distance_matrix.distances'
s._write_distance_matrix(tmp_distances)
expected = os.path.join(data_dir, 'summary_test_write_distance_matrix.distances')
self.assertTrue(filecmp.cmp(expected, tmp_distances, shallow=False))
os.unlink(tmp_distances)


def test_newick_from_dist_matrix(self):
'''Test _newick_from_dist_matrix'''
tmp_tree = 'tmp.test.newick_from_dist_matrix.tre'
dist_file = os.path.join(data_dir, 'summary_test_newick_from_dist_matrix.distances')
summary.Summary._newick_from_dist_matrix(dist_file, tmp_tree)
expected = os.path.join(data_dir, 'summary_test_newick_from_dist_matrix.tre')
self.assertTrue(filecmp.cmp(expected, tmp_tree, shallow=False))
os.unlink(tmp_tree)


def test_write_js_candy_files(self):
'''Test _write_js_candy_files'''
tmp_prefix = 'tmp.test.write_js_candy_files'
s = summary.Summary('out', filenames=['spam', 'eggs'])
s.rows_out = [
['filename', 'gene1', 'gene2', 'gene3'],
['file1', 0, 1, 0],
['file2', 1, 0, 3],
['file3', 0, 0, 4],
]
s._write_js_candy_files(tmp_prefix)
expected_csv = os.path.join(data_dir, 'summary_test_write_js_candy_files.csv')
expected_tre = os.path.join(data_dir, 'summary_test_write_js_candy_files.tre')
self.assertTrue(filecmp.cmp(expected_csv, tmp_prefix + '.csv', shallow=False))
self.assertTrue(filecmp.cmp(expected_tre, tmp_prefix + '.tre', shallow=False))
os.unlink(tmp_prefix + '.csv')
os.unlink(tmp_prefix + '.tre')

0 comments on commit ac94d94

Please sign in to comment.