Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Always run refcheck; Add JS candy output #37

Merged
merged 6 commits into from
Nov 25, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 74 additions & 2 deletions ariba/summary.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import openpyxl
import pyfastaq
from ariba import flag
from ariba import flag, common

class Error (Exception): pass

Expand Down Expand Up @@ -49,6 +49,7 @@ def __init__(
filenames=None,
fofn=None,
filter_output=True,
js_candy_prefix=None,
min_id=90.0
):
if filenames is None and fofn is None:
Expand All @@ -65,6 +66,7 @@ def __init__(
self.filter_output = filter_output
self.min_id = min_id
self.outfile = outfile
self.js_candy_prefix = js_candy_prefix


def _load_fofn(self, fofn):
Expand Down Expand Up @@ -195,6 +197,16 @@ def _write_tsv(self):
pyfastaq.utils.close(f)


def _write_js_candy_csv(self, outfile):
f = pyfastaq.utils.open_file_write(outfile)
# js candy needs the "name" column.
# Names must match those used in the tree file
print('name', *self.rows_out[0][1:], sep=',', file=f)
for row in self.rows_out[1:]:
print(*row, sep=',', file=f)
pyfastaq.utils.close(f)


def _write_xls(self):
workbook = openpyxl.Workbook()
worksheet = workbook.worksheets[0]
Expand All @@ -204,6 +216,65 @@ def _write_xls(self):
workbook.save(self.outfile)


@staticmethod
def _distance_score_between_values(value1, value2):
if value1 != value2 and 0 in [value1, value2]:
return 1
else:
return 0


@classmethod
def _distance_score_between_lists(cls, scores1, scores2):
assert len(scores1) == len(scores2)
return sum([cls._distance_score_between_values(scores1[i], scores2[i]) for i in range(1, len(scores1))])


def _write_distance_matrix(self, outfile):
if len(self.rows_out) < 3:
raise Error('Cannot calculate distance matrix to make tree for js_candy.\n' +
'Only one sample present.')

if len(self.rows_out[0]) < 2:
raise Error('Cannot calculate distance matrix to make tree for js_candy.\n' +
'No genes present in output')

with open(outfile, 'w') as f:
sample_names = [x[0] for x in self.rows_out]
print(*sample_names[1:], sep='\t', file=f)

for i in range(1,len(self.rows_out)):
scores = []
for j in range(2, len(self.rows_out)):
scores.append(self._distance_score_between_lists(self.rows_out[i], self.rows_out[j]))
print(self.rows_out[i][0], *scores, sep='\t', file=f)


@staticmethod
def _newick_from_dist_matrix(distance_file, outfile):
r_script = outfile + '.tmp.R'

with open(r_script, 'w') as f:
print('library(ape)', file=f)
print('a=read.table("', distance_file, '", header=TRUE, row.names=1)', sep='', file=f)
print('h=hclust(dist(a))', file=f)
print('write.tree(as.phylo(h), file="', outfile, '")', sep='', file=f)

common.syscall('R CMD BATCH --no-save ' + r_script)
os.unlink(r_script + 'out')
os.unlink(r_script)


def _write_js_candy_files(self, outprefix):
distance_file = outprefix + '.distance_matrix'
tree_file = outprefix + '.tre'
csv_file = outprefix + '.csv'
self._write_distance_matrix(distance_file)
self._newick_from_dist_matrix(distance_file, tree_file)
os.unlink(distance_file)
self._write_js_candy_csv(csv_file)


def run(self):
self._check_files_exist()
self._gather_output_rows()
Expand All @@ -213,4 +284,5 @@ def run(self):
else:
self._write_tsv()


if self.js_candy_prefix is not None:
self._write_js_candy_files(self.js_candy_prefix)
23 changes: 23 additions & 0 deletions ariba/tasks/run.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import sys
import pyfastaq
import ariba

Expand Down Expand Up @@ -29,6 +30,10 @@ def run():
assembly_group.add_argument('--assembler_k', type=int, help='kmer size to use with assembler. You can use 0 to set kmer to 2/3 of the read length. Warning - lower kmers are usually better. [%(default)s]', metavar='INT', default=21)
assembly_group.add_argument('--spades_other', help='Put options string to be used with spades in quotes. This will NOT be sanity checked. Do not use -k or -t: for these options you should use the ariba run options --assembler_k and --threads [%(default)s]', default="--only-assembler", metavar="OPTIONS")

refcheck_group = parser.add_argument_group('refcheck options')
refcheck_group.add_argument('--refcheck_min_length', type=int, help='Minimum allowed length in nucleotides of reference gene [%(default)s]', metavar='INT', default=6)
refcheck_group.add_argument('--refcheck_max_length', type=int, help='Maximum allowed length in nucleotides of reference gene [%(default)s]', metavar='INT', default=10000)

other_group = parser.add_argument_group('Other options')
other_group.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
other_group.add_argument('--threads', type=int, help='Number of threads for bowtie2 and spades [%(default)s]', default=1, metavar='INT')
Expand Down Expand Up @@ -58,6 +63,24 @@ def run():
ariba.external_progs.check_versions(options, verbose=options.verbose, not_required=set(['sspace', 'gapfiller']))
pyfastaq.sequences.genetic_code = options.genetic_code


checker = ariba.refcheck.Checker(
options.db_fasta,
min_length=options.refcheck_min_length,
max_length=options.refcheck_max_length,
)
ok, reason, seq = checker.run()

if not ok:
print('\nInput reference file of genes failed refcheck! Cannot continue.', file=sys.stderr)
print('The first failed sequence is:\n', file=sys.stderr)
print(seq, file=sys.stderr)
print('\nIt failed for the reason:', reason, file=sys.stderr)
print('\nTo make a new fasta file called new_genes.fa, with the bad genes fixed/removed run:\n', file=sys.stderr)
print(' ariba refcheck -o new_genes', options.db_fasta, file=sys.stderr)
sys.exit(1)


c = ariba.clusters.Clusters(
options.db_fasta,
options.reads_1,
Expand Down
4 changes: 3 additions & 1 deletion ariba/tasks/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def run():
usage = 'ariba summary [options] <outfile> [report1.tsv report2.tsv ...]',
epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input. The input report files must be in tsv format, not xls.')
parser.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME')
parser.add_argument('--js_candy', help='Write files that can be used as input to JS Candy, named with the given prefix. Writes a tree and csv file, which can be drag and dropped straight into JS Candy', metavar='files_prefix')
parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
parser.add_argument('--no_filter', action='store_true', help='Do not filter rows or columns of output that are all 0 (by deafult, they are removed from the output)')
parser.add_argument('outfile', help='Name of output file. If file ends with ".xls", then an excel spreadsheet is written. Otherwise a tsv file is written')
Expand All @@ -20,6 +21,7 @@ def run():
fofn=options.fofn,
filenames=options.infiles,
filter_output=(not options.no_filter),
min_id=options.min_id
min_id=options.min_id,
js_candy_prefix=options.js_candy
)
s.run()
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
file1 file2 file3
file1 3 2
file2 0 1
file3 1 0
1 change: 1 addition & 0 deletions ariba/tests/data/summary_test_newick_from_dist_matrix.tre
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(file1:1.58113883,(file2:0.7071067812,file3:0.7071067812):0.8740320489);
4 changes: 4 additions & 0 deletions ariba/tests/data/summary_test_write_distance_matrix.distances
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
file1 file2 file3
file1 3 2
file2 0 1
file3 1 0
3 changes: 3 additions & 0 deletions ariba/tests/data/summary_test_write_js_candy_csv.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
name,gene1,gene3
file1,1,3
file2,2,4
4 changes: 4 additions & 0 deletions ariba/tests/data/summary_test_write_js_candy_files.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name,gene1,gene2,gene3
file1,0,1,0
file2,1,0,3
file3,0,0,4
1 change: 1 addition & 0 deletions ariba/tests/data/summary_test_write_js_candy_files.tre
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(file1:1.58113883,(file2:0.7071067812,file3:0.7071067812):0.8740320489);
94 changes: 94 additions & 0 deletions ariba/tests/summary_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,97 @@ def test_write_tsv(self):
expected = os.path.join(data_dir, 'summary_test_write_tsv.out.tsv')
self.assertTrue(filecmp.cmp(tmp_out, expected, shallow=False))
os.unlink(tmp_out)


def test_write_js_candy_csv(self):
'''Test _write_js_candy_csv'''
tmp_out = 'tmp.test_write_js_candy.csv'
s = summary.Summary(tmp_out, filenames=['spam', 'eggs'])
s.rows_out = [
['filename', 'gene1', 'gene3'],
['file1', 1, 3],
['file2', 2, 4],
]
s._write_js_candy_csv(tmp_out)
expected = os.path.join(data_dir, 'summary_test_write_js_candy_csv.csv')
self.assertTrue(filecmp.cmp(expected, tmp_out, shallow=False))
os.unlink(tmp_out)


def test_distance_score_bewteen_values(self):
'''Test _distance_score_bewteen_values'''
tests = [
((0, 0), 0),
((0, 1), 1),
((0, 2), 1),
((1, 0), 1),
((1, 1), 0),
((1, 2), 0),
((2, 0), 1),
((2, 1), 0),
((2, 2), 0),
]

for (val1, val2), expected in tests:
self.assertEqual(expected, summary.Summary._distance_score_between_values(val1, val2))

# check distance calculation is commutative
for val1 in range(5):
for val2 in range(5):
d1 = summary.Summary._distance_score_between_values(val1, val2)
d2 = summary.Summary._distance_score_between_values(val2, val1)
self.assertEqual(d1, d2)


def test_distance_score_between_lists(self):
'''Test _distance_score_between_lists'''
list1 = ['na', 0, 0, 0, 1, 1, 1, 2, 2, 2]
list2 = ['na', 0, 1, 2, 0, 1, 2, 1, 2, 2]
self.assertEqual(3, summary.Summary._distance_score_between_lists(list1, list2))


def test_write_distance_matrix(self):
'''Test _write_distance_matrix'''
s = summary.Summary('out', filenames=['spam', 'eggs'])
s.rows_out = [
['filename', 'gene1', 'gene2', 'gene3'],
['file1', 0, 1, 0],
['file2', 1, 0, 3],
['file3', 0, 0, 4],
]

tmp_distances = 'tmp.test.write_distance_matrix.distances'
s._write_distance_matrix(tmp_distances)
expected = os.path.join(data_dir, 'summary_test_write_distance_matrix.distances')
self.assertTrue(filecmp.cmp(expected, tmp_distances, shallow=False))
os.unlink(tmp_distances)


def test_newick_from_dist_matrix(self):
'''Test _newick_from_dist_matrix'''
tmp_tree = 'tmp.test.newick_from_dist_matrix.tre'
dist_file = os.path.join(data_dir, 'summary_test_newick_from_dist_matrix.distances')
summary.Summary._newick_from_dist_matrix(dist_file, tmp_tree)
expected = os.path.join(data_dir, 'summary_test_newick_from_dist_matrix.tre')
self.assertTrue(filecmp.cmp(expected, tmp_tree, shallow=False))
os.unlink(tmp_tree)


def test_write_js_candy_files(self):
'''Test _write_js_candy_files'''
tmp_prefix = 'tmp.test.write_js_candy_files'
s = summary.Summary('out', filenames=['spam', 'eggs'])
s.rows_out = [
['filename', 'gene1', 'gene2', 'gene3'],
['file1', 0, 1, 0],
['file2', 1, 0, 3],
['file3', 0, 0, 4],
]
s._write_js_candy_files(tmp_prefix)
expected_csv = os.path.join(data_dir, 'summary_test_write_js_candy_files.csv')
expected_tre = os.path.join(data_dir, 'summary_test_write_js_candy_files.tre')
self.assertTrue(filecmp.cmp(expected_csv, tmp_prefix + '.csv', shallow=False))
self.assertTrue(filecmp.cmp(expected_tre, tmp_prefix + '.tre', shallow=False))
os.unlink(tmp_prefix + '.csv')
os.unlink(tmp_prefix + '.tre')