Skip to content

Commit

Permalink
Merge pull request #146 from martinghunt/torsten_issues
Browse files Browse the repository at this point in the history
Torsten issues
  • Loading branch information
martinghunt authored Oct 12, 2016
2 parents b22557e + ebd2baa commit d02a2e7
Show file tree
Hide file tree
Showing 48 changed files with 4,215 additions and 96 deletions.
20 changes: 8 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,16 +96,18 @@ are put in a temporary directory made by ARIBA. The total size of these
files is small, but there can be a many of them. This can be a
problem when running large numbers (100s or 1000s) of jobs simultaneously
on the same file system.
By default, ARIBA creates a temporary directory for these files
inside the output directory of each run.
The parent directory of the temporary directory is determined in the
following order of precedence:

1. The value of the option `--tmp_dir` (if that option was used)
2. The environment variable `$ARIBA_TMPDIR` (if it is set)
3. The environment variable `$TMPDIR` (if it is set)
4. If none of the above is found, then use the run's output directory.

Each temporary directory
is unique to one run of ARIBA, and is automatically deleted at the end
of the run (even if ARIBA was killed by the user or crashed).
The parent directory of the temporary
directory can be changed using the environment variable
`$ARIBA_TMPDIR`. The temporary directory for each run will be made
inside `$ARIBA_TMPDIR`. For example,
For example,

export $ARIBA_TMPDIR=/tmp

Expand All @@ -117,12 +119,6 @@ will have a name of the form
where the suffix `abcdef` is a random string of characters, chosen
such that `/tmp/ariba.tmp.abcdef` does not already exist.

The temporary directory can also be changed using the option
`--tmp_dir` when running `ariba run`. Using this option takes precedence
over the environment variable `$ARIBA_TMPDIR`. If neither are
set, then ARIBA creates the temporary directory inside
the output directory given to `ariba run`.

The exception to the above is if the option `--noclean` is used.
This forces the temporary directory to be placed in the output
directory, and temporary files are kept. It is intended for
Expand Down
18 changes: 16 additions & 2 deletions ariba/assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(self,
final_assembly_fa,
final_assembly_bam,
log_fh,
mash_reference_fasta,
scaff_name_prefix='scaffold',
kmer=0,
assembler='fermilite',
Expand All @@ -42,6 +43,7 @@ def __init__(self,
self.final_assembly_fa = os.path.abspath(final_assembly_fa)
self.final_assembly_bam = os.path.abspath(final_assembly_bam)
self.log_fh = log_fh
self.mash_reference_fasta = os.path.abspath(mash_reference_fasta)
self.scaff_name_prefix = scaff_name_prefix

self.ref_seq_name = None
Expand Down Expand Up @@ -377,14 +379,26 @@ def run(self):
self.log_fh = None
return

masher = mash.Masher(self.ref_fastas, self.gapfilled_length_filtered, self.log_fh, self.extern_progs)
masher = mash.Masher(self.mash_reference_fasta, self.gapfilled_length_filtered, self.log_fh, self.extern_progs)
self.ref_seq_name = masher.run(self.mash_dist_file)
if self.ref_seq_name is None:
print('Could not determine closest reference sequence', file=self.log_fh)
self.log_fh = None
return

faidx.write_fa_subset({self.ref_seq_name}, self.ref_fastas, self.ref_fasta)
file_reader = pyfastaq.sequences.file_reader(self.ref_fastas)
for ref_seq in file_reader:
if self.ref_seq_name == ref_seq.id:
f_out = pyfastaq.utils.open_file_write(self.ref_fasta)
print(ref_seq, file=f_out)
pyfastaq.utils.close(f_out)
break
else:
print('Closest reference sequence ', self.ref_seq_name, ' does not belong to this cluster', file=self.log_fh)
self.ref_seq_name = None
self.log_fh = None
return

print('Closest reference sequence according to mash: ', self.ref_seq_name, file=self.log_fh)

contigs_both_strands = self._fix_contig_orientation(self.gapfilled_length_filtered, self.ref_fasta, self.final_assembly_fa, min_id=self.nucmer_min_id, min_length=self.nucmer_min_len, breaklen=self.nucmer_breaklen)
Expand Down
20 changes: 17 additions & 3 deletions ariba/cluster.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import signal
import traceback
import os
import atexit
import random
import math
import shutil
import sys
import pyfastaq
from ariba import assembly, assembly_compare, assembly_variants, external_progs, flag, mapping, report, samtools_variants
from ariba import assembly, assembly_compare, assembly_variants, external_progs, flag, mapping, mash, report, samtools_variants

class Error (Exception): pass

Expand All @@ -17,6 +18,7 @@ def __init__(self,
root_dir,
name,
refdata,
refdata_seqs_fasta_for_mash=None,
total_reads=None,
total_reads_bases=None,
fail_file=None,
Expand Down Expand Up @@ -126,6 +128,13 @@ def __init__(self,
else:
self.extern_progs = extern_progs

if refdata_seqs_fasta_for_mash is None:
mash.Masher.sketch(self.references_fa, True, self.extern_progs, verbose=False)
self.refdata_seqs_fasta_for_mash = self.references_fa
else:
self.refdata_seqs_fasta_for_mash = os.path.abspath(refdata_seqs_fasta_for_mash)
assert os.path.exists(self.refdata_seqs_fasta_for_mash + '.msh')

self.random_seed = random_seed
wanted_signals = [signal.SIGABRT, signal.SIGINT, signal.SIGSEGV, signal.SIGTERM]
for s in wanted_signals:
Expand Down Expand Up @@ -312,6 +321,7 @@ def _run(self):
self.final_assembly_fa,
self.final_assembly_bam,
self.log_fh,
self.refdata_seqs_fasta_for_mash,
scaff_name_prefix=self.name,
kmer=self.assembly_kmer,
assembler=self.assembler,
Expand Down Expand Up @@ -418,8 +428,12 @@ def _run(self):
print('\nCould not get closest reference sequence\n', file=self.log_fh, flush=True)
self.status_flag.add('ref_seq_choose_fail')

try:
self.report_lines = report.report_lines(self)
except:
print('Error making report for cluster ', self.name, '... traceback:', file=sys.stderr)
traceback.print_exc(file=sys.stderr)
raise Error('Error making report for cluster ' + self.name)

print('\nMaking report lines', file=self.log_fh, flush=True)
self.report_lines = report.report_lines(self)
self._clean()
atexit.unregister(self._atexit)
6 changes: 6 additions & 0 deletions ariba/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ def __init__(self,
self.extern_progs = extern_progs
self.clusters_tsv = os.path.abspath(os.path.join(refdata_dir, '02.cdhit.clusters.tsv'))
self.all_ref_seqs_fasta = os.path.abspath(os.path.join(refdata_dir, '02.cdhit.all.fa'))
mash_file = self.all_ref_seqs_fasta + '.msh'
if not os.path.exists(mash_file):
raise Error('Error! Mash file ' + mash_file + ' not found.\nThe likely cause is that prepareref was run using an old version of ariba.\nIf this is the case, please rerun ariba preparef.')

if version_report_lines is None:
self.version_report_lines = []
Expand Down Expand Up @@ -137,6 +140,8 @@ def __init__(self,
if tmp_dir is None:
if 'ARIBA_TMPDIR' in os.environ:
tmp_dir = os.path.abspath(os.environ['ARIBA_TMPDIR'])
elif 'TMPDIR' in os.environ:
tmp_dir = os.path.abspath(os.environ['TMPDIR'])
else:
tmp_dir = self.outdir

Expand Down Expand Up @@ -386,6 +391,7 @@ def _init_and_run_clusters(self):
new_dir,
cluster_name,
self.refdata,
refdata_seqs_fasta_for_mash=self.all_ref_seqs_fasta,
fail_file=os.path.join(self.fails_dir, cluster_name),
read_store=self.read_store,
reference_names=self.cluster_ids[cluster_name],
Expand Down
15 changes: 10 additions & 5 deletions ariba/mash.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,13 @@ def __init__(self,
self.extern_progs = extern_progs


def _sketch(self, infile, individual):
@classmethod
def sketch(cls, infile, individual, extern_progs, verbose=True, verbose_filehandle=None):
if verbose:
assert verbose_filehandle is not None

cmd_list = [
self.extern_progs.exe('mash'),
extern_progs.exe('mash'),
'sketch',
'-s 100000'
]
Expand All @@ -30,7 +34,7 @@ def _sketch(self, infile, individual):
cmd_list.append('-i')

cmd_list.append(infile)
common.syscall(' '.join(cmd_list), verbose=True, verbose_filehandle=self.log_fh)
common.syscall(' '.join(cmd_list), verbose=verbose, verbose_filehandle=verbose_filehandle)


def _dist(self, outfile):
Expand All @@ -45,8 +49,9 @@ def _dist(self, outfile):


def run(self, outfile):
self._sketch(self.reference_fa, True)
self._sketch(self.query_fa, False)
if not os.path.exists(self.reference_fa + '.msh'):
Masher.sketch(self.reference_fa, True, self.extern_progs, verbose=True, verbose_filehandle=self.log_fh)
Masher.sketch(self.query_fa, False, self.extern_progs, verbose=True, verbose_filehandle=self.log_fh)
self._dist(outfile)
if os.path.getsize(outfile) == 0:
return None
Expand Down
20 changes: 15 additions & 5 deletions ariba/ref_genes_getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@ class Error (Exception): pass


class RefGenesGetter:
def __init__(self, ref_db, version=None):
def __init__(self, ref_db, version=None, debug=False):
if ref_db not in allowed_ref_dbs:
raise Error('Error in RefGenesGetter. ref_db must be one of: ' + str(allowed_ref_dbs) + ', but I got "' + ref_db)
self.ref_db=ref_db
self.debug = debug
self.genetic_code = 11
self.max_download_attempts = 3
self.sleep_time = 2
Expand Down Expand Up @@ -185,6 +186,9 @@ def _get_from_card(self, outprefix):
pyfastaq.utils.close(f_out_tsv)
pyfastaq.utils.close(f_out_log)
os.chdir(current_dir)
if not self.debug:
shutil.rmtree(tmpdir)

print('Extracted data and written ARIBA input files\n')
print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
print('You can use them with ARIBA like this:')
Expand Down Expand Up @@ -244,7 +248,8 @@ def _get_from_resfinder(self, outprefix):
pyfastaq.utils.close(fout_tsv)
print('\nFinished combining files\n')
os.chdir(current_dir)
shutil.rmtree(tmpdir)
if not self.debug:
shutil.rmtree(tmpdir)
print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
print('You can use them with ARIBA like this:')
print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
Expand Down Expand Up @@ -286,7 +291,8 @@ def _get_from_argannot(self, outprefix):

pyfastaq.utils.close(f_out_tsv)
pyfastaq.utils.close(f_out_fa)
shutil.rmtree(tmpdir)
if not self.debug:
shutil.rmtree(tmpdir)

print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
print('You can use them with ARIBA like this:')
Expand Down Expand Up @@ -339,7 +345,8 @@ def _get_from_plasmidfinder(self, outprefix):
pyfastaq.utils.close(fout_tsv)
print('\nFinished combining files\n')
os.chdir(current_dir)
shutil.rmtree(tmpdir)
if not self.debug:
shutil.rmtree(tmpdir)
print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
print('You can use them with ARIBA like this:')
print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
Expand Down Expand Up @@ -371,6 +378,8 @@ def _get_from_srst2_argannot(self, outprefix):

pyfastaq.utils.close(f_out_fa)
pyfastaq.utils.close(f_out_meta)
if not self.debug:
os.unlink(srst2_fa)

print('Finished downloading and converting data. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
print('You can use them with ARIBA like this:')
Expand Down Expand Up @@ -403,7 +412,8 @@ def _get_from_vfdb_common(self, outprefix, filename, info_text):
print('Extracting files ... ', end='', flush=True)
vparser = vfdb_parser.VfdbParser(zipfile, outprefix)
vparser.run()
shutil.rmtree(tmpdir)
if not self.debug:
shutil.rmtree(tmpdir)
print('done')
final_fasta = outprefix + '.fa'
final_tsv = outprefix + '.tsv'
Expand Down
13 changes: 12 additions & 1 deletion ariba/ref_preparer.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import sys
import os
import shutil
import pickle
import pyfastaq
from ariba import reference_data
from ariba import reference_data, mash

class Error (Exception): pass

Expand All @@ -23,6 +24,7 @@ def __init__(self,
clusters_file=None,
threads=1,
verbose=False,
force=False,
):
self.extern_progs = extern_progs

Expand All @@ -43,6 +45,7 @@ def __init__(self,
self.clusters_file = clusters_file
self.threads = threads
self.verbose = verbose
self.force = force


@classmethod
Expand Down Expand Up @@ -136,6 +139,9 @@ def _rename_clusters(clusters_in):
def run(self, outdir):
original_dir = os.getcwd()

if self.force and os.path.exists(outdir):
shutil.rmtree(outdir)

if os.path.exists(outdir):
raise Error('Error! Output directory ' + outdir + ' already exists. Cannot continue')

Expand Down Expand Up @@ -204,3 +210,8 @@ def run(self, outdir):
with open(clusters_pickle_file, 'wb') as f:
pickle.dump(clusters, f)

if self.verbose:
print('\nMash-sketching all reference sequences', flush=True)

mash.Masher.sketch(os.path.join(outdir, '02.cdhit.all.fa'), True, self.extern_progs, self.verbose, sys.stdout)

Loading

0 comments on commit d02a2e7

Please sign in to comment.