Merge pull request #146 from martinghunt/torsten_issues

Torsten issues
sanger-pathogens · Oct 12, 2016 · d02a2e7 · d02a2e7
2 parents b22557e + ebd2baa
commit d02a2e7
Show file tree

Hide file tree

Showing 48 changed files with 4,215 additions and 96 deletions.
diff --git a/README.md b/README.md
@@ -96,16 +96,18 @@ are put in a temporary directory made by ARIBA.  The total size of these
 files is small, but there can be a many of them. This can be a
 problem when running large numbers (100s or 1000s) of jobs simultaneously
 on the same file system.
-By default, ARIBA creates a temporary directory for these files
-inside the output directory of each run.
+The parent directory of the temporary directory is determined in the
+following order of precedence:
+
+1. The value of the option `--tmp_dir` (if that option was used)
+2. The environment variable `$ARIBA_TMPDIR` (if it is set)
+3. The environment variable `$TMPDIR` (if it is set)
+4. If none of the above is found, then use the run's output directory.
 
 Each temporary directory
 is unique to one run of ARIBA, and is automatically deleted at the end
 of the run (even if ARIBA was killed by the user or crashed).
-The parent directory of the temporary
-directory can be changed using the environment variable
-`$ARIBA_TMPDIR`. The temporary directory for each run will be made
-inside `$ARIBA_TMPDIR`. For example,
+For example,
 
     export $ARIBA_TMPDIR=/tmp
 
@@ -117,12 +119,6 @@ will have a name of the form
 where the suffix `abcdef` is a random string of characters, chosen
 such that `/tmp/ariba.tmp.abcdef` does not already exist.
 
-The temporary directory can also be changed using the option
-`--tmp_dir` when running `ariba run`. Using this option takes precedence
-over the environment variable `$ARIBA_TMPDIR`. If neither are
-set, then ARIBA creates the temporary directory inside
-the output directory given to `ariba run`.
-
 The exception to the above is if the option `--noclean` is used.
 This forces the temporary directory to be placed in the output
 directory, and temporary files are kept. It is intended for

diff --git a/ariba/assembly.py b/ariba/assembly.py
@@ -18,6 +18,7 @@ def __init__(self,
       final_assembly_fa,
       final_assembly_bam,
       log_fh,
+      mash_reference_fasta,
       scaff_name_prefix='scaffold',
       kmer=0,
       assembler='fermilite',
@@ -42,6 +43,7 @@ def __init__(self,
         self.final_assembly_fa = os.path.abspath(final_assembly_fa)
         self.final_assembly_bam = os.path.abspath(final_assembly_bam)
         self.log_fh = log_fh
+        self.mash_reference_fasta = os.path.abspath(mash_reference_fasta)
         self.scaff_name_prefix = scaff_name_prefix
 
         self.ref_seq_name = None
@@ -377,14 +379,26 @@ def run(self):
                 self.log_fh = None
                 return
 
-            masher = mash.Masher(self.ref_fastas, self.gapfilled_length_filtered, self.log_fh, self.extern_progs)
+            masher = mash.Masher(self.mash_reference_fasta, self.gapfilled_length_filtered, self.log_fh, self.extern_progs)
             self.ref_seq_name = masher.run(self.mash_dist_file)
             if self.ref_seq_name is None:
                 print('Could not determine closest reference sequence', file=self.log_fh)
                 self.log_fh = None
                 return
 
-            faidx.write_fa_subset({self.ref_seq_name}, self.ref_fastas, self.ref_fasta)
+            file_reader = pyfastaq.sequences.file_reader(self.ref_fastas)
+            for ref_seq in file_reader:
+                if self.ref_seq_name == ref_seq.id:
+                    f_out = pyfastaq.utils.open_file_write(self.ref_fasta)
+                    print(ref_seq, file=f_out)
+                    pyfastaq.utils.close(f_out)
+                    break
+            else:
+                print('Closest reference sequence ', self.ref_seq_name, ' does not belong to this cluster', file=self.log_fh)
+                self.ref_seq_name = None
+                self.log_fh = None
+                return
+
             print('Closest reference sequence according to mash: ', self.ref_seq_name, file=self.log_fh)
 
             contigs_both_strands = self._fix_contig_orientation(self.gapfilled_length_filtered, self.ref_fasta, self.final_assembly_fa, min_id=self.nucmer_min_id, min_length=self.nucmer_min_len, breaklen=self.nucmer_breaklen)

diff --git a/ariba/cluster.py b/ariba/cluster.py
@@ -1,12 +1,13 @@
 import signal
+import traceback
 import os
 import atexit
 import random
 import math
 import shutil
 import sys
 import pyfastaq
-from ariba import assembly, assembly_compare, assembly_variants, external_progs, flag, mapping, report, samtools_variants
+from ariba import assembly, assembly_compare, assembly_variants, external_progs, flag, mapping, mash, report, samtools_variants
 
 class Error (Exception): pass
 
@@ -17,6 +18,7 @@ def __init__(self,
       root_dir,
       name,
       refdata,
+      refdata_seqs_fasta_for_mash=None,
       total_reads=None,
       total_reads_bases=None,
       fail_file=None,
@@ -126,6 +128,13 @@ def __init__(self,
         else:
             self.extern_progs = extern_progs
 
+        if refdata_seqs_fasta_for_mash is None:
+            mash.Masher.sketch(self.references_fa, True, self.extern_progs, verbose=False)
+            self.refdata_seqs_fasta_for_mash = self.references_fa
+        else:
+            self.refdata_seqs_fasta_for_mash = os.path.abspath(refdata_seqs_fasta_for_mash)
+            assert os.path.exists(self.refdata_seqs_fasta_for_mash + '.msh')
+
         self.random_seed = random_seed
         wanted_signals = [signal.SIGABRT, signal.SIGINT, signal.SIGSEGV, signal.SIGTERM]
         for s in wanted_signals:
@@ -312,6 +321,7 @@ def _run(self):
               self.final_assembly_fa,
               self.final_assembly_bam,
               self.log_fh,
+              self.refdata_seqs_fasta_for_mash,
               scaff_name_prefix=self.name,
               kmer=self.assembly_kmer,
               assembler=self.assembler,
@@ -418,8 +428,12 @@ def _run(self):
             print('\nCould not get closest reference sequence\n', file=self.log_fh, flush=True)
             self.status_flag.add('ref_seq_choose_fail')
 
+        try:
+            self.report_lines = report.report_lines(self)
+        except:
+            print('Error making report for cluster ', self.name, '... traceback:', file=sys.stderr)
+            traceback.print_exc(file=sys.stderr)
+            raise Error('Error making report for cluster ' + self.name)
 
-        print('\nMaking report lines', file=self.log_fh, flush=True)
-        self.report_lines = report.report_lines(self)
         self._clean()
         atexit.unregister(self._atexit)
diff --git a/ariba/clusters.py b/ariba/clusters.py
@@ -76,6 +76,9 @@ def __init__(self,
         self.extern_progs = extern_progs
         self.clusters_tsv = os.path.abspath(os.path.join(refdata_dir, '02.cdhit.clusters.tsv'))
         self.all_ref_seqs_fasta = os.path.abspath(os.path.join(refdata_dir, '02.cdhit.all.fa'))
+        mash_file = self.all_ref_seqs_fasta + '.msh'
+        if not os.path.exists(mash_file):
+            raise Error('Error! Mash file ' + mash_file + ' not found.\nThe likely cause is that prepareref was run using an old version of ariba.\nIf this is the case, please rerun ariba preparef.')
 
         if version_report_lines is None:
             self.version_report_lines = []
@@ -137,6 +140,8 @@ def __init__(self,
         if tmp_dir is None:
             if 'ARIBA_TMPDIR' in os.environ:
                 tmp_dir = os.path.abspath(os.environ['ARIBA_TMPDIR'])
+            elif 'TMPDIR' in os.environ:
+                tmp_dir = os.path.abspath(os.environ['TMPDIR'])
             else:
                 tmp_dir = self.outdir
 
@@ -386,6 +391,7 @@ def _init_and_run_clusters(self):
                 new_dir,
                 cluster_name,
                 self.refdata,
+                refdata_seqs_fasta_for_mash=self.all_ref_seqs_fasta,
                 fail_file=os.path.join(self.fails_dir, cluster_name),
                 read_store=self.read_store,
                 reference_names=self.cluster_ids[cluster_name],

diff --git a/ariba/mash.py b/ariba/mash.py
@@ -19,9 +19,13 @@ def __init__(self,
             self.extern_progs = extern_progs
 
 
-    def _sketch(self, infile, individual):
+    @classmethod
+    def sketch(cls, infile, individual, extern_progs, verbose=True, verbose_filehandle=None):
+        if verbose:
+            assert verbose_filehandle is not None
+
         cmd_list = [
-            self.extern_progs.exe('mash'),
+            extern_progs.exe('mash'),
             'sketch',
             '-s 100000'
         ]
@@ -30,7 +34,7 @@ def _sketch(self, infile, individual):
             cmd_list.append('-i')
 
         cmd_list.append(infile)
-        common.syscall(' '.join(cmd_list), verbose=True, verbose_filehandle=self.log_fh)
+        common.syscall(' '.join(cmd_list), verbose=verbose, verbose_filehandle=verbose_filehandle)
 
 
     def _dist(self, outfile):
@@ -45,8 +49,9 @@ def _dist(self, outfile):
 
 
     def run(self, outfile):
-        self._sketch(self.reference_fa, True)
-        self._sketch(self.query_fa, False)
+        if not os.path.exists(self.reference_fa + '.msh'):
+            Masher.sketch(self.reference_fa, True, self.extern_progs, verbose=True, verbose_filehandle=self.log_fh)
+        Masher.sketch(self.query_fa, False, self.extern_progs, verbose=True, verbose_filehandle=self.log_fh)
         self._dist(outfile)
         if os.path.getsize(outfile) == 0:
             return None

diff --git a/ariba/ref_genes_getter.py b/ariba/ref_genes_getter.py
@@ -25,10 +25,11 @@ class Error (Exception): pass
 
 
 class RefGenesGetter:
-    def __init__(self, ref_db, version=None):
+    def __init__(self, ref_db, version=None, debug=False):
         if ref_db not in allowed_ref_dbs:
             raise Error('Error in RefGenesGetter. ref_db must be one of: ' + str(allowed_ref_dbs) + ', but I got "' + ref_db)
         self.ref_db=ref_db
+        self.debug = debug
         self.genetic_code = 11
         self.max_download_attempts = 3
         self.sleep_time = 2
@@ -185,6 +186,9 @@ def _get_from_card(self, outprefix):
         pyfastaq.utils.close(f_out_tsv)
         pyfastaq.utils.close(f_out_log)
         os.chdir(current_dir)
+        if not self.debug:
+            shutil.rmtree(tmpdir)
+
         print('Extracted data and written ARIBA input files\n')
         print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
         print('You can use them with ARIBA like this:')
@@ -244,7 +248,8 @@ def _get_from_resfinder(self, outprefix):
         pyfastaq.utils.close(fout_tsv)
         print('\nFinished combining files\n')
         os.chdir(current_dir)
-        shutil.rmtree(tmpdir)
+        if not self.debug:
+            shutil.rmtree(tmpdir)
         print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
         print('You can use them with ARIBA like this:')
         print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
@@ -286,7 +291,8 @@ def _get_from_argannot(self, outprefix):
 
         pyfastaq.utils.close(f_out_tsv)
         pyfastaq.utils.close(f_out_fa)
-        shutil.rmtree(tmpdir)
+        if not self.debug:
+            shutil.rmtree(tmpdir)
 
         print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
         print('You can use them with ARIBA like this:')
@@ -339,7 +345,8 @@ def _get_from_plasmidfinder(self, outprefix):
         pyfastaq.utils.close(fout_tsv)
         print('\nFinished combining files\n')
         os.chdir(current_dir)
-        shutil.rmtree(tmpdir)
+        if not self.debug:
+            shutil.rmtree(tmpdir)
         print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
         print('You can use them with ARIBA like this:')
         print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
@@ -371,6 +378,8 @@ def _get_from_srst2_argannot(self, outprefix):
 
         pyfastaq.utils.close(f_out_fa)
         pyfastaq.utils.close(f_out_meta)
+        if not self.debug:
+            os.unlink(srst2_fa)
 
         print('Finished downloading and converting data. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
         print('You can use them with ARIBA like this:')
@@ -403,7 +412,8 @@ def _get_from_vfdb_common(self, outprefix, filename, info_text):
         print('Extracting files ... ', end='', flush=True)
         vparser = vfdb_parser.VfdbParser(zipfile, outprefix)
         vparser.run()
-        shutil.rmtree(tmpdir)
+        if not self.debug:
+            shutil.rmtree(tmpdir)
         print('done')
         final_fasta = outprefix + '.fa'
         final_tsv = outprefix + '.tsv'

diff --git a/ariba/ref_preparer.py b/ariba/ref_preparer.py
@@ -1,8 +1,9 @@
 import sys
 import os
+import shutil
 import pickle
 import pyfastaq
-from ariba import reference_data
+from ariba import reference_data, mash
 
 class Error (Exception): pass
 
@@ -23,6 +24,7 @@ def __init__(self,
         clusters_file=None,
         threads=1,
         verbose=False,
+        force=False,
     ):
         self.extern_progs = extern_progs
 
@@ -43,6 +45,7 @@ def __init__(self,
         self.clusters_file = clusters_file
         self.threads = threads
         self.verbose = verbose
+        self.force = force
 
 
     @classmethod
@@ -136,6 +139,9 @@ def _rename_clusters(clusters_in):
     def run(self, outdir):
         original_dir = os.getcwd()
 
+        if self.force and os.path.exists(outdir):
+            shutil.rmtree(outdir)
+
         if os.path.exists(outdir):
             raise Error('Error! Output directory ' + outdir + ' already exists. Cannot continue')
 
@@ -204,3 +210,8 @@ def run(self, outdir):
         with open(clusters_pickle_file, 'wb') as f:
             pickle.dump(clusters, f)
 
+        if self.verbose:
+            print('\nMash-sketching all reference sequences', flush=True)
+
+        mash.Masher.sketch(os.path.join(outdir, '02.cdhit.all.fa'), True, self.extern_progs, self.verbose, sys.stdout)
+