Merge pull request #56 from martinghunt/new_task_refprepare

New task prepareref
sanger-pathogens · Apr 20, 2016 · c1c1cea · c1c1cea
2 parents e31c6ea + e9c2d48
commit c1c1cea
Show file tree

Hide file tree

Showing 26 changed files with 645 additions and 265 deletions.
diff --git a/ariba/__init__.py b/ariba/__init__.py
@@ -17,9 +17,11 @@
     'mapping',
     'reference_data',
     'ref_genes_getter',
+    'ref_preparer',
     'report',
     'report_filter',
     'scaffold_graph',
+    'samtools_variants',
     'sequence_metadata',
     'sequence_variant',
     'summary',

diff --git a/ariba/assembly.py b/ariba/assembly.py
@@ -94,6 +94,7 @@ def _assemble_with_spades(self, unittest=False):
             '-2', self.reads2,
             '-o', self.assembler_dir,
             '-k', str(self.assembly_kmer),
+            '--threads 1', # otherwise defaults to 16!
             '--untrusted-contigs', self.ref_fasta,
         ])
         if self.spades_other_options is not None:

diff --git a/ariba/clusters.py b/ariba/clusters.py
@@ -1,13 +1,14 @@
 import os
 import copy
+import pickle
 import itertools
 import sys
 import shutil
 import openpyxl
 import multiprocessing
 import pysam
 import pyfastaq
-from ariba import cluster, common, mapping, histogram, report, report_filter
+from ariba import cluster, common, mapping, histogram, report, report_filter, reference_data
 
 class Error (Exception): pass
 
@@ -23,7 +24,7 @@ def _run_cluster(obj, verbose):
 
 class Clusters:
     def __init__(self,
-      refdata,
+      refdata_dir,
       reads_1,
       reads_2,
       outdir,
@@ -42,12 +43,10 @@ def __init__(self,
       assembled_threshold=0.95,
       unique_threshold=0.03,
       bowtie2_preset='very-sensitive-local',
-      cdhit_seq_identity_threshold=0.9,
-      cdhit_length_diff_cutoff=0.9,
-      run_cd_hit=True,
       clean=1,
     ):
-        self.refdata = refdata
+        self.refdata_dir = os.path.abspath(refdata_dir)
+        self.refdata, self.cluster_ids = self._load_reference_data_from_dir(refdata_dir)
         self.reads_1 = os.path.abspath(reads_1)
         self.reads_2 = os.path.abspath(reads_2)
         self.outdir = os.path.abspath(outdir)
@@ -61,11 +60,9 @@ def __init__(self,
         self.assembly_coverage = assembly_coverage
         self.spades_other = spades_other
 
-        self.refdata_files_prefix = os.path.join(self.outdir, 'refdata')
-        self.cdhit_files_prefix = os.path.join(self.outdir, 'cdhit')
+        self.cdhit_files_prefix = os.path.join(self.refdata_dir, 'cdhit')
         self.cdhit_cluster_representatives_fa = self.cdhit_files_prefix + '.cluster_representatives.fa'
-        self.cluster_ids = {}
-        self.bam_prefix = self.cdhit_cluster_representatives_fa + '.map_reads'
+        self.bam_prefix = os.path.join(self.outdir, 'map_reads_to_cluster_reps')
         self.bam = self.bam_prefix + '.bam'
         self.report_file_all_tsv = os.path.join(self.outdir, 'report.all.tsv')
         self.report_file_all_xls = os.path.join(self.outdir, 'report.all.xls')
@@ -96,28 +93,58 @@ def __init__(self,
         self.cluster_read_counts = {} # gene name -> number of reads
         self.cluster_base_counts = {} # gene name -> number of bases
 
-        self.cdhit_seq_identity_threshold = cdhit_seq_identity_threshold
-        self.cdhit_length_diff_cutoff = cdhit_length_diff_cutoff
-        self.run_cd_hit = run_cd_hit
-
         for d in [self.outdir, self.clusters_outdir]:
             try:
                 os.mkdir(d)
             except:
                 raise Error('Error mkdir ' + d)
 
 
-    def _run_cdhit(self):
-        self.cluster_ids = self.refdata.cluster_with_cdhit(
-            self.refdata_files_prefix + '.01.check_variants',
-            self.cdhit_files_prefix,
-            seq_identity_threshold=self.cdhit_seq_identity_threshold,
-            threads=self.threads,
-            length_diff_cutoff=self.cdhit_length_diff_cutoff,
-            nocluster=not self.run_cd_hit,
-            verbose=self.verbose,
+    @classmethod
+    def _load_reference_data_info_file(cls, filename):
+        data = {
+            'genetic_code': None
+        }
+
+        with open(filename) as f:
+            for line in f:
+                key, val = line.rstrip().split('\t')
+                if key in data:
+                    data[key] = val
+
+        if None in data.values():
+            missing_values = [x for x in data if data[x] is None]
+            raise Error('Error reading reference info file ' + filename + '. These values not found: ' + ','.join(missing_values))
+
+        data['genetic_code'] = int(data['genetic_code'])
+        return data
+
+
+    @staticmethod
+    def _load_reference_data_from_dir(indir):
+        if not os.path.exists(indir):
+            raise Error('Error loading reference data. Input directory ' + indir + ' not found. Cannot continue')
+
+        variants_only_fa = os.path.join(indir, 'refcheck.01.check_variants.variants_only.fa')
+        presence_absence_fa = os.path.join(indir, 'refcheck.01.check_variants.presence_absence.fa')
+        non_coding_fa = os.path.join(indir, 'refcheck.01.check_variants.non_coding.fa')
+        metadata_tsv = os.path.join(indir, 'refcheck.01.check_variants.tsv')
+        info_file = os.path.join(indir, 'info.txt')
+        clusters_file = os.path.join(indir, 'cdhit.clusters.pickle')
+        params = Clusters._load_reference_data_info_file(info_file)
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa if os.path.exists(presence_absence_fa) else None,
+            variants_only_fa=variants_only_fa if os.path.exists(variants_only_fa) else None,
+            non_coding_fa=non_coding_fa if os.path.exists(non_coding_fa) else None,
+            metadata_tsv=metadata_tsv if os.path.exists(metadata_tsv) else None,
+            genetic_code=params['genetic_code'],
         )
 
+        with open(clusters_file, 'rb') as f:
+            cluster_ids = pickle.load(f)
+
+        return refdata, cluster_ids
+
 
     def _map_reads_to_clustered_genes(self):
         mapping.run_bowtie2(
@@ -134,40 +161,6 @@ def _map_reads_to_clustered_genes(self):
         )
 
 
-    def _sam_to_fastq(self, s):
-        name = s.qname
-        if s.is_read1:
-            name += '/1'
-        elif s.is_read2:
-            name += '/2'
-        else:
-            raise Error('Read ' + name + ' must be first or second of pair according to flag. Cannot continue')
-
-        seq = pyfastaq.sequences.Fastq(name, common.decode(s.seq), common.decode(s.qual))
-        if s.is_reverse:
-            seq.revcomp()
-
-        return seq
-
-
-    def _sam_pair_to_insert(self, s1, s2):
-        if s1.is_unmapped or s2.is_unmapped or (s1.tid != s2.tid) or (s1.is_reverse == s2.is_reverse):
-            return None
-
-        # If here, reads are both mapped to the same ref, and in opposite orientations
-        if s1.is_reverse:
-            end = s1.reference_end - 1
-            start = s2.reference_start
-        else:
-            end = s2.reference_end - 1
-            start = s1.reference_start
-
-        if start < end:
-            return end - start + 1
-        else:
-            return None
-
-
     def _bam_to_clusters_reads(self):
         '''Sets up Cluster directories (one for each gene that has reads that mapped to it), writes reads fwd and rev files. Also gathers histogram data of insert size'''
         filehandles_1 = {} # gene name -> filehandle of fwd_reads
@@ -186,12 +179,12 @@ def _bam_to_clusters_reads(self):
             if not sam1.is_unmapped:
                 ref_seqs.add(sam_reader.getrname(sam1.tid))
 
-            read1 = self._sam_to_fastq(sam1)
-            read2 = self._sam_to_fastq(s)
+            read1 = mapping.sam_to_fastq(sam1)
+            read2 = mapping.sam_to_fastq(s)
             if read1.id.endswith('/2'):
                 read1, read2 = read2, read1
 
-            insert = self._sam_pair_to_insert(s, sam1)
+            insert = mapping.sam_pair_to_insert(s, sam1)
             if insert is not None:
                 self.insert_hist.add(insert)
 
@@ -349,31 +342,13 @@ def _clean(self):
                 print('   ... not deleting anything because --clean 0 used')
             return
 
-        to_delete= [
-            self.bam,
-            self.cdhit_cluster_representatives_fa,
-            self.cdhit_cluster_representatives_fa + '.fai',
-            self.cdhit_files_prefix + '.non_coding.cdhit',
-            self.cdhit_files_prefix + '.presence_absence.cdhit',
-            self.cdhit_files_prefix + '.variants_only.cdhit',
-        ]
+        to_delete= [self.bam]
 
         if self.clean == 2:
             if self.verbose:
                 print('    rm -r', self.clusters_outdir)
                 shutil.rmtree(self.clusters_outdir)
 
-            to_delete.extend([
-                self.cdhit_files_prefix + '.clusters.tsv',
-                self.refdata_files_prefix + '.00.check_fasta_presence_absence.log',
-                self.refdata_files_prefix + '.00.check_fasta_variants_only.log',
-                self.refdata_files_prefix + '.01.check_variants.log',
-                self.refdata_files_prefix + '.01.check_variants.non_coding.fa',
-                self.refdata_files_prefix + '.01.check_variants.presence_absence.fa',
-                self.refdata_files_prefix + '.01.check_variants.tsv',
-                self.refdata_files_prefix + '.01.check_variants.variants_only.fa',
-            ])
-
         for filename in to_delete:
             if os.path.exists(filename):
                 if self.verbose:
@@ -400,16 +375,6 @@ def run(self):
         self.write_versions_file(cwd)
 
         if self.verbose:
-            print('{:_^79}'.format(' Checking reference data '), flush=True)
-        self.refdata.sanity_check(self.refdata_files_prefix)
-
-        if self.verbose:
-            print()
-            print('{:_^79}'.format(' Running cd-hit '), flush=True)
-        self._run_cdhit()
-
-        if self.verbose:
-            print('Finished cd-hit\n')
             print('{:_^79}'.format(' Mapping reads to clustered genes '), flush=True)
         self._map_reads_to_clustered_genes()
 

diff --git a/ariba/mapping.py b/ariba/mapping.py
@@ -1,11 +1,33 @@
 import os
 import sys
 import pysam
+import pyfastaq
 from ariba import common
 
 class Error (Exception): pass
 
 
+def bowtie2_index(ref_fa, outprefix, bowtie2='bowtie2', verbose=False, verbose_filehandle=sys.stdout):
+    expected_files = [outprefix + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]
+    file_missing = False
+    for filename in expected_files:
+        if not os.path.exists(filename):
+            file_missing = True
+            break 
+
+    if not file_missing:
+        return
+
+    cmd = ' '.join([
+        bowtie2 + '-build',
+        '-q',
+        ref_fa,
+        outprefix
+    ])
+
+    common.syscall(cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
+
+
 def run_bowtie2(
       reads_fwd,
       reads_rev,
@@ -20,16 +42,15 @@ def run_bowtie2(
       verbose=False,
       verbose_filehandle=sys.stdout,
       remove_both_unmapped=False,
+      clean_index=True,
     ):
 
     map_index = out_prefix + '.map_index'
-    clean_files = [map_index + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]
-    index_cmd = ' '.join([
-        bowtie2 + '-build',
-        '-q',
-        ref_fa,
-        map_index
-    ])
+
+    if clean_index:
+        clean_files = [map_index + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]
+    else:
+        clean_files = []
 
     final_bam = out_prefix + '.bam'
     if sort:
@@ -60,7 +81,7 @@ def run_bowtie2(
 
     map_cmd = ' '.join(map_cmd)
 
-    common.syscall(index_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
+    bowtie2_index(ref_fa, map_index, bowtie2=bowtie2, verbose=verbose, verbose_filehandle=verbose_filehandle)
     common.syscall(map_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
 
     if sort:
@@ -96,3 +117,42 @@ def get_total_alignment_score(bam):
             pass
     return total
 
+
+def sam_to_fastq(sam):
+    '''Given a pysam alignment, returns the sequence a Fastq object.
+       Reverse complements as required and add suffix /1 or /2 as appropriate from the flag'''
+    name = sam.qname
+    if sam.is_read1:
+        name += '/1'
+    elif sam.is_read2:
+        name += '/2'
+    else:
+        raise Error('Read ' + name + ' must be first or second of pair according to flag. Cannot continue')
+
+    seq = pyfastaq.sequences.Fastq(name, common.decode(sam.seq), common.decode(sam.qual))
+    if sam.is_reverse:
+        seq.revcomp()
+
+    return seq
+
+
+def sam_pair_to_insert(s1, s2):
+    '''Returns insert size from pair of sam records, as long as their orientation is "innies".
+       Otherwise returns None.'''
+    if s1.is_unmapped or s2.is_unmapped or (s1.tid != s2.tid) or (s1.is_reverse == s2.is_reverse):
+        return None
+
+    # If here, reads are both mapped to the same ref, and in opposite orientations
+    if s1.is_reverse:
+        end = s1.reference_end - 1
+        start = s2.reference_start
+    else:
+        end = s2.reference_end - 1
+        start = s1.reference_start
+
+    if start < end:
+        return end - start + 1
+    else:
+        return None
+
+