sanger-pathogens · jgtate · Sep 10, 2015 · Sep 9, 2015 · Sep 10, 2015 · Sep 10, 2015
diff --git a/ariba/cluster.py b/ariba/cluster.py
@@ -129,6 +129,7 @@ def __init__(self,
         self.final_assembly_bam = os.path.join(self.root_dir, 'assembly.reads_mapped.bam')
         self.final_assembly_read_depths = os.path.join(self.root_dir, 'assembly.reads_mapped.bam.read_depths.gz')
         self.final_assembly_vcf = os.path.join(self.root_dir, 'assembly.reads_mapped.bam.vcf')
+        self.final_assembled_genes_fa = os.path.join(self.root_dir, 'assembly.genes.fa')
         self.final_assembly = {}
         self.mummer_variants = {}
         self.variant_depths = {}
@@ -525,6 +526,38 @@ def _nucmer_hits_to_gene_cov_per_contig(self):
         return cov
 
 
+    @staticmethod
+    def _nucmer_hits_to_assembled_gene_sequences(nucmer_hits, ref_gene, assembly, outfile):
+        f = pyfastaq.utils.open_file_write(outfile)
+
+        for contig in sorted(nucmer_hits):
+            for hit in nucmer_hits[contig]:
+                qry_coords = hit.qry_coords()
+                fa = assembly[hit.qry_name].subseq(qry_coords.start, qry_coords.end + 1)
+                if hit.on_same_strand():
+                    strand = '+'
+                else:
+                    fa.revcomp()
+                    strand = '-'
+                ref_coords = hit.ref_coords()
+                fa.id = '.'.join([
+                    ref_gene.id,
+                    str(ref_coords.start + 1),
+                    str(ref_coords.end + 1),
+                    contig,
+                    str(qry_coords.start + 1),
+                    str(qry_coords.end + 1),
+                    strand
+                ])
+
+                if hit.hit_length_ref == hit.ref_length:
+                    fa.id += '.complete'
+
+                print(fa, file=f)
+
+        pyfastaq.utils.close(f)
+
+
     def _whole_gene_covered_by_nucmer_hits(self):
         covered = self._nucmer_hits_to_ref_coords()
         pyfastaq.intervals.merge_overlapping_in_list(covered)
@@ -1046,6 +1079,7 @@ def run(self):
             self._update_flag_from_nucmer_file()
             self._make_assembly_vcf()
             self._get_vcf_variant_counts()
+            self._nucmer_hits_to_assembled_gene_sequences(self.nucmer_hits, self.gene, self.final_assembly, self.final_assembled_genes_fa)
 
         self._make_report_lines()
         self._clean()
diff --git a/ariba/clusters.py b/ariba/clusters.py
@@ -44,7 +44,6 @@ def __init__(self,
       run_cd_hit=True,
       clean=1,
     ):
-        self.db_fasta = os.path.abspath(db_fasta)
         self.reads_1 = os.path.abspath(reads_1)
         self.reads_2 = os.path.abspath(reads_2)
         self.outdir = os.path.abspath(outdir)
@@ -57,12 +56,13 @@ def __init__(self,
         self.assembly_kmer = assembly_kmer
         self.spades_other = spades_other
 
-        self.db_fasta_clustered = os.path.join(self.outdir, 'genes.clustered.fa')
+        self.db_fasta_clustered = os.path.join(self.outdir, 'input_genes.clustered.fa')
         self.cluster_ids = {}
         self.bam_prefix = os.path.join(self.outdir, 'map_all_reads')
         self.bam = self.bam_prefix + '.bam'
         self.report_file_tsv = os.path.join(self.outdir, 'report.tsv')
         self.report_file_xls = os.path.join(self.outdir, 'report.xls')
+        self.catted_assembled_genes_fasta = os.path.join(self.outdir, 'assembled_genes.fa')
         self.threads = threads
         self.verbose = verbose
 
@@ -120,6 +120,10 @@ def __init__(self,
             except:
                 raise Error('Error mkdir ' + d)
 
+        self.db_fasta = os.path.join(self.outdir, 'input_genes.not_clustered.fa')
+        pyfastaq.tasks.to_fasta(db_fasta, self.db_fasta, check_unique=True)
+
+
     def _run_cdhit(self):
         r = cdhit.Runner(
             self.db_fasta,
@@ -357,12 +361,27 @@ def _write_reports(self):
         workbook.save(self.report_file_xls)
 
 
+    def _write_catted_assembled_genes_fasta(self):
+        f = pyfastaq.utils.open_file_write(self.catted_assembled_genes_fasta)
+
+        for gene in sorted(self.clusters):
+            cluster_fasta = self.clusters[gene].final_assembled_genes_fa
+            if os.path.exists(cluster_fasta):
+                file_reader = pyfastaq.sequences.file_reader(cluster_fasta)
+                for seq in file_reader:
+                    print(seq, file=f)
+
+        pyfastaq.utils.close(f)
+
+
     def _clean(self):
         to_clean = [
             [
             ],
             [
-                self.bam
+                self.bam,
+                self.db_fasta,
+                self.db_fasta + '.fai',
             ],
             [
                 self.db_fasta_clustered,
@@ -400,14 +419,22 @@ def run(self):
             print('Finished mapping\n')
             print('{:_^79}'.format(' Generating clusters '), flush=True)
         self._bam_to_clusters_reads()
-        self._set_insert_size_data()
-        if self.verbose:
-            print('{:_^79}'.format(' Assembling each cluster '), flush=True)
-        self._init_and_run_clusters()
+        if len(self.cluster_to_dir) > 0:
+            self._set_insert_size_data()
+            if self.verbose:
+                print('{:_^79}'.format(' Assembling each cluster '), flush=True)
+            self._init_and_run_clusters()
+            if self.verbose:
+                print('Finished assembling clusters\n')
+        else:
+            if self.verbose:
+                print('No reads mapped. Skipping all assemblies', flush=True)
+            print('WARNING: no reads mapped to reference genes. Therefore no local assemblies will be run', file=sys.stderr)
+
         if self.verbose:
-            print('Finished assembling clusters\n')
             print('{:_^79}'.format(' Writing report files '), flush=True)
         self._write_reports()
+        self._write_catted_assembled_genes_fasta()
         if self.verbose:
             print('Finished writing report files. Cleaning files', flush=True)
         self._clean()

diff --git a/ariba/common.py b/ariba/common.py
@@ -1,7 +1,7 @@
 import sys
 import subprocess
 
-version = '0.4.1'
+version = '0.5.0'
 
 def syscall(cmd, allow_fail=False, verbose=False):
     if verbose:

diff --git a/ariba/refcheck.py b/ariba/refcheck.py
@@ -5,78 +5,86 @@ class Error (Exception): pass
 
 
 class Checker:
-    def __init__(self, infile, min_length=1, max_length=10000):
+    def __init__(self, infile, min_length=1, max_length=10000, outprefix=None):
         self.infile = os.path.abspath(infile)
         if not os.path.exists(self.infile):
             raise Error('File not found: "' + self.infile + '". Cannot continue')
 
         self.min_length = min_length
         self.max_length = max_length
+        self.outprefix = outprefix
 
 
-    def check(self, error_code_on_exit=None):
+    def run(self):
         file_reader = pyfastaq.sequences.file_reader(self.infile)
-
-        for seq in file_reader:
-            if not seq.looks_like_gene():
-                return False, 'Not a gene', seq
-            elif len(seq) < self.min_length:
-                return False, 'Too short', seq
-            elif len(seq) > self.max_length:
-                return False, 'Too long', seq
-
-        return True, None, None
-
-
-    def fix(self, outprefix):
-        file_reader = pyfastaq.sequences.file_reader(self.infile)
-        old2new_out = outprefix + '.rename'
-        fasta_out = outprefix + '.fa'
-        bad_seqs_out = outprefix + '.removed.fa'
-        log_out = outprefix + '.log'
         names = {}
-        old2new_out_fh = pyfastaq.utils.open_file_write(old2new_out)
-        fasta_out_fh = pyfastaq.utils.open_file_write(fasta_out)
-        bad_seqs_out_fh = pyfastaq.utils.open_file_write(bad_seqs_out)
-        log_out_fh = pyfastaq.utils.open_file_write(log_out)
+
+        if self.outprefix is not None:
+            old2new_out = self.outprefix + '.rename'
+            fasta_out = self.outprefix + '.fa'
+            bad_seqs_out = self.outprefix + '.removed.fa'
+            log_out = self.outprefix + '.log'
+            old2new_out_fh = pyfastaq.utils.open_file_write(old2new_out)
+            fasta_out_fh = pyfastaq.utils.open_file_write(fasta_out)
+            bad_seqs_out_fh = pyfastaq.utils.open_file_write(bad_seqs_out)
+            log_out_fh = pyfastaq.utils.open_file_write(log_out)
 
         for seq in file_reader:
             seq.seq = seq.seq.upper()
             if len(seq) < self.min_length:
-                print(seq.id, 'Too short. Skipping', sep='\t', file=log_out_fh)
-                print(seq, file=bad_seqs_out_fh)
-                continue
+                if self.outprefix is None:
+                    return False, 'Too short', seq
+                else:
+                    print(seq.id, 'Too short. Skipping', sep='\t', file=log_out_fh)
+                    print(seq, file=bad_seqs_out_fh)
+                    continue
             elif len(seq) > self.max_length:
-                print(seq.id, 'Too long. Skipping', sep='\t', file=log_out_fh)
-                print(seq, file=bad_seqs_out_fh)
-                continue
-
+                if self.outprefix is None:
+                    return False, 'Too long', seq
+                else:
+                    print(seq.id, 'Too long. Skipping', sep='\t', file=log_out_fh)
+                    print(seq, file=bad_seqs_out_fh)
+                    continue
 
             if not seq.looks_like_gene():
-                seq.revcomp()
-                if seq.looks_like_gene():
-                    print(seq.id, 'Reverse complemented', sep='\t', file=log_out_fh)
+                if self.outprefix is None:
+                    return False, 'Not a gene', seq
                 else:
-                    print(seq.id, 'Does not look like a gene. Skipping', sep='\t', file=log_out_fh)
                     seq.revcomp()
-                    print(seq, file=bad_seqs_out_fh)
-                    continue
-
+                    if seq.looks_like_gene():
+                        print(seq.id, 'Reverse complemented', sep='\t', file=log_out_fh)
+                    else:
+                        print(seq.id, 'Does not look like a gene. Skipping', sep='\t', file=log_out_fh)
+                        seq.revcomp()
+                        print(seq, file=bad_seqs_out_fh)
+                        continue
+
             original_id = seq.id
             # replace unwanted characters with underscores
             to_replace = ' '
             seq.id = seq.id.translate(str.maketrans(to_replace, '_' *  len(to_replace)))
 
+            if self.outprefix is None and original_id != seq.id:
+                seq.id = original_id
+                return False, 'Name has spaces', seq
+
             if seq.id in names:
-                names[seq.id] += 1
-                seq.id += '.' + str(names[seq.id])
+                if self.outprefix is None:
+                    return False, 'Duplicate name', seq
+                else:
+                    names[seq.id] += 1
+                    seq.id += '.' + str(names[seq.id])
             else:
                 names[seq.id] = 1
 
-            print(original_id, seq.id, sep='\t', file=old2new_out_fh)
-            print(seq, file=fasta_out_fh)
+            if self.outprefix is not None:
+                print(original_id, seq.id, sep='\t', file=old2new_out_fh)
+                print(seq, file=fasta_out_fh)
 
-        pyfastaq.utils.close(fasta_out_fh)
-        pyfastaq.utils.close(bad_seqs_out_fh)
-        pyfastaq.utils.close(log_out_fh)
-        pyfastaq.utils.close(old2new_out_fh)
+        if self.outprefix is not None:
+            pyfastaq.utils.close(fasta_out_fh)
+            pyfastaq.utils.close(bad_seqs_out_fh)
+            pyfastaq.utils.close(log_out_fh)
+            pyfastaq.utils.close(old2new_out_fh)
+
+        return True, None, None
diff --git a/ariba/tasks/refcheck.py b/ariba/tasks/refcheck.py
@@ -18,14 +18,14 @@ def run():
     checker = ariba.refcheck.Checker(
         options.infile,
         min_length=options.min_length,
-        max_length=options.max_length
+        max_length=options.max_length,
+        outprefix=options.outprefix
     )
-
-    if options.outprefix:
-        checker.fix(options.outprefix)
-    else:
-        ok, reason, seq = checker.check()
-        if not ok:
-            print('The following sequence not OK, for the reason:', reason)
-            print(seq)
-            sys.exit(1)
+
+    ok, reason, seq = checker.run()
+
+    if options.outprefix is None and not ok:
+        print('The following sequence not OK, for the reason:', reason)
+        print(seq)
+        sys.exit(1)
+
diff --git a/ariba/tests/cluster_test.py b/ariba/tests/cluster_test.py
@@ -34,7 +34,7 @@ def clean_cluster_dir(d, exclude=None):
 def file2lines(filename):
     f = pyfastaq.utils.open_file_read(filename)
     lines = f.readlines()
-    pyfastaq.utils.close(f) 
+    pyfastaq.utils.close(f)
     return lines
 
 
@@ -303,8 +303,8 @@ def test_nucmer_hits_to_percent_identity(self):
             ]
         }
         expected = {'scaff1': round((90*10 + 100*34) / (10+34), 2), 'scaff2': 42.42}
-        c._nucmer_hits_to_percent_identity() 
-        self.assertEqual(expected, c.percent_identities) 
+        c._nucmer_hits_to_percent_identity()
+        self.assertEqual(expected, c.percent_identities)
 
 
     def test_nucmer_hits_to_scaff_coords(self):
@@ -396,12 +396,41 @@ def test_nucmer_hits_to_gene_cov_per_contig(self):
                 pymummer.alignment.Alignment('\t'.join(hits[2])),
             ]
         }
-        
+
         expected = {'contig1': 85, 'contig2': 11}
         self.assertEqual(expected, c._nucmer_hits_to_gene_cov_per_contig())
         clean_cluster_dir(cluster_dir)
 
 
+    def test_nucmer_hits_to_assembled_gene_sequences(self):
+        '''test _nucmer_hits_to_assembled_gene_sequences'''
+        ref_gene = pyfastaq.sequences.Fasta('ref_gene', 'ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAG')
+        assembly = {
+            'contig1': pyfastaq.sequences.Fasta('contig1', 'CATCTATGCTGCATCGATCACTGACGTATCATCATCAGCGTACTGACGTATTAGTTTGTAATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAGACGTCGTACTACTGTATATGCATCGATCTGAA'),
+            'contig2': pyfastaq.sequences.Fasta('contig2', 'AGTGATATCCTGCGATCTATAATTTTTTTCGCGGGATCTTGAACGCGACGATGTTCGATAATTCAATGCAAAGGAGCGACCCGCAAGTACACAGGACTGCAAA')
+        }
+
+        hits = [
+            ['1', '147', '61', '207', '147', '147', '100.00', '147', '239', '1', '1', 'ref_gene', 'contig1'],
+            ['18', '120', '103', '1', '103', '103', '100.00', '147', '103', '1', '-1', 'ref_gene', 'contig2']
+        ]
+        nucmer_hits = {
+            'contig1': [
+                pymummer.alignment.Alignment('\t'.join(hits[0])),
+            ],
+            'contig2': [
+                pymummer.alignment.Alignment('\t'.join(hits[1])),
+            ]
+        }
+
+        assembly_fasta = os.path.join(data_dir, 'cluster_test_nucmer_hits_to_assembled_gene_sequences.assembly.fa')
+        tmp_outfile = 'tmp.test_nucmer_hits_to_assembled_gene_sequences.out.fa'
+        expected_outfile = os.path.join(data_dir, 'cluster_test_nucmer_hits_to_assembled_gene_sequences.expected.out.fa')
+        cluster.Cluster._nucmer_hits_to_assembled_gene_sequences(nucmer_hits, ref_gene, assembly, tmp_outfile)
+        self.assertTrue(filecmp.cmp(tmp_outfile, expected_outfile, shallow=False))
+        os.unlink(tmp_outfile)
+
+
     def test_whole_gene_covered_by_nucmer_hits(self):
         '''test _whole_gene_covered_by_nucmer_hits'''
         cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
@@ -637,7 +666,7 @@ def test_get_assembly_read_depths(self):
             ( ('ref1', 3), ('C', 'A,G', 42, '21,11,10') ),
             ( ('ref1', 4), ('C', 'AC', 41, '0,42') )
         ]
- 
+
         for t in tests:
             self.assertEqual(c._get_assembly_read_depths(t[0][0], t[0][1]), t[1])
 
@@ -652,7 +681,7 @@ def test_get_samtools_variant_positions(self):
             ('16__cat_2_M35190.scaffold.1', 179),
             ('16__cat_2_M35190.scaffold.1', 263),
             ('16__cat_2_M35190.scaffold.6', 93)
-        ] 
+        ]
         self.assertEqual(expected, c._get_samtools_variant_positions())
 
 
@@ -764,4 +793,4 @@ def test_make_report_lines_assembly_fail(self):
         ]
         self.assertEqual(expected, c.report_lines)
         clean_cluster_dir(cluster_dir)
-        
+