Merge pull request #139 from martinghunt/het_snp_bugs

Het snp bugs
sanger-pathogens · Sep 29, 2016 · 49e8da3 · 49e8da3
2 parents 6343add + b23daae
commit 49e8da3
Show file tree

Hide file tree

Showing 41 changed files with 495 additions and 397 deletions.
diff --git a/README.md b/README.md
@@ -78,7 +78,7 @@ to the following dependencies.
 | MASH           | `mash`                 | `$ARIBA_MASH`             |
 
 
-For example, you could specify an exact version of a Samtools executable
+For example, you could specify an exact version of a bowtie2 executable
 that you compiled and downloaded in your home directory (assuming BASH):
 
     export ARIBA_BOWTIE2=$HOME/bowtie2-2.1.0/bowtie2

diff --git a/ariba/clusters.py b/ariba/clusters.py
@@ -372,6 +372,12 @@ def _init_and_run_clusters(self):
 
         for cluster_name in sorted(self.cluster_to_dir):
             counter += 1
+
+            if self.cluster_read_counts[cluster_name] <= 2:
+                if self.verbose:
+                    print('Not constructing cluster ', cluster_name, ' because it only has ', self.cluster_read_counts[cluster_name], ' reads (', counter, ' of ', len(self.cluster_to_dir), ')', sep='')
+                continue
+
             if self.verbose:
                 print('Constructing cluster ', cluster_name, ' (', counter, ' of ', len(self.cluster_to_dir), ')', sep='')
             new_dir = self.cluster_to_dir[cluster_name]

diff --git a/ariba/report.py b/ariba/report.py
@@ -31,8 +31,8 @@ class Error (Exception): pass
     'ctg_end',               # 23 end position of variant in contig
     'ctg_nt',                # 24 nucleotide(s) in contig at variant position
     'smtls_total_depth',     # 25 total read depth at variant start position in contig, reported by mpileup
-    'smtls_alt_nt',          # 26 alt nucleotides on contig, reported by mpileup
-    'smtls_alt_depth',       # 27 alt depth on contig, reported by mpileup
+    'smtls_nts',             # 26 alt nucleotides on contig, reported by mpileup
+    'smtls_nts_depth',          # 27 alt depth on contig, reported by mpileup
     'var_description',       # 28 description of variant from reference metdata
     'free_text',             # 29 other free text about reference sequence, from reference metadata
 ]
@@ -53,8 +53,8 @@ class Error (Exception): pass
     'ctg_end',
     'ctg_nt',
     'smtls_total_depth',
-    'smtls_alt_nt',
-    'smtls_alt_depth',
+    'smtls_nts',
+    'smtls_nts_depth',
     'var_description',
 ]
 
@@ -91,11 +91,12 @@ def _samtools_depths_at_known_snps_all_wild(sequence_meta, contig_name, cluster,
     if ref_nuc_range is None:
         return None
 
+    bases = []
     ctg_nts = []
     ref_nts = []
     smtls_total_depths = []
-    smtls_alt_nts = []
-    smtls_alt_depths = []
+    smtls_nts = []
+    smtls_depths = []
     contig_positions = []
 
     for ref_position in range(ref_nuc_range[0], ref_nuc_range[1]+1, 1):
@@ -106,17 +107,19 @@ def _samtools_depths_at_known_snps_all_wild(sequence_meta, contig_name, cluster,
             ref_nts.append(cluster.ref_sequence[ref_position])
             contig_position, in_indel = nucmer_match.qry_coords_from_ref_coord(ref_position, variant_list)
             contig_positions.append(contig_position)
-            ref, alt, total_depth, alt_depths = cluster.samtools_vars.get_depths_at_position(contig_name, contig_position)
-            ctg_nts.append(ref)
-            smtls_alt_nts.append(alt)
+            bases, total_depth, base_depths = cluster.samtools_vars.get_depths_at_position(contig_name, contig_position)
+            #ctg_nts.append(ref)
+            #samtools_nts.append(bases)
+            ctg_nts.append(cluster.assembly.sequences[contig_name][contig_position])
+            smtls_nts.append(bases)
             smtls_total_depths.append(total_depth)
-            smtls_alt_depths.append(alt_depths)
+            smtls_depths.append(base_depths)
 
     ctg_nts = ';'.join(ctg_nts) if len(ctg_nts) else '.'
     ref_nts = ';'.join(ref_nts) if len(ref_nts) else '.'
-    smtls_alt_nts = ';'.join(smtls_alt_nts) if len(smtls_alt_nts) else '.'
+    smtls_nts = ';'.join(smtls_nts) if len(smtls_nts) else '.'
     smtls_total_depths = ';'.join([str(x)for x in smtls_total_depths]) if len(smtls_total_depths) else '.'
-    smtls_alt_depths = ';'.join([str(x)for x in smtls_alt_depths]) if len(smtls_alt_depths) else '.'
+    smtls_depths = ';'.join([str(x)for x in smtls_depths]) if len(smtls_depths) else '.'
     ctg_start = str(min(contig_positions) + 1) if contig_positions is not None else '.'
     ctg_end = str(max(contig_positions) + 1) if contig_positions is not None else '.'
 
@@ -128,8 +131,8 @@ def _samtools_depths_at_known_snps_all_wild(sequence_meta, contig_name, cluster,
         ctg_end,
         ctg_nts,
         smtls_total_depths,
-        smtls_alt_nts,
-        smtls_alt_depths
+        smtls_nts,
+        smtls_depths
     ]]
 
 
@@ -206,9 +209,9 @@ def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymum
                     depths_tuple = cluster.samtools_vars.get_depths_at_position(contig_name, var.qry_start)
 
                     if depths_tuple is not None:
-                        smtls_alt_nt.append(depths_tuple[1])
-                        smtls_total_depth.append(str(depths_tuple[2]))
-                        smtls_alt_depth.append(str(depths_tuple[3]))
+                        smtls_alt_nt.append(depths_tuple[0])
+                        smtls_total_depth.append(str(depths_tuple[1]))
+                        smtls_alt_depth.append(str(depths_tuple[2]))
 
                 smtls_total_depth = ';'.join(smtls_total_depth) if len(smtls_total_depth) else '.'
                 smtls_alt_nt = ';'.join(smtls_alt_nt) if len(smtls_alt_nt) else '.'
@@ -271,7 +274,9 @@ def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymum
                     var_string = None
                 else:
                     ref_nt = cluster.ref_sequence[ref_coord]
-                    var_string = depths_tuple[0] + str(ref_coord + 1) + depths_tuple[1]
+                    ctg_nt = cluster.assembly.sequences[contig_name][var_position]
+                    alt_strings = [x for x in depths_tuple[0].split(',') if x != ctg_nt]
+                    var_string = ctg_nt + str(ref_coord + 1) + ','.join(alt_strings)
                     ref_coord = str(ref_coord + 1)
 
                 if var_string not in reported_known_vars:
@@ -280,10 +285,10 @@ def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymum
                         'HET', # var_type
                         '.', '.', '.', var_string, '.', ref_coord, ref_coord, ref_nt, # var_seq_type ... ref_nt
                         str(var_position + 1), str(var_position + 1), # ctg_start, ctg_end
-                        depths_tuple[0], # ctg_nt
-                        str(depths_tuple[2]), # smtls_total_depth
-                        depths_tuple[1], # smtls_alt_nt
-                        str(depths_tuple[3]), # smtls_alt_depth
+                        ctg_nt, # ctg_nt
+                        str(depths_tuple[1]), # smtls_total_depth
+                        depths_tuple[0], # smtls_alt_nt
+                        str(depths_tuple[2]), # smtls_alt_depth
                         '.',
                         free_text_column,
                     ]

diff --git a/ariba/samtools_variants.py b/ariba/samtools_variants.py
@@ -76,7 +76,8 @@ def _get_read_depths(cls, read_depths_file, sequence_name, position):
 
         if len(rows) == 1:
             r, p, ref_base, alt_base, ref_counts, alt_counts = rows[0].rstrip().split()
-            return ref_base, alt_base, int(ref_counts), alt_counts
+            bases = ref_base if alt_base == '.' else ref_base + ',' + alt_base
+            return bases, int(ref_counts), alt_counts
         else:
             return None
 
@@ -161,7 +162,7 @@ def get_depths_at_position(self, seq_name, position):
         if seq_name in d and position in d[seq_name]:
             return d[seq_name][position]
         else:
-            return 'ND', 'ND', 'ND', 'ND'
+            return 'ND', 'ND', 'ND'
 
 
     def run(self):

diff --git a/ariba/summary.py b/ariba/summary.py
@@ -126,13 +126,15 @@ def _gather_unfiltered_output_data(self):
                             if variant.var_group not in seen_groups:
                                 seen_groups[variant.var_group] = {'yes': 0, 'het': 0}
 
-                            if variant.het_percent is None:
-                                seen_groups[variant.var_group]['yes'] += 1
-                                this_cluster_dict['groups'][variant.var_group] = 'yes'
-                            else:
+                            if variant.het_percent is not None:
+                                this_cluster_dict['groups'][variant.var_group + '.%'] = variant.het_percent
+
+                            if variant.is_het:
                                 seen_groups[variant.var_group]['het'] += 1
                                 this_cluster_dict['groups'][variant.var_group] = 'het'
-                                this_cluster_dict['groups'][variant.var_group + '.%'] = variant.het_percent
+                            else:
+                                seen_groups[variant.var_group]['yes'] += 1
+                                this_cluster_dict['groups'][variant.var_group] = 'yes'
 
                     for group, d in seen_groups.items():
                         if d['het'] > 0 and d['het'] + d['yes'] > 1:
@@ -272,7 +274,8 @@ def _add_phandango_colour_columns(cls, header, matrix):
     @classmethod
     def _matrix_to_csv(cls, matrix, header, outfile, remove_nas=False):
         f = pyfastaq.utils.open_file_write(outfile)
-        print(*header, sep=',', file=f)
+        fixed_header = [x.replace(',', '/') for x in header]
+        print(*fixed_header, sep=',', file=f)
         for line in matrix:
             if remove_nas:
                 new_line = ['' if x=='NA' else x for x in line]

diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py
@@ -126,42 +126,70 @@ def _to_cluster_summary_assembled(self):
 
     @classmethod
     def _has_known_variant(cls, data_dict):
-        return data_dict['has_known_var'] == '1'
+        if data_dict['has_known_var'] == '1':
+            return 'yes'
+        elif data_dict['known_var'] == '0':
+            return 'no'
+        elif data_dict['gene'] == '1': # we don't yet call hets in genes
+            return 'no'
+        else:
+            cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)
+            return 'het' if cluster_var.is_het else 'no'
 
 
     def _has_any_known_variant(self):
-        for d in self.data:
-            if self._has_known_variant(d):
-                return 'yes'
-        return 'no'
+        results = {self._has_known_variant(d) for d in self.data}
+        if 'yes' in results:
+            return 'yes'
+        else:
+            return 'het' if 'het' in results else 'no'
 
 
     @classmethod
     def _has_nonsynonymous(cls, data_dict):
-        return data_dict['ref_ctg_effect'] != 'SYN' and \
-          (
-              data_dict['has_known_var'] == '1' or \
-              (data_dict['known_var'] != '1' and (data_dict['ref_ctg_change'] != '.' or data_dict['ref_ctg_effect'] != '.'))
-          )
+        cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)
+
+        has_non_het = data_dict['ref_ctg_effect'] != 'SYN' and \
+        (
+            data_dict['has_known_var'] == '1' or \
+            (data_dict['known_var'] != '1' and (data_dict['ref_ctg_change'] != '.' or data_dict['ref_ctg_effect'] != '.'))
+        )
+
+        if has_non_het and not cluster_var.is_het:
+            return 'yes'
+        else:
+            return 'het' if cluster_var.is_het else 'no'
 
 
     def _has_any_nonsynonymous(self):
-        for d in self.data:
-            if self._has_nonsynonymous(d):
-                return 'yes'
-        return 'no'
+        results = {SummaryCluster._has_nonsynonymous(d) for d in self.data}
+
+        if 'yes' in results:
+            return 'yes'
+        else:
+            return 'het' if 'het' in results else 'no'
 
 
     @classmethod
     def _has_novel_nonsynonymous(cls, data_dict):
-        return SummaryCluster._has_nonsynonymous(data_dict) and not SummaryCluster._has_known_variant(data_dict)
+        has_nonsynon = SummaryCluster._has_nonsynonymous(data_dict)
+        if has_nonsynon == 'no':
+            return 'no'
+        else:
+            has_known = SummaryCluster._has_known_variant(data_dict)
+            if has_known == 'no':
+                return has_nonsynon
+            else:
+                return 'no'
 
 
     def _has_any_novel_nonsynonymous(self):
-        for d in self.data:
-            if self._has_novel_nonsynonymous(d):
-                return 'yes'
-        return 'no'
+        results = {SummaryCluster._has_novel_nonsynonymous(d) for d in self.data}
+
+        if 'yes' in results:
+            return 'yes'
+        else:
+            return 'het' if 'het' in results else 'no'
 
 
     def _to_cluster_summary_has_known_nonsynonymous(self, assembled_summary):
@@ -198,12 +226,12 @@ def _get_known_noncoding_het_snp(data_dict):
             return None
 
         if data_dict['known_var'] == '1' and data_dict['ref_ctg_effect'] == 'SNP' \
-          and data_dict['smtls_alt_nt'] != '.' and ';' not in data_dict['smtls_alt_nt']:
-            nucleotides = [data_dict['ctg_nt']] + data_dict['smtls_alt_nt'].split(',')
-            depths = data_dict['smtls_alt_depth'].split(',')
+          and data_dict['smtls_nts'] != '.' and ';' not in data_dict['smtls_nts']:
+            nucleotides = data_dict['smtls_nts'].split(',')
+            depths = data_dict['smtls_nts_depth'].split(',')
 
             if len(nucleotides) != len(depths):
-                raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_alt_nt, smtls_alt_depth columns. Cannot continue\n' + str(data_dict))
+                raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_nts, smtls_nts_depth columns. Cannot continue\n' + str(data_dict))
 
             try:
                 var_nucleotide = data_dict['known_var_change'][-1]
@@ -220,14 +248,13 @@ def _get_known_noncoding_het_snp(data_dict):
             return None
 
 
-
     @staticmethod
     def _get_nonsynonymous_var(data_dict):
         '''if data_dict has a non synonymous variant, return string:
         ref_name.change. Otherwise return None'''
         has_nonsyn = SummaryCluster._has_nonsynonymous(data_dict)
 
-        if not has_nonsyn:
+        if has_nonsyn == 'no':
             return None
         elif data_dict['known_var_change'] == data_dict['ref_ctg_change'] == '.' == data_dict['ref_ctg_effect']:
             raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change, ref_ctg_change, ref_ctg_effect all equal to ".", but has a non synonymous change. Something is inconsistent. Cannot continue')
@@ -251,6 +278,7 @@ def _get_nonsynonymous_var(data_dict):
 
             return (data_dict['ref_name'], var_change) + var_group
 
+
     def _has_match(self, assembled_summary):
         '''assembled_summary should be output of _to_cluster_summary_assembled'''
         if assembled_summary.startswith('yes'):
@@ -266,7 +294,7 @@ def has_var_groups(self):
         '''Returns a set of the variant group ids that this cluster has'''
         ids = set()
         for d in self.data:
-            if self._has_known_variant(d) and d['var_group'] != '.':
+            if self._has_known_variant(d) != 'no' and d['var_group'] != '.':
                 ids.add(d['var_group'])
         return ids
 
@@ -298,7 +326,10 @@ def known_noncoding_het_snps(self):
         for d in self.data:
             snp_tuple = self._get_known_noncoding_het_snp(d)
             if snp_tuple is not None:
-                snp_id = d['var_description'].split(':')[4]
+                try:
+                    snp_id = d['var_description'].split(':')[4]
+                except:
+                    raise Error('Error getting ID from ' + str(d) + '\n' + d['var_description'])
                 if snp_id not in snps:
                     snps[snp_id] = {}
                 snps[snp_id][snp_tuple[0]] = snp_tuple[1]
@@ -311,7 +342,7 @@ def _get_all_nonsynon_variants_set(cls, data_dicts):
 
         for data_dict in data_dicts:
             cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)
-            if cluster_var.has_nonsynon:
+            if cluster_var.has_nonsynon or cluster_var.is_het:
                 variants.add(cluster_var)
 
         return variants