Skip to content

Commit

Permalink
Merge pull request #139 from martinghunt/het_snp_bugs
Browse files Browse the repository at this point in the history
Het snp bugs
  • Loading branch information
martinghunt authored Sep 29, 2016
2 parents 6343add + b23daae commit 49e8da3
Show file tree
Hide file tree
Showing 41 changed files with 495 additions and 397 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ to the following dependencies.
| MASH | `mash` | `$ARIBA_MASH` |


For example, you could specify an exact version of a Samtools executable
For example, you could specify an exact version of a bowtie2 executable
that you compiled and downloaded in your home directory (assuming BASH):

export ARIBA_BOWTIE2=$HOME/bowtie2-2.1.0/bowtie2
Expand Down
6 changes: 6 additions & 0 deletions ariba/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,12 @@ def _init_and_run_clusters(self):

for cluster_name in sorted(self.cluster_to_dir):
counter += 1

if self.cluster_read_counts[cluster_name] <= 2:
if self.verbose:
print('Not constructing cluster ', cluster_name, ' because it only has ', self.cluster_read_counts[cluster_name], ' reads (', counter, ' of ', len(self.cluster_to_dir), ')', sep='')
continue

if self.verbose:
print('Constructing cluster ', cluster_name, ' (', counter, ' of ', len(self.cluster_to_dir), ')', sep='')
new_dir = self.cluster_to_dir[cluster_name]
Expand Down
49 changes: 27 additions & 22 deletions ariba/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ class Error (Exception): pass
'ctg_end', # 23 end position of variant in contig
'ctg_nt', # 24 nucleotide(s) in contig at variant position
'smtls_total_depth', # 25 total read depth at variant start position in contig, reported by mpileup
'smtls_alt_nt', # 26 alt nucleotides on contig, reported by mpileup
'smtls_alt_depth', # 27 alt depth on contig, reported by mpileup
'smtls_nts', # 26 alt nucleotides on contig, reported by mpileup
'smtls_nts_depth', # 27 alt depth on contig, reported by mpileup
'var_description', # 28 description of variant from reference metdata
'free_text', # 29 other free text about reference sequence, from reference metadata
]
Expand All @@ -53,8 +53,8 @@ class Error (Exception): pass
'ctg_end',
'ctg_nt',
'smtls_total_depth',
'smtls_alt_nt',
'smtls_alt_depth',
'smtls_nts',
'smtls_nts_depth',
'var_description',
]

Expand Down Expand Up @@ -91,11 +91,12 @@ def _samtools_depths_at_known_snps_all_wild(sequence_meta, contig_name, cluster,
if ref_nuc_range is None:
return None

bases = []
ctg_nts = []
ref_nts = []
smtls_total_depths = []
smtls_alt_nts = []
smtls_alt_depths = []
smtls_nts = []
smtls_depths = []
contig_positions = []

for ref_position in range(ref_nuc_range[0], ref_nuc_range[1]+1, 1):
Expand All @@ -106,17 +107,19 @@ def _samtools_depths_at_known_snps_all_wild(sequence_meta, contig_name, cluster,
ref_nts.append(cluster.ref_sequence[ref_position])
contig_position, in_indel = nucmer_match.qry_coords_from_ref_coord(ref_position, variant_list)
contig_positions.append(contig_position)
ref, alt, total_depth, alt_depths = cluster.samtools_vars.get_depths_at_position(contig_name, contig_position)
ctg_nts.append(ref)
smtls_alt_nts.append(alt)
bases, total_depth, base_depths = cluster.samtools_vars.get_depths_at_position(contig_name, contig_position)
#ctg_nts.append(ref)
#samtools_nts.append(bases)
ctg_nts.append(cluster.assembly.sequences[contig_name][contig_position])
smtls_nts.append(bases)
smtls_total_depths.append(total_depth)
smtls_alt_depths.append(alt_depths)
smtls_depths.append(base_depths)

ctg_nts = ';'.join(ctg_nts) if len(ctg_nts) else '.'
ref_nts = ';'.join(ref_nts) if len(ref_nts) else '.'
smtls_alt_nts = ';'.join(smtls_alt_nts) if len(smtls_alt_nts) else '.'
smtls_nts = ';'.join(smtls_nts) if len(smtls_nts) else '.'
smtls_total_depths = ';'.join([str(x)for x in smtls_total_depths]) if len(smtls_total_depths) else '.'
smtls_alt_depths = ';'.join([str(x)for x in smtls_alt_depths]) if len(smtls_alt_depths) else '.'
smtls_depths = ';'.join([str(x)for x in smtls_depths]) if len(smtls_depths) else '.'
ctg_start = str(min(contig_positions) + 1) if contig_positions is not None else '.'
ctg_end = str(max(contig_positions) + 1) if contig_positions is not None else '.'

Expand All @@ -128,8 +131,8 @@ def _samtools_depths_at_known_snps_all_wild(sequence_meta, contig_name, cluster,
ctg_end,
ctg_nts,
smtls_total_depths,
smtls_alt_nts,
smtls_alt_depths
smtls_nts,
smtls_depths
]]


Expand Down Expand Up @@ -206,9 +209,9 @@ def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymum
depths_tuple = cluster.samtools_vars.get_depths_at_position(contig_name, var.qry_start)

if depths_tuple is not None:
smtls_alt_nt.append(depths_tuple[1])
smtls_total_depth.append(str(depths_tuple[2]))
smtls_alt_depth.append(str(depths_tuple[3]))
smtls_alt_nt.append(depths_tuple[0])
smtls_total_depth.append(str(depths_tuple[1]))
smtls_alt_depth.append(str(depths_tuple[2]))

smtls_total_depth = ';'.join(smtls_total_depth) if len(smtls_total_depth) else '.'
smtls_alt_nt = ';'.join(smtls_alt_nt) if len(smtls_alt_nt) else '.'
Expand Down Expand Up @@ -271,7 +274,9 @@ def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymum
var_string = None
else:
ref_nt = cluster.ref_sequence[ref_coord]
var_string = depths_tuple[0] + str(ref_coord + 1) + depths_tuple[1]
ctg_nt = cluster.assembly.sequences[contig_name][var_position]
alt_strings = [x for x in depths_tuple[0].split(',') if x != ctg_nt]
var_string = ctg_nt + str(ref_coord + 1) + ','.join(alt_strings)
ref_coord = str(ref_coord + 1)

if var_string not in reported_known_vars:
Expand All @@ -280,10 +285,10 @@ def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymum
'HET', # var_type
'.', '.', '.', var_string, '.', ref_coord, ref_coord, ref_nt, # var_seq_type ... ref_nt
str(var_position + 1), str(var_position + 1), # ctg_start, ctg_end
depths_tuple[0], # ctg_nt
str(depths_tuple[2]), # smtls_total_depth
depths_tuple[1], # smtls_alt_nt
str(depths_tuple[3]), # smtls_alt_depth
ctg_nt, # ctg_nt
str(depths_tuple[1]), # smtls_total_depth
depths_tuple[0], # smtls_alt_nt
str(depths_tuple[2]), # smtls_alt_depth
'.',
free_text_column,
]
Expand Down
5 changes: 3 additions & 2 deletions ariba/samtools_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ def _get_read_depths(cls, read_depths_file, sequence_name, position):

if len(rows) == 1:
r, p, ref_base, alt_base, ref_counts, alt_counts = rows[0].rstrip().split()
return ref_base, alt_base, int(ref_counts), alt_counts
bases = ref_base if alt_base == '.' else ref_base + ',' + alt_base
return bases, int(ref_counts), alt_counts
else:
return None

Expand Down Expand Up @@ -161,7 +162,7 @@ def get_depths_at_position(self, seq_name, position):
if seq_name in d and position in d[seq_name]:
return d[seq_name][position]
else:
return 'ND', 'ND', 'ND', 'ND'
return 'ND', 'ND', 'ND'


def run(self):
Expand Down
15 changes: 9 additions & 6 deletions ariba/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,15 @@ def _gather_unfiltered_output_data(self):
if variant.var_group not in seen_groups:
seen_groups[variant.var_group] = {'yes': 0, 'het': 0}

if variant.het_percent is None:
seen_groups[variant.var_group]['yes'] += 1
this_cluster_dict['groups'][variant.var_group] = 'yes'
else:
if variant.het_percent is not None:
this_cluster_dict['groups'][variant.var_group + '.%'] = variant.het_percent

if variant.is_het:
seen_groups[variant.var_group]['het'] += 1
this_cluster_dict['groups'][variant.var_group] = 'het'
this_cluster_dict['groups'][variant.var_group + '.%'] = variant.het_percent
else:
seen_groups[variant.var_group]['yes'] += 1
this_cluster_dict['groups'][variant.var_group] = 'yes'

for group, d in seen_groups.items():
if d['het'] > 0 and d['het'] + d['yes'] > 1:
Expand Down Expand Up @@ -272,7 +274,8 @@ def _add_phandango_colour_columns(cls, header, matrix):
@classmethod
def _matrix_to_csv(cls, matrix, header, outfile, remove_nas=False):
f = pyfastaq.utils.open_file_write(outfile)
print(*header, sep=',', file=f)
fixed_header = [x.replace(',', '/') for x in header]
print(*fixed_header, sep=',', file=f)
for line in matrix:
if remove_nas:
new_line = ['' if x=='NA' else x for x in line]
Expand Down
87 changes: 59 additions & 28 deletions ariba/summary_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,42 +126,70 @@ def _to_cluster_summary_assembled(self):

@classmethod
def _has_known_variant(cls, data_dict):
return data_dict['has_known_var'] == '1'
if data_dict['has_known_var'] == '1':
return 'yes'
elif data_dict['known_var'] == '0':
return 'no'
elif data_dict['gene'] == '1': # we don't yet call hets in genes
return 'no'
else:
cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)
return 'het' if cluster_var.is_het else 'no'


def _has_any_known_variant(self):
for d in self.data:
if self._has_known_variant(d):
return 'yes'
return 'no'
results = {self._has_known_variant(d) for d in self.data}
if 'yes' in results:
return 'yes'
else:
return 'het' if 'het' in results else 'no'


@classmethod
def _has_nonsynonymous(cls, data_dict):
return data_dict['ref_ctg_effect'] != 'SYN' and \
(
data_dict['has_known_var'] == '1' or \
(data_dict['known_var'] != '1' and (data_dict['ref_ctg_change'] != '.' or data_dict['ref_ctg_effect'] != '.'))
)
cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)

has_non_het = data_dict['ref_ctg_effect'] != 'SYN' and \
(
data_dict['has_known_var'] == '1' or \
(data_dict['known_var'] != '1' and (data_dict['ref_ctg_change'] != '.' or data_dict['ref_ctg_effect'] != '.'))
)

if has_non_het and not cluster_var.is_het:
return 'yes'
else:
return 'het' if cluster_var.is_het else 'no'


def _has_any_nonsynonymous(self):
for d in self.data:
if self._has_nonsynonymous(d):
return 'yes'
return 'no'
results = {SummaryCluster._has_nonsynonymous(d) for d in self.data}

if 'yes' in results:
return 'yes'
else:
return 'het' if 'het' in results else 'no'


@classmethod
def _has_novel_nonsynonymous(cls, data_dict):
return SummaryCluster._has_nonsynonymous(data_dict) and not SummaryCluster._has_known_variant(data_dict)
has_nonsynon = SummaryCluster._has_nonsynonymous(data_dict)
if has_nonsynon == 'no':
return 'no'
else:
has_known = SummaryCluster._has_known_variant(data_dict)
if has_known == 'no':
return has_nonsynon
else:
return 'no'


def _has_any_novel_nonsynonymous(self):
for d in self.data:
if self._has_novel_nonsynonymous(d):
return 'yes'
return 'no'
results = {SummaryCluster._has_novel_nonsynonymous(d) for d in self.data}

if 'yes' in results:
return 'yes'
else:
return 'het' if 'het' in results else 'no'


def _to_cluster_summary_has_known_nonsynonymous(self, assembled_summary):
Expand Down Expand Up @@ -198,12 +226,12 @@ def _get_known_noncoding_het_snp(data_dict):
return None

if data_dict['known_var'] == '1' and data_dict['ref_ctg_effect'] == 'SNP' \
and data_dict['smtls_alt_nt'] != '.' and ';' not in data_dict['smtls_alt_nt']:
nucleotides = [data_dict['ctg_nt']] + data_dict['smtls_alt_nt'].split(',')
depths = data_dict['smtls_alt_depth'].split(',')
and data_dict['smtls_nts'] != '.' and ';' not in data_dict['smtls_nts']:
nucleotides = data_dict['smtls_nts'].split(',')
depths = data_dict['smtls_nts_depth'].split(',')

if len(nucleotides) != len(depths):
raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_alt_nt, smtls_alt_depth columns. Cannot continue\n' + str(data_dict))
raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_nts, smtls_nts_depth columns. Cannot continue\n' + str(data_dict))

try:
var_nucleotide = data_dict['known_var_change'][-1]
Expand All @@ -220,14 +248,13 @@ def _get_known_noncoding_het_snp(data_dict):
return None



@staticmethod
def _get_nonsynonymous_var(data_dict):
'''if data_dict has a non synonymous variant, return string:
ref_name.change. Otherwise return None'''
has_nonsyn = SummaryCluster._has_nonsynonymous(data_dict)

if not has_nonsyn:
if has_nonsyn == 'no':
return None
elif data_dict['known_var_change'] == data_dict['ref_ctg_change'] == '.' == data_dict['ref_ctg_effect']:
raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change, ref_ctg_change, ref_ctg_effect all equal to ".", but has a non synonymous change. Something is inconsistent. Cannot continue')
Expand All @@ -251,6 +278,7 @@ def _get_nonsynonymous_var(data_dict):

return (data_dict['ref_name'], var_change) + var_group


def _has_match(self, assembled_summary):
'''assembled_summary should be output of _to_cluster_summary_assembled'''
if assembled_summary.startswith('yes'):
Expand All @@ -266,7 +294,7 @@ def has_var_groups(self):
'''Returns a set of the variant group ids that this cluster has'''
ids = set()
for d in self.data:
if self._has_known_variant(d) and d['var_group'] != '.':
if self._has_known_variant(d) != 'no' and d['var_group'] != '.':
ids.add(d['var_group'])
return ids

Expand Down Expand Up @@ -298,7 +326,10 @@ def known_noncoding_het_snps(self):
for d in self.data:
snp_tuple = self._get_known_noncoding_het_snp(d)
if snp_tuple is not None:
snp_id = d['var_description'].split(':')[4]
try:
snp_id = d['var_description'].split(':')[4]
except:
raise Error('Error getting ID from ' + str(d) + '\n' + d['var_description'])
if snp_id not in snps:
snps[snp_id] = {}
snps[snp_id][snp_tuple[0]] = snp_tuple[1]
Expand All @@ -311,7 +342,7 @@ def _get_all_nonsynon_variants_set(cls, data_dicts):

for data_dict in data_dicts:
cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)
if cluster_var.has_nonsynon:
if cluster_var.has_nonsynon or cluster_var.is_het:
variants.add(cluster_var)

return variants
Expand Down
Loading

0 comments on commit 49e8da3

Please sign in to comment.