Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Het snp bugs #139

Merged
merged 15 commits into from
Sep 29, 2016
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ to the following dependencies.
| MASH | `mash` | `$ARIBA_MASH` |


For example, you could specify an exact version of a Samtools executable
For example, you could specify an exact version of a bowtie2 executable
that you compiled and downloaded in your home directory (assuming BASH):

export ARIBA_BOWTIE2=$HOME/bowtie2-2.1.0/bowtie2
Expand Down
6 changes: 6 additions & 0 deletions ariba/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,12 @@ def _init_and_run_clusters(self):

for cluster_name in sorted(self.cluster_to_dir):
counter += 1

if self.cluster_read_counts[cluster_name] <= 2:
if self.verbose:
print('Not constructing cluster ', cluster_name, ' because it only has ', self.cluster_read_counts[cluster_name], ' reads (', counter, ' of ', len(self.cluster_to_dir), ')', sep='')
continue

if self.verbose:
print('Constructing cluster ', cluster_name, ' (', counter, ' of ', len(self.cluster_to_dir), ')', sep='')
new_dir = self.cluster_to_dir[cluster_name]
Expand Down
49 changes: 27 additions & 22 deletions ariba/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ class Error (Exception): pass
'ctg_end', # 23 end position of variant in contig
'ctg_nt', # 24 nucleotide(s) in contig at variant position
'smtls_total_depth', # 25 total read depth at variant start position in contig, reported by mpileup
'smtls_alt_nt', # 26 alt nucleotides on contig, reported by mpileup
'smtls_alt_depth', # 27 alt depth on contig, reported by mpileup
'smtls_nts', # 26 alt nucleotides on contig, reported by mpileup
'smtls_nts_depth', # 27 alt depth on contig, reported by mpileup
'var_description', # 28 description of variant from reference metdata
'free_text', # 29 other free text about reference sequence, from reference metadata
]
Expand All @@ -53,8 +53,8 @@ class Error (Exception): pass
'ctg_end',
'ctg_nt',
'smtls_total_depth',
'smtls_alt_nt',
'smtls_alt_depth',
'smtls_nts',
'smtls_nts_depth',
'var_description',
]

Expand Down Expand Up @@ -91,11 +91,12 @@ def _samtools_depths_at_known_snps_all_wild(sequence_meta, contig_name, cluster,
if ref_nuc_range is None:
return None

bases = []
ctg_nts = []
ref_nts = []
smtls_total_depths = []
smtls_alt_nts = []
smtls_alt_depths = []
smtls_nts = []
smtls_depths = []
contig_positions = []

for ref_position in range(ref_nuc_range[0], ref_nuc_range[1]+1, 1):
Expand All @@ -106,17 +107,19 @@ def _samtools_depths_at_known_snps_all_wild(sequence_meta, contig_name, cluster,
ref_nts.append(cluster.ref_sequence[ref_position])
contig_position, in_indel = nucmer_match.qry_coords_from_ref_coord(ref_position, variant_list)
contig_positions.append(contig_position)
ref, alt, total_depth, alt_depths = cluster.samtools_vars.get_depths_at_position(contig_name, contig_position)
ctg_nts.append(ref)
smtls_alt_nts.append(alt)
bases, total_depth, base_depths = cluster.samtools_vars.get_depths_at_position(contig_name, contig_position)
#ctg_nts.append(ref)
#samtools_nts.append(bases)
ctg_nts.append(cluster.assembly.sequences[contig_name][contig_position])
smtls_nts.append(bases)
smtls_total_depths.append(total_depth)
smtls_alt_depths.append(alt_depths)
smtls_depths.append(base_depths)

ctg_nts = ';'.join(ctg_nts) if len(ctg_nts) else '.'
ref_nts = ';'.join(ref_nts) if len(ref_nts) else '.'
smtls_alt_nts = ';'.join(smtls_alt_nts) if len(smtls_alt_nts) else '.'
smtls_nts = ';'.join(smtls_nts) if len(smtls_nts) else '.'
smtls_total_depths = ';'.join([str(x)for x in smtls_total_depths]) if len(smtls_total_depths) else '.'
smtls_alt_depths = ';'.join([str(x)for x in smtls_alt_depths]) if len(smtls_alt_depths) else '.'
smtls_depths = ';'.join([str(x)for x in smtls_depths]) if len(smtls_depths) else '.'
ctg_start = str(min(contig_positions) + 1) if contig_positions is not None else '.'
ctg_end = str(max(contig_positions) + 1) if contig_positions is not None else '.'

Expand All @@ -128,8 +131,8 @@ def _samtools_depths_at_known_snps_all_wild(sequence_meta, contig_name, cluster,
ctg_end,
ctg_nts,
smtls_total_depths,
smtls_alt_nts,
smtls_alt_depths
smtls_nts,
smtls_depths
]]


Expand Down Expand Up @@ -206,9 +209,9 @@ def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymum
depths_tuple = cluster.samtools_vars.get_depths_at_position(contig_name, var.qry_start)

if depths_tuple is not None:
smtls_alt_nt.append(depths_tuple[1])
smtls_total_depth.append(str(depths_tuple[2]))
smtls_alt_depth.append(str(depths_tuple[3]))
smtls_alt_nt.append(depths_tuple[0])
smtls_total_depth.append(str(depths_tuple[1]))
smtls_alt_depth.append(str(depths_tuple[2]))

smtls_total_depth = ';'.join(smtls_total_depth) if len(smtls_total_depth) else '.'
smtls_alt_nt = ';'.join(smtls_alt_nt) if len(smtls_alt_nt) else '.'
Expand Down Expand Up @@ -271,7 +274,9 @@ def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymum
var_string = None
else:
ref_nt = cluster.ref_sequence[ref_coord]
var_string = depths_tuple[0] + str(ref_coord + 1) + depths_tuple[1]
ctg_nt = cluster.assembly.sequences[contig_name][var_position]
alt_strings = [x for x in depths_tuple[0].split(',') if x != ctg_nt]
var_string = ctg_nt + str(ref_coord + 1) + ','.join(alt_strings)
ref_coord = str(ref_coord + 1)

if var_string not in reported_known_vars:
Expand All @@ -280,10 +285,10 @@ def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymum
'HET', # var_type
'.', '.', '.', var_string, '.', ref_coord, ref_coord, ref_nt, # var_seq_type ... ref_nt
str(var_position + 1), str(var_position + 1), # ctg_start, ctg_end
depths_tuple[0], # ctg_nt
str(depths_tuple[2]), # smtls_total_depth
depths_tuple[1], # smtls_alt_nt
str(depths_tuple[3]), # smtls_alt_depth
ctg_nt, # ctg_nt
str(depths_tuple[1]), # smtls_total_depth
depths_tuple[0], # smtls_alt_nt
str(depths_tuple[2]), # smtls_alt_depth
'.',
free_text_column,
]
Expand Down
5 changes: 3 additions & 2 deletions ariba/samtools_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ def _get_read_depths(cls, read_depths_file, sequence_name, position):

if len(rows) == 1:
r, p, ref_base, alt_base, ref_counts, alt_counts = rows[0].rstrip().split()
return ref_base, alt_base, int(ref_counts), alt_counts
bases = ref_base if alt_base == '.' else ref_base + ',' + alt_base
return bases, int(ref_counts), alt_counts
else:
return None

Expand Down Expand Up @@ -161,7 +162,7 @@ def get_depths_at_position(self, seq_name, position):
if seq_name in d and position in d[seq_name]:
return d[seq_name][position]
else:
return 'ND', 'ND', 'ND', 'ND'
return 'ND', 'ND', 'ND'


def run(self):
Expand Down
15 changes: 9 additions & 6 deletions ariba/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,15 @@ def _gather_unfiltered_output_data(self):
if variant.var_group not in seen_groups:
seen_groups[variant.var_group] = {'yes': 0, 'het': 0}

if variant.het_percent is None:
seen_groups[variant.var_group]['yes'] += 1
this_cluster_dict['groups'][variant.var_group] = 'yes'
else:
if variant.het_percent is not None:
this_cluster_dict['groups'][variant.var_group + '.%'] = variant.het_percent

if variant.is_het:
seen_groups[variant.var_group]['het'] += 1
this_cluster_dict['groups'][variant.var_group] = 'het'
this_cluster_dict['groups'][variant.var_group + '.%'] = variant.het_percent
else:
seen_groups[variant.var_group]['yes'] += 1
this_cluster_dict['groups'][variant.var_group] = 'yes'

for group, d in seen_groups.items():
if d['het'] > 0 and d['het'] + d['yes'] > 1:
Expand Down Expand Up @@ -272,7 +274,8 @@ def _add_phandango_colour_columns(cls, header, matrix):
@classmethod
def _matrix_to_csv(cls, matrix, header, outfile, remove_nas=False):
f = pyfastaq.utils.open_file_write(outfile)
print(*header, sep=',', file=f)
fixed_header = [x.replace(',', '/') for x in header]
print(*fixed_header, sep=',', file=f)
for line in matrix:
if remove_nas:
new_line = ['' if x=='NA' else x for x in line]
Expand Down
87 changes: 59 additions & 28 deletions ariba/summary_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,42 +126,70 @@ def _to_cluster_summary_assembled(self):

@classmethod
def _has_known_variant(cls, data_dict):
return data_dict['has_known_var'] == '1'
if data_dict['has_known_var'] == '1':
return 'yes'
elif data_dict['known_var'] == '0':
return 'no'
elif data_dict['gene'] == '1': # we don't yet call hets in genes
return 'no'
else:
cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)
return 'het' if cluster_var.is_het else 'no'


def _has_any_known_variant(self):
for d in self.data:
if self._has_known_variant(d):
return 'yes'
return 'no'
results = {self._has_known_variant(d) for d in self.data}
if 'yes' in results:
return 'yes'
else:
return 'het' if 'het' in results else 'no'


@classmethod
def _has_nonsynonymous(cls, data_dict):
return data_dict['ref_ctg_effect'] != 'SYN' and \
(
data_dict['has_known_var'] == '1' or \
(data_dict['known_var'] != '1' and (data_dict['ref_ctg_change'] != '.' or data_dict['ref_ctg_effect'] != '.'))
)
cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)

has_non_het = data_dict['ref_ctg_effect'] != 'SYN' and \
(
data_dict['has_known_var'] == '1' or \
(data_dict['known_var'] != '1' and (data_dict['ref_ctg_change'] != '.' or data_dict['ref_ctg_effect'] != '.'))
)

if has_non_het and not cluster_var.is_het:
return 'yes'
else:
return 'het' if cluster_var.is_het else 'no'


def _has_any_nonsynonymous(self):
for d in self.data:
if self._has_nonsynonymous(d):
return 'yes'
return 'no'
results = {SummaryCluster._has_nonsynonymous(d) for d in self.data}

if 'yes' in results:
return 'yes'
else:
return 'het' if 'het' in results else 'no'


@classmethod
def _has_novel_nonsynonymous(cls, data_dict):
return SummaryCluster._has_nonsynonymous(data_dict) and not SummaryCluster._has_known_variant(data_dict)
has_nonsynon = SummaryCluster._has_nonsynonymous(data_dict)
if has_nonsynon == 'no':
return 'no'
else:
has_known = SummaryCluster._has_known_variant(data_dict)
if has_known == 'no':
return has_nonsynon
else:
return 'no'


def _has_any_novel_nonsynonymous(self):
for d in self.data:
if self._has_novel_nonsynonymous(d):
return 'yes'
return 'no'
results = {SummaryCluster._has_novel_nonsynonymous(d) for d in self.data}

if 'yes' in results:
return 'yes'
else:
return 'het' if 'het' in results else 'no'


def _to_cluster_summary_has_known_nonsynonymous(self, assembled_summary):
Expand Down Expand Up @@ -198,12 +226,12 @@ def _get_known_noncoding_het_snp(data_dict):
return None

if data_dict['known_var'] == '1' and data_dict['ref_ctg_effect'] == 'SNP' \
and data_dict['smtls_alt_nt'] != '.' and ';' not in data_dict['smtls_alt_nt']:
nucleotides = [data_dict['ctg_nt']] + data_dict['smtls_alt_nt'].split(',')
depths = data_dict['smtls_alt_depth'].split(',')
and data_dict['smtls_nts'] != '.' and ';' not in data_dict['smtls_nts']:
nucleotides = data_dict['smtls_nts'].split(',')
depths = data_dict['smtls_nts_depth'].split(',')

if len(nucleotides) != len(depths):
raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_alt_nt, smtls_alt_depth columns. Cannot continue\n' + str(data_dict))
raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_nts, smtls_nts_depth columns. Cannot continue\n' + str(data_dict))

try:
var_nucleotide = data_dict['known_var_change'][-1]
Expand All @@ -220,14 +248,13 @@ def _get_known_noncoding_het_snp(data_dict):
return None



@staticmethod
def _get_nonsynonymous_var(data_dict):
'''if data_dict has a non synonymous variant, return string:
ref_name.change. Otherwise return None'''
has_nonsyn = SummaryCluster._has_nonsynonymous(data_dict)

if not has_nonsyn:
if has_nonsyn == 'no':
return None
elif data_dict['known_var_change'] == data_dict['ref_ctg_change'] == '.' == data_dict['ref_ctg_effect']:
raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change, ref_ctg_change, ref_ctg_effect all equal to ".", but has a non synonymous change. Something is inconsistent. Cannot continue')
Expand All @@ -251,6 +278,7 @@ def _get_nonsynonymous_var(data_dict):

return (data_dict['ref_name'], var_change) + var_group


def _has_match(self, assembled_summary):
'''assembled_summary should be output of _to_cluster_summary_assembled'''
if assembled_summary.startswith('yes'):
Expand All @@ -266,7 +294,7 @@ def has_var_groups(self):
'''Returns a set of the variant group ids that this cluster has'''
ids = set()
for d in self.data:
if self._has_known_variant(d) and d['var_group'] != '.':
if self._has_known_variant(d) != 'no' and d['var_group'] != '.':
ids.add(d['var_group'])
return ids

Expand Down Expand Up @@ -298,7 +326,10 @@ def known_noncoding_het_snps(self):
for d in self.data:
snp_tuple = self._get_known_noncoding_het_snp(d)
if snp_tuple is not None:
snp_id = d['var_description'].split(':')[4]
try:
snp_id = d['var_description'].split(':')[4]
except:
raise Error('Error getting ID from ' + str(d) + '\n' + d['var_description'])
if snp_id not in snps:
snps[snp_id] = {}
snps[snp_id][snp_tuple[0]] = snp_tuple[1]
Expand All @@ -311,7 +342,7 @@ def _get_all_nonsynon_variants_set(cls, data_dicts):

for data_dict in data_dicts:
cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)
if cluster_var.has_nonsynon:
if cluster_var.has_nonsynon or cluster_var.is_het:
variants.add(cluster_var)

return variants
Expand Down
Loading