Skip to content

Commit

Permalink
Merge pull request #90 from martinghunt/group_snps
Browse files Browse the repository at this point in the history
Group snps
  • Loading branch information
martinghunt committed May 26, 2016
2 parents eaeb7ce + 40a7843 commit 4b2c976
Show file tree
Hide file tree
Showing 80 changed files with 1,414 additions and 446 deletions.
1 change: 1 addition & 0 deletions ariba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@


__all__ = [
'aln_to_metadata',
'assembly',
'assembly_compare',
'assembly_variants',
Expand Down
269 changes: 269 additions & 0 deletions ariba/aln_to_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
import os
import re
import sys
import shutil
import pyfastaq
from ariba import sequence_variant

class Error (Exception): pass

class AlnToMetadata:
def __init__(self,
aln_file,
vars_file,
refs_are_coding,
cluster_rep_name,
genetic_code=11,
):
self.padded_seqs = AlnToMetadata._load_aln_file(aln_file)
self.refs_are_coding = refs_are_coding
self.variants = AlnToMetadata._load_vars_file(vars_file, self.refs_are_coding)
self.genetic_code = genetic_code
self.cluster_rep_name = cluster_rep_name


@classmethod
def _load_aln_file(cls, aln_file):
seqs = {}
pyfastaq.tasks.file_to_dict(aln_file, seqs)
return seqs


@classmethod
def _load_vars_file(cls, vars_file, refs_are_coding):
var_type = 'p' if refs_are_coding else 'n'
f = pyfastaq.utils.open_file_read(vars_file)
variants = {}

for line in f:
try:
ref_name, variant, identifier, description = line.rstrip().split('\t')
variant = sequence_variant.Variant(var_type, variant, identifier)
except:
pyfastaq.utils.close(f)
raise Error('Error in this line of variants file:\n' + line)

if ref_name not in variants:
variants[ref_name] = []

variants[ref_name].append((variant, description))

pyfastaq.utils.close(f)
return variants


@classmethod
def _make_unpadded_seqs(cls, padded_seqs):
unpadded_seqs = {}
for seq in padded_seqs.values():
unpadded_seqs[seq.id] = pyfastaq.sequences.Fasta(seq.id, seq.seq.replace('-', ''))
return unpadded_seqs


@classmethod
def _check_seq_lengths_same(cls, seqs):
sequence_lengths = set([len(x) for x in seqs.values()])
if len(sequence_lengths) > 1:
raise Error('Input sequences must all be the same length. Cannot continue. Lengths found: ' + ','.join([str(x) for x in sequence_lengths]))
return len(sequence_lengths) == 1


@classmethod
def _insertion_coords(cls, sequence):
insertions = []
regex = re.compile('-+')
for m in regex.finditer(sequence.seq):
insertions.append(pyfastaq.intervals.Interval(m.span()[0], m.span()[1] - 1))
return insertions


@classmethod
def _make_unpadded_insertion_coords(cls, unpadded_sequences):
return {x.id: AlnToMetadata._insertion_coords(x) for x in unpadded_sequences.values()}


@classmethod
def _check_insertion_coords(cls, sequence):
insertions = AlnToMetadata._insertion_coords(sequence)
for coords in insertions:
if coords.start % 3 !=0:
raise Error('Insertion does not start in frame in sequence "' + sequence.id + '". Cannot continue')
elif len(coords) % 3 != 0:
raise Error('Insertion of length not a mulitple of 3 in sequence "' + sequence.id + '". Cannot continue')

return True


@classmethod
def _check_coding_seq(cls, sequence, genetic_code=11):
if len(sequence) % 3 != 0:
raise Error('Length of sequence ' + sequence.id + ' is ' + str(len(sequence)) + ', which is not a multiple of 3. Cannot continue')

original_code = pyfastaq.sequences.genetic_code
pyfastaq.sequences.genetic_code = genetic_code
protein_seq = sequence.translate()
start_ok = sequence.seq[0:3].upper() in pyfastaq.genetic_codes.starts[genetic_code]
pyfastaq.sequences.genetic_code = original_code

if not start_ok:
raise Error('Sequence "' + sequence.id + '" does not start with a start codon. Cannot continue')
elif protein_seq[-1] != '*':
raise Error('Sequence "' + sequence.id + '" does not end with a stop codon. Cannot continue')
elif '*' in protein_seq[:-1]:
raise Error('Sequence "' + sequence.id + '" has an internal stop codon. Cannot continue')

return True


@classmethod
def _check_sequences(cls, padded_sequences, unpadded_sequences, seqs_are_coding, genetic_code=11):
AlnToMetadata._check_seq_lengths_same(padded_sequences)

if seqs_are_coding:
for sequence in unpadded_sequences.values():
AlnToMetadata._check_insertion_coords(sequence)
AlnToMetadata._check_coding_seq(sequence, genetic_code=genetic_code)

return True


@classmethod
def _check_variants_match_sequences(cls, unpadded_sequences, variants, seqs_are_coding, genetic_code=11):
original_code = pyfastaq.sequences.genetic_code
pyfastaq.sequences.genetic_code = genetic_code
for seqname, variant_list in variants.items():
if seqname not in unpadded_sequences:
pyfastaq.sequences.genetic_code = original_code
raise Error('Sequence name "' + seqname + '" given in variants file, but sequence not found')
for variant, description in variant_list:
if not variant.sanity_check_against_seq(unpadded_sequences[seqname], translate_seq=seqs_are_coding):
pyfastaq.sequences.genetic_code = original_code
raise Error('Variant "' + str(variant) + '" for sequence "' + seqname + '" does not match sequence. cannot continue')

pyfastaq.sequences.genetic_code = original_code
return True


@classmethod
def _variant_ids_are_unique(cls, variants):
seen_variants = set()
for variants_list in variants.values():
for variant, description in variants_list:
if variant.identifier in seen_variants:
raise Error('Variant identifier "' + variant.identifier + '" found more than once. Cannot continue')
else:
seen_variants.add(variant.identifier)

return True


@classmethod
def _unpadded_to_padded_nt_position(cls, position, insertions):
if len(insertions) == 0:
return position

i = 0
while i < len(insertions) and insertions[i].start <= position:
position += len(insertions[i])
i += 1

return position


@classmethod
def _padded_to_unpadded_nt_position(cls, position, insertions):
if len(insertions) == 0:
return position

i = 0
total_gap_length = 0
while i < len(insertions) and insertions[i].end < position:
total_gap_length += len(insertions[i])
i += 1

if i < len(insertions) and insertions[i].distance_to_point(position) == 0:
return None
else:
return position - total_gap_length


@classmethod
def _variants_to_tsv_lines(cls, variants, unpadded_sequences, padded_sequences, insertions, seqs_are_coding):
if seqs_are_coding:
unpadded_aa_sequences = {x: unpadded_sequences[x].translate() for x in unpadded_sequences}

lines = []
for refname in sorted(variants):
for variant, description in variants[refname]:
if seqs_are_coding:
ref_unpadded_nt_position = 3 * variant.position
else:
ref_unpadded_nt_position = variant.position

padded_nt_position = AlnToMetadata._unpadded_to_padded_nt_position(ref_unpadded_nt_position, insertions[refname])
lines.append('\t'.join([refname, variant.variant_type, str(variant), variant.identifier, description]))

for seqname, seq in sorted(padded_sequences.items()):
if seqname == refname:
continue

if seq[padded_nt_position] == '-':
print('Warning: position has a gap in sequence ', seqname, 'corresponding to variant', variant, '(' + variant.identifier + ') in sequence ', refname, '... Ignoring for ' + seqname, file=sys.stderr)
continue

unpadded_nt_position = AlnToMetadata._padded_to_unpadded_nt_position(padded_nt_position, insertions[seqname])
assert unpadded_nt_position is not None

if seqs_are_coding:
assert unpadded_nt_position % 3 == 0
unpadded_aa_position = unpadded_nt_position // 3
pos_string = str(unpadded_aa_position)
if unpadded_aa_sequences[seqname][unpadded_aa_position] in {variant.wild_value, variant.variant_value}:
variant_string = variant.wild_value
else:
variant_string = unpadded_aa_sequences[seqname][unpadded_aa_position]
variant_string += str(unpadded_aa_position + 1) + variant.variant_value
else:
pos_string = str(unpadded_nt_position)
if unpadded_sequences[seqname][unpadded_nt_position] in {variant.wild_value, variant.variant_value}:
variant_string = variant.wild_value
else:
variant_string = unpadded_sequences[seqname][unpadded_nt_position]
variant_string += str(unpadded_nt_position + 1) + variant.variant_value

lines.append('\t'.join([seqname, variant.variant_type, variant_string, variant.identifier, description]))

return lines


@classmethod
def _make_cluster_file(cls, cluster_name, sequences, filename):
if cluster_name not in sequences:
raise Error('Sequence name "' + cluster_name + '" to be used as cluster representative not found. Cannot continue')
names = [x for x in sequences.keys() if x != cluster_name]
names.sort()
with open(filename, 'w') as f:
print(cluster_name, *names, sep='\t', file=f)


def run(self, outprefix):
if self.cluster_rep_name not in self.padded_seqs:
raise Error('Sequence name "' + self.cluster_rep_name + '" to be used as cluster representative not found. Cannot continue')
original_code = pyfastaq.sequences.genetic_code
pyfastaq.sequences.genetic_code = self.genetic_code
unpadded_seqs = AlnToMetadata._make_unpadded_seqs(self.padded_seqs)
insertions = AlnToMetadata._make_unpadded_insertion_coords(self.padded_seqs)
AlnToMetadata._check_sequences(self.padded_seqs, unpadded_seqs, self.refs_are_coding, genetic_code=self.genetic_code)
AlnToMetadata._variant_ids_are_unique(self.variants)
AlnToMetadata._check_variants_match_sequences(unpadded_seqs, self.variants, self.refs_are_coding, genetic_code=self.genetic_code)

tsv_lines = AlnToMetadata._variants_to_tsv_lines(self.variants, unpadded_seqs, self.padded_seqs, insertions, self.refs_are_coding)
with open(outprefix + '.tsv', 'w') as f:
print(*tsv_lines, sep='\n', file=f)

with open(outprefix + '.fa', 'w') as f:
for seqname in sorted(unpadded_seqs):
print(unpadded_seqs[seqname], sep='\n', file=f)

AlnToMetadata._make_cluster_file(self.cluster_rep_name, unpadded_seqs, outprefix + '.cluster')
pyfastaq.sequences.genetic_code = original_code
2 changes: 1 addition & 1 deletion ariba/assembly_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def _get_one_variant_for_one_contig_coding(ref_sequence, refdata_var_dict, mumme
# if this variant is at the same position as a known variant in the reference
if refdata_var_dict is not None and aa_var_position in refdata_var_dict['p']:
if aa_var_effect == 'NONSYN':
aa_variant = sequence_variant.Variant('p', aa_var_string)
aa_variant = sequence_variant.Variant('p', aa_var_string, '.')
variants_at_this_position = {x for x in refdata_var_dict['p'][aa_variant.position]}
matching_variants = {x for x in variants_at_this_position if aa_variant.variant_value == x.variant.variant_value}
not_interesting_variants = {x for x in variants_at_this_position if aa_variant.variant_value == x.variant.wild_value}
Expand Down
2 changes: 1 addition & 1 deletion ariba/cdhit.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def run_get_clusters_from_file(self, infile):
f = pyfastaq.utils.open_file_write(tmp_fa)

for seq in seq_reader:
if seq.id in clusters:
if seq.id in clusters and seq.id in clusters[seq.id]:
pyfastaq.utils.close(f)
shutil.rmtree(tmpdir)
raise Error('Sequence name "' + seq.id + '" not unique. Cannot continue')
Expand Down
6 changes: 3 additions & 3 deletions ariba/ref_genes_getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,15 +126,15 @@ def _get_from_card(self, outprefix):
else:
fasta_filehandle = f_out_var_only

print(fasta.id, '.', '.', data['ARO_name'], sep='\t', file=f_out_tsv)
print(fasta.id, '.', '.', '.', data['ARO_name'], sep='\t', file=f_out_tsv)

if len(data['snps']) == 0:
print(fasta, file=fasta_filehandle)
print(fasta.id, '.', '.', data['ARO_description'], sep='\t', file=f_out_tsv)
print(fasta.id, '.', '.', '.', data['ARO_description'], sep='\t', file=f_out_tsv)
else:
print(fasta, file=fasta_filehandle)
for snp in data['snps']:
print(fasta.id, variant_type, snp, data['ARO_description'], sep='\t', file=f_out_tsv)
print(fasta.id, variant_type, snp, '.', data['ARO_description'], sep='\t', file=f_out_tsv)


pyfastaq.utils.close(f_out_tsv)
Expand Down
4 changes: 2 additions & 2 deletions ariba/reference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def _write_metadata_tsv(metadata, filename):
f = pyfastaq.utils.open_file_write(filename)

for gene_name, data_dict in sorted(metadata.items()):
for meta in data_dict['.']:
for meta in sorted([str(x) for x in data_dict['.']]):
print(meta, file=f)

variants = []
Expand Down Expand Up @@ -190,7 +190,7 @@ def _filter_bad_variant_data(self, out_prefix, presence_absence_removed, variant
to_remove = []

for metadata in metadata_dict['.']:
if metadata.free_text is None:
if metadata.free_text == '.':
print(gene_name, 'metadata has no info. Just gene name given. Removing. Line of file was:', metadata, file=log_fh)
to_remove.append(metadata)

Expand Down
10 changes: 7 additions & 3 deletions ariba/report.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import sys
import pymummer

class Error (Exception): pass

columns = [
'ref_name', # 0 name of reference sequence
'ref_type', # 1 type of reference sequence (presence/absence, variants only, noncoding)
Expand Down Expand Up @@ -165,7 +167,7 @@ def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymum
known_var_change = 'unknown'
var_type = 'SNP'
has_known_var = '1'
matching_vars_column = ';;;'.join([x.to_string(separator='_') for x in matching_vars_set])
matching_vars_column = ';;;'.join([x.to_string(separator=':') for x in matching_vars_set])
else:
is_known_var = '0'
known_var_change = '.'
Expand Down Expand Up @@ -253,8 +255,10 @@ def report_lines(cluster):

for line in lines:
if len(line.split('\t')) != len(columns):
print('Error making report - wrong number of columns. Expected', len(columns), 'but got', len(line.split('\t')), file=sys.stderr)
print(line, file=sys.stderr)
cols = line.split('\t')
print('Error making report - wrong number of columns. Expected', len(columns), 'but got', len(cols), file=sys.stderr)
for i in range(len(cols)):
print(i, cols[i], sep='\t', file=sys.stderr)
lines_ok = False

if not lines_ok:
Expand Down
Loading

0 comments on commit 4b2c976

Please sign in to comment.