diff --git a/ariba/__init__.py b/ariba/__init__.py index 68461b4f..7c85be26 100644 --- a/ariba/__init__.py +++ b/ariba/__init__.py @@ -7,6 +7,7 @@ __all__ = [ + 'aln_to_metadata', 'assembly', 'assembly_compare', 'assembly_variants', diff --git a/ariba/aln_to_metadata.py b/ariba/aln_to_metadata.py new file mode 100644 index 00000000..555b69d6 --- /dev/null +++ b/ariba/aln_to_metadata.py @@ -0,0 +1,269 @@ +import os +import re +import sys +import shutil +import pyfastaq +from ariba import sequence_variant + +class Error (Exception): pass + +class AlnToMetadata: + def __init__(self, + aln_file, + vars_file, + refs_are_coding, + cluster_rep_name, + genetic_code=11, + ): + self.padded_seqs = AlnToMetadata._load_aln_file(aln_file) + self.refs_are_coding = refs_are_coding + self.variants = AlnToMetadata._load_vars_file(vars_file, self.refs_are_coding) + self.genetic_code = genetic_code + self.cluster_rep_name = cluster_rep_name + + + @classmethod + def _load_aln_file(cls, aln_file): + seqs = {} + pyfastaq.tasks.file_to_dict(aln_file, seqs) + return seqs + + + @classmethod + def _load_vars_file(cls, vars_file, refs_are_coding): + var_type = 'p' if refs_are_coding else 'n' + f = pyfastaq.utils.open_file_read(vars_file) + variants = {} + + for line in f: + try: + ref_name, variant, identifier, description = line.rstrip().split('\t') + variant = sequence_variant.Variant(var_type, variant, identifier) + except: + pyfastaq.utils.close(f) + raise Error('Error in this line of variants file:\n' + line) + + if ref_name not in variants: + variants[ref_name] = [] + + variants[ref_name].append((variant, description)) + + pyfastaq.utils.close(f) + return variants + + + @classmethod + def _make_unpadded_seqs(cls, padded_seqs): + unpadded_seqs = {} + for seq in padded_seqs.values(): + unpadded_seqs[seq.id] = pyfastaq.sequences.Fasta(seq.id, seq.seq.replace('-', '')) + return unpadded_seqs + + + @classmethod + def _check_seq_lengths_same(cls, seqs): + sequence_lengths = set([len(x) for x in seqs.values()]) + if len(sequence_lengths) > 1: + raise Error('Input sequences must all be the same length. Cannot continue. Lengths found: ' + ','.join([str(x) for x in sequence_lengths])) + return len(sequence_lengths) == 1 + + + @classmethod + def _insertion_coords(cls, sequence): + insertions = [] + regex = re.compile('-+') + for m in regex.finditer(sequence.seq): + insertions.append(pyfastaq.intervals.Interval(m.span()[0], m.span()[1] - 1)) + return insertions + + + @classmethod + def _make_unpadded_insertion_coords(cls, unpadded_sequences): + return {x.id: AlnToMetadata._insertion_coords(x) for x in unpadded_sequences.values()} + + + @classmethod + def _check_insertion_coords(cls, sequence): + insertions = AlnToMetadata._insertion_coords(sequence) + for coords in insertions: + if coords.start % 3 !=0: + raise Error('Insertion does not start in frame in sequence "' + sequence.id + '". Cannot continue') + elif len(coords) % 3 != 0: + raise Error('Insertion of length not a mulitple of 3 in sequence "' + sequence.id + '". Cannot continue') + + return True + + + @classmethod + def _check_coding_seq(cls, sequence, genetic_code=11): + if len(sequence) % 3 != 0: + raise Error('Length of sequence ' + sequence.id + ' is ' + str(len(sequence)) + ', which is not a multiple of 3. Cannot continue') + + original_code = pyfastaq.sequences.genetic_code + pyfastaq.sequences.genetic_code = genetic_code + protein_seq = sequence.translate() + start_ok = sequence.seq[0:3].upper() in pyfastaq.genetic_codes.starts[genetic_code] + pyfastaq.sequences.genetic_code = original_code + + if not start_ok: + raise Error('Sequence "' + sequence.id + '" does not start with a start codon. Cannot continue') + elif protein_seq[-1] != '*': + raise Error('Sequence "' + sequence.id + '" does not end with a stop codon. Cannot continue') + elif '*' in protein_seq[:-1]: + raise Error('Sequence "' + sequence.id + '" has an internal stop codon. Cannot continue') + + return True + + + @classmethod + def _check_sequences(cls, padded_sequences, unpadded_sequences, seqs_are_coding, genetic_code=11): + AlnToMetadata._check_seq_lengths_same(padded_sequences) + + if seqs_are_coding: + for sequence in unpadded_sequences.values(): + AlnToMetadata._check_insertion_coords(sequence) + AlnToMetadata._check_coding_seq(sequence, genetic_code=genetic_code) + + return True + + + @classmethod + def _check_variants_match_sequences(cls, unpadded_sequences, variants, seqs_are_coding, genetic_code=11): + original_code = pyfastaq.sequences.genetic_code + pyfastaq.sequences.genetic_code = genetic_code + for seqname, variant_list in variants.items(): + if seqname not in unpadded_sequences: + pyfastaq.sequences.genetic_code = original_code + raise Error('Sequence name "' + seqname + '" given in variants file, but sequence not found') + for variant, description in variant_list: + if not variant.sanity_check_against_seq(unpadded_sequences[seqname], translate_seq=seqs_are_coding): + pyfastaq.sequences.genetic_code = original_code + raise Error('Variant "' + str(variant) + '" for sequence "' + seqname + '" does not match sequence. cannot continue') + + pyfastaq.sequences.genetic_code = original_code + return True + + + @classmethod + def _variant_ids_are_unique(cls, variants): + seen_variants = set() + for variants_list in variants.values(): + for variant, description in variants_list: + if variant.identifier in seen_variants: + raise Error('Variant identifier "' + variant.identifier + '" found more than once. Cannot continue') + else: + seen_variants.add(variant.identifier) + + return True + + + @classmethod + def _unpadded_to_padded_nt_position(cls, position, insertions): + if len(insertions) == 0: + return position + + i = 0 + while i < len(insertions) and insertions[i].start <= position: + position += len(insertions[i]) + i += 1 + + return position + + + @classmethod + def _padded_to_unpadded_nt_position(cls, position, insertions): + if len(insertions) == 0: + return position + + i = 0 + total_gap_length = 0 + while i < len(insertions) and insertions[i].end < position: + total_gap_length += len(insertions[i]) + i += 1 + + if i < len(insertions) and insertions[i].distance_to_point(position) == 0: + return None + else: + return position - total_gap_length + + + @classmethod + def _variants_to_tsv_lines(cls, variants, unpadded_sequences, padded_sequences, insertions, seqs_are_coding): + if seqs_are_coding: + unpadded_aa_sequences = {x: unpadded_sequences[x].translate() for x in unpadded_sequences} + + lines = [] + for refname in sorted(variants): + for variant, description in variants[refname]: + if seqs_are_coding: + ref_unpadded_nt_position = 3 * variant.position + else: + ref_unpadded_nt_position = variant.position + + padded_nt_position = AlnToMetadata._unpadded_to_padded_nt_position(ref_unpadded_nt_position, insertions[refname]) + lines.append('\t'.join([refname, variant.variant_type, str(variant), variant.identifier, description])) + + for seqname, seq in sorted(padded_sequences.items()): + if seqname == refname: + continue + + if seq[padded_nt_position] == '-': + print('Warning: position has a gap in sequence ', seqname, 'corresponding to variant', variant, '(' + variant.identifier + ') in sequence ', refname, '... Ignoring for ' + seqname, file=sys.stderr) + continue + + unpadded_nt_position = AlnToMetadata._padded_to_unpadded_nt_position(padded_nt_position, insertions[seqname]) + assert unpadded_nt_position is not None + + if seqs_are_coding: + assert unpadded_nt_position % 3 == 0 + unpadded_aa_position = unpadded_nt_position // 3 + pos_string = str(unpadded_aa_position) + if unpadded_aa_sequences[seqname][unpadded_aa_position] in {variant.wild_value, variant.variant_value}: + variant_string = variant.wild_value + else: + variant_string = unpadded_aa_sequences[seqname][unpadded_aa_position] + variant_string += str(unpadded_aa_position + 1) + variant.variant_value + else: + pos_string = str(unpadded_nt_position) + if unpadded_sequences[seqname][unpadded_nt_position] in {variant.wild_value, variant.variant_value}: + variant_string = variant.wild_value + else: + variant_string = unpadded_sequences[seqname][unpadded_nt_position] + variant_string += str(unpadded_nt_position + 1) + variant.variant_value + + lines.append('\t'.join([seqname, variant.variant_type, variant_string, variant.identifier, description])) + + return lines + + + @classmethod + def _make_cluster_file(cls, cluster_name, sequences, filename): + if cluster_name not in sequences: + raise Error('Sequence name "' + cluster_name + '" to be used as cluster representative not found. Cannot continue') + names = [x for x in sequences.keys() if x != cluster_name] + names.sort() + with open(filename, 'w') as f: + print(cluster_name, *names, sep='\t', file=f) + + + def run(self, outprefix): + if self.cluster_rep_name not in self.padded_seqs: + raise Error('Sequence name "' + self.cluster_rep_name + '" to be used as cluster representative not found. Cannot continue') + original_code = pyfastaq.sequences.genetic_code + pyfastaq.sequences.genetic_code = self.genetic_code + unpadded_seqs = AlnToMetadata._make_unpadded_seqs(self.padded_seqs) + insertions = AlnToMetadata._make_unpadded_insertion_coords(self.padded_seqs) + AlnToMetadata._check_sequences(self.padded_seqs, unpadded_seqs, self.refs_are_coding, genetic_code=self.genetic_code) + AlnToMetadata._variant_ids_are_unique(self.variants) + AlnToMetadata._check_variants_match_sequences(unpadded_seqs, self.variants, self.refs_are_coding, genetic_code=self.genetic_code) + + tsv_lines = AlnToMetadata._variants_to_tsv_lines(self.variants, unpadded_seqs, self.padded_seqs, insertions, self.refs_are_coding) + with open(outprefix + '.tsv', 'w') as f: + print(*tsv_lines, sep='\n', file=f) + + with open(outprefix + '.fa', 'w') as f: + for seqname in sorted(unpadded_seqs): + print(unpadded_seqs[seqname], sep='\n', file=f) + + AlnToMetadata._make_cluster_file(self.cluster_rep_name, unpadded_seqs, outprefix + '.cluster') + pyfastaq.sequences.genetic_code = original_code diff --git a/ariba/assembly_variants.py b/ariba/assembly_variants.py index ab8658fa..137cb233 100644 --- a/ariba/assembly_variants.py +++ b/ariba/assembly_variants.py @@ -193,7 +193,7 @@ def _get_one_variant_for_one_contig_coding(ref_sequence, refdata_var_dict, mumme # if this variant is at the same position as a known variant in the reference if refdata_var_dict is not None and aa_var_position in refdata_var_dict['p']: if aa_var_effect == 'NONSYN': - aa_variant = sequence_variant.Variant('p', aa_var_string) + aa_variant = sequence_variant.Variant('p', aa_var_string, '.') variants_at_this_position = {x for x in refdata_var_dict['p'][aa_variant.position]} matching_variants = {x for x in variants_at_this_position if aa_variant.variant_value == x.variant.variant_value} not_interesting_variants = {x for x in variants_at_this_position if aa_variant.variant_value == x.variant.wild_value} diff --git a/ariba/cdhit.py b/ariba/cdhit.py index 1450c9a7..c49750cd 100644 --- a/ariba/cdhit.py +++ b/ariba/cdhit.py @@ -84,7 +84,7 @@ def run_get_clusters_from_file(self, infile): f = pyfastaq.utils.open_file_write(tmp_fa) for seq in seq_reader: - if seq.id in clusters: + if seq.id in clusters and seq.id in clusters[seq.id]: pyfastaq.utils.close(f) shutil.rmtree(tmpdir) raise Error('Sequence name "' + seq.id + '" not unique. Cannot continue') diff --git a/ariba/ref_genes_getter.py b/ariba/ref_genes_getter.py index 82114ec0..07f097bb 100644 --- a/ariba/ref_genes_getter.py +++ b/ariba/ref_genes_getter.py @@ -126,15 +126,15 @@ def _get_from_card(self, outprefix): else: fasta_filehandle = f_out_var_only - print(fasta.id, '.', '.', data['ARO_name'], sep='\t', file=f_out_tsv) + print(fasta.id, '.', '.', '.', data['ARO_name'], sep='\t', file=f_out_tsv) if len(data['snps']) == 0: print(fasta, file=fasta_filehandle) - print(fasta.id, '.', '.', data['ARO_description'], sep='\t', file=f_out_tsv) + print(fasta.id, '.', '.', '.', data['ARO_description'], sep='\t', file=f_out_tsv) else: print(fasta, file=fasta_filehandle) for snp in data['snps']: - print(fasta.id, variant_type, snp, data['ARO_description'], sep='\t', file=f_out_tsv) + print(fasta.id, variant_type, snp, '.', data['ARO_description'], sep='\t', file=f_out_tsv) pyfastaq.utils.close(f_out_tsv) diff --git a/ariba/reference_data.py b/ariba/reference_data.py index f43ab119..3251a059 100644 --- a/ariba/reference_data.py +++ b/ariba/reference_data.py @@ -132,7 +132,7 @@ def _write_metadata_tsv(metadata, filename): f = pyfastaq.utils.open_file_write(filename) for gene_name, data_dict in sorted(metadata.items()): - for meta in data_dict['.']: + for meta in sorted([str(x) for x in data_dict['.']]): print(meta, file=f) variants = [] @@ -190,7 +190,7 @@ def _filter_bad_variant_data(self, out_prefix, presence_absence_removed, variant to_remove = [] for metadata in metadata_dict['.']: - if metadata.free_text is None: + if metadata.free_text == '.': print(gene_name, 'metadata has no info. Just gene name given. Removing. Line of file was:', metadata, file=log_fh) to_remove.append(metadata) diff --git a/ariba/report.py b/ariba/report.py index df06311c..f32234e3 100644 --- a/ariba/report.py +++ b/ariba/report.py @@ -1,6 +1,8 @@ import sys import pymummer +class Error (Exception): pass + columns = [ 'ref_name', # 0 name of reference sequence 'ref_type', # 1 type of reference sequence (presence/absence, variants only, noncoding) @@ -165,7 +167,7 @@ def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymum known_var_change = 'unknown' var_type = 'SNP' has_known_var = '1' - matching_vars_column = ';;;'.join([x.to_string(separator='_') for x in matching_vars_set]) + matching_vars_column = ';;;'.join([x.to_string(separator=':') for x in matching_vars_set]) else: is_known_var = '0' known_var_change = '.' @@ -253,8 +255,10 @@ def report_lines(cluster): for line in lines: if len(line.split('\t')) != len(columns): - print('Error making report - wrong number of columns. Expected', len(columns), 'but got', len(line.split('\t')), file=sys.stderr) - print(line, file=sys.stderr) + cols = line.split('\t') + print('Error making report - wrong number of columns. Expected', len(columns), 'but got', len(cols), file=sys.stderr) + for i in range(len(cols)): + print(i, cols[i], sep='\t', file=sys.stderr) lines_ok = False if not lines_ok: diff --git a/ariba/sequence_metadata.py b/ariba/sequence_metadata.py index a1efde2e..5b888a29 100644 --- a/ariba/sequence_metadata.py +++ b/ariba/sequence_metadata.py @@ -6,23 +6,16 @@ class Error (Exception): pass class SequenceMetadata: def __init__(self, line): try: - self.name, variant_type, variant_string, *extra_columns = line.rstrip().split('\t') + self.name, variant_type, variant_string, identifier, self.free_text = line.rstrip().split('\t') except: raise Error('Error parsing line of file:\n' + line) - if len(extra_columns) == 0: - self.free_text = None - elif len(extra_columns) == 1: - self.free_text = extra_columns[0] - else: - raise Error('Too many columns in this line:\n' + line) - self.variant_type = variant_type if self.variant_type == '.': self.variant = None else: - self.variant = sequence_variant.Variant(self.variant_type, variant_string) + self.variant = sequence_variant.Variant(self.variant_type, variant_string, identifier) def __eq__(self, other): @@ -42,16 +35,13 @@ def __str__(self): def to_string(self, separator='\t'): - fields = [self.name, self.variant_type] - if self.variant is None: - fields.append('.') - else: - fields.append(str(self.variant)) - - if self.free_text: - return separator.join(fields + [self.free_text]) - else: - return separator.join(fields) + return separator.join([ + self.name, + self.variant_type, + '.' if self.variant is None else str(self.variant), + '.' if (self.variant is None or self.variant.identifier is None) else self.variant.identifier, + self.free_text + ]) def has_variant(self, seq): diff --git a/ariba/sequence_variant.py b/ariba/sequence_variant.py index fa475c92..7e32515d 100644 --- a/ariba/sequence_variant.py +++ b/ariba/sequence_variant.py @@ -7,12 +7,13 @@ class Error (Exception): pass allowed_variant_types = {'n', 'p'} class Variant: - def __init__(self, variant_type, variant_string): + def __init__(self, variant_type, variant_string, identifier): if variant_type not in allowed_variant_types: raise Error('Error! Variant type "' + variant_type + '" not recognised.\n' + \ 'Must be one of:' + ', '.join(allowed_variant_types)) self.variant_type = variant_type + self.identifier = None if identifier == '.' else identifier m = re.match('^([A-Z])([0-9]+)([A-Z])$', variant_string.upper()) diff --git a/ariba/summary.py b/ariba/summary.py index 8e7133bb..94604863 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -15,12 +15,11 @@ def __init__( outprefix, filenames=None, fofn=None, - include_all_known_variant_columns=True, - include_all_novel_variant_columns=False, filter_rows=True, filter_columns=True, min_id=90.0, cluster_cols='assembled,has_res,ref_seq,pct_id,known_var,novel_var', + variant_cols='groups,grouped,ungrouped,novel', verbose=False, ): if filenames is None and fofn is None: @@ -35,8 +34,7 @@ def __init__( self.filenames.extend(self._load_fofn(fofn)) self.cluster_columns = self._determine_cluster_cols(cluster_cols) - self.include_all_known_variant_columns = include_all_known_variant_columns - self.include_all_novel_variant_columns = include_all_novel_variant_columns + self.var_columns = self._determine_var_cols(variant_cols) self.filter_rows = filter_rows self.filter_columns = filter_columns self.min_id = min_id @@ -44,17 +42,28 @@ def __init__( self.verbose = verbose - @staticmethod - def _determine_cluster_cols(cols_string): - allowed_cols = {'assembled', 'has_res', 'ref_seq', 'pct_id', 'known_var', 'novel_var'} + @classmethod + def _determine_cols(cls, cols_string, allowed_cols, error_string): if cols_string == '' or cols_string is None: return {x: False for x in allowed_cols} wanted_cols = set(cols_string.split(',')) if not wanted_cols.issubset(allowed_cols): - raise Error('Error in cluster names. Allowed values are: ' + str(','.join(list(allowed_cols))) + '. Got: ' + cols_string) + raise Error('Error in ' + error_string + '. Allowed values are: ' + str(','.join(list(allowed_cols))) + '. Got: ' + cols_string) return {x: x in wanted_cols for x in allowed_cols} + @staticmethod + def _determine_cluster_cols(cols_string): + allowed_cols = {'assembled', 'has_res', 'ref_seq', 'pct_id', 'known_var', 'novel_var'} + return Summary._determine_cols(cols_string, allowed_cols, 'cluster columns') + + + @staticmethod + def _determine_var_cols(cols_string): + allowed_cols = {'groups', 'grouped', 'ungrouped', 'novel'} + return Summary._determine_cols(cols_string, allowed_cols, 'variant columns') + + def _load_fofn(self, fofn): f = pyfastaq.utils.open_file_read(fofn) filenames = [x.rstrip() for x in f.readlines()] @@ -103,9 +112,24 @@ def _get_all_variant_columns(cls, samples_dict): return columns + @classmethod + def _get_all_var_groups(cls, samples_dict): + groups = {} + for filename, sample in samples_dict.items(): + for name, name_set in sample.var_groups.items(): + if name not in groups: + groups[name] = set() + groups[name].update(name_set) + return groups + + def _gather_output_rows(self): all_cluster_names = Summary._get_all_cluster_names(self.samples) all_var_columns = Summary._get_all_variant_columns(self.samples) + if self.var_columns['groups']: + var_groups = Summary._get_all_var_groups(self.samples) + else: + var_groups = set() rows = {} for filename, sample in self.samples.items(): @@ -126,21 +150,22 @@ def _gather_output_rows(self): 'pct_id': 'NA' } - wanted_var_types = set() - if self.include_all_known_variant_columns: - wanted_var_types.add('known') - if self.include_all_novel_variant_columns: - wanted_var_types.add('unknown') + if self.var_columns['groups']: + for group_name in var_groups[cluster]: + if cluster in sample.var_groups and group_name in sample.var_groups[cluster]: + rows[filename][cluster]['vgroup.' + group_name] = 'yes' + else: + rows[filename][cluster]['vgroup.' + group_name] = 'no' - if len(wanted_var_types) and cluster in all_var_columns: - for (ref_name, variant, known_or_unknown) in all_var_columns[cluster]: - if known_or_unknown not in wanted_var_types: + if cluster in all_var_columns: + for (ref_name, variant, grouped_or_novel, group_name) in all_var_columns[cluster]: + if not self.var_columns[grouped_or_novel]: continue key = ref_name + '.' + variant if rows[filename][cluster]['assembled'] == 'no': rows[filename][cluster][key] = 'NA' - elif cluster in sample.variant_column_names_tuples and (ref_name, variant, known_or_unknown) in sample.variant_column_names_tuples[cluster]: + elif cluster in sample.variant_column_names_tuples and (ref_name, variant, grouped_or_novel, group_name) in sample.variant_column_names_tuples[cluster]: rows[filename][cluster][key] = 'yes' else: rows[filename][cluster][key] = 'no' diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py index dab808d0..f1bc7bbb 100644 --- a/ariba/summary_cluster.py +++ b/ariba/summary_cluster.py @@ -52,6 +52,14 @@ def line2dict(cls, line): except: assert d[key] == '.' + if d['var_description'] == '.': + d['var_group'] = '.' + else: + try: + d['var_group'] = d['var_description'].split(':')[3] + except: + raise Error('Error getting variant group from the following line:\n' + line) + return d @@ -193,13 +201,20 @@ def _get_nonsynonymous_var(data_dict): data_dict['known_var_change'] != data_dict['ref_ctg_change']: raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change != ref_ctg_change. Cannot continue') - if data_dict['known_var_change'] != '.': - return data_dict['known_var_change'] + var_group = 'novel', None + + if data_dict['known_var'] == '1' and data_dict['known_var_change'] != '.': + var_change = data_dict['known_var_change'] + if data_dict['var_group'] == '.': + var_group = 'ungrouped', None + else: + var_group = 'grouped', data_dict['var_group'] elif data_dict['ref_ctg_change'] != '.': - return data_dict['ref_ctg_change'] + var_change = data_dict['ref_ctg_change'] else: - return data_dict['ref_ctg_effect'] + var_change = data_dict['ref_ctg_effect'] + return (data_dict['ref_name'], var_change) + var_group def _has_resistance(self, assembled_summary): '''assembled_summary should be output of _to_cluster_summary_assembled''' @@ -212,6 +227,15 @@ def _has_resistance(self, assembled_summary): return 'no' + def has_var_groups(self): + '''Returns a set of the variant group ids that this cluster has''' + ids = set() + for d in self.data: + if self._has_known_variant(d) and d['var_group'] != '.': + ids.add(d['var_group']) + return ids + + def column_summary_data(self): '''Returns a dictionary of column name -> value, for cluster-level results''' assembled_summary = self._to_cluster_summary_assembled() diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py index 0a994b45..fcf05331 100644 --- a/ariba/summary_sample.py +++ b/ariba/summary_sample.py @@ -40,7 +40,11 @@ def _column_summary_data(self): return {c: self.clusters[c].column_summary_data() for c in self.clusters} - def _non_synon_variants(self): + def _var_groups(self): + return {c: self.clusters[c].has_var_groups() for c in self.clusters} + + + def _variant_column_names_tuples(self): variants = {} for cluster_name, cluster in self.clusters.items(): cluster_vars = cluster.non_synon_variants() @@ -49,25 +53,9 @@ def _non_synon_variants(self): return variants - def _variant_column_names_tuples(self): - # assumes this has been run: - # self.column_summary_data = self._column_summary_data() - # self.variants = self._non_synon_variants() - columns = {} - for cluster_name, variants in self.variants.items(): - ref_name = self.column_summary_data[cluster_name]['ref_seq'] - columns[cluster_name] = set() - for var in variants: - if self.column_summary_data[cluster_name]['known_var'] == 'yes': - columns[cluster_name].add((ref_name, var, 'known')) - else: - columns[cluster_name].add((ref_name, var, 'unknown')) - return columns - - def run(self): self.clusters = self._load_file(self.report_tsv, self.min_pc_id) self.column_summary_data = self._column_summary_data() - self.variants = self._non_synon_variants() self.variant_column_names_tuples = self._variant_column_names_tuples() + self.var_groups = self._var_groups() diff --git a/ariba/tasks/aln2meta.py b/ariba/tasks/aln2meta.py new file mode 100644 index 00000000..f0b655f1 --- /dev/null +++ b/ariba/tasks/aln2meta.py @@ -0,0 +1,28 @@ +import argparse +from ariba import aln_to_metadata + + +def run(): + coding_choices = ['coding', 'noncoding'] + parser = argparse.ArgumentParser( + description = 'Converts multi-alignment fasta and SNP info to metadata', + usage = 'ariba aln2meta [options] <(non)coding> ' + ) + + parser.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT') + parser.add_argument('aln_fasta', help='Multi-fasta file of alignments') + parser.add_argument('variants_tsv', help='TSV file of variants information') + parser.add_argument('coding_or_non', help='Sequences are coding or noncoding. Must be one of: ' + ' '.join(coding_choices), choices=coding_choices, metavar='(non)coding') + parser.add_argument('cluster_rep', help='Name of sequence to be used as cluster representative. Must exactly match a sequence in aln_fasta file') + parser.add_argument('outprefix', help='Prefix of output filenames') + options = parser.parse_args() + + aln_to_meta = aln_to_metadata.AlnToMetadata( + options.aln_fasta, + options.variants_tsv, + options.coding_or_non == 'coding', + options.cluster_rep, + genetic_code=options.genetic_code + ) + aln_to_meta.run(options.outprefix) + diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py index 0083be96..d4a22f43 100644 --- a/ariba/tasks/summary.py +++ b/ariba/tasks/summary.py @@ -9,43 +9,64 @@ def use_preset(options): preset_to_vals = { 'minimal': { 'cluster_cols': 'has_res', + 'variant_cols': '', 'col_filter': 'y', 'row_filter': 'y', + 'var_groups': 'n', 'known_vars': 'n', 'novel_vars': 'n' }, 'cluster_small': { 'cluster_cols': 'assembled,has_res,ref_seq,known_var', + 'variant_cols': '', 'col_filter': 'y', 'row_filter': 'y', + 'var_groups': 'n', 'known_vars': 'n', 'novel_vars': 'n' }, 'cluster_all': { 'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var', + 'variant_cols': '', 'col_filter': 'y', 'row_filter': 'y', + 'var_groups': 'n', + 'known_vars': 'n', + 'novel_vars': 'n' + }, + 'cluster_var_groups': { + 'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var', + 'variant_cols': 'groups', + 'col_filter': 'y', + 'row_filter': 'y', + 'var_groups': 'y', 'known_vars': 'n', 'novel_vars': 'n' }, 'cluster_known_vars': { 'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var', + 'variant_cols': 'groups,grouped,ungrouped', 'col_filter': 'y', 'row_filter': 'y', + 'var_groups': 'y', 'known_vars': 'y', 'novel_vars': 'n' }, 'all': { 'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var', + 'variant_cols': 'groups,grouped,ungrouped,novel', 'col_filter': 'y', 'row_filter': 'y', + 'var_groups': 'y', 'known_vars': 'y', 'novel_vars': 'y' }, 'all_no_filter': { 'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var', + 'variant_cols': 'groups,grouped,ungrouped,novel', 'col_filter': 'n', 'row_filter': 'n', + 'var_groups': 'y', 'known_vars': 'y', 'novel_vars': 'y' }, @@ -60,7 +81,7 @@ def use_preset(options): def run(): - presets = ['minimal', 'cluster_small', 'cluster_all', 'cluster_known_vars', 'all', 'all_no_filter'] + presets = ['minimal', 'cluster_small', 'cluster_all', 'cluster_var_groups', 'cluster_known_vars', 'all', 'all_no_filter'] parser = argparse.ArgumentParser( description = 'Make a summary of ARIBA report files, and Phandango files', @@ -71,8 +92,7 @@ def run(): parser.add_argument('--cluster_cols', help='Comma separated list of cluster columns to include. Choose from: assembled, has_res, ref_seq, pct_id, known_var, novel_var [%(default)s]', default='has_res', metavar='col1,col2,...') parser.add_argument('--col_filter', choices=['y', 'n'], default='y', help='Choose whether columns where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n') parser.add_argument('--row_filter', choices=['y', 'n'], default='y', help='Choose whether rows where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n') - parser.add_argument('--known_vars', choices=['y', 'n'], default='n', help='Output a column for every known variant [%(default)s]', metavar='y|n') - parser.add_argument('--novel_vars', choices=['y', 'n'], default='n', help='Output a column for every novel variant [%(default)s]', metavar='y|n') + parser.add_argument('--var_cols', help='Comma separated list of variant columns to include. Choose from: groups, grouped, ungrouped, novel [none by default]', metavar='col1,col2,...', default='') parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT') parser.add_argument('--verbose', action='store_true', help='Be verbose') parser.add_argument('outprefix', help='Prefix of output files') @@ -87,12 +107,11 @@ def run(): options.outprefix, fofn=options.fofn, filenames=options.infiles, - include_all_known_variant_columns=options.known_vars == 'y', - include_all_novel_variant_columns=options.novel_vars == 'y', filter_rows=options.col_filter == 'y', filter_columns=options.row_filter == 'y', min_id=options.min_id, cluster_cols=options.cluster_cols, + variant_cols=options.var_cols, verbose=options.verbose ) s.run() diff --git a/ariba/test_run_data/metadata.tsv b/ariba/test_run_data/metadata.tsv index f9c3fb7e..04b61078 100644 --- a/ariba/test_run_data/metadata.tsv +++ b/ariba/test_run_data/metadata.tsv @@ -1,14 +1,14 @@ -presence_absence1 . . Generic description of presence_absence1 -presence_absence1 p R3S Ref and assembly have wild type, so do not report -presence_absence1 p A10V Ref has wild, reads have variant so report -presence_absence1 p I5A Ref and reads have variant so report -variants_only1 . . Generic description of variants_only1 -variants_only1 p I3L Ref and assembly have wild type, so do not report -variants_only1 p S5T Ref and reads have variant so report -variants_only2 p R3I Ref and reads have wild so do not report -variants_only2 . . Generic description of variants_only2 -noncoding1 . . generic description of noncoding1 -noncoding1 n A6G variant in ref and reads so should report -noncoding1 n G9T wild type in ref and reads so should not report -noncoding1 n A14T ref has wild type, reads have variant so should report -noncoding1 n A40C ref has variant, reads have wild type so should not report +presence_absence1 . . . Generic description of presence_absence1 +presence_absence1 p R3S . Ref and assembly have wild type, so do not report +presence_absence1 p A10V . Ref has wild, reads have variant so report +presence_absence1 p I5A . Ref and reads have variant so report +variants_only1 . . . Generic description of variants_only1 +variants_only1 p I3L . Ref and assembly have wild type, so do not report +variants_only1 p S5T . Ref and reads have variant so report +variants_only2 p R3I . Ref and reads have wild so do not report +variants_only2 . . . Generic description of variants_only2 +noncoding1 . . . generic description of noncoding1 +noncoding1 n A6G . variant in ref and reads so should report +noncoding1 n G9T . wild type in ref and reads so should not report +noncoding1 n A14T noncoding_group1 ref has wild type, reads have variant so should report +noncoding1 n A40C . ref has variant, reads have wild type so should not report diff --git a/ariba/tests/aln_to_metadata_test.py b/ariba/tests/aln_to_metadata_test.py new file mode 100644 index 00000000..7f55dd3f --- /dev/null +++ b/ariba/tests/aln_to_metadata_test.py @@ -0,0 +1,411 @@ +import unittest +import os +import copy +import shutil +import filecmp +import pyfastaq +from ariba import aln_to_metadata, sequence_variant + +modules_dir = os.path.dirname(os.path.abspath(aln_to_metadata.__file__)) +data_dir = os.path.join(modules_dir, 'tests', 'data') + + +class TestAlnToMetadata(unittest.TestCase): + def test_load_aln_file(self): + '''test _load_aln_file''' + aln_file = os.path.join(data_dir, 'aln_to_metadata_load_aln_file.in.fa') + expected = { + 'seq1': pyfastaq.sequences.Fasta('seq1', 'ABC-DE'), + 'seq2': pyfastaq.sequences.Fasta('seq2', 'ABCQDE'), + } + got = aln_to_metadata.AlnToMetadata._load_aln_file(aln_file) + self.assertEqual(expected, got) + + + def test_load_vars_file_good_file(self): + '''test _load_vars_file good input file''' + infile = os.path.join(data_dir, 'aln_to_metadata_load_vars_file_good.tsv') + variant1 = sequence_variant.Variant('p', 'A42B', 'id1') + variant2 = sequence_variant.Variant('p', 'C43D', 'id2') + variant3 = sequence_variant.Variant('p', 'E100F', 'id3') + expected = { + 'seq1': [(variant1, 'description 1')], + 'seq2': [(variant2, 'description 2'), (variant3, 'description 3')] + } + got = aln_to_metadata.AlnToMetadata._load_vars_file(infile, True) + self.assertEqual(expected, got) + + + def test_load_vars_bad_files(self): + '''test _load_vars_file bad input files''' + infiles = [ + os.path.join(data_dir, 'aln_to_metadata_load_vars_file_bad.1.tsv'), + os.path.join(data_dir, 'aln_to_metadata_load_vars_file_bad.2.tsv') + ] + + for infile in infiles: + with self.assertRaises(aln_to_metadata.Error): + aln_to_metadata.AlnToMetadata._load_vars_file(infile, True) + + + def test_make_unpadded_seqs(self): + '''test _make_unpadded_seqs''' + padded = { + 'seq1': pyfastaq.sequences.Fasta('seq1', 'acg---t'), + 'seq2': pyfastaq.sequences.Fasta('seq2', '---a-cgt-'), + } + expected = { + 'seq1': pyfastaq.sequences.Fasta('seq1', 'acgt'), + 'seq2': pyfastaq.sequences.Fasta('seq2', 'acgt'), + } + got = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded) + self.assertEqual(expected, got) + + + def test_check_seq_lengths_same(self): + '''test _check_seq_lengths_same''' + seqs = { + 'seq1': pyfastaq.sequences.Fasta('seq1', 'acgt'), + 'seq2': pyfastaq.sequences.Fasta('seq2', 'acgt'), + } + + self.assertTrue(aln_to_metadata.AlnToMetadata._check_seq_lengths_same(seqs)) + seqs['seq1'].seq = 'a' + with self.assertRaises(aln_to_metadata.Error): + aln_to_metadata.AlnToMetadata._check_seq_lengths_same(seqs) + + + def test_insertion_coords(self): + '''test _insertion_coords''' + ivl = pyfastaq.intervals.Interval + tests = [ + ('acgt', []), + ('-a', [pyfastaq.intervals.Interval(0, 0)]), + ('a---cgt--', [pyfastaq.intervals.Interval(1, 3), pyfastaq.intervals.Interval(7, 8)]), + ] + + for seq, expected in tests: + fa = pyfastaq.sequences.Fasta('x', seq) + got = aln_to_metadata.AlnToMetadata._insertion_coords(fa) + self.assertEqual(expected, got) + + + def test_make_unpadded_insertion_coords(self): + '''test _make_unpadded_insertion_coords''' + seqs = { + 'seq1': pyfastaq.sequences.Fasta('seq1', 'acgt'), + 'seq2': pyfastaq.sequences.Fasta('seq2', 'ac-gt'), + 'seq3': pyfastaq.sequences.Fasta('seq3', '--acg-t'), + } + + expected = { + 'seq1': [], + 'seq2': [pyfastaq.intervals.Interval(2, 2)], + 'seq3': [pyfastaq.intervals.Interval(0, 1), pyfastaq.intervals.Interval(5, 5)], + + } + got = aln_to_metadata.AlnToMetadata._make_unpadded_insertion_coords(seqs) + self.assertEqual(expected, got) + + + def test_check_insertion_coords(self): + '''test _check_insertion_coords''' + seq = pyfastaq.sequences.Fasta('name', 'AAA---GGG------TTT---') + self.assertTrue(aln_to_metadata.AlnToMetadata._check_insertion_coords(seq)) + + bad_seqs = [ + pyfastaq.sequences.Fasta('name', 'AAA--GGG'), # bad length + pyfastaq.sequences.Fasta('name', 'A---AA'), # bad start position + pyfastaq.sequences.Fasta('name', 'AA---AA'), # bad start position + ] + + for seq in bad_seqs: + with self.assertRaises(aln_to_metadata.Error): + aln_to_metadata.AlnToMetadata._check_insertion_coords(seq) + + + def test_check_coding_seq(self): + '''test _check_coding_seq''' + seq = pyfastaq.sequences.Fasta('name', 'ATGCTTTAG') + self.assertTrue(aln_to_metadata.AlnToMetadata._check_coding_seq(seq)) + + bad_seqs = [ + pyfastaq.sequences.Fasta('name', 'TTGCTTAG'), # length not a mutliple of 3 + pyfastaq.sequences.Fasta('name', 'TTTCTTTAG'), # no start codon + pyfastaq.sequences.Fasta('name', 'ATGTAGCTTTAG'), # stop codon in middle + pyfastaq.sequences.Fasta('name', 'TTGCTTTTT'), # no stop at end + ] + + for seq in bad_seqs: + with self.assertRaises(aln_to_metadata.Error): + aln_to_metadata.AlnToMetadata._check_coding_seq(seq) + + + def test_check_sequences_non_coding(self): + '''test _check_sequences with noncoding seqs''' + padded_sequences = { + 'seq1': pyfastaq.sequences.Fasta('seq1', 'AC-T') + } + + unpadded_sequences = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_sequences) + self.assertTrue(aln_to_metadata.AlnToMetadata._check_sequences(padded_sequences, unpadded_sequences, False)) + padded_sequences['seq2'] = pyfastaq.sequences.Fasta('seq2', 'AC-') + unpadded_sequences = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_sequences) + with self.assertRaises(aln_to_metadata.Error): + aln_to_metadata.AlnToMetadata._check_sequences(padded_sequences, unpadded_sequences, False) + + + def test_check_sequences_coding(self): + '''test _check_sequences with coding seqs''' + padded_sequences = { + 'seq1': pyfastaq.sequences.Fasta('seq1', 'ATGCTTTAG'), + 'seq2': pyfastaq.sequences.Fasta('seq2', 'ATG---TAG') + } + + unpadded_sequences = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_sequences) + + self.assertTrue(aln_to_metadata.AlnToMetadata._check_sequences(padded_sequences, unpadded_sequences, True)) + + bad_seqs = [ + 'ATGCTTAG', # length not a mutliple of 3 + 'TTTCTTTAG', # no start codon + 'ATGTAGCTTTAG', # stop codon in middle + 'ATGTTTTTT', # no stop at end + 'ATGC---TTTAG', # bad insertion + 'ATGCT---TTAG', # bad insertion + 'ATG-CTTTAG', # bad insertion + 'ATG--CTTTAG', # bad insertion + 'ATG----CTTTAG', # bad insertion + ] + + for seq in bad_seqs: + padded_sequences['seq2'] = pyfastaq.sequences.Fasta('seq2', seq) + unpadded_sequences = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_sequences) + with self.assertRaises(aln_to_metadata.Error): + aln_to_metadata.AlnToMetadata._check_sequences(padded_sequences, unpadded_sequences, True) + + + def test_check_variants_match_sequences(self): + '''test _check_variants_match_sequences''' + seqs = { + 'seq1': pyfastaq.sequences.Fasta('seq1', 'ATGCTTTAG'), + 'seq2': pyfastaq.sequences.Fasta('seq2', 'ATGCTTCTTTAG'), + 'seq3': pyfastaq.sequences.Fasta('seq3', 'ATG---TAG') + } + + variants = {'seq1': [(sequence_variant.Variant('p', 'L2M', 'id1'), 'description1')]} + self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True)) + variants = {'seq1': [(sequence_variant.Variant('p', 'M2L', 'id1'), 'description1')]} + self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True)) + + variants = {'seq1': [(sequence_variant.Variant('p', 'A2M', 'id1'), 'description1')]} + with self.assertRaises(aln_to_metadata.Error): + self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True)) + + variants = {'seq4': [(sequence_variant.Variant('p', 'A2M', 'id1'), 'description1')]} + with self.assertRaises(aln_to_metadata.Error): + self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True)) + + + def test_variant_ids_are_unique(self): + '''test variant_ids_are_unique''' + variants = { + 'seq1': [(sequence_variant.Variant('p', 'L2M', 'id1'), 'description1')], + 'seq2': [(sequence_variant.Variant('p', 'L2M', 'id2'), 'description2')] + } + + self.assertTrue(aln_to_metadata.AlnToMetadata._variant_ids_are_unique(variants)) + variants['seq2'].append((sequence_variant.Variant('p', 'I3K', 'id1'), 'description3')) + with self.assertRaises(aln_to_metadata.Error): + self.assertTrue(aln_to_metadata.AlnToMetadata._variant_ids_are_unique(variants)) + + + def test_unpadded_to_padded_nt_position(self): + '''test _unpadded_to_padded_nt_position''' + ivl = pyfastaq.intervals.Interval + + tests = [ + (0, [], 0), + (1, [], 1), + (2, [], 2), + (0, [ivl(3, 5)], 0), + (1, [ivl(3, 5)], 1), + (2, [ivl(3, 5)], 2), + (3, [ivl(3, 5)], 6), + (4, [ivl(3, 5)], 7), + (5, [ivl(3, 5)], 8), + (0, [ivl(3, 5), ivl(9,14)], 0), + (1, [ivl(3, 5), ivl(9,14)], 1), + (2, [ivl(3, 5), ivl(9,14)], 2), + (3, [ivl(3, 5), ivl(9,14)], 6), + (4, [ivl(3, 5), ivl(9,14)], 7), + (5, [ivl(3, 5), ivl(9,14)], 8), + (6, [ivl(3, 5), ivl(9,14)], 15), + (7, [ivl(3, 5), ivl(9,14)], 16), + (8, [ivl(3, 5), ivl(9,14)], 17), + ] + + for position, insertions, expected in tests: + got = aln_to_metadata.AlnToMetadata._unpadded_to_padded_nt_position(position, insertions) + self.assertEqual(expected, got) + + + def test_padded_to_unpadded_nt_position(self): + '''test _padded_to_unpadded_nt_position''' + ivl = pyfastaq.intervals.Interval + + tests = [ + (0, [], 0), + (1, [], 1), + (2, [], 2), + (0, [ivl(3, 5)], 0), + (1, [ivl(3, 5)], 1), + (2, [ivl(3, 5)], 2), + (3, [ivl(3, 5)], None), + (4, [ivl(3, 5)], None), + (5, [ivl(3, 5)], None), + (6, [ivl(3, 5)], 3), + (7, [ivl(3, 5)], 4), + (8, [ivl(3, 5)], 5), + (0, [ivl(3, 5), ivl(7,10)], 0), + (1, [ivl(3, 5), ivl(7,10)], 1), + (2, [ivl(3, 5), ivl(7,10)], 2), + (3, [ivl(3, 5), ivl(7,10)], None), + (4, [ivl(3, 5), ivl(7,10)], None), + (5, [ivl(3, 5), ivl(7,10)], None), + (6, [ivl(3, 5), ivl(7,10)], 3), + (7, [ivl(3, 5), ivl(7,10)], None), + (8, [ivl(3, 5), ivl(7,10)], None), + (9, [ivl(3, 5), ivl(7,10)], None), + (10, [ivl(3, 5), ivl(7,10)], None), + (11, [ivl(3, 5), ivl(7,10)], 4), + (12, [ivl(3, 5), ivl(7,10)], 5), + ] + + for position, insertions, expected in tests: + got = aln_to_metadata.AlnToMetadata._padded_to_unpadded_nt_position(position, insertions) + self.assertEqual(expected, got) + + + def test_variants_to_tsv_lines_coding(self): + '''test _variants_to_tsv_lines coding sequences''' + padded_seqs = { + 'seq1': pyfastaq.sequences.Fasta('seq1', 'ATG---GCTAATTAG'), # M-AN* + 'seq2': pyfastaq.sequences.Fasta('seq2', 'ATG---GCTAATTAG'), # MFAN* + 'seq3': pyfastaq.sequences.Fasta('seq3', 'ATGTTT---AATTAG'), # MF-N* + 'seq4': pyfastaq.sequences.Fasta('seq4', 'ATGTTTTGTAATTAG'), # MFCN* + 'seq5': pyfastaq.sequences.Fasta('seq5', 'ATGTTTGATAATTAG'), # MFDN* + } + + unpadded_seqs = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_seqs) + insertions = aln_to_metadata.AlnToMetadata._make_unpadded_insertion_coords(padded_seqs) + + variant1 = sequence_variant.Variant('p', 'A2D', 'id1') + variant2 = sequence_variant.Variant('p', 'F2E', 'id2') + variants = { + 'seq1': [(variant1, 'description 1')], + 'seq5': [(variant2, 'description 2')], + } + + expected = [ + 'seq1\tp\tA2D\tid1\tdescription 1', + 'seq2\tp\tA2D\tid1\tdescription 1', + 'seq4\tp\tC3D\tid1\tdescription 1', + 'seq5\tp\tA3D\tid1\tdescription 1', + 'seq5\tp\tF2E\tid2\tdescription 2', + 'seq3\tp\tF2E\tid2\tdescription 2', + 'seq4\tp\tF2E\tid2\tdescription 2', + ] + + got = aln_to_metadata.AlnToMetadata._variants_to_tsv_lines(variants, unpadded_seqs, padded_seqs, insertions, True) + self.assertEqual(expected, got) + + + def test_variants_to_tsv_lines_noncoding(self): + '''test _variants_to_tsv_lines noncoding sequences''' + padded_seqs = { + 'seq1': pyfastaq.sequences.Fasta('seq1', 'ATG---GCTAATTAG'), + 'seq2': pyfastaq.sequences.Fasta('seq2', 'ATG---GCTAATTAG'), + 'seq3': pyfastaq.sequences.Fasta('seq3', 'ATGTAT---AATTAG'), + 'seq4': pyfastaq.sequences.Fasta('seq4', 'ATGTGTTGTAATTAG'), + 'seq5': pyfastaq.sequences.Fasta('seq5', 'ATGTTTGATAATTAG'), + } + + unpadded_seqs = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_seqs) + unpadded_aa_seqs = {x: unpadded_seqs[x].translate() for x in unpadded_seqs} + insertions = aln_to_metadata.AlnToMetadata._make_unpadded_insertion_coords(padded_seqs) + + variant1 = sequence_variant.Variant('n', 'C5T', 'id1') + variant2 = sequence_variant.Variant('n', 'A5T', 'id2') + variants = { + 'seq1': [(variant1, 'description 1')], + 'seq5': [(variant2, 'description 2')], + } + + expected = [ + 'seq1\tn\tC5T\tid1\tdescription 1', + 'seq2\tn\tC5T\tid1\tdescription 1', + 'seq4\tn\tG8T\tid1\tdescription 1', + 'seq5\tn\tA8T\tid1\tdescription 1', + 'seq5\tn\tA5T\tid2\tdescription 2', + 'seq3\tn\tA5T\tid2\tdescription 2', + 'seq4\tn\tG5T\tid2\tdescription 2', + ] + + got = aln_to_metadata.AlnToMetadata._variants_to_tsv_lines(variants, unpadded_seqs, padded_seqs, insertions, False) + self.assertEqual(expected, got) + + + def test_make_cluster_file(self): + '''test _make_cluster_file''' + seqs = { + 'seq1': pyfastaq.sequences.Fasta('seq1', 'a'), + 'seq2': pyfastaq.sequences.Fasta('seq2', 'c'), + 'seq3': pyfastaq.sequences.Fasta('seq3', 'g'), + } + tmpfile = 'tmp.aln_to_meta_test_make_cluster_file.out' + expected_file = os.path.join(data_dir, 'aln_to_metadata_make_cluster_file.out') + + with self.assertRaises(aln_to_metadata.Error): + aln_to_metadata.AlnToMetadata._make_cluster_file('not_found', seqs, tmpfile) + + aln_to_metadata.AlnToMetadata._make_cluster_file('seq2', seqs, tmpfile) + self.assertTrue(filecmp.cmp(expected_file, tmpfile, shallow=False)) + os.unlink(tmpfile) + + + def test_run_coding(self): + '''test run coding sequences''' + fa_in = os.path.join(data_dir, 'aln_to_metadata_run_coding.in.fa') + fa_expected = os.path.join(data_dir, 'aln_to_metadata_run_coding.out.fa') + tsv_in = os.path.join(data_dir, 'aln_to_metadata_run_coding.in.tsv') + tsv_expected = os.path.join(data_dir, 'aln_to_metadata_run_coding.out.tsv') + cluster_expected = os.path.join(data_dir, 'aln_to_metadata_run_coding.out.cluster') + a_to_m = aln_to_metadata.AlnToMetadata(fa_in, tsv_in, True, 'seq3') + outprefix = 'tmp.test.aln_to_metadata.run_coding' + a_to_m.run(outprefix) + self.assertTrue(filecmp.cmp(tsv_expected, outprefix + '.tsv', shallow=False)) + self.assertTrue(filecmp.cmp(fa_expected, outprefix + '.fa', shallow=False)) + self.assertTrue(filecmp.cmp(cluster_expected, outprefix + '.cluster', shallow=False)) + os.unlink(outprefix + '.tsv') + os.unlink(outprefix + '.fa') + os.unlink(outprefix + '.cluster') + + + def test_run_noncoding(self): + '''test run noncoding sequences''' + fa_in = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.in.fa') + fa_expected = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.out.fa') + tsv_in = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.in.tsv') + tsv_expected = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.out.tsv') + cluster_expected = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.out.cluster') + a_to_m = aln_to_metadata.AlnToMetadata(fa_in, tsv_in, False, 'seq2') + outprefix = 'tmp.test.aln_to_metadata.run_noncoding' + a_to_m.run(outprefix) + self.assertTrue(filecmp.cmp(tsv_expected, outprefix + '.tsv', shallow=False)) + self.assertTrue(filecmp.cmp(fa_expected, outprefix + '.fa', shallow=False)) + self.assertTrue(filecmp.cmp(cluster_expected, outprefix + '.cluster', shallow=False)) + os.unlink(outprefix + '.tsv') + os.unlink(outprefix + '.fa') + os.unlink(outprefix + '.cluster') + diff --git a/ariba/tests/assembly_variants_test.py b/ariba/tests/assembly_variants_test.py index c8dea8bb..6061b292 100644 --- a/ariba/tests/assembly_variants_test.py +++ b/ariba/tests/assembly_variants_test.py @@ -120,8 +120,8 @@ def test_get_one_variant_for_one_contig_non_coding(self): # ref has T at position 5, which is wild type. This gives contig variant type A. Should report v2 = pymummer.variant.Variant(pymummer.snp.Snp('5\tT\tA\t5\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig')) - meta0 = sequence_metadata.SequenceMetadata('non_coding\tn\tC3A\tref has variant type A') - meta2 = sequence_metadata.SequenceMetadata('non_coding\tn\tT5A\tref has wild type T') + meta0 = sequence_metadata.SequenceMetadata('non_coding\tn\tC3A\tid1\tref has variant type A') + meta2 = sequence_metadata.SequenceMetadata('non_coding\tn\tT5A\tid1\tref has wild type T') mummer_variants = [v0, v1, v2] @@ -188,8 +188,8 @@ def test_get_one_variant_for_one_contig_coding(self): mummer_variants = [[v0], [v1], [v2], [v3], [v4], [v5], [v6], [v7], [v8], [v9], [v10]] - meta0 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD2E\tref has wild type D (GAT=D, GAA=E)') - meta4 = sequence_metadata.SequenceMetadata('presence_absence\tp\tS3R\tref has variant type R (AGA=R, AGT=S)') + meta0 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD2E\tid1\tref has wild type D (GAT=D, GAA=E)') + meta4 = sequence_metadata.SequenceMetadata('presence_absence\tp\tS3R\tid1\tref has variant type R (AGA=R, AGT=S)') expected_tuples = [ (1, 'p', 'D2E', 'NONSYN', [v0], {meta0}, set()), #0 @@ -230,13 +230,13 @@ def test_get_one_variant_for_one_contig_coding(self): def test_get_remaining_known_ref_variants_amino_acids(self): '''test _get_remaining_known_ref_variants with amino acids''' - ref_var1 = sequence_metadata.SequenceMetadata('gene1\tp\tD2E\tfoo bar') - ref_var2 = sequence_metadata.SequenceMetadata('gene1\tp\tD3E\tfoo bar baz') - ref_var3 = sequence_metadata.SequenceMetadata('gene1\tp\tD3I\tfoo bar baz spam') - ref_var4 = sequence_metadata.SequenceMetadata('gene1\tp\tD10E\tfoo bar baz spam egg') - ref_var5 = sequence_metadata.SequenceMetadata('gene1\tp\tD14E\tfoo bar baz spam egg chips') - ref_var6 = sequence_metadata.SequenceMetadata('gene1\tp\tD15E\tfoo bar baz spam egg chips') - ref_var7 = sequence_metadata.SequenceMetadata('gene1\tp\tD40E\tfoo bar baz spam egg chips') + ref_var1 = sequence_metadata.SequenceMetadata('gene1\tp\tD2E\tid1\tfoo bar') + ref_var2 = sequence_metadata.SequenceMetadata('gene1\tp\tD3E\tid1\tfoo bar baz') + ref_var3 = sequence_metadata.SequenceMetadata('gene1\tp\tD3I\tid1\tfoo bar baz spam') + ref_var4 = sequence_metadata.SequenceMetadata('gene1\tp\tD10E\tid1\tfoo bar baz spam egg') + ref_var5 = sequence_metadata.SequenceMetadata('gene1\tp\tD14E\tid1\tfoo bar baz spam egg chips') + ref_var6 = sequence_metadata.SequenceMetadata('gene1\tp\tD15E\tid1\tfoo bar baz spam egg chips') + ref_var7 = sequence_metadata.SequenceMetadata('gene1\tp\tD40E\tid1\tfoo bar baz spam egg chips') known_ref_variants = { 1: {ref_var1}, @@ -261,13 +261,13 @@ def test_get_remaining_known_ref_variants_amino_acids(self): def test_get_remaining_known_ref_variants_nucleotides(self): '''test _get_remaining_known_ref_variants with nucleotides''' - ref_var1 = sequence_metadata.SequenceMetadata('gene1\tn\tA2C\tfoo bar') - ref_var2 = sequence_metadata.SequenceMetadata('gene1\tn\tA3C\tfoo bar baz') - ref_var3 = sequence_metadata.SequenceMetadata('gene1\tn\tA3T\tfoo bar baz spam') - ref_var4 = sequence_metadata.SequenceMetadata('gene1\tn\tA10C\tfoo bar baz spam egg') - ref_var5 = sequence_metadata.SequenceMetadata('gene1\tn\tA14C\tfoo bar baz spam egg chips') - ref_var6 = sequence_metadata.SequenceMetadata('gene1\tn\tA15C\tfoo bar baz spam egg chips') - ref_var7 = sequence_metadata.SequenceMetadata('gene1\tn\tA40C\tfoo bar baz spam egg chips') + ref_var1 = sequence_metadata.SequenceMetadata('gene1\tn\tA2C\tid1\tfoo bar') + ref_var2 = sequence_metadata.SequenceMetadata('gene1\tn\tA3C\tid1\tfoo bar baz') + ref_var3 = sequence_metadata.SequenceMetadata('gene1\tn\tA3T\tid1\tfoo bar baz spam') + ref_var4 = sequence_metadata.SequenceMetadata('gene1\tn\tA10C\tid1\tfoo bar baz spam egg') + ref_var5 = sequence_metadata.SequenceMetadata('gene1\tn\tA14C\tid1\tfoo bar baz spam egg chips') + ref_var6 = sequence_metadata.SequenceMetadata('gene1\tn\tA15C\tid1\tfoo bar baz spam egg chips') + ref_var7 = sequence_metadata.SequenceMetadata('gene1\tn\tA40C\tid1\tfoo bar baz spam egg chips') known_ref_variants = { 1: {ref_var1}, @@ -292,11 +292,11 @@ def test_get_remaining_known_ref_variants_nucleotides(self): def test_get_variants_presence_absence(self): '''test get_variants presence absence genes''' - meta1 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD2E\tref has wild type D, contig has var (GAT=D, GAA=E)') - meta2 = sequence_metadata.SequenceMetadata('presence_absence\tp\tS3R\tref has variant type R, contig has wild (AGA=R, AGT=S)') - meta3 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD4E\tref has variant type E, contig has var (GAA=E, GAC=D)') - meta4 = sequence_metadata.SequenceMetadata('presence_absence\tp\tA5D\tref has wild type A, contig has var (GCG=A, GAC=D)') - meta5 = sequence_metadata.SequenceMetadata('presence_absence\tp\tR13S\tref and qry have wild type') + meta1 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD2E\tid1\tref has wild type D, contig has var (GAT=D, GAA=E)') + meta2 = sequence_metadata.SequenceMetadata('presence_absence\tp\tS3R\tid1\tref has variant type R, contig has wild (AGA=R, AGT=S)') + meta3 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD4E\tid1\tref has variant type E, contig has var (GAA=E, GAC=D)') + meta4 = sequence_metadata.SequenceMetadata('presence_absence\tp\tA5D\tid1\tref has wild type A, contig has var (GCG=A, GAC=D)') + meta5 = sequence_metadata.SequenceMetadata('presence_absence\tp\tR13S\tid1\tref and qry have wild type') metadata_tsv = 'tmp.test_get_variants_presence_absence.metadata.tsv' with open(metadata_tsv, 'w') as f: @@ -343,9 +343,9 @@ def test_get_variants_presence_absence(self): def test_get_variants_variants_only(self): '''test get_variants variants only''' - meta1 = sequence_metadata.SequenceMetadata('variants_only\tp\tD2E\tref has wild type D (GAT=D, GAA=E)') - meta2 = sequence_metadata.SequenceMetadata('variants_only\tp\tS3R\tref has variant type R (AGA=R, AGT=S)') - meta3 = sequence_metadata.SequenceMetadata('variants_only\tp\tD4E\tref has variant type E (GAA=E, GAC=D)') + meta1 = sequence_metadata.SequenceMetadata('variants_only\tp\tD2E\tid1\tref has wild type D (GAT=D, GAA=E)') + meta2 = sequence_metadata.SequenceMetadata('variants_only\tp\tS3R\tid1\tref has variant type R (AGA=R, AGT=S)') + meta3 = sequence_metadata.SequenceMetadata('variants_only\tp\tD4E\tid1\tref has variant type E (GAA=E, GAC=D)') metadata_tsv = 'tmp.test_get_variants_variants_only.metadata.tsv' with open(metadata_tsv, 'w') as f: diff --git a/ariba/tests/cluster_test.py b/ariba/tests/cluster_test.py index 7ccdd61d..6e959eb0 100644 --- a/ariba/tests/cluster_test.py +++ b/ariba/tests/cluster_test.py @@ -151,12 +151,12 @@ def test_full_run_ok_non_coding(self): c.run() expected = [ - 'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t73\t73\tT\t19\t.\t19\tnoncoding1_n_A14T_ref has wild type, reads has variant so should report\tgeneric description of noncoding1', + 'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t73\t73\tT\t19\t.\t19\tnoncoding1:n:A14T:.:ref has wild type, reads has variant so should report\tgeneric description of noncoding1', 'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t0\t.\tn\t.\t0\tG61T\tSNP\t60\t60\tG\t120\t120\tT\t24\t.\t24\t.\tgeneric description of noncoding1', 'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t0\t.\tn\t.\t0\t.82C\tINS\t81\t81\t.\t142\t142\tC\t23\t.\t23\t.\tgeneric description of noncoding1', 'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t0\t.\tn\t.\t0\tT108.\tDEL\t107\t107\tT\t167\t167\t.\t17\t.\t17\t.\tgeneric description of noncoding1', - 'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tA6G\t1\t.\t.\t6\t6\tG\t66\t66\tG\t19\t.\t19\tnoncoding1_n_A6G_variant in ref and reads so should report\tgeneric description of noncoding1', - 'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tG9T\t0\t.\t.\t9\t9\tG\t69\t69\tG\t19\t.\t19\tnoncoding1_n_G9T_wild type in ref and reads\tgeneric description of noncoding1' + 'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tA6G\t1\t.\t.\t6\t6\tG\t66\t66\tG\t19\t.\t19\tnoncoding1:n:A6G:.:variant in ref and reads so should report\tgeneric description of noncoding1', + 'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tG9T\t0\t.\t.\t9\t9\tG\t69\t69\tG\t19\t.\t19\tnoncoding1:n:G9T:.:wild type in ref and reads\tgeneric description of noncoding1' ] self.assertEqual(expected, c.report_lines) @@ -177,12 +177,12 @@ def test_full_run_ok_presence_absence(self): c.run() expected = [ - 'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tA10V\t1\tA10V\tNONSYN\t28\t28\tC\t83\t83\tT\t22\t.\t22\tpresence_absence1_p_A10V_Ref has wild, reads have variant so report\tGeneric description of presence_absence1', + 'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tA10V\t1\tA10V\tNONSYN\t28\t28\tC\t83\t83\tT\t22\t.\t22\tpresence_absence1:p:A10V:.:Ref has wild, reads have variant so report\tGeneric description of presence_absence1', 'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t0\t.\tp\t.\t0\t.\tSYN\t53\t53\tT\t108\t108\tC\t32\t.\t32\t.\tGeneric description of presence_absence1', - 'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t62\t64\tC;G;C\t18;17;17\t.;.;.\t18;17;17\tpresence_absence1_p_R3S_Ref and assembly have wild type\tGeneric description of presence_absence1', + 'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t62\t64\tC;G;C\t18;17;17\t.;.;.\t18;17;17\tpresence_absence1:p:R3S:.:Ref and assembly have wild type\tGeneric description of presence_absence1', - 'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tG;C;G\t68\t70\tG;C;G\t18;20;20\t.;.;.\t18;20;20\tpresence_absence1_p_I5A_Ref and reads have variant so report\tGeneric description of presence_absence1', + 'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tG;C;G\t68\t70\tG;C;G\t18;20;20\t.;.;.\t18;20;20\tpresence_absence1:p:I5A:.:Ref and reads have variant so report\tGeneric description of presence_absence1', ] self.assertEqual(expected, c.report_lines) @@ -202,7 +202,7 @@ def test_full_run_ok_variants_only_variant_not_present(self): c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=66, total_reads_bases=3300) c.run() expected = [ - 'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1_p_R3S_Ref and assembly have wild type, so do not report\tGeneric description of variants_only1' + 'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1:p:R3S:.:Ref and assembly have wild type, so do not report\tGeneric description of variants_only1' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir) @@ -221,7 +221,7 @@ def test_full_run_ok_variants_only_variant_not_present_always_report(self): c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=66, total_reads_bases=3300) c.run() expected = [ - 'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1_p_R3S_Ref and assembly have wild type, but always report anyway\tGeneric description of variants_only1' + 'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1:p:R3S:.:Ref and assembly have wild type, but always report anyway\tGeneric description of variants_only1' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir) @@ -241,8 +241,8 @@ def test_full_run_ok_variants_only_variant_is_present(self): c.run() expected = [ - 'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1_p_R3S_Ref and assembly have wild type\tGeneric description of variants_only1', - 'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tG;C;G\t71\t73\tG;C;G\t17;17;17\t.;.;.\t17;17;17\tvariants_only1_p_I5A_Ref and reads have variant so report\tGeneric description of variants_only1', + 'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1:p:R3S:.:Ref and assembly have wild type\tGeneric description of variants_only1', + 'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tG;C;G\t71\t73\tG;C;G\t17;17;17\t.;.;.\t17;17;17\tvariants_only1:p:I5A:.:Ref and reads have variant so report\tGeneric description of variants_only1', ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir) diff --git a/ariba/tests/clusters_test.py b/ariba/tests/clusters_test.py index a263de0e..60ec0c41 100644 --- a/ariba/tests/clusters_test.py +++ b/ariba/tests/clusters_test.py @@ -63,14 +63,14 @@ def test_load_reference_data_from_dir(self): expected_metadata = { 'presabs1': { - '.': {sequence_metadata.SequenceMetadata('presabs1\t.\t.\tpresabs1 description')}, + '.': {sequence_metadata.SequenceMetadata('presabs1\t.\t.\t.\tpresabs1 description')}, 'n': {}, 'p': {} }, 'variants_only1': { '.': set(), 'n': {}, - 'p': {1: {sequence_metadata.SequenceMetadata('variants_only1\tp\tC2I\tdescription of variants_only1 C2I')}} + 'p': {1: {sequence_metadata.SequenceMetadata('variants_only1\tp\tC2I\t.\tdescription of variants_only1 C2I')}} } } self.assertEqual(expected_metadata, got_refdata.metadata) diff --git a/ariba/tests/data/aln_to_metadata_load_aln_file.in.fa b/ariba/tests/data/aln_to_metadata_load_aln_file.in.fa new file mode 100644 index 00000000..4c7c0fe7 --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_load_aln_file.in.fa @@ -0,0 +1,4 @@ +>seq1 +ABC-DE +>seq2 +ABCQDE diff --git a/ariba/tests/data/aln_to_metadata_load_vars_file_bad.1.tsv b/ariba/tests/data/aln_to_metadata_load_vars_file_bad.1.tsv new file mode 100644 index 00000000..6152c4a8 --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_load_vars_file_bad.1.tsv @@ -0,0 +1,2 @@ +seq1 A42B id1 description 1 +seq2 C43D id2 diff --git a/ariba/tests/data/aln_to_metadata_load_vars_file_bad.2.tsv b/ariba/tests/data/aln_to_metadata_load_vars_file_bad.2.tsv new file mode 100644 index 00000000..da6dd350 --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_load_vars_file_bad.2.tsv @@ -0,0 +1,2 @@ +seq1 A42B id1 description 1 +seq2 wrong_format id2 description 2 diff --git a/ariba/tests/data/aln_to_metadata_load_vars_file_good.tsv b/ariba/tests/data/aln_to_metadata_load_vars_file_good.tsv new file mode 100644 index 00000000..058b1dee --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_load_vars_file_good.tsv @@ -0,0 +1,3 @@ +seq1 A42B id1 description 1 +seq2 C43D id2 description 2 +seq2 E100F id3 description 3 diff --git a/ariba/tests/data/aln_to_metadata_make_cluster_file.out b/ariba/tests/data/aln_to_metadata_make_cluster_file.out new file mode 100644 index 00000000..529a3cc6 --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_make_cluster_file.out @@ -0,0 +1 @@ +seq2 seq1 seq3 diff --git a/ariba/tests/data/aln_to_metadata_run_coding.in.fa b/ariba/tests/data/aln_to_metadata_run_coding.in.fa new file mode 100644 index 00000000..c71f8c11 --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_run_coding.in.fa @@ -0,0 +1,10 @@ +>seq1 +ATG---GCTAATTAG +>seq2 +ATG---GCTAATTAG +>seq3 +ATGTTT---AATTAG +>seq4 +ATGTTTTGTAATTAG +>seq5 +ATGTTTGATAATTAG diff --git a/ariba/tests/data/aln_to_metadata_run_coding.in.tsv b/ariba/tests/data/aln_to_metadata_run_coding.in.tsv new file mode 100644 index 00000000..552e7a51 --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_run_coding.in.tsv @@ -0,0 +1,2 @@ +seq1 A2D id1 description 1 +seq5 F2E id2 description 2 diff --git a/ariba/tests/data/aln_to_metadata_run_coding.out.cluster b/ariba/tests/data/aln_to_metadata_run_coding.out.cluster new file mode 100644 index 00000000..6df8ac7b --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_run_coding.out.cluster @@ -0,0 +1 @@ +seq3 seq1 seq2 seq4 seq5 diff --git a/ariba/tests/data/aln_to_metadata_run_coding.out.fa b/ariba/tests/data/aln_to_metadata_run_coding.out.fa new file mode 100644 index 00000000..97d0f121 --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_run_coding.out.fa @@ -0,0 +1,10 @@ +>seq1 +ATGGCTAATTAG +>seq2 +ATGGCTAATTAG +>seq3 +ATGTTTAATTAG +>seq4 +ATGTTTTGTAATTAG +>seq5 +ATGTTTGATAATTAG diff --git a/ariba/tests/data/aln_to_metadata_run_coding.out.tsv b/ariba/tests/data/aln_to_metadata_run_coding.out.tsv new file mode 100644 index 00000000..ee957fa6 --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_run_coding.out.tsv @@ -0,0 +1,7 @@ +seq1 p A2D id1 description 1 +seq2 p A2D id1 description 1 +seq4 p C3D id1 description 1 +seq5 p A3D id1 description 1 +seq5 p F2E id2 description 2 +seq3 p F2E id2 description 2 +seq4 p F2E id2 description 2 diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.in.fa b/ariba/tests/data/aln_to_metadata_run_noncoding.in.fa new file mode 100644 index 00000000..2bc56571 --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_run_noncoding.in.fa @@ -0,0 +1,10 @@ +>seq1 +ATG---GCTAATTAG +>seq2 +ATG---GCTAATTAG +>seq3 +ATGTAT---AATTAG +>seq4 +ATGTGTTGTAATTAG +>seq5 +ATGTTTGATAATTAG diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.in.tsv b/ariba/tests/data/aln_to_metadata_run_noncoding.in.tsv new file mode 100644 index 00000000..3d32d779 --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_run_noncoding.in.tsv @@ -0,0 +1,2 @@ +seq1 C5T id1 description 1 +seq5 A5T id2 description 2 diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.out.cluster b/ariba/tests/data/aln_to_metadata_run_noncoding.out.cluster new file mode 100644 index 00000000..aee4e5a9 --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_run_noncoding.out.cluster @@ -0,0 +1 @@ +seq2 seq1 seq3 seq4 seq5 diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.out.fa b/ariba/tests/data/aln_to_metadata_run_noncoding.out.fa new file mode 100644 index 00000000..e737be69 --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_run_noncoding.out.fa @@ -0,0 +1,10 @@ +>seq1 +ATGGCTAATTAG +>seq2 +ATGGCTAATTAG +>seq3 +ATGTATAATTAG +>seq4 +ATGTGTTGTAATTAG +>seq5 +ATGTTTGATAATTAG diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.out.tsv b/ariba/tests/data/aln_to_metadata_run_noncoding.out.tsv new file mode 100644 index 00000000..7ba82bf3 --- /dev/null +++ b/ariba/tests/data/aln_to_metadata_run_noncoding.out.tsv @@ -0,0 +1,7 @@ +seq1 n C5T id1 description 1 +seq2 n C5T id1 description 1 +seq4 n G8T id1 description 1 +seq5 n A8T id1 description 1 +seq5 n A5T id2 description 2 +seq3 n A5T id2 description 2 +seq4 n G5T id2 description 2 diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv index 5d0fd041..f1e3583e 100644 --- a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv +++ b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv @@ -1,11 +1,11 @@ -presence_absence p D2E ref has wild type D (GAT=D, GAA=E) -presence_absence p S3R ref has variant type R (AGA=R, AGT=S) -presence_absence p D4E ref has variant type E (GAA=E, GAC=D) -presence_absence p A5D ref has wild type A (GCG=A) -variants_only p D2E ref has wild type D (GAT=D, GAA=E) -variants_only p S3R ref has variant type R (AGA=R, AGT=S) -variants_only p D4E ref has variant type E (GAA=E, GAC=D) -variants_only p A5D ref has wild type A (GCG=A) -non_coding n C3A ref has variant type A -non_coding n T5A ref has wild type T -non_coding n C6G ref has variant type G +presence_absence p D2E id1 ref has wild type D (GAT=D, GAA=E) +presence_absence p S3R id1 ref has variant type R (AGA=R, AGT=S) +presence_absence p D4E id1 ref has variant type E (GAA=E, GAC=D) +presence_absence p A5D id1 ref has wild type A (GCG=A) +variants_only p D2E id1 ref has wild type D (GAT=D, GAA=E) +variants_only p S3R id1 ref has variant type R (AGA=R, AGT=S) +variants_only p D4E id1 ref has variant type E (GAA=E, GAC=D) +variants_only p A5D id1 ref has wild type A (GCG=A) +non_coding n C3A id1 ref has variant type A +non_coding n T5A id1 ref has wild type T +non_coding n C6G id1 ref has variant type G diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv index 5d0fd041..f1e3583e 100644 --- a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv +++ b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv @@ -1,11 +1,11 @@ -presence_absence p D2E ref has wild type D (GAT=D, GAA=E) -presence_absence p S3R ref has variant type R (AGA=R, AGT=S) -presence_absence p D4E ref has variant type E (GAA=E, GAC=D) -presence_absence p A5D ref has wild type A (GCG=A) -variants_only p D2E ref has wild type D (GAT=D, GAA=E) -variants_only p S3R ref has variant type R (AGA=R, AGT=S) -variants_only p D4E ref has variant type E (GAA=E, GAC=D) -variants_only p A5D ref has wild type A (GCG=A) -non_coding n C3A ref has variant type A -non_coding n T5A ref has wild type T -non_coding n C6G ref has variant type G +presence_absence p D2E id1 ref has wild type D (GAT=D, GAA=E) +presence_absence p S3R id1 ref has variant type R (AGA=R, AGT=S) +presence_absence p D4E id1 ref has variant type E (GAA=E, GAC=D) +presence_absence p A5D id1 ref has wild type A (GCG=A) +variants_only p D2E id1 ref has wild type D (GAT=D, GAA=E) +variants_only p S3R id1 ref has variant type R (AGA=R, AGT=S) +variants_only p D4E id1 ref has variant type E (GAA=E, GAC=D) +variants_only p A5D id1 ref has wild type A (GCG=A) +non_coding n C3A id1 ref has variant type A +non_coding n T5A id1 ref has wild type T +non_coding n C6G id1 ref has variant type G diff --git a/ariba/tests/data/cluster_test_full_run_ok_non_coding.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_non_coding.metadata.tsv index ba79712a..97cfd32f 100644 --- a/ariba/tests/data/cluster_test_full_run_ok_non_coding.metadata.tsv +++ b/ariba/tests/data/cluster_test_full_run_ok_non_coding.metadata.tsv @@ -1,5 +1,5 @@ -noncoding1 . . generic description of noncoding1 -noncoding1 n A6G variant in ref and reads so should report -noncoding1 n G9T wild type in ref and reads -noncoding1 n A14T ref has wild type, reads has variant so should report -noncoding1 n A40C ref has variant, reads has wild type +noncoding1 . . . generic description of noncoding1 +noncoding1 n A6G . variant in ref and reads so should report +noncoding1 n G9T . wild type in ref and reads +noncoding1 n A14T . ref has wild type, reads has variant so should report +noncoding1 n A40C . ref has variant, reads has wild type diff --git a/ariba/tests/data/cluster_test_full_run_ok_presence_absence.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_presence_absence.metadata.tsv index 8adc93cf..bc5a3d97 100644 --- a/ariba/tests/data/cluster_test_full_run_ok_presence_absence.metadata.tsv +++ b/ariba/tests/data/cluster_test_full_run_ok_presence_absence.metadata.tsv @@ -1,4 +1,4 @@ -presence_absence1 . . Generic description of presence_absence1 -presence_absence1 p R3S Ref and assembly have wild type -presence_absence1 p A10V Ref has wild, reads have variant so report -presence_absence1 p I5A Ref and reads have variant so report +presence_absence1 . . . Generic description of presence_absence1 +presence_absence1 p R3S . Ref and assembly have wild type +presence_absence1 p A10V . Ref has wild, reads have variant so report +presence_absence1 p I5A . Ref and reads have variant so report diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv index b0ee54de..7e193f69 100644 --- a/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv +++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv @@ -1,2 +1,2 @@ -variants_only1 . . Generic description of variants_only1 -variants_only1 p R3S Ref and assembly have wild type, but always report anyway +variants_only1 . . . Generic description of variants_only1 +variants_only1 p R3S . Ref and assembly have wild type, but always report anyway diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.metadata.tsv index c314c207..de14a1b3 100644 --- a/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.metadata.tsv +++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.metadata.tsv @@ -1,2 +1,2 @@ -variants_only1 . . Generic description of variants_only1 -variants_only1 p R3S Ref and assembly have wild type, so do not report +variants_only1 . . . Generic description of variants_only1 +variants_only1 p R3S . Ref and assembly have wild type, so do not report diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.present.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_variants_only.present.metadata.tsv index f4b198da..621f2c90 100644 --- a/ariba/tests/data/cluster_test_full_run_ok_variants_only.present.metadata.tsv +++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only.present.metadata.tsv @@ -1,3 +1,3 @@ -variants_only1 . . Generic description of variants_only1 -variants_only1 p R3S Ref and assembly have wild type -variants_only1 p I5A Ref and reads have variant so report +variants_only1 . . . Generic description of variants_only1 +variants_only1 p R3S . Ref and assembly have wild type +variants_only1 p I5A . Ref and reads have variant so report diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.tsv b/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.tsv index 88a5889e..07c89d5c 100644 --- a/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.tsv +++ b/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.tsv @@ -1,2 +1,2 @@ -variants_only1 p C2I description of variants_only1 C2I -presabs1 . . presabs1 description +variants_only1 p C2I . description of variants_only1 C2I +presabs1 . . . presabs1 description diff --git a/ariba/tests/data/reference_data_filter_bad_data_metadata.expected.tsv b/ariba/tests/data/reference_data_filter_bad_data_metadata.expected.tsv index 66fae14e..4f90cbf1 100644 --- a/ariba/tests/data/reference_data_filter_bad_data_metadata.expected.tsv +++ b/ariba/tests/data/reference_data_filter_bad_data_metadata.expected.tsv @@ -1,8 +1,9 @@ -non_coding_1 . . should be in output because this field is here -non_coding_1 n C5A dna variant ok -presence_absence_1 . . should be in output because this field is here -presence_absence_2 n T4G dna variant ok -presence_absence_3 p R3S amino acid variant ok -variants_only_1 . . should be kept as a generic description of variants_only_1 -variants_only_1 p S2T amino acid variant ok -variants_only_1 n T4A dna variant ok +non_coding_1 . . . non_coding_1 description1 +non_coding_1 . . . should be in output because this field is here +non_coding_1 n C5A id1 dna variant ok +presence_absence_1 . . . should be in output because this field is here +presence_absence_2 n T4G id2 dna variant ok +presence_absence_3 p R3S . amino acid variant ok +variants_only_1 . . . should be kept as a generic description of variants_only_1 +variants_only_1 p S2T . amino acid variant ok +variants_only_1 n T4A . dna variant ok diff --git a/ariba/tests/data/reference_data_filter_bad_data_metadata.in.tsv b/ariba/tests/data/reference_data_filter_bad_data_metadata.in.tsv index 4f590de0..031a8d7a 100644 --- a/ariba/tests/data/reference_data_filter_bad_data_metadata.in.tsv +++ b/ariba/tests/data/reference_data_filter_bad_data_metadata.in.tsv @@ -1,20 +1,20 @@ -non_coding_1 . . -non_coding_1 . . should be in output because this field is here -non_coding_1 p L2K should be removed because this is non-coding, but variant is protein -non_coding_1 n C5A dna variant ok -non_coding_not_in_fasta . . should be removed from tsv because not in fasta -presence_absence_1 . . -presence_absence_1 . . should be in output because this field is here -presence_absence_2 n T4G dna variant ok -presence_absence_2 n A4G dna variant not ok -presence_absence_3 p R3S amino acid variant ok -presence_absence_3 p I3S amino acid variant not ok -presence_absence_not_in_fasta . . should be removed from tsv because not in fasta -variants_only_1 n T4A dna variant ok -variants_only_1 n C4G dna variant not ok -variants_only_1 p S2T amino acid variant ok -variants_only_1 p I2L amin acid variant not ok -variants_only_1 . . should be kept as a generic description of variants_only_1 -variants_only_1 . . -variants_only_not_in_fasta . . should be removed from tsv because not in fasta -variants_only_no_good_variants n A4G dna variant not ok +non_coding_1 . . . non_coding_1 description1 +non_coding_1 . . . should be in output because this field is here +non_coding_1 p L2K . should be removed because this is non-coding, but variant is protein +non_coding_1 n C5A id1 dna variant ok +non_coding_not_in_fasta . . . should be removed from tsv because not in fasta +presence_absence_1 . . . . +presence_absence_1 . . . should be in output because this field is here +presence_absence_2 n T4G id2 dna variant ok +presence_absence_2 n A4G . dna variant not ok +presence_absence_3 p R3S . amino acid variant ok +presence_absence_3 p I3S . amino acid variant not ok +presence_absence_not_in_fasta . . . should be removed from tsv because not in fasta +variants_only_1 n T4A . dna variant ok +variants_only_1 n C4G . dna variant not ok +variants_only_1 p S2T . amino acid variant ok +variants_only_1 p I2L . amin acid variant not ok +variants_only_1 . . . should be kept as a generic description of variants_only_1 +variants_only_1 . . . . +variants_only_not_in_fasta . . . should be removed from tsv because not in fasta +variants_only_no_good_variants n A4G . dna variant not ok diff --git a/ariba/tests/data/reference_data_init.tsv b/ariba/tests/data/reference_data_init.tsv index 612b0774..1e8f1a60 100644 --- a/ariba/tests/data/reference_data_init.tsv +++ b/ariba/tests/data/reference_data_init.tsv @@ -1,4 +1,4 @@ -gene1 n A42G free text -gene1 n A42T free text2 -gene1 n G13T confers killer rabbit resistance -gene2 p I42L removes tardigrade's space-living capability +gene1 n A42G . free text +gene1 n A42T . free text2 +gene1 n G13T . confers killer rabbit resistance +gene2 p I42L . removes tardigrade's space-living capability diff --git a/ariba/tests/data/reference_data_load_metadata_tsv.tsv b/ariba/tests/data/reference_data_load_metadata_tsv.tsv index 8d151a62..3551863f 100644 --- a/ariba/tests/data/reference_data_load_metadata_tsv.tsv +++ b/ariba/tests/data/reference_data_load_metadata_tsv.tsv @@ -1,3 +1,3 @@ -gene1 n A42G free text -gene1 n G13T confers killer rabbit resistance -gene2 p I42L removes tardigrade's space-living capability +gene1 n A42G . free text +gene1 n G13T . confers killer rabbit resistance +gene2 p I42L . removes tardigrade's space-living capability diff --git a/ariba/tests/data/reference_data_rename_sequences_metadata.tsv b/ariba/tests/data/reference_data_rename_sequences_metadata.tsv index 1dd13740..6d43433f 100644 --- a/ariba/tests/data/reference_data_rename_sequences_metadata.tsv +++ b/ariba/tests/data/reference_data_rename_sequences_metadata.tsv @@ -1,11 +1,11 @@ -noncoding1 . . original name "noncoding1" -noncoding1 blah . . original name "noncoding1 blah" -pres_abs1 foo bar spam eggs . . original name "pres_abs1 foo bar spam eggs" -pres_abs1 blah . . original name "pres_abs1 blah" -pres'abs1 . . original name "pres'abs1" -pres_abs2 . . original name "pres_abs2" -pres!abs3 . . original name "pres!abs3" -var_only1 hello . . original name "var_only1 hello" -var:only1 boo . . original name "var:only1 boo" -var_only1 . . original name "var_only1" -var_only2 . . original name "var_only2" +noncoding1 . . . original name "noncoding1" +noncoding1 blah . . . original name "noncoding1 blah" +pres_abs1 foo bar spam eggs . . . original name "pres_abs1 foo bar spam eggs" +pres_abs1 blah . . . original name "pres_abs1 blah" +pres'abs1 . . . original name "pres'abs1" +pres_abs2 . . . original name "pres_abs2" +pres!abs3 . . . original name "pres!abs3" +var_only1 hello . . . original name "var_only1 hello" +var:only1 boo . . . original name "var:only1 boo" +var_only1 . . . original name "var_only1" +var_only2 . . . original name "var_only2" diff --git a/ariba/tests/data/reference_data_test_all_non_wild_type_variants.tsv b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.tsv index 76f865f3..6f1defa0 100644 --- a/ariba/tests/data/reference_data_test_all_non_wild_type_variants.tsv +++ b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.tsv @@ -1,12 +1,12 @@ -var_only_gene n A8T ref has wild type A -var_only_gene n G9C ref has variant C instead of G -var_only_gene p G4I ref has wild type F -var_only_gene p F6I ref has wild type F -var_only_gene p P3Q ref has wild type P -var_only_gene p I5V ref has variant V instead of I -presence_absence_gene n A4G ref has wild type A -presence_absence_gene n A6C ref has variant C instead of A -presence_absence_gene p N2I ref has wild type N -presence_absence_gene p A4G ref has variant G instead of A -non_coding n A2C ref has wild type A -non_coding n C4T ref has variant T instead of C +var_only_gene n A8T . ref has wild type A +var_only_gene n G9C . ref has variant C instead of G +var_only_gene p G4I . ref has wild type F +var_only_gene p F6I . ref has wild type F +var_only_gene p P3Q . ref has wild type P +var_only_gene p I5V . ref has variant V instead of I +presence_absence_gene n A4G . ref has wild type A +presence_absence_gene n A6C . ref has variant C instead of A +presence_absence_gene p N2I . ref has wild type N +presence_absence_gene p A4G . ref has variant G instead of A +non_coding n A2C . ref has wild type A +non_coding n C4T . ref has variant T instead of C diff --git a/ariba/tests/data/reference_data_write_metadata_tsv.expected.tsv b/ariba/tests/data/reference_data_write_metadata_tsv.expected.tsv index 0faf409d..7baa4023 100644 --- a/ariba/tests/data/reference_data_write_metadata_tsv.expected.tsv +++ b/ariba/tests/data/reference_data_write_metadata_tsv.expected.tsv @@ -1,2 +1,2 @@ -gene1 . . has anybody got a bottle of orange juice? -gene2 . . we didn't burn him +gene1 . . . has anybody got a bottle of orange juice? +gene2 . . . we didn't burn him diff --git a/ariba/tests/data/reference_data_write_metadata_tsv.tsv b/ariba/tests/data/reference_data_write_metadata_tsv.tsv index 93192808..4143803e 100644 --- a/ariba/tests/data/reference_data_write_metadata_tsv.tsv +++ b/ariba/tests/data/reference_data_write_metadata_tsv.tsv @@ -1,2 +1,2 @@ -gene2 . . we didn't burn him -gene1 . . has anybody got a bottle of orange juice? +gene2 . . . we didn't burn him +gene1 . . . has anybody got a bottle of orange juice? diff --git a/ariba/tests/data/report_filter_test_init_bad.tsv b/ariba/tests/data/report_filter_test_init_bad.tsv index f3dc87a5..f93b0f57 100644 --- a/ariba/tests/data/report_filter_test_init_bad.tsv +++ b/ariba/tests/data/report_filter_test_init_bad.tsv @@ -1,4 +1,4 @@ #ef_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 1 SNP n C42T 0 . . 42 42 C 142 142 C 500 . 500 Description_of_variant.C42T free_text -cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 1 SNP n A51G 0 . . 51 51 C 151 151 C 542 . 542 Description_of_variant.A51G free_text2 -cluster2 variants_only 179 20000 cluster2 1042 1042 42.42 cluster2.scaffold.1 1442 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 Description_of_variant.I42L free_text3 +cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 1 SNP n C42T 0 . . 42 42 C 142 142 C 500 . 500 a:n:C42T:id1:foo free_text +cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 1 SNP n A51G 0 . . 51 51 C 151 151 C 542 . 542 a:n:A51G:id2:bar free_text2 +cluster2 variants_only 179 20000 cluster2 1042 1042 42.42 cluster2.scaffold.1 1442 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 a:n:I42L:id3:baz free_text3 diff --git a/ariba/tests/data/report_filter_test_init_good.tsv b/ariba/tests/data/report_filter_test_init_good.tsv index 5b3368fd..c98baf86 100644 --- a/ariba/tests/data/report_filter_test_init_good.tsv +++ b/ariba/tests/data/report_filter_test_init_good.tsv @@ -1,5 +1,5 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 10.5 1 SNP n C42T 0 . . 42 42 C 142 142 C 500 . 500 Description_of_variant.C42T free_text -cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 10.5 1 SNP n A51G 0 . . 51 51 C 151 151 C 542 . 542 Description_of_variant.A51G free_text2 -cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.2 1300 12.4 1 SNP n A51G 0 . . 51 51 C 151 151 C 542 . 542 Description_of_variant.A51G free_text3 -cluster2 variants_only 179 20000 cluster2 1042 1042 42.42 cluster2.scaffold.1 1442 20.2 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 Description_of_variant.I42L free_text3 +cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 10.5 1 SNP n C42T 0 . . 42 42 C 142 142 C 500 . 500 a:n:C42T:id1:foo free_text +cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 10.5 1 SNP n A51G 0 . . 51 51 C 151 151 C 542 . 542 a:n:A51G:id2:bar free_text2 +cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.2 1300 12.4 1 SNP n A51G 0 . . 51 51 C 151 151 C 542 . 542 a:n:A51G:id3:spam free_text3 +cluster2 variants_only 179 20000 cluster2 1042 1042 42.42 cluster2.scaffold.1 1442 20.2 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 a:v:I42L:id4:eggs free_text3 diff --git a/ariba/tests/data/report_filter_test_load_report_bad.tsv b/ariba/tests/data/report_filter_test_load_report_bad.tsv index f3dc87a5..553e60ff 100644 --- a/ariba/tests/data/report_filter_test_load_report_bad.tsv +++ b/ariba/tests/data/report_filter_test_load_report_bad.tsv @@ -1,4 +1,4 @@ #ef_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 1 SNP n C42T 0 . . 42 42 C 142 142 C 500 . 500 Description_of_variant.C42T free_text -cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 1 SNP n A51G 0 . . 51 51 C 151 151 C 542 . 542 Description_of_variant.A51G free_text2 -cluster2 variants_only 179 20000 cluster2 1042 1042 42.42 cluster2.scaffold.1 1442 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 Description_of_variant.I42L free_text3 +cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 1 SNP n C42T 0 . . 42 42 C 142 142 C 500 . 500 a:n:C42T:id1:foo free_text +cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 1 SNP n A51G 0 . . 51 51 C 151 151 C 542 . 542 a:n:A51G:id1:bar free_text2 +cluster2 variants_only 179 20000 cluster2 1042 1042 42.42 cluster2.scaffold.1 1442 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 a:n:I42L:id1:foo free_text3 diff --git a/ariba/tests/data/report_filter_test_load_report_good.tsv b/ariba/tests/data/report_filter_test_load_report_good.tsv index 9a0afe0d..1165ea45 100644 --- a/ariba/tests/data/report_filter_test_load_report_good.tsv +++ b/ariba/tests/data/report_filter_test_load_report_good.tsv @@ -1,5 +1,5 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 12.2 1 SNP n C42T 0 . . 42 42 C 142 142 C 500 . 500 Description_of_variant.C42T free_text -cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 12.2 1 SNP n A51G 0 . . 51 51 C 151 151 C 542 . 542 Description_of_variant.A51G free_text2 -cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.2 1300 22.2 1 SNP n A51G 0 . . 51 51 C 151 151 C 542 . 542 Description_of_variant.A51G free_text3 -cluster2 variants_only 179 20000 cluster2 1042 1042 42.42 cluster2.scaffold.1 1442 33.3 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 Description_of_variant.I42L free_text3 +cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 12.2 1 SNP n C42T 0 . . 42 42 C 142 142 C 500 . 500 a:n:C42T:id1:foo free_text +cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 12.2 1 SNP n A51G 0 . . 51 51 C 151 151 C 542 . 542 a:n:A51G:id2:bar free_text2 +cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.2 1300 22.2 1 SNP n A51G 0 . . 51 51 C 151 151 C 542 . 542 a:n:A51G:id3:spam free_text3 +cluster2 variants_only 179 20000 cluster2 1042 1042 42.42 cluster2.scaffold.1 1442 33.3 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 a:v:I42L:id4:eggs free_text3 diff --git a/ariba/tests/data/report_filter_test_run.expected.tsv b/ariba/tests/data/report_filter_test_run.expected.tsv index 0a6c41e1..a35a0cf6 100644 --- a/ariba/tests/data/report_filter_test_run.expected.tsv +++ b/ariba/tests/data/report_filter_test_run.expected.tsv @@ -1,6 +1,6 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 12.4 1 SNP n A51G 1 . . 51 51 C 151 151 C 542 . 542 Description_of_variant.A51G free_text2 -cluster2 variants_only 179 20000 cluster2 1042 1042 99.0 cluster2.scaffold.1 1442 13.5 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 Description_of_variant.I42L free_text3 +cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 12.4 1 SNP n A51G 1 . . 51 51 C 151 151 C 542 . 542 a:n:A51G:id2:bar free_text2 +cluster2 variants_only 179 20000 cluster2 1042 1042 99.0 cluster2.scaffold.1 1442 13.5 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 a:n:I42L:id3:baz free_text3 cluster4 variants_only 179 20000 cluster4 1042 1042 99.0 cluster4.scaffold.1 1442 14.6 . . . . . . . . . . . . . . . . . free_text3 cluster5 presence_absence 528 1874 cluster5 1188 1097 92.43 cluster5.scaffold.1 2218 20.0 0 . p . 0 E89G NONSYN 65 265 A;A 766 766 G;C 88;90 .;. 87;90 . .' cluster5 presence_absence 528 1874 cluster5 1188 1097 92.43 cluster5.scaffold.1 2218 20.0 0 . p . 0 Q37fs FSHIFT 109 109 A 634 634 . 67 . 67 . . diff --git a/ariba/tests/data/report_filter_test_run.in.tsv b/ariba/tests/data/report_filter_test_run.in.tsv index 59a81cfc..f701c3a3 100644 --- a/ariba/tests/data/report_filter_test_run.in.tsv +++ b/ariba/tests/data/report_filter_test_run.in.tsv @@ -1,9 +1,9 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -cluster1 non_coding 27 10000 cluster1 1000 0 99.42 cluster1.scaffold.1 1300 12.4 1 SNP n C42T 0 . . 42 42 C 142 142 C 500 . 500 Description_of_variant.C42T free_text -cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 12.4 1 SNP n A51G 1 . . 51 51 C 151 151 C 542 . 542 Description_of_variant.A51G free_text2 -cluster2 variants_only 179 20000 cluster2 1042 1042 99.0 cluster2.scaffold.1 1442 13.5 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 Description_of_variant.I42L free_text3 -cluster3 variants_only 179 20000 cluster3 1042 1042 89.0 cluster2.scaffold.1 1442 13.5 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 Description_of_variant.I42L free_text3 -cluster4 variants_only 179 20000 cluster4 1042 1042 99.0 cluster4.scaffold.1 1442 14.6 1 SNP p I42L 1 I42L SYN 112 112 C 442 442 T 300 . 290 Description_of_variant.I42L free_text3 +cluster1 non_coding 27 10000 cluster1 1000 0 99.42 cluster1.scaffold.1 1300 12.4 1 SNP n C42T 0 . . 42 42 C 142 142 C 500 . 500 a:n:C42T:id1:foo free_text +cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 12.4 1 SNP n A51G 1 . . 51 51 C 151 151 C 542 . 542 a:n:A51G:id2:bar free_text2 +cluster2 variants_only 179 20000 cluster2 1042 1042 99.0 cluster2.scaffold.1 1442 13.5 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 a:n:I42L:id3:baz free_text3 +cluster3 variants_only 179 20000 cluster3 1042 1042 89.0 cluster2.scaffold.1 1442 13.5 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 a:n:I42L:id4:spam free_text3 +cluster4 variants_only 179 20000 cluster4 1042 1042 99.0 cluster4.scaffold.1 1442 14.6 1 SNP p I42L 1 I42L SYN 112 112 C 442 442 T 300 . 290 a:n:I42L:id5:eggs free_text3 cluster5 presence_absence 528 1874 cluster5 1188 1097 92.43 cluster5.scaffold.1 2218 20.0 0 . p . 0 E89G NONSYN 65 265 A;A 766 766 G;C 88;90 .;. 87;90 . .' cluster5 presence_absence 528 1874 cluster5 1188 1097 92.43 cluster5.scaffold.1 2218 20.0 0 . p . 0 Q37fs FSHIFT 109 109 A 634 634 . 67 . 67 . . cluster5 presence_absence 528 1874 cluster5 1188 1097 92.43 cluster5.scaffold.1 2218 20.0 0 . p . 0 E89G NONSYN 265 265 A;A 766 766 G;C 88;90 .;. 87;90 . . diff --git a/ariba/tests/data/report_filter_test_write_report.tsv b/ariba/tests/data/report_filter_test_write_report.tsv index e159675a..11b3ab4f 100644 --- a/ariba/tests/data/report_filter_test_write_report.tsv +++ b/ariba/tests/data/report_filter_test_write_report.tsv @@ -1,4 +1,4 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 42.4 1 SNP n C42T 0 . . 42 42 C 142 142 C 500 . 500 Description_of_variant.C42T free_text -cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 42.4 1 SNP n A51G 0 . . 51 51 C 151 151 C 542 . 542 Description_of_variant.A51G free_text2 -cluster2 variants_only 179 20000 cluster2 1042 1042 42.42 cluster2.scaffold.1 1442 42.4 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 Description_of_variant.I42L free_text3 +cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 42.4 1 SNP n C42T 0 . . 42 42 C 142 142 C 500 . 500 a:n:C42T:id1:foo free_text +cluster1 non_coding 27 10000 cluster1 1000 999 99.42 cluster1.scaffold.1 1300 42.4 1 SNP n A51G 0 . . 51 51 C 151 151 C 542 . 542 a:n:A51G:id2:bar free_text2 +cluster2 variants_only 179 20000 cluster2 1042 1042 42.42 cluster2.scaffold.1 1442 42.4 1 SNP p I42L 1 I42L NONSYN 112 112 C 442 442 T 300 . 290 a:v:I42L:id3:baz free_text3 diff --git a/ariba/tests/data/summary_sample_test_column_names_tuples.tsv b/ariba/tests/data/summary_sample_test_column_names_tuples.tsv index 5e5926c6..4684c424 100644 --- a/ariba/tests/data/summary_sample_test_column_names_tuples.tsv +++ b/ariba/tests/data/summary_sample_test_column_names_tuples.tsv @@ -1,7 +1,8 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1_n_A14T_N_ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1_n_A6G_N_variant in ref and reads so should report generic description of noncoding1 +noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:.:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 0 SNP . . . G15T SNP 15 15 G 85 85 T 17 . 17 . generic description of noncoding1 +noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1 noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding2.scaffold.1 279 35.4 . . . . . . . . . . . . . . . . . generic description of noncoding2 -presence_absence1 presence_absence 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1_p_A10V_N_Ref has wild, reads have variant so report Generic description of presence_absence1 +presence_absence1 presence_absence 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 presence_absence1 presence_absence 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.2 267 35.1 . . . . . . . . . . . . . . . . . Generic description of presence_absence2 -variants_only1 variants_only 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1_p_S5T_N_Ref and reads have variant so report Generic description of variants_only1 +variants_only1 variants_only 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:p:S5T:.:Ref and reads have variant so report Generic description of variants_only1 diff --git a/ariba/tests/data/summary_sample_test_column_summary_data.tsv b/ariba/tests/data/summary_sample_test_column_summary_data.tsv index 733e0963..cf7e5b98 100644 --- a/ariba/tests/data/summary_sample_test_column_summary_data.tsv +++ b/ariba/tests/data/summary_sample_test_column_summary_data.tsv @@ -1,7 +1,8 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1_n_A14T_N_ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1_n_A6G_N_variant in ref and reads so should report generic description of noncoding1 +noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1_n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 0 SNP . . . G15T SNP 15 15 G 85 85 T 17 . 17 . generic description of noncoding1 +noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1_n:A6G:id2:variant in ref and reads so should report generic description of noncoding1 noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding2.scaffold.1 279 35.4 . . . . . . . . . . . . . . . . . generic description of noncoding2 -presence_absence1 presence_absence 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1_p_A10V_N_Ref has wild, reads have variant so report Generic description of presence_absence1 +presence_absence1 presence_absence 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 presence_absence1 presence_absence 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.2 267 35.1 . . . . . . . . . . . . . . . . . Generic description of presence_absence2 -variants_only1 variants_only 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1_p_S5T_N_Ref and reads have variant so report Generic description of variants_only1 +variants_only1 variants_only 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:p:S5T:id4:Ref and reads have variant so report Generic description of variants_only1 diff --git a/ariba/tests/data/summary_sample_test_load_file.in.tsv b/ariba/tests/data/summary_sample_test_load_file.in.tsv index 733e0963..524d3347 100644 --- a/ariba/tests/data/summary_sample_test_load_file.in.tsv +++ b/ariba/tests/data/summary_sample_test_load_file.in.tsv @@ -1,7 +1,7 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1_n_A14T_N_ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1_n_A6G_N_variant in ref and reads so should report generic description of noncoding1 +noncoding1 non:coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 non:coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1 noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding2.scaffold.1 279 35.4 . . . . . . . . . . . . . . . . . generic description of noncoding2 -presence_absence1 presence_absence 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1_p_A10V_N_Ref has wild, reads have variant so report Generic description of presence_absence1 +presence_absence1 presence_absence 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 presence_absence1 presence_absence 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.2 267 35.1 . . . . . . . . . . . . . . . . . Generic description of presence_absence2 -variants_only1 variants_only 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1_p_S5T_N_Ref and reads have variant so report Generic description of variants_only1 +variants_only1 variants_only 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:p:S5T:id4:Ref and reads have variant so report Generic description of variants_only1 diff --git a/ariba/tests/data/summary_sample_test_non_synon_variants.tsv b/ariba/tests/data/summary_sample_test_non_synon_variants.tsv index 733e0963..b8f5753d 100644 --- a/ariba/tests/data/summary_sample_test_non_synon_variants.tsv +++ b/ariba/tests/data/summary_sample_test_non_synon_variants.tsv @@ -1,7 +1,8 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1_n_A14T_N_ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1_n_A6G_N_variant in ref and reads so should report generic description of noncoding1 +noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 0 SNP . . . G15T SNP 15 15 G 85 85 T 17 . 17 . generic description of noncoding1 +noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1 noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding2.scaffold.1 279 35.4 . . . . . . . . . . . . . . . . . generic description of noncoding2 -presence_absence1 presence_absence 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1_p_A10V_N_Ref has wild, reads have variant so report Generic description of presence_absence1 +presence_absence1 presence_absence 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 presence_absence1 presence_absence 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.2 267 35.1 . . . . . . . . . . . . . . . . . Generic description of presence_absence2 -variants_only1 variants_only 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1_p_S5T_N_Ref and reads have variant so report Generic description of variants_only1 +variants_only1 variants_only 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:p:S5T:id4:Ref and reads have variant so report Generic description of variants_only1 diff --git a/ariba/tests/data/summary_sample_test_var_groups.tsv b/ariba/tests/data/summary_sample_test_var_groups.tsv new file mode 100644 index 00000000..056296ab --- /dev/null +++ b/ariba/tests/data/summary_sample_test_var_groups.tsv @@ -0,0 +1,7 @@ +#ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text +noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1 +noncoding1 non_coding 19 78 cluster.n 120 120 98.33 noncoding2.scaffold.1 279 35.4 . . . . . . . . . . . . . . . . . generic description of noncoding2 +presence_absence1 presence_absence 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 +presence_absence1 presence_absence 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.2 267 35.1 . . . . . . . . . . . . . . . . . Generic description of presence_absence2 +variants_only1 variants_only 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:p:S5T:id4:Ref and reads have variant so report Generic description of variants_only1 diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv index f88dd14e..6ec23eca 100644 --- a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv +++ b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv @@ -1,3 +1,3 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 non_coding 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1_n_A14T_N_ref has wild type, reads have variant so should report generic description of noncoding1 -presence_absence1 presence_absence 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1_p_A10V_N_Ref has wild, reads have variant so report Generic description of presence_absence1 +noncoding1 non_coding 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +presence_absence1 presence_absence 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv index 3e322baa..322f9656 100644 --- a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv +++ b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv @@ -1,5 +1,5 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 non_coding 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1_n_A14T_N_ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 non_coding 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1_n_A6G_N_variant in ref and reads so should report generic description of noncoding1 -presence_absence1 presence_absence 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1_p_A10V_N_Ref has wild, reads have variant so report Generic description of presence_absence1 +noncoding1 non_coding 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 non_coding 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id3:variant in ref and reads so should report generic description of noncoding1 +presence_absence1 presence_absence 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1 variants_only1 variants_only 64 12 variants_only1 90 . . . . . . . . . . . . . . . . . . . . . . . diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv index 5848d5d7..2d068427 100644 --- a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv +++ b/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv @@ -1,3 +1,3 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 non_coding 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1_n_A14T_N_ref has wild type, reads have variant so should report generic description of noncoding1 -presence_absence1 presence_absence 27 88 cluster.p.1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1_p_A10V_N_Ref has wild, reads have variant so report Generic description of presence_absence1 +noncoding1 non_coding 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +presence_absence1 presence_absence 27 88 cluster.p.1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id2:Ref has wild, reads have variant so report Generic description of presence_absence1 diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv index a80d7582..0058b231 100644 --- a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv +++ b/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv @@ -1,5 +1,5 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 non_coding 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1_n_A14T_N_ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 non_coding 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1_n_A6G_N_variant in ref and reads so should report generic description of noncoding1 -presence_absence1 presence_absence 27 88 cluster.p.2 96 96 98.96 presence_absence1.scaffold.1 267 51.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1_p_A10V_N_Ref has wild, reads have variant so report Generic description of presence_absence1 +noncoding1 non_coding 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 non_coding 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1 +presence_absence1 presence_absence 27 88 cluster.p.2 96 96 98.96 presence_absence1.scaffold.1 267 51.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 variants_only1 variants_only 64 12 cluster.v.1 90 . . . . . . . . . . . . . . . . . . . . . . . diff --git a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv b/ariba/tests/data/summary_test_get_all_var_groups.1.tsv new file mode 100644 index 00000000..e3465e4e --- /dev/null +++ b/ariba/tests/data/summary_test_get_all_var_groups.1.tsv @@ -0,0 +1,3 @@ +#ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text +noncoding1 non_coding 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +presence_absence1 presence_absence 27 88 cluster.p.1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id4:Ref has wild, reads have variant so report Generic description of presence_absence1 diff --git a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv b/ariba/tests/data/summary_test_get_all_var_groups.2.tsv new file mode 100644 index 00000000..0058b231 --- /dev/null +++ b/ariba/tests/data/summary_test_get_all_var_groups.2.tsv @@ -0,0 +1,5 @@ +#ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text +noncoding1 non_coding 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 non_coding 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1 +presence_absence1 presence_absence 27 88 cluster.p.2 96 96 98.96 presence_absence1.scaffold.1 267 51.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 +variants_only1 variants_only 64 12 cluster.v.1 90 . . . . . . . . . . . . . . . . . . . . . . . diff --git a/ariba/tests/data/summary_test_load_input_files.1.tsv b/ariba/tests/data/summary_test_load_input_files.1.tsv index ffee4cdb..e1bc25f8 100644 --- a/ariba/tests/data/summary_test_load_input_files.1.tsv +++ b/ariba/tests/data/summary_test_load_input_files.1.tsv @@ -1,3 +1,3 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 non_coding 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1_n_A14T_N_ref has wild type, reads have variant so should report generic description of noncoding1 -presence_absence1 presence_absence 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1_p_A10V_N_Ref has wild, reads have variant so report Generic description of presence_absence1 +noncoding1 non_coding 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +presence_absence1 presence_absence 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id2:Ref has wild, reads have variant so report Generic description of presence_absence1 diff --git a/ariba/tests/data/summary_test_load_input_files.2.tsv b/ariba/tests/data/summary_test_load_input_files.2.tsv index b4dcb0e8..ff47b223 100644 --- a/ariba/tests/data/summary_test_load_input_files.2.tsv +++ b/ariba/tests/data/summary_test_load_input_files.2.tsv @@ -1,5 +1,5 @@ #ref_name ref_type flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text -noncoding1 non_coding 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1_n_A14T_N_ref has wild type, reads have variant so should report generic description of noncoding1 -noncoding1 non_coding 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1_n_A6G_N_variant in ref and reads so should report generic description of noncoding1 -presence_absence1 presence_absence 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1_p_A10V_N_Ref has wild, reads have variant so report Generic description of presence_absence1 +noncoding1 non_coding 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1 +noncoding1 non_coding 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1 +presence_absence1 presence_absence 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1 variants_only1 variants_only 64 12 variants_only1 90 . . . . . . . . . . . . . . . . . . . . . . . diff --git a/ariba/tests/data/vfdb_parser_test_run.out.tsv b/ariba/tests/data/vfdb_parser_test_run.out.tsv index 83652b41..242514cb 100644 --- a/ariba/tests/data/vfdb_parser_test_run.out.tsv +++ b/ariba/tests/data/vfdb_parser_test_run.out.tsv @@ -1,2 +1,2 @@ -abcD.VF123(gi:1234).genus1_species1 . . foobar description1 [abc] -efgH.VF234(gi:2345).genus2_species2 . . spam eggs description2 [abc] +abcD.VF123(gi:1234).genus1_species1 . . . foobar description1 [abc] +efgH.VF234(gi:2345).genus2_species2 . . . spam eggs description2 [abc] diff --git a/ariba/tests/reference_data_test.py b/ariba/tests/reference_data_test.py index f01a7ccd..cb64077f 100644 --- a/ariba/tests/reference_data_test.py +++ b/ariba/tests/reference_data_test.py @@ -30,10 +30,10 @@ def test_init_ok(self): '''Test init with good input''' tsv_file = os.path.join(data_dir, 'reference_data_init.tsv') presence_absence_fa = os.path.join(data_dir, 'reference_data_init_presence_absence.fa') - meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\tfree text') - meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tA42T\tfree text2') - meta3 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\tconfers killer rabbit resistance') - meta4 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\tremoves tardigrade's space-living capability") + meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\t.\tfree text') + meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tA42T\t.\tfree text2') + meta3 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\t.\tconfers killer rabbit resistance') + meta4 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\t.\tremoves tardigrade's space-living capability") expected_metadata = { 'gene1': { @@ -83,9 +83,9 @@ def test_get_filename(self): def test_load_metadata_tsv(self): '''Test _load_metadata_tsv''' - meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\tfree text') - meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\tconfers killer rabbit resistance') - meta3 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\tremoves tardigrade's space-living capability") + meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\t.\tfree text') + meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\t.\tconfers killer rabbit resistance') + meta3 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\t.\tremoves tardigrade's space-living capability") expected = { 'gene1': { 'n': {12: {meta2}, 41: {meta1}}, @@ -284,13 +284,13 @@ def test_rename_names_in_seq_dicts(self): def test_rename_metadata_set(self): '''Test _rename_metadata_set''' metaset = { - sequence_metadata.SequenceMetadata('foo 1\t.\t.\tdescription'), - sequence_metadata.SequenceMetadata('foo 1\tp\tI42L\tspam eggs') + sequence_metadata.SequenceMetadata('foo 1\t.\t.\t.\tdescription'), + sequence_metadata.SequenceMetadata('foo 1\tp\tI42L\t.\tspam eggs') } expected = { - sequence_metadata.SequenceMetadata('new_name\t.\t.\tdescription'), - sequence_metadata.SequenceMetadata('new_name\tp\tI42L\tspam eggs') + sequence_metadata.SequenceMetadata('new_name\t.\t.\t.\tdescription'), + sequence_metadata.SequenceMetadata('new_name\tp\tI42L\t.\tspam eggs') } got = reference_data.ReferenceData._rename_metadata_set(metaset, 'new_name') self.assertEqual(expected, got) @@ -298,15 +298,15 @@ def test_rename_metadata_set(self): def test_rename_names_in_metadata(self): '''Test _rename_names_in_metadata''' - meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\tfree text') - meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tA42T\tfree text2') - meta3 = sequence_metadata.SequenceMetadata('gene1\t.\t.\tfree text3') - meta4 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\tconfers killer rabbit resistance') - meta5 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\tremoves tardigrade's space-living capability") - meta1rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tA42G\tfree text') - meta2rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tA42T\tfree text2') - meta3rename = sequence_metadata.SequenceMetadata('new_gene1\t.\t.\tfree text3') - meta4rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tG13T\tconfers killer rabbit resistance') + meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\t.\tfree text') + meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tA42T\t.\tfree text2') + meta3 = sequence_metadata.SequenceMetadata('gene1\t.\t.\t.\tfree text3') + meta4 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\t.\tconfers killer rabbit resistance') + meta5 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\t.\tremoves tardigrade's space-living capability") + meta1rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tA42G\t.\tfree text') + meta2rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tA42T\t.\tfree text2') + meta3rename = sequence_metadata.SequenceMetadata('new_gene1\t.\t.\t.\tfree text3') + meta4rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tG13T\t.\tconfers killer rabbit resistance') metadata = { 'gene1': { @@ -357,17 +357,17 @@ def test_rename_sequences(self): self.assertTrue(filecmp.cmp(expected_file, tmp_out, shallow=False)) os.unlink(tmp_out) - meta1 = sequence_metadata.SequenceMetadata('noncoding1\t.\t.\toriginal name "noncoding1"') - meta2 = sequence_metadata.SequenceMetadata('noncoding1_1\t.\t.\toriginal name "noncoding1 blah"') - meta3 = sequence_metadata.SequenceMetadata('pres_abs1_2\t.\t.\toriginal name "pres_abs1 foo bar spam eggs"') - meta4 = sequence_metadata.SequenceMetadata('pres_abs1_1\t.\t.\toriginal name "pres_abs1 blah"') - meta5 = sequence_metadata.SequenceMetadata('pres_abs1\t.\t.\toriginal name "pres\'abs1"') - meta6 = sequence_metadata.SequenceMetadata('pres_abs2\t.\t.\toriginal name "pres_abs2"') - meta7 = sequence_metadata.SequenceMetadata('pres_abs3\t.\t.\toriginal name "pres!abs3"') - meta8 = sequence_metadata.SequenceMetadata('var_only1_2\t.\t.\toriginal name "var_only1 hello"') - meta9 = sequence_metadata.SequenceMetadata('var_only1\t.\t.\toriginal name "var:only1 boo"') - meta10 = sequence_metadata.SequenceMetadata('var_only1_1\t.\t.\toriginal name "var_only1"') - meta11 = sequence_metadata.SequenceMetadata('var_only2\t.\t.\toriginal name "var_only2"') + meta1 = sequence_metadata.SequenceMetadata('noncoding1\t.\t.\t.\toriginal name "noncoding1"') + meta2 = sequence_metadata.SequenceMetadata('noncoding1_1\t.\t.\t.\toriginal name "noncoding1 blah"') + meta3 = sequence_metadata.SequenceMetadata('pres_abs1_2\t.\t.\t.\toriginal name "pres_abs1 foo bar spam eggs"') + meta4 = sequence_metadata.SequenceMetadata('pres_abs1_1\t.\t.\t.\toriginal name "pres_abs1 blah"') + meta5 = sequence_metadata.SequenceMetadata('pres_abs1\t.\t.\t.\toriginal name "pres\'abs1"') + meta6 = sequence_metadata.SequenceMetadata('pres_abs2\t.\t.\t.\toriginal name "pres_abs2"') + meta7 = sequence_metadata.SequenceMetadata('pres_abs3\t.\t.\t.\toriginal name "pres!abs3"') + meta8 = sequence_metadata.SequenceMetadata('var_only1_2\t.\t.\t.\toriginal name "var_only1 hello"') + meta9 = sequence_metadata.SequenceMetadata('var_only1\t.\t.\t.\toriginal name "var:only1 boo"') + meta10 = sequence_metadata.SequenceMetadata('var_only1_1\t.\t.\t.\toriginal name "var_only1"') + meta11 = sequence_metadata.SequenceMetadata('var_only2\t.\t.\t.\toriginal name "var_only2"') expected_meta = { 'noncoding1': {'n': {}, 'p': {}, '.': {meta1}}, @@ -476,18 +476,18 @@ def test_all_non_wild_type_variants(self): metadata_tsv=tsv_file ) - v1 = sequence_metadata.SequenceMetadata('var_only_gene\tn\tA8T\tref has wild type A') - v2 = sequence_metadata.SequenceMetadata('var_only_gene\tn\tG9C\tref has variant C instead of G') - v3 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tP3Q\tref has wild type P') - v4 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tG4I\tref has wild type F') - v5 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tI5V\tref has variant V instead of I') - v6 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tF6I\tref has wild type F') - p1 = sequence_metadata.SequenceMetadata('presence_absence_gene\tn\tA4G\tref has wild type A') - p2 = sequence_metadata.SequenceMetadata('presence_absence_gene\tn\tA6C\tref has variant C instead of A') - p3 = sequence_metadata.SequenceMetadata('presence_absence_gene\tp\tN2I\tref has wild type N') - p4 = sequence_metadata.SequenceMetadata('presence_absence_gene\tp\tA4G\tref has variant G instead of A') - n1 = sequence_metadata.SequenceMetadata('non_coding\tn\tA2C\tref has wild type A') - n2 = sequence_metadata.SequenceMetadata('non_coding\tn\tC4T\tref has variant T instead of C') + v1 = sequence_metadata.SequenceMetadata('var_only_gene\tn\tA8T\t.\tref has wild type A') + v2 = sequence_metadata.SequenceMetadata('var_only_gene\tn\tG9C\t.\tref has variant C instead of G') + v3 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tP3Q\t.\tref has wild type P') + v4 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tG4I\t.\tref has wild type F') + v5 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tI5V\t.\tref has variant V instead of I') + v6 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tF6I\t.\tref has wild type F') + p1 = sequence_metadata.SequenceMetadata('presence_absence_gene\tn\tA4G\t.\tref has wild type A') + p2 = sequence_metadata.SequenceMetadata('presence_absence_gene\tn\tA6C\t.\tref has variant C instead of A') + p3 = sequence_metadata.SequenceMetadata('presence_absence_gene\tp\tN2I\t.\tref has wild type N') + p4 = sequence_metadata.SequenceMetadata('presence_absence_gene\tp\tA4G\t.\tref has variant G instead of A') + n1 = sequence_metadata.SequenceMetadata('non_coding\tn\tA2C\t.\tref has wild type A') + n2 = sequence_metadata.SequenceMetadata('non_coding\tn\tC4T\t.\tref has variant T instead of C') var_only_expected = { 'n': {7: {v1}, 8: {v2}}, diff --git a/ariba/tests/report_filter_test.py b/ariba/tests/report_filter_test.py index 5c8919b4..7725de67 100644 --- a/ariba/tests/report_filter_test.py +++ b/ariba/tests/report_filter_test.py @@ -13,10 +13,10 @@ def test_init_good_file(self): '''test __init__ on good input file''' infile = os.path.join(data_dir, 'report_filter_test_init_good.tsv') rf = report_filter.ReportFilter(infile=infile) - line1 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '10.5', '1', 'SNP', 'n', 'C42T', '0', '.', '.', '42', '42', 'C', '142', '142', 'C', '500', '.', '500', 'Description_of_variant.C42T', 'free_text']) - line2 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '10.5', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'Description_of_variant.A51G', 'free_text2']) - line3 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.2', '1300', '12.4', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'Description_of_variant.A51G', 'free_text3']) - line4 = '\t'.join(['cluster2', 'variants_only', '179', '20000', 'cluster2', '1042', '1042', '42.42', 'cluster2.scaffold.1', '1442', '20.2', '1', 'SNP', 'p', 'I42L', '1', 'I42L', 'NONSYN', '112', '112', 'C', '442', '442', 'T', '300', '.', '290', 'Description_of_variant.I42L', 'free_text3']) + line1 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '10.5', '1', 'SNP', 'n', 'C42T', '0', '.', '.', '42', '42', 'C', '142', '142', 'C', '500', '.', '500', 'a:n:C42T:id1:foo', 'free_text']) + line2 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '10.5', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'a:n:A51G:id2:bar', 'free_text2']) + line3 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.2', '1300', '12.4', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'a:n:A51G:id3:spam', 'free_text3']) + line4 = '\t'.join(['cluster2', 'variants_only', '179', '20000', 'cluster2', '1042', '1042', '42.42', 'cluster2.scaffold.1', '1442', '20.2', '1', 'SNP', 'p', 'I42L', '1', 'I42L', 'NONSYN', '112', '112', 'C', '442', '442', 'T', '300', '.', '290', 'a:v:I42L:id4:eggs', 'free_text3']) expected = { 'cluster1': { @@ -39,7 +39,7 @@ def test_init_bad_file(self): def test_report_line_to_dict(self): - line = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t99.42\tcluster1.scaffold.1\t999\t23.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text' + line = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t99.42\tcluster1.scaffold.1\t999\t23.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\ta:n:C42T:id1:foo\tfree text' expected = { 'ref_name': 'cluster1', 'ref_type': 'non_coding', @@ -68,7 +68,7 @@ def test_report_line_to_dict(self): 'smtls_total_depth': '500', 'smtls_alt_nt': '.', 'smtls_alt_depth': '500', - 'var_description': 'Description_of_variant C42T', + 'var_description': 'a:n:C42T:id1:foo', 'free_text': 'free text', } @@ -108,11 +108,11 @@ def test_dict_to_report_line(self): 'smtls_total_depth': '500', 'smtls_alt_nt': '.', 'smtls_alt_depth': '500', - 'var_description': 'Description_of_variant C42T', + 'var_description': 'a:n:C42T:id1:foo', 'free_text': 'free text', } - expected = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t99.42\tcluster1.scaffold.1\t1300\t42.4\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text' + expected = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t99.42\tcluster1.scaffold.1\t1300\t42.4\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\ta:n:C42T:id1:foo\tfree text' self.assertEqual(expected, report_filter.ReportFilter._dict_to_report_line(report_dict)) @@ -120,10 +120,10 @@ def test_load_report(self): good_infile = os.path.join(data_dir, 'report_filter_test_load_report_good.tsv') bad_infile = os.path.join(data_dir, 'report_filter_test_load_report_bad.tsv') - line1 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '12.2', '1', 'SNP', 'n', 'C42T', '0', '.', '.', '42', '42', 'C', '142', '142', 'C', '500', '.', '500', 'Description_of_variant.C42T', 'free_text']) - line2 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '12.2', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'Description_of_variant.A51G', 'free_text2']) - line3 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.2', '1300', '22.2', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'Description_of_variant.A51G', 'free_text3']) - line4 = '\t'.join(['cluster2', 'variants_only', '179', '20000', 'cluster2', '1042', '1042', '42.42', 'cluster2.scaffold.1', '1442', '33.3', '1', 'SNP', 'p', 'I42L', '1', 'I42L', 'NONSYN', '112', '112', 'C', '442', '442', 'T', '300', '.', '290', 'Description_of_variant.I42L', 'free_text3']) + line1 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '12.2', '1', 'SNP', 'n', 'C42T', '0', '.', '.', '42', '42', 'C', '142', '142', 'C', '500', '.', '500', 'a:n:C42T:id1:foo', 'free_text']) + line2 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '12.2', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'a:n:A51G:id2:bar', 'free_text2']) + line3 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.2', '1300', '22.2', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'a:n:A51G:id3:spam', 'free_text3']) + line4 = '\t'.join(['cluster2', 'variants_only', '179', '20000', 'cluster2', '1042', '1042', '42.42', 'cluster2.scaffold.1', '1442', '33.3', '1', 'SNP', 'p', 'I42L', '1', 'I42L', 'NONSYN', '112', '112', 'C', '442', '442', 'T', '300', '.', '290', 'a:v:I42L:id4:eggs', 'free_text3']) expected = { 'cluster1': { diff --git a/ariba/tests/sequence_metadata_test.py b/ariba/tests/sequence_metadata_test.py index aad7eddb..ed4fe473 100644 --- a/ariba/tests/sequence_metadata_test.py +++ b/ariba/tests/sequence_metadata_test.py @@ -13,34 +13,42 @@ def test_init_fails_on_bad_lines(self): lines = [ 'only one column. There can NOT be only one\n', 'two\tcolumns is not enough\n', - 'five\tcolumns\tis\ttoo\tmany\n', + 'three\tcolumns\tis still not enough\n', + 'four\tcolumns\tis\tis also not enough\n', + 'six\tcolumns\tis\tone\ttoo\tmany\n', ] for line in lines: with self.assertRaises(sequence_metadata.Error): sequence_metadata.SequenceMetadata(line) - with self.assertRaises(sequence_variant.Error): - sequence_metadata.SequenceMetadata('gene\tx\tI42L\n') + lines = [ + 'gene\tx\tI42L\tid\tfoo\n', + ] + + for line in lines: + with self.assertRaises(sequence_variant.Error): + sequence_metadata.SequenceMetadata(line) def test_init_on_good_input(self): '''test init ok on good input''' - data = sequence_metadata.SequenceMetadata('gene\tn\tI42L\tspam spam wonderful spam') + data = sequence_metadata.SequenceMetadata('gene\tn\tI42L\tid\tspam spam wonderful spam') self.assertEqual(data.name, 'gene') self.assertEqual(data.variant_type, 'n') self.assertEqual(data.variant.wild_value, 'I') self.assertEqual(data.variant.variant_value, 'L') + self.assertEqual(data.variant.identifier, 'id') self.assertEqual(data.free_text, 'spam spam wonderful spam') def test_str(self): '''test __str__''' lines = [ - 'gene1\tn\tA42G\tspam', - 'gene2\t.\t.', - 'gene3\t.\t.\teggs', - 'gene4\tp\tI42K\tthis mutation kills tardigrades', + 'gene1\tn\tA42G\tid1\tspam', + 'gene2\t.\t.\t.\t.', + 'gene3\t.\t.\t.\teggs', + 'gene4\tp\tI42K\tid\tthis mutation kills tardigrades', ] for line in lines: @@ -50,11 +58,11 @@ def test_str(self): def test_has_variant(self): '''test has_variant''' tests = [ - ('gene1\t.\t.', False), - ('gene1\tn\tA2T', True), - ('gene1\tn\tT2A', False), - ('gene1\tp\tI2Y', True), - ('gene1\tp\tY2I', False), + ('gene1\t.\t.\t.\t.', False), + ('gene1\tn\tA2T\t.\t,', True), + ('gene1\tn\tT2A\t.\t.', False), + ('gene1\tp\tI2Y\t.\t.', True), + ('gene1\tp\tY2I\t.\t.', False), ] seq = pyfastaq.sequences.Fasta('name', 'ATGTATTGCTGA') # translation: MYC* @@ -62,3 +70,20 @@ def test_has_variant(self): for line, expected in tests: metadata = sequence_metadata.SequenceMetadata(line) self.assertEqual(expected, metadata.has_variant(seq)) + + + def test_to_string(self): + '''test to_string''' + lines = [ + ('gene1', 'n', 'A42G', 'id1', 'spam'), + ('gene2', '.', '.', '.', '.'), + ('gene3', '.', '.', '.', 'eggs'), + ('gene4', 'p', 'I42K', 'id', 'this mutation kills tardigrades'), + ] + + for line in lines: + m = sequence_metadata.SequenceMetadata('\t'.join(line)) + for separator in ('_', '\t'): + expected = separator.join(line) + self.assertEqual(expected, m.to_string(separator=separator)) + diff --git a/ariba/tests/sequence_variant_test.py b/ariba/tests/sequence_variant_test.py index 49a55c53..1c199156 100644 --- a/ariba/tests/sequence_variant_test.py +++ b/ariba/tests/sequence_variant_test.py @@ -21,18 +21,22 @@ def test_init_fails_on_bad_variant_strings(self): for var in bad_variants: with self.assertRaises(sequence_variant.Error): - v = sequence_variant.Variant('p', var) + v = sequence_variant.Variant('p', var, '.') def test_init_ok(self): '''Test init ok''' - variants = ['I42K', 'i42k', 'I42k', 'i42K'] + variants = [('I42K', '.'), ('i42k', 'id1'), ('I42k', 'id2'), ('i42K', 'id3')] - for var in variants: - aa_var = sequence_variant.Variant('p', var) + for var, identifier in variants: + aa_var = sequence_variant.Variant('p', var, identifier) self.assertEqual(41, aa_var.position) self.assertEqual('I', aa_var.wild_value) self.assertEqual('K', aa_var.variant_value) + if identifier == '.': + self.assertIsNone(aa_var.identifier) + else: + self.assertEqual(identifier, aa_var.identifier) def test_init_str(self): @@ -41,7 +45,7 @@ def test_init_str(self): expected = 'I42K' for var in variants: - self.assertEqual(expected, str(sequence_variant.Variant('p', var))) + self.assertEqual(expected, str(sequence_variant.Variant('p', var, '.'))) def test_sanity_check_against_seq_no_translate(self): @@ -55,7 +59,7 @@ def test_sanity_check_against_seq_no_translate(self): ] for var, expected in tests: - variant = sequence_variant.Variant('p', var) + variant = sequence_variant.Variant('p', var, '.') self.assertEqual(expected, variant.sanity_check_against_seq(seq)) @@ -70,7 +74,7 @@ def test_sanity_check_against_seq_translate(self): ] for var, expected in tests: - variant = sequence_variant.Variant('p', var) + variant = sequence_variant.Variant('p', var, '.') self.assertEqual(expected, variant.sanity_check_against_seq(seq, translate_seq=True)) @@ -78,10 +82,10 @@ def test_has_variant(self): '''test has_variant''' seq = pyfastaq.sequences.Fasta('name', 'ATGTATTGCTGA') # translation: MYC* tests = [ - (sequence_variant.Variant('n', 'A2T'), True), - (sequence_variant.Variant('n', 'T2A'), False), - (sequence_variant.Variant('p', 'I2Y'), True), - (sequence_variant.Variant('p', 'Y2I'), False), + (sequence_variant.Variant('n', 'A2T', '.'), True), + (sequence_variant.Variant('n', 'T2A', '.'), False), + (sequence_variant.Variant('p', 'I2Y', '.'), True), + (sequence_variant.Variant('p', 'Y2I', '.'), False), ] for var, expected in tests: @@ -90,7 +94,7 @@ def test_has_variant(self): def test_nucleotide_range(self): '''test nucleotide_range''' - sv = sequence_variant.Variant('n', 'A2T') + sv = sequence_variant.Variant('n', 'A2T', '.') self.assertEqual((1, 1), sv.nucleotide_range()) - sv = sequence_variant.Variant('p', 'I42L') + sv = sequence_variant.Variant('p', 'I42L', '.') self.assertEqual((123, 125), sv.nucleotide_range()) diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py index f6acfccb..4219af7f 100644 --- a/ariba/tests/summary_cluster_test.py +++ b/ariba/tests/summary_cluster_test.py @@ -10,7 +10,7 @@ class TestSummaryCluster(unittest.TestCase): def test_line2dict(self): '''Test _line2dict''' - line = 'refname\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text' + line = 'refname\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:var_group1:ref has wild type, foo bar\tsome free text' expected = { 'ref_name': 'refname', @@ -40,7 +40,8 @@ def test_line2dict(self): 'smtls_total_depth': '17', 'smtls_alt_nt': '.', 'smtls_alt_depth': '17', - 'var_description': 'noncoding1_n_A14T_N_ref has wild type, foo bar', + 'var_description': 'noncoding1:n:A14T:var_group1:ref has wild type, foo bar', + 'var_group': 'var_group1', 'free_text': 'some free text' } @@ -51,9 +52,9 @@ def test_add_data_dict(self): '''Test add_data_dict''' cluster = summary_cluster.SummaryCluster() self.assertTrue(cluster.name is None) - line1 = 'refname\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text' - line2 = 'refname\treftype\t19\t78\tcluster2\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text' - line3 = 'refname2\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text' + line1 = 'refname\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text' + line2 = 'refname\treftype\t19\t78\tcluster2\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id2:ref has wild type, foo bar\tsome free text' + line3 = 'refname2\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id3:ref has wild type, foo bar\tsome free text' data_dict1 = summary_cluster.SummaryCluster.line2dict(line1) data_dict2 = summary_cluster.SummaryCluster.line2dict(line2) data_dict3 = summary_cluster.SummaryCluster.line2dict(line3) @@ -71,9 +72,9 @@ def test_pc_id_of_longest(self): '''Test pc_id_of_longest''' cluster = summary_cluster.SummaryCluster() self.assertTrue(cluster.name is None) - line1 = 'refname\treftype\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text' - line2 = 'refname\treftype\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text' - line3 = 'refname\treftype\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text' + line1 = 'refname\treftype\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text' + line2 = 'refname\treftype\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text' + line3 = 'refname\treftype\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text' data_dict1 = summary_cluster.SummaryCluster.line2dict(line1) data_dict2 = summary_cluster.SummaryCluster.line2dict(line2) data_dict3 = summary_cluster.SummaryCluster.line2dict(line3) @@ -85,7 +86,7 @@ def test_pc_id_of_longest(self): def test_to_cluster_summary_number(self): '''Test _to_cluster_summary_assembled''' - line = 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text' + line = 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text' data_dict = summary_cluster.SummaryCluster.line2dict(line) tests = [ @@ -122,9 +123,9 @@ def test_to_cluster_summary_number(self): def test_has_known_variant(self): '''Test _has_known_variant''' lines = [ - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', 'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.', 'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.' ] @@ -139,9 +140,9 @@ def test_has_known_variant(self): def test_has_any_known_variant(self): lines = [ - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', 'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.', 'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.' ] @@ -159,10 +160,10 @@ def test_has_any_known_variant(self): def test_has_nonsynonymous(self): '''Test _has_nonsynonymous''' lines = [ - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', 'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.', 'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.' ] @@ -178,11 +179,11 @@ def test_has_nonsynonymous(self): def test_has_any_nonsynonymous(self): '''Test _has_any_nonsynonymous''' lines = [ - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:N_ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', ] expected = ['no', 'yes', 'no', 'yes', 'yes'] @@ -198,9 +199,9 @@ def test_has_any_nonsynonymous(self): def test_has_novel_nonsynonymous(self): '''Test _has_novel_nonsynonymous''' lines = [ - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', 'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.', 'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.' ] @@ -216,9 +217,9 @@ def test_has_novel_nonsynonymous(self): def test_has_any_novel_nonsynonymous(self): '''Test _has_any_novel_nonsynonymous''' lines = [ - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', 'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.', 'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.' ] @@ -236,11 +237,11 @@ def test_has_any_novel_nonsynonymous(self): def test_to_cluster_summary_has_known_nonsynonymous(self): '''Test _to_cluster_summary_has_known_nonsynonymous''' lines = [ - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', ] expected = ['yes', 'yes', 'no', 'no', 'no'] @@ -257,11 +258,11 @@ def test_to_cluster_summary_has_known_nonsynonymous(self): def test_to_cluster_summary_has_novel_nonsynonymous(self): '''Test _to_cluster_summary_has_novel_nonsynonymous''' lines = [ - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', ] expected = ['no', 'no', 'no', 'yes', 'yes'] @@ -278,11 +279,11 @@ def test_to_cluster_summary_has_novel_nonsynonymous(self): def test_to_cluster_summary_has_nonsynonymous(self): '''Test _to_cluster_summary_has_nonsynonymous''' lines = [ - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', ] expected = ['no', 'yes', 'no', 'yes', 'yes'] @@ -306,45 +307,51 @@ def test_get_nonsynonymous_var(self): 'known_var': '0', 'ref_ctg_change': '.', 'ref_ctg_effect': '.', - 'var_seq_type': '.' + 'var_seq_type': '.', + 'var_group': '.', } self.assertEqual(None, summary_cluster.SummaryCluster._get_nonsynonymous_var(d)) d['var_type'] = 'p' + d['known_var'] = '1' d['has_known_var'] = '1' with self.assertRaises(summary_cluster.Error): summary_cluster.SummaryCluster._get_nonsynonymous_var(d) d['known_var_change'] = 'I42L' - self.assertEqual('I42L', summary_cluster.SummaryCluster._get_nonsynonymous_var(d)) + self.assertEqual(('ref', 'I42L', 'ungrouped', None), summary_cluster.SummaryCluster._get_nonsynonymous_var(d)) + + d['var_group'] = 'vgroup' + self.assertEqual(('ref', 'I42L', 'grouped', 'vgroup'), summary_cluster.SummaryCluster._get_nonsynonymous_var(d)) + d['var_group'] = '.' d['ref_ctg_change'] = 'P43Q' with self.assertRaises(summary_cluster.Error): summary_cluster.SummaryCluster._get_nonsynonymous_var(d) d['known_var_change'] = '.' - self.assertEqual('P43Q', summary_cluster.SummaryCluster._get_nonsynonymous_var(d)) + self.assertEqual(('ref', 'P43Q', 'novel', None), summary_cluster.SummaryCluster._get_nonsynonymous_var(d)) d['ref_ctg_change'] = '.' with self.assertRaises(summary_cluster.Error): summary_cluster.SummaryCluster._get_nonsynonymous_var(d) d['ref_ctg_effect'] = 'MULTIPLE' - self.assertEqual('MULTIPLE', summary_cluster.SummaryCluster._get_nonsynonymous_var(d)) + self.assertEqual(('ref', 'MULTIPLE', 'novel', None), summary_cluster.SummaryCluster._get_nonsynonymous_var(d)) def test_has_resistance(self): '''Test _has_resistance''' lines = [ - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', - 'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', ] expected = ['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no'] @@ -359,10 +366,30 @@ def test_has_resistance(self): self.assertEqual('no', cluster._has_resistance(assembled_summary)) + def test_has_var_groups(self): + '''Test has_var_groups''' + lines = [ + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text', + 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id2:ref has wild type, foo bar\tsome free text', + 'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id3:ref has wild type, foo bar\tsome free text', + 'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id4:ref has wild type, foo bar\tsome free text', + 'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id5:ref has wild type, foo bar\tsome free text', + 'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id6:ref has wild type, foo bar\tsome free text', + 'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id7:ref has wild type, foo bar\tsome free text', + 'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id7:ref has wild type, foo bar\tsome free text', + ] + dicts = [summary_cluster.SummaryCluster.line2dict(line) for line in lines] + cluster = summary_cluster.SummaryCluster() + for d in dicts: + cluster.add_data_dict(d) + got = cluster.has_var_groups() + expected = {'id1', 'id3', 'id6'} + self.assertEqual(expected, got) + def test_column_summary_data(self): '''Test column_summary_data''' - line1 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tfoo bar\tspam eggs' + line1 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs' line2 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text' data_dict1 = summary_cluster.SummaryCluster.line2dict(line1) @@ -384,7 +411,7 @@ def test_column_summary_data(self): def test_non_synon_variants(self): '''Test non_synon_variants''' - line1 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tfoo bar\tspam eggs' + line1 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs' line2 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text' data_dict1 = summary_cluster.SummaryCluster.line2dict(line1) @@ -393,5 +420,5 @@ def test_non_synon_variants(self): cluster.add_data_dict(data_dict1) cluster.add_data_dict(data_dict2) got = cluster.non_synon_variants() - expected = {'A14T'} + expected = {('ref1', 'A14T', 'grouped', 'id1')} self.assertEqual(expected, got) diff --git a/ariba/tests/summary_sample_test.py b/ariba/tests/summary_sample_test.py index a318270e..3c5b2bef 100644 --- a/ariba/tests/summary_sample_test.py +++ b/ariba/tests/summary_sample_test.py @@ -45,7 +45,7 @@ def test_column_summary_data(self): 'has_res': 'yes', 'ref_seq': 'noncoding1', 'known_var': 'yes', - 'novel_var': 'no', + 'novel_var': 'yes', 'pct_id': '98.33' }, 'cluster.p': { @@ -70,17 +70,17 @@ def test_column_summary_data(self): self.assertEqual(expected, got) - def test_non_synon_variants(self): - '''Test _non_synon_variants''' - infile = os.path.join(data_dir, 'summary_sample_test_non_synon_variants.tsv') + def test_var_groups(self): + '''test _var_groups''' + infile = os.path.join(data_dir, 'summary_sample_test_var_groups.tsv') sample_summary = summary_sample.SummarySample(infile) sample_summary.clusters = sample_summary._load_file(infile, 90) + got = sample_summary._var_groups() expected = { - 'cluster.n': {'A14T', 'A6G'}, - 'cluster.p': {'A10V'}, - 'cluster.v': {'S5T'} + 'cluster.n': {'id1', 'id2'}, + 'cluster.p': {'id3'}, + 'cluster.v': {'id4'} } - got = sample_summary._non_synon_variants() self.assertEqual(expected, got) @@ -90,11 +90,14 @@ def test_variant_column_names_tuples(self): sample_summary = summary_sample.SummarySample(infile) sample_summary.clusters = sample_summary._load_file(infile, 90) sample_summary.column_summary_data = sample_summary._column_summary_data() - sample_summary.variants = sample_summary._non_synon_variants() expected = { - 'cluster.v': {('variants_only1', 'S5T', 'known')}, - 'cluster.n': {('noncoding1', 'A6G', 'known'), ('noncoding1', 'A14T', 'known')}, - 'cluster.p': {('presence_absence1', 'A10V', 'unknown')} + 'cluster.v': {('variants_only1', 'S5T', 'ungrouped', None)}, + 'cluster.n': { + ('noncoding1', 'A6G', 'grouped', 'id2'), + ('noncoding1', 'A14T', 'ungrouped', None), + ('noncoding1', 'G15T', 'novel', None) + }, + 'cluster.p': {('presence_absence1', 'A10V', 'grouped', 'id3')} } got = sample_summary._variant_column_names_tuples() self.assertEqual(expected, got) diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 1fa454bc..6cfd1e63 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -44,6 +44,35 @@ def test_determine_cluster_cols(self): self.assertEqual(expected[i], summary.Summary._determine_cluster_cols(col_strings[i])) + def test_determine_var_cols(self): + col_strings = [ + 'groups,grouped,ungrouped,novel', + 'groups,grouped,ungrouped', + 'grouped,novel', + 'ungrouped,novel', + 'grouped', + 'ungrouped', + 'novel', + '' + ] + + expected = [ + {'groups': True, 'grouped': True, 'ungrouped': True, 'novel': True}, + {'groups': True, 'grouped': True, 'ungrouped': True, 'novel': False}, + {'groups': False, 'grouped': True, 'ungrouped': False, 'novel': True}, + {'groups': False, 'grouped': False, 'ungrouped': True, 'novel': True}, + {'groups': False, 'grouped': True, 'ungrouped': False, 'novel': False}, + {'groups': False, 'grouped': False, 'ungrouped': True, 'novel': False}, + {'groups': False, 'grouped': False, 'ungrouped': False, 'novel': True}, + {'groups': False, 'grouped': False, 'ungrouped': False, 'novel': False}, + ] + + assert len(col_strings) == len(expected) + + for i in range(len(col_strings)): + self.assertEqual(expected[i], summary.Summary._determine_var_cols(col_strings[i])) + + def test_load_input_files(self): '''Test _load_input_files''' file1 = os.path.join(data_dir, 'summary_test_load_input_files.1.tsv') @@ -74,9 +103,24 @@ def test_get_all_variant_columns(self): samples = summary.Summary._load_input_files([file1, file2], 90) got = summary.Summary._get_all_variant_columns(samples) expected = { - 'cluster.p.2': {('presence_absence1', 'A10V', 'known')}, - 'cluster.n.1': {('noncoding1', 'A6G', 'known'), ('noncoding1', 'A14T', 'known')}, - 'cluster.p.1': {('presence_absence1', 'A10V', 'known')}, + 'cluster.p.2': {('presence_absence1', 'A10V', 'grouped', 'id3')}, + 'cluster.n.1': {('noncoding1', 'A6G', 'grouped', 'id2'), ('noncoding1', 'A14T', 'grouped', 'id1')}, + 'cluster.p.1': {('presence_absence1', 'A10V', 'grouped', 'id2')}, + } + self.assertEqual(expected, got) + + + def test_get_all_var_groups(self): + '''test _get_all_var_groups''' + file1 = os.path.join(data_dir, 'summary_test_get_all_var_groups.1.tsv') + file2 = os.path.join(data_dir, 'summary_test_get_all_var_groups.2.tsv') + samples = summary.Summary._load_input_files([file1, file2], 90) + got = summary.Summary._get_all_var_groups(samples) + expected = { + 'cluster.p.1': {'id4'}, + 'cluster.p.2': {'id3'}, + 'cluster.v.1': set(), + 'cluster.n.1': {'id1', 'id2'} } self.assertEqual(expected, got) @@ -87,7 +131,7 @@ def test_gather_output_rows(self): os.path.join(data_dir, 'summary_test_gather_output_rows.in.1.tsv'), os.path.join(data_dir, 'summary_test_gather_output_rows.in.2.tsv') ] - s = summary.Summary('out', filenames=infiles, include_all_known_variant_columns=False) + s = summary.Summary('out', filenames=infiles, variant_cols=None) s.samples = summary.Summary._load_input_files(infiles, 90) expected = { infiles[0]: { @@ -146,7 +190,17 @@ def test_gather_output_rows(self): got = s._gather_output_rows() self.assertEqual(expected, got) - s.include_all_known_variant_columns = True + s.var_columns['groups'] = True + expected[infiles[0]]['noncoding1']['vgroup.id1'] = 'yes' + expected[infiles[0]]['noncoding1']['vgroup.id3'] = 'no' + expected[infiles[1]]['noncoding1']['vgroup.id1'] = 'yes' + expected[infiles[1]]['noncoding1']['vgroup.id3'] = 'yes' + got = s._gather_output_rows() + self.assertEqual(expected, got) + + + s.var_columns['grouped'] = True + s.var_columns['ungrouped'] = True expected[infiles[0]]['noncoding1']['noncoding1.A14T'] = 'yes' expected[infiles[0]]['noncoding1']['noncoding1.A6G'] = 'no' expected[infiles[1]]['noncoding1']['noncoding1.A14T'] = 'yes' @@ -154,17 +208,19 @@ def test_gather_output_rows(self): got = s._gather_output_rows() self.assertEqual(expected, got) - s.include_all_novel_variant_columns = True + s.var_columns['novel'] = True expected[infiles[0]]['presence_absence1']['presence_absence1.A10V'] = 'yes' expected[infiles[1]]['presence_absence1']['presence_absence1.A10V'] = 'yes' got = s._gather_output_rows() self.assertEqual(expected, got) for filename in expected: + del expected[filename]['noncoding1']['vgroup.id1'] + del expected[filename]['noncoding1']['vgroup.id3'] for gene_type in expected[filename]: del expected[filename][gene_type]['ref_seq'] - s = summary.Summary('out', filenames=infiles, cluster_cols='assembled,has_res,pct_id,known_var,novel_var', include_all_novel_variant_columns=True) + s = summary.Summary('out', filenames=infiles, cluster_cols='assembled,has_res,pct_id,known_var,novel_var', variant_cols='ungrouped,grouped,novel') s.samples = summary.Summary._load_input_files(infiles, 90) s.include_all_variant_columns = True got = s._gather_output_rows() diff --git a/ariba/vfdb_parser.py b/ariba/vfdb_parser.py index 052d2f7b..9e0dab80 100644 --- a/ariba/vfdb_parser.py +++ b/ariba/vfdb_parser.py @@ -38,7 +38,7 @@ def run(self): for seq in file_reader: seq.id, description = self._fa_header_to_name_and_metadata(seq.id) if description is not None: - print(seq.id, '.', '.', description, sep='\t', file=tsv_out) + print(seq.id, '.', '.', '.', description, sep='\t', file=tsv_out) print(seq, file=fa_out) pyfastaq.utils.close(fa_out) diff --git a/scripts/ariba b/scripts/ariba index a4bbeac3..28d16786 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -9,6 +9,7 @@ tasks = { 'run': 'Run the ARIBA local assembly pipeline', 'summary': 'Summarise multiple reports made by "run"', 'flag': 'Translate the meaning of a flag output by the pipeline', + 'aln2meta': 'Make metadata input to preparef, using multialignment and SNPs', 'test': 'Run on small test dataset', 'version': 'Print version and exit', } @@ -21,6 +22,7 @@ ordered_tasks = [ 'reportfilter', 'summary', 'flag', + 'aln2meta', 'test', 'version', ]