diff --git a/ariba/__init__.py b/ariba/__init__.py
index 68461b4f..7c85be26 100644
--- a/ariba/__init__.py
+++ b/ariba/__init__.py
@@ -7,6 +7,7 @@
 
 
 __all__ = [
+    'aln_to_metadata',
     'assembly',
     'assembly_compare',
     'assembly_variants',
diff --git a/ariba/aln_to_metadata.py b/ariba/aln_to_metadata.py
new file mode 100644
index 00000000..555b69d6
--- /dev/null
+++ b/ariba/aln_to_metadata.py
@@ -0,0 +1,269 @@
+import os
+import re
+import sys
+import shutil
+import pyfastaq
+from ariba import sequence_variant
+
+class Error (Exception): pass
+
+class AlnToMetadata:
+    def __init__(self,
+      aln_file,
+      vars_file,
+      refs_are_coding,
+      cluster_rep_name,
+      genetic_code=11,
+    ):
+        self.padded_seqs = AlnToMetadata._load_aln_file(aln_file)
+        self.refs_are_coding = refs_are_coding
+        self.variants = AlnToMetadata._load_vars_file(vars_file, self.refs_are_coding)
+        self.genetic_code = genetic_code
+        self.cluster_rep_name = cluster_rep_name
+
+
+    @classmethod
+    def _load_aln_file(cls, aln_file):
+        seqs = {}
+        pyfastaq.tasks.file_to_dict(aln_file, seqs)
+        return seqs
+
+
+    @classmethod
+    def _load_vars_file(cls, vars_file, refs_are_coding):
+        var_type = 'p' if refs_are_coding else 'n'
+        f = pyfastaq.utils.open_file_read(vars_file)
+        variants = {}
+
+        for line in f:
+            try:
+                ref_name, variant, identifier, description = line.rstrip().split('\t')
+                variant = sequence_variant.Variant(var_type, variant, identifier)
+            except:
+                pyfastaq.utils.close(f)
+                raise Error('Error in this line of variants file:\n' + line)
+
+            if ref_name not in variants:
+                variants[ref_name] = []
+
+            variants[ref_name].append((variant, description))
+
+        pyfastaq.utils.close(f)
+        return variants
+
+
+    @classmethod
+    def _make_unpadded_seqs(cls, padded_seqs):
+        unpadded_seqs = {}
+        for seq in padded_seqs.values():
+            unpadded_seqs[seq.id] = pyfastaq.sequences.Fasta(seq.id, seq.seq.replace('-', ''))
+        return unpadded_seqs
+
+
+    @classmethod
+    def _check_seq_lengths_same(cls, seqs):
+        sequence_lengths = set([len(x) for x in seqs.values()])
+        if len(sequence_lengths) > 1:
+            raise Error('Input sequences must all be the same length. Cannot continue. Lengths found: ' + ','.join([str(x) for x in sequence_lengths]))
+        return len(sequence_lengths) == 1
+
+
+    @classmethod
+    def _insertion_coords(cls, sequence):
+        insertions = []
+        regex = re.compile('-+')
+        for m in regex.finditer(sequence.seq):
+             insertions.append(pyfastaq.intervals.Interval(m.span()[0], m.span()[1] - 1))
+        return insertions
+
+
+    @classmethod
+    def _make_unpadded_insertion_coords(cls, unpadded_sequences):
+        return {x.id: AlnToMetadata._insertion_coords(x) for x in unpadded_sequences.values()}
+
+
+    @classmethod
+    def _check_insertion_coords(cls, sequence):
+        insertions = AlnToMetadata._insertion_coords(sequence)
+        for coords in insertions:
+            if coords.start % 3 !=0:
+                raise Error('Insertion does not start in frame in sequence "' + sequence.id + '". Cannot continue')
+            elif len(coords) % 3 != 0:
+                raise Error('Insertion of length not a mulitple of 3 in sequence "' + sequence.id + '". Cannot continue')
+
+        return True
+
+
+    @classmethod
+    def _check_coding_seq(cls, sequence, genetic_code=11):
+        if len(sequence) % 3 != 0:
+            raise Error('Length of sequence ' + sequence.id + ' is ' + str(len(sequence)) + ', which is not a multiple of 3. Cannot continue')
+
+        original_code = pyfastaq.sequences.genetic_code
+        pyfastaq.sequences.genetic_code = genetic_code
+        protein_seq = sequence.translate()
+        start_ok = sequence.seq[0:3].upper() in pyfastaq.genetic_codes.starts[genetic_code]
+        pyfastaq.sequences.genetic_code = original_code
+
+        if not start_ok:
+            raise Error('Sequence "' + sequence.id + '" does not start with a start codon. Cannot continue')
+        elif protein_seq[-1] != '*':
+            raise Error('Sequence "' + sequence.id + '" does not end with a stop codon. Cannot continue')
+        elif '*' in protein_seq[:-1]:
+            raise Error('Sequence "' + sequence.id + '" has an internal stop codon. Cannot continue')
+
+        return True
+
+
+    @classmethod
+    def _check_sequences(cls, padded_sequences, unpadded_sequences, seqs_are_coding, genetic_code=11):
+        AlnToMetadata._check_seq_lengths_same(padded_sequences)
+
+        if seqs_are_coding:
+            for sequence in unpadded_sequences.values():
+                AlnToMetadata._check_insertion_coords(sequence)
+                AlnToMetadata._check_coding_seq(sequence, genetic_code=genetic_code)
+
+        return True
+
+
+    @classmethod
+    def _check_variants_match_sequences(cls, unpadded_sequences, variants, seqs_are_coding, genetic_code=11):
+        original_code = pyfastaq.sequences.genetic_code
+        pyfastaq.sequences.genetic_code = genetic_code
+        for seqname, variant_list in variants.items():
+            if seqname not in unpadded_sequences:
+                pyfastaq.sequences.genetic_code = original_code
+                raise Error('Sequence name "' + seqname + '" given in variants file, but sequence not found')
+            for variant, description in variant_list:
+                if not variant.sanity_check_against_seq(unpadded_sequences[seqname], translate_seq=seqs_are_coding):
+                    pyfastaq.sequences.genetic_code = original_code
+                    raise Error('Variant "' + str(variant) + '" for sequence "' + seqname + '" does not match sequence. cannot continue')
+
+        pyfastaq.sequences.genetic_code = original_code
+        return True
+
+
+    @classmethod
+    def _variant_ids_are_unique(cls, variants):
+        seen_variants = set()
+        for variants_list in variants.values():
+            for variant, description in variants_list:
+                if variant.identifier in seen_variants:
+                    raise Error('Variant identifier "' + variant.identifier + '" found more than once. Cannot continue')
+                else:
+                    seen_variants.add(variant.identifier)
+
+        return True
+
+
+    @classmethod
+    def _unpadded_to_padded_nt_position(cls, position, insertions):
+        if len(insertions) == 0:
+            return position
+
+        i = 0
+        while i < len(insertions) and insertions[i].start <= position:
+            position += len(insertions[i])
+            i += 1
+
+        return position
+
+
+    @classmethod
+    def _padded_to_unpadded_nt_position(cls, position, insertions):
+        if len(insertions) == 0:
+            return position
+
+        i = 0
+        total_gap_length = 0
+        while i < len(insertions) and insertions[i].end < position:
+            total_gap_length += len(insertions[i])
+            i += 1
+
+        if i < len(insertions) and insertions[i].distance_to_point(position) == 0:
+            return None
+        else:
+            return position - total_gap_length
+
+
+    @classmethod
+    def _variants_to_tsv_lines(cls, variants, unpadded_sequences, padded_sequences, insertions, seqs_are_coding):
+        if seqs_are_coding:
+            unpadded_aa_sequences = {x: unpadded_sequences[x].translate() for x in unpadded_sequences}
+
+        lines = []
+        for refname in sorted(variants):
+            for variant, description in variants[refname]:
+                if seqs_are_coding:
+                    ref_unpadded_nt_position = 3 * variant.position
+                else:
+                    ref_unpadded_nt_position = variant.position
+
+                padded_nt_position = AlnToMetadata._unpadded_to_padded_nt_position(ref_unpadded_nt_position, insertions[refname])
+                lines.append('\t'.join([refname, variant.variant_type, str(variant), variant.identifier, description]))
+
+                for seqname, seq in sorted(padded_sequences.items()):
+                    if seqname == refname:
+                        continue
+
+                    if seq[padded_nt_position] == '-':
+                        print('Warning: position has a gap in sequence ', seqname, 'corresponding to variant', variant, '(' + variant.identifier + ') in sequence ', refname, '... Ignoring for ' + seqname, file=sys.stderr)
+                        continue
+
+                    unpadded_nt_position = AlnToMetadata._padded_to_unpadded_nt_position(padded_nt_position, insertions[seqname])
+                    assert unpadded_nt_position is not None
+
+                    if seqs_are_coding:
+                        assert unpadded_nt_position % 3 == 0
+                        unpadded_aa_position = unpadded_nt_position // 3
+                        pos_string = str(unpadded_aa_position)
+                        if unpadded_aa_sequences[seqname][unpadded_aa_position] in {variant.wild_value, variant.variant_value}:
+                            variant_string = variant.wild_value
+                        else:
+                            variant_string = unpadded_aa_sequences[seqname][unpadded_aa_position]
+                        variant_string += str(unpadded_aa_position + 1) + variant.variant_value
+                    else:
+                        pos_string = str(unpadded_nt_position)
+                        if unpadded_sequences[seqname][unpadded_nt_position] in {variant.wild_value, variant.variant_value}:
+                            variant_string = variant.wild_value
+                        else:
+                            variant_string = unpadded_sequences[seqname][unpadded_nt_position]
+                        variant_string += str(unpadded_nt_position + 1) + variant.variant_value
+
+                    lines.append('\t'.join([seqname, variant.variant_type, variant_string, variant.identifier, description]))
+
+        return lines
+
+
+    @classmethod
+    def _make_cluster_file(cls, cluster_name, sequences, filename):
+        if cluster_name not in sequences:
+            raise Error('Sequence name "' + cluster_name + '" to be used as cluster representative not found. Cannot continue')
+        names = [x for x in sequences.keys() if x != cluster_name]
+        names.sort()
+        with open(filename, 'w') as f:
+            print(cluster_name, *names, sep='\t', file=f)
+
+
+    def run(self, outprefix):
+        if self.cluster_rep_name not in self.padded_seqs:
+            raise Error('Sequence name "' + self.cluster_rep_name + '" to be used as cluster representative not found. Cannot continue')
+        original_code = pyfastaq.sequences.genetic_code
+        pyfastaq.sequences.genetic_code = self.genetic_code
+        unpadded_seqs = AlnToMetadata._make_unpadded_seqs(self.padded_seqs)
+        insertions = AlnToMetadata._make_unpadded_insertion_coords(self.padded_seqs)
+        AlnToMetadata._check_sequences(self.padded_seqs, unpadded_seqs, self.refs_are_coding, genetic_code=self.genetic_code)
+        AlnToMetadata._variant_ids_are_unique(self.variants)
+        AlnToMetadata._check_variants_match_sequences(unpadded_seqs, self.variants, self.refs_are_coding, genetic_code=self.genetic_code)
+
+        tsv_lines = AlnToMetadata._variants_to_tsv_lines(self.variants, unpadded_seqs, self.padded_seqs, insertions, self.refs_are_coding)
+        with open(outprefix + '.tsv', 'w') as f:
+            print(*tsv_lines, sep='\n', file=f)
+
+        with open(outprefix + '.fa', 'w') as f:
+            for seqname in sorted(unpadded_seqs):
+                print(unpadded_seqs[seqname], sep='\n', file=f)
+
+        AlnToMetadata._make_cluster_file(self.cluster_rep_name, unpadded_seqs, outprefix + '.cluster')
+        pyfastaq.sequences.genetic_code = original_code
diff --git a/ariba/assembly_variants.py b/ariba/assembly_variants.py
index ab8658fa..137cb233 100644
--- a/ariba/assembly_variants.py
+++ b/ariba/assembly_variants.py
@@ -193,7 +193,7 @@ def _get_one_variant_for_one_contig_coding(ref_sequence, refdata_var_dict, mumme
         # if this variant is at the same position as a known variant in the reference
         if refdata_var_dict is not None and aa_var_position in refdata_var_dict['p']:
             if aa_var_effect == 'NONSYN':
-                aa_variant = sequence_variant.Variant('p', aa_var_string)
+                aa_variant = sequence_variant.Variant('p', aa_var_string, '.')
                 variants_at_this_position = {x for x in refdata_var_dict['p'][aa_variant.position]}
                 matching_variants = {x for x in variants_at_this_position if aa_variant.variant_value == x.variant.variant_value}
                 not_interesting_variants = {x for x in variants_at_this_position if aa_variant.variant_value == x.variant.wild_value}
diff --git a/ariba/cdhit.py b/ariba/cdhit.py
index 1450c9a7..c49750cd 100644
--- a/ariba/cdhit.py
+++ b/ariba/cdhit.py
@@ -84,7 +84,7 @@ def run_get_clusters_from_file(self, infile):
         f = pyfastaq.utils.open_file_write(tmp_fa)
 
         for seq in seq_reader:
-            if seq.id in clusters:
+            if seq.id in clusters and seq.id in clusters[seq.id]:
                 pyfastaq.utils.close(f)
                 shutil.rmtree(tmpdir)
                 raise Error('Sequence name "' + seq.id + '" not unique. Cannot continue')
diff --git a/ariba/ref_genes_getter.py b/ariba/ref_genes_getter.py
index 82114ec0..07f097bb 100644
--- a/ariba/ref_genes_getter.py
+++ b/ariba/ref_genes_getter.py
@@ -126,15 +126,15 @@ def _get_from_card(self, outprefix):
                 else:
                     fasta_filehandle = f_out_var_only
 
-                print(fasta.id, '.', '.', data['ARO_name'], sep='\t', file=f_out_tsv)
+                print(fasta.id, '.', '.', '.', data['ARO_name'], sep='\t', file=f_out_tsv)
 
                 if len(data['snps']) == 0:
                     print(fasta, file=fasta_filehandle)
-                    print(fasta.id, '.', '.', data['ARO_description'], sep='\t', file=f_out_tsv)
+                    print(fasta.id, '.', '.', '.', data['ARO_description'], sep='\t', file=f_out_tsv)
                 else:
                     print(fasta, file=fasta_filehandle)
                     for snp in data['snps']:
-                        print(fasta.id, variant_type, snp, data['ARO_description'], sep='\t', file=f_out_tsv)
+                        print(fasta.id, variant_type, snp, '.', data['ARO_description'], sep='\t', file=f_out_tsv)
 
 
         pyfastaq.utils.close(f_out_tsv)
diff --git a/ariba/reference_data.py b/ariba/reference_data.py
index f43ab119..3251a059 100644
--- a/ariba/reference_data.py
+++ b/ariba/reference_data.py
@@ -132,7 +132,7 @@ def _write_metadata_tsv(metadata, filename):
         f = pyfastaq.utils.open_file_write(filename)
 
         for gene_name, data_dict in sorted(metadata.items()):
-            for meta in data_dict['.']:
+            for meta in sorted([str(x) for x in data_dict['.']]):
                 print(meta, file=f)
 
             variants = []
@@ -190,7 +190,7 @@ def _filter_bad_variant_data(self, out_prefix, presence_absence_removed, variant
             to_remove = []
 
             for metadata in metadata_dict['.']:
-                if metadata.free_text is None:
+                if metadata.free_text == '.':
                     print(gene_name, 'metadata has no info. Just gene name given. Removing. Line of file was:', metadata, file=log_fh)
                     to_remove.append(metadata)
 
diff --git a/ariba/report.py b/ariba/report.py
index df06311c..f32234e3 100644
--- a/ariba/report.py
+++ b/ariba/report.py
@@ -1,6 +1,8 @@
 import sys
 import pymummer
 
+class Error (Exception): pass
+
 columns = [
     'ref_name',              # 0  name of reference sequence
     'ref_type',              # 1  type of reference sequence (presence/absence, variants only, noncoding)
@@ -165,7 +167,7 @@ def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymum
                 known_var_change = 'unknown'
                 var_type = 'SNP'
                 has_known_var = '1'
-                matching_vars_column = ';;;'.join([x.to_string(separator='_') for x in matching_vars_set])
+                matching_vars_column = ';;;'.join([x.to_string(separator=':') for x in matching_vars_set])
             else:
                 is_known_var = '0'
                 known_var_change = '.'
@@ -253,8 +255,10 @@ def report_lines(cluster):
 
     for line in lines:
         if len(line.split('\t')) != len(columns):
-            print('Error making report - wrong number of columns. Expected', len(columns), 'but got', len(line.split('\t')), file=sys.stderr)
-            print(line, file=sys.stderr)
+            cols = line.split('\t')
+            print('Error making report - wrong number of columns. Expected', len(columns), 'but got', len(cols), file=sys.stderr)
+            for i in range(len(cols)):
+                print(i, cols[i], sep='\t', file=sys.stderr)
             lines_ok = False
 
     if not lines_ok:
diff --git a/ariba/sequence_metadata.py b/ariba/sequence_metadata.py
index a1efde2e..5b888a29 100644
--- a/ariba/sequence_metadata.py
+++ b/ariba/sequence_metadata.py
@@ -6,23 +6,16 @@ class Error (Exception): pass
 class SequenceMetadata:
     def __init__(self, line):
         try:
-            self.name, variant_type, variant_string, *extra_columns = line.rstrip().split('\t')
+            self.name, variant_type, variant_string, identifier, self.free_text = line.rstrip().split('\t')
         except:
             raise Error('Error parsing line of file:\n' + line)
 
-        if len(extra_columns) == 0:
-            self.free_text = None
-        elif len(extra_columns) == 1:
-            self.free_text = extra_columns[0]
-        else:
-            raise Error('Too many columns in this line:\n' + line)
-
         self.variant_type = variant_type
 
         if self.variant_type == '.':
             self.variant = None
         else:
-            self.variant = sequence_variant.Variant(self.variant_type, variant_string)
+            self.variant = sequence_variant.Variant(self.variant_type, variant_string, identifier)
 
 
     def __eq__(self, other):
@@ -42,16 +35,13 @@ def __str__(self):
 
 
     def to_string(self, separator='\t'):
-        fields = [self.name, self.variant_type]
-        if self.variant is None:
-            fields.append('.')
-        else:
-            fields.append(str(self.variant))
-
-        if self.free_text:
-            return separator.join(fields + [self.free_text])
-        else:
-            return separator.join(fields)
+        return separator.join([
+            self.name,
+            self.variant_type,
+            '.' if self.variant is None else str(self.variant),
+            '.' if (self.variant is None or self.variant.identifier is None) else self.variant.identifier,
+            self.free_text
+        ])
 
 
     def has_variant(self, seq):
diff --git a/ariba/sequence_variant.py b/ariba/sequence_variant.py
index fa475c92..7e32515d 100644
--- a/ariba/sequence_variant.py
+++ b/ariba/sequence_variant.py
@@ -7,12 +7,13 @@ class Error (Exception): pass
 allowed_variant_types = {'n', 'p'}
 
 class Variant:
-    def __init__(self, variant_type, variant_string):
+    def __init__(self, variant_type, variant_string, identifier):
         if variant_type not in allowed_variant_types:
             raise Error('Error! Variant type "' + variant_type + '" not recognised.\n' + \
                         'Must be one of:' + ', '.join(allowed_variant_types))
 
         self.variant_type = variant_type
+        self.identifier = None if identifier == '.' else identifier
 
 
         m = re.match('^([A-Z])([0-9]+)([A-Z])$', variant_string.upper())
diff --git a/ariba/summary.py b/ariba/summary.py
index 8e7133bb..94604863 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -15,12 +15,11 @@ def __init__(
       outprefix,
       filenames=None,
       fofn=None,
-      include_all_known_variant_columns=True,
-      include_all_novel_variant_columns=False,
       filter_rows=True,
       filter_columns=True,
       min_id=90.0,
       cluster_cols='assembled,has_res,ref_seq,pct_id,known_var,novel_var',
+      variant_cols='groups,grouped,ungrouped,novel',
       verbose=False,
     ):
         if filenames is None and fofn is None:
@@ -35,8 +34,7 @@ def __init__(
             self.filenames.extend(self._load_fofn(fofn))
 
         self.cluster_columns = self._determine_cluster_cols(cluster_cols)
-        self.include_all_known_variant_columns = include_all_known_variant_columns
-        self.include_all_novel_variant_columns = include_all_novel_variant_columns
+        self.var_columns = self._determine_var_cols(variant_cols)
         self.filter_rows = filter_rows
         self.filter_columns = filter_columns
         self.min_id = min_id
@@ -44,17 +42,28 @@ def __init__(
         self.verbose = verbose
 
 
-    @staticmethod
-    def _determine_cluster_cols(cols_string):
-        allowed_cols = {'assembled', 'has_res', 'ref_seq', 'pct_id', 'known_var', 'novel_var'}
+    @classmethod
+    def _determine_cols(cls, cols_string, allowed_cols, error_string):
         if cols_string == '' or cols_string is None:
             return {x: False for x in allowed_cols}
         wanted_cols = set(cols_string.split(','))
         if not wanted_cols.issubset(allowed_cols):
-            raise Error('Error in cluster names. Allowed values are: ' + str(','.join(list(allowed_cols))) + '. Got: ' + cols_string)
+            raise Error('Error in ' + error_string + '. Allowed values are: ' + str(','.join(list(allowed_cols))) + '. Got: ' + cols_string)
         return {x: x in wanted_cols for x in allowed_cols}
 
 
+    @staticmethod
+    def _determine_cluster_cols(cols_string):
+        allowed_cols = {'assembled', 'has_res', 'ref_seq', 'pct_id', 'known_var', 'novel_var'}
+        return Summary._determine_cols(cols_string, allowed_cols, 'cluster columns')
+
+
+    @staticmethod
+    def _determine_var_cols(cols_string):
+        allowed_cols = {'groups', 'grouped', 'ungrouped', 'novel'}
+        return Summary._determine_cols(cols_string, allowed_cols, 'variant columns')
+
+
     def _load_fofn(self, fofn):
         f = pyfastaq.utils.open_file_read(fofn)
         filenames = [x.rstrip() for x in f.readlines()]
@@ -103,9 +112,24 @@ def _get_all_variant_columns(cls, samples_dict):
         return columns
 
 
+    @classmethod
+    def _get_all_var_groups(cls, samples_dict):
+        groups = {}
+        for filename, sample in samples_dict.items():
+            for name, name_set in sample.var_groups.items():
+                if name not in groups:
+                    groups[name] = set()
+                groups[name].update(name_set)
+        return groups
+
+
     def _gather_output_rows(self):
         all_cluster_names = Summary._get_all_cluster_names(self.samples)
         all_var_columns = Summary._get_all_variant_columns(self.samples)
+        if self.var_columns['groups']:
+            var_groups = Summary._get_all_var_groups(self.samples)
+        else:
+            var_groups = set()
         rows = {}
 
         for filename, sample in self.samples.items():
@@ -126,21 +150,22 @@ def _gather_output_rows(self):
                         'pct_id': 'NA'
                     }
 
-                wanted_var_types = set()
-                if self.include_all_known_variant_columns:
-                    wanted_var_types.add('known')
-                if self.include_all_novel_variant_columns:
-                    wanted_var_types.add('unknown')
+                if self.var_columns['groups']:
+                    for group_name in var_groups[cluster]:
+                        if cluster in sample.var_groups and group_name in sample.var_groups[cluster]:
+                            rows[filename][cluster]['vgroup.' + group_name] = 'yes'
+                        else:
+                            rows[filename][cluster]['vgroup.' + group_name] = 'no'
 
-                if len(wanted_var_types) and cluster in all_var_columns:
-                    for (ref_name, variant, known_or_unknown) in all_var_columns[cluster]:
-                        if known_or_unknown not in wanted_var_types:
+                if cluster in all_var_columns:
+                    for (ref_name, variant, grouped_or_novel, group_name) in all_var_columns[cluster]:
+                        if not self.var_columns[grouped_or_novel]:
                             continue
 
                         key = ref_name + '.' + variant
                         if rows[filename][cluster]['assembled'] == 'no':
                             rows[filename][cluster][key] = 'NA'
-                        elif cluster in sample.variant_column_names_tuples and (ref_name, variant, known_or_unknown) in sample.variant_column_names_tuples[cluster]:
+                        elif cluster in sample.variant_column_names_tuples and (ref_name, variant, grouped_or_novel, group_name) in sample.variant_column_names_tuples[cluster]:
                             rows[filename][cluster][key] = 'yes'
                         else:
                             rows[filename][cluster][key] = 'no'
diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py
index dab808d0..f1bc7bbb 100644
--- a/ariba/summary_cluster.py
+++ b/ariba/summary_cluster.py
@@ -52,6 +52,14 @@ def line2dict(cls, line):
             except:
                 assert d[key] == '.'
 
+        if d['var_description'] == '.':
+            d['var_group'] = '.'
+        else:
+            try:
+                d['var_group'] = d['var_description'].split(':')[3]
+            except:
+                raise Error('Error getting variant group from the following line:\n' + line)
+
         return d
 
 
@@ -193,13 +201,20 @@ def _get_nonsynonymous_var(data_dict):
               data_dict['known_var_change'] != data_dict['ref_ctg_change']:
                 raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change != ref_ctg_change. Cannot continue')
 
-            if data_dict['known_var_change'] != '.':
-                return data_dict['known_var_change']
+            var_group = 'novel', None
+
+            if data_dict['known_var'] == '1' and data_dict['known_var_change'] != '.':
+                var_change = data_dict['known_var_change']
+                if data_dict['var_group'] == '.':
+                    var_group = 'ungrouped', None
+                else:
+                    var_group = 'grouped', data_dict['var_group']
             elif data_dict['ref_ctg_change'] != '.':
-                return data_dict['ref_ctg_change']
+                var_change = data_dict['ref_ctg_change']
             else:
-                return data_dict['ref_ctg_effect']
+                var_change = data_dict['ref_ctg_effect']
 
+            return (data_dict['ref_name'], var_change) + var_group
 
     def _has_resistance(self, assembled_summary):
         '''assembled_summary should be output of _to_cluster_summary_assembled'''
@@ -212,6 +227,15 @@ def _has_resistance(self, assembled_summary):
             return 'no'
 
 
+    def has_var_groups(self):
+        '''Returns a set of the variant group ids that this cluster has'''
+        ids = set()
+        for d in self.data:
+            if self._has_known_variant(d) and d['var_group'] != '.':
+                ids.add(d['var_group'])
+        return ids
+
+
     def column_summary_data(self):
         '''Returns a dictionary of column name -> value, for cluster-level results'''
         assembled_summary = self._to_cluster_summary_assembled()
diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py
index 0a994b45..fcf05331 100644
--- a/ariba/summary_sample.py
+++ b/ariba/summary_sample.py
@@ -40,7 +40,11 @@ def _column_summary_data(self):
         return {c: self.clusters[c].column_summary_data() for c in self.clusters}
 
 
-    def _non_synon_variants(self):
+    def _var_groups(self):
+        return {c: self.clusters[c].has_var_groups() for c in self.clusters}
+
+
+    def _variant_column_names_tuples(self):
         variants = {}
         for cluster_name, cluster in self.clusters.items():
             cluster_vars = cluster.non_synon_variants()
@@ -49,25 +53,9 @@ def _non_synon_variants(self):
         return variants
 
 
-    def _variant_column_names_tuples(self):
-        # assumes this has been run:
-        # self.column_summary_data = self._column_summary_data()
-        # self.variants = self._non_synon_variants()
-        columns = {}
-        for cluster_name, variants in self.variants.items():
-           ref_name = self.column_summary_data[cluster_name]['ref_seq']
-           columns[cluster_name] = set()
-           for var in variants:
-               if self.column_summary_data[cluster_name]['known_var'] == 'yes':
-                   columns[cluster_name].add((ref_name, var, 'known'))
-               else:
-                   columns[cluster_name].add((ref_name, var, 'unknown'))
-        return columns
-
-
     def run(self):
         self.clusters = self._load_file(self.report_tsv, self.min_pc_id)
         self.column_summary_data = self._column_summary_data()
-        self.variants = self._non_synon_variants()
         self.variant_column_names_tuples = self._variant_column_names_tuples()
+        self.var_groups = self._var_groups()
 
diff --git a/ariba/tasks/aln2meta.py b/ariba/tasks/aln2meta.py
new file mode 100644
index 00000000..f0b655f1
--- /dev/null
+++ b/ariba/tasks/aln2meta.py
@@ -0,0 +1,28 @@
+import argparse
+from ariba import aln_to_metadata
+
+
+def run():
+    coding_choices = ['coding', 'noncoding']
+    parser = argparse.ArgumentParser(
+        description = 'Converts multi-alignment fasta and SNP info to metadata',
+        usage = 'ariba aln2meta [options] <aln_fasta> <variants_tsv> <(non)coding> <cluster_rep> <outprefix>'
+    )
+
+    parser.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
+    parser.add_argument('aln_fasta', help='Multi-fasta file of alignments')
+    parser.add_argument('variants_tsv', help='TSV file of variants information')
+    parser.add_argument('coding_or_non', help='Sequences are coding or noncoding. Must be one of: ' + ' '.join(coding_choices), choices=coding_choices, metavar='(non)coding')
+    parser.add_argument('cluster_rep', help='Name of sequence to be used as cluster representative. Must exactly match a sequence in aln_fasta file')
+    parser.add_argument('outprefix', help='Prefix of output filenames')
+    options = parser.parse_args()
+
+    aln_to_meta = aln_to_metadata.AlnToMetadata(
+      options.aln_fasta,
+      options.variants_tsv,
+      options.coding_or_non == 'coding',
+      options.cluster_rep,
+      genetic_code=options.genetic_code
+    )
+    aln_to_meta.run(options.outprefix)
+
diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
index 0083be96..d4a22f43 100644
--- a/ariba/tasks/summary.py
+++ b/ariba/tasks/summary.py
@@ -9,43 +9,64 @@ def use_preset(options):
     preset_to_vals = {
         'minimal': {
             'cluster_cols': 'has_res',
+            'variant_cols': '',
             'col_filter': 'y',
             'row_filter': 'y',
+            'var_groups': 'n',
             'known_vars': 'n',
             'novel_vars': 'n'
         },
         'cluster_small': {
             'cluster_cols': 'assembled,has_res,ref_seq,known_var',
+            'variant_cols': '',
             'col_filter': 'y',
             'row_filter': 'y',
+            'var_groups': 'n',
             'known_vars': 'n',
             'novel_vars': 'n'
         },
         'cluster_all': {
             'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var',
+            'variant_cols': '',
             'col_filter': 'y',
             'row_filter': 'y',
+            'var_groups': 'n',
+            'known_vars': 'n',
+            'novel_vars': 'n'
+        },
+        'cluster_var_groups': {
+            'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var',
+            'variant_cols': 'groups',
+            'col_filter': 'y',
+            'row_filter': 'y',
+            'var_groups': 'y',
             'known_vars': 'n',
             'novel_vars': 'n'
         },
         'cluster_known_vars': {
             'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var',
+            'variant_cols': 'groups,grouped,ungrouped',
             'col_filter': 'y',
             'row_filter': 'y',
+            'var_groups': 'y',
             'known_vars': 'y',
             'novel_vars': 'n'
         },
         'all': {
             'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var',
+            'variant_cols': 'groups,grouped,ungrouped,novel',
             'col_filter': 'y',
             'row_filter': 'y',
+            'var_groups': 'y',
             'known_vars': 'y',
             'novel_vars': 'y'
         },
         'all_no_filter': {
             'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var',
+            'variant_cols': 'groups,grouped,ungrouped,novel',
             'col_filter': 'n',
             'row_filter': 'n',
+            'var_groups': 'y',
             'known_vars': 'y',
             'novel_vars': 'y'
         },
@@ -60,7 +81,7 @@ def use_preset(options):
 
 
 def run():
-    presets = ['minimal', 'cluster_small', 'cluster_all', 'cluster_known_vars', 'all', 'all_no_filter']
+    presets = ['minimal', 'cluster_small', 'cluster_all', 'cluster_var_groups', 'cluster_known_vars', 'all', 'all_no_filter']
 
     parser = argparse.ArgumentParser(
         description = 'Make a summary of ARIBA report files, and Phandango files',
@@ -71,8 +92,7 @@ def run():
     parser.add_argument('--cluster_cols', help='Comma separated list of cluster columns to include. Choose from: assembled, has_res, ref_seq, pct_id, known_var, novel_var [%(default)s]', default='has_res', metavar='col1,col2,...')
     parser.add_argument('--col_filter', choices=['y', 'n'], default='y', help='Choose whether columns where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
     parser.add_argument('--row_filter', choices=['y', 'n'], default='y', help='Choose whether rows where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
-    parser.add_argument('--known_vars', choices=['y', 'n'], default='n', help='Output a column for every known variant [%(default)s]', metavar='y|n')
-    parser.add_argument('--novel_vars', choices=['y', 'n'], default='n', help='Output a column for every novel variant [%(default)s]', metavar='y|n')
+    parser.add_argument('--var_cols', help='Comma separated list of variant columns to include. Choose from: groups, grouped, ungrouped, novel [none by default]', metavar='col1,col2,...', default='')
     parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
     parser.add_argument('--verbose', action='store_true', help='Be verbose')
     parser.add_argument('outprefix', help='Prefix of output files')
@@ -87,12 +107,11 @@ def run():
         options.outprefix,
         fofn=options.fofn,
         filenames=options.infiles,
-        include_all_known_variant_columns=options.known_vars == 'y',
-        include_all_novel_variant_columns=options.novel_vars == 'y',
         filter_rows=options.col_filter == 'y',
         filter_columns=options.row_filter == 'y',
         min_id=options.min_id,
         cluster_cols=options.cluster_cols,
+        variant_cols=options.var_cols,
         verbose=options.verbose
     )
     s.run()
diff --git a/ariba/test_run_data/metadata.tsv b/ariba/test_run_data/metadata.tsv
index f9c3fb7e..04b61078 100644
--- a/ariba/test_run_data/metadata.tsv
+++ b/ariba/test_run_data/metadata.tsv
@@ -1,14 +1,14 @@
-presence_absence1	.	.	Generic description of presence_absence1
-presence_absence1	p	R3S	Ref and assembly have wild type, so do not report
-presence_absence1	p	A10V	Ref has wild, reads have variant so report
-presence_absence1	p	I5A	Ref and reads have variant so report
-variants_only1	.	.	Generic description of variants_only1
-variants_only1	p	I3L	Ref and assembly have wild type, so do not report
-variants_only1	p	S5T	Ref and reads have variant so report
-variants_only2	p	R3I	Ref and reads have wild so do not report
-variants_only2	.	.	Generic description of variants_only2
-noncoding1	.	.	generic description of noncoding1
-noncoding1	n	A6G	variant in ref and reads so should report
-noncoding1	n	G9T	wild type in ref and reads so should not report
-noncoding1	n	A14T	ref has wild type, reads have variant so should report
-noncoding1	n	A40C	ref has variant, reads have wild type so should not report
+presence_absence1	.	.	.	Generic description of presence_absence1
+presence_absence1	p	R3S	.	Ref and assembly have wild type, so do not report
+presence_absence1	p	A10V	.	Ref has wild, reads have variant so report
+presence_absence1	p	I5A	.	Ref and reads have variant so report
+variants_only1	.	.	.	Generic description of variants_only1
+variants_only1	p	I3L	.	Ref and assembly have wild type, so do not report
+variants_only1	p	S5T	.	Ref and reads have variant so report
+variants_only2	p	R3I	.	Ref and reads have wild so do not report
+variants_only2	.	.	.	Generic description of variants_only2
+noncoding1	.	.	.	generic description of noncoding1
+noncoding1	n	A6G	.	variant in ref and reads so should report
+noncoding1	n	G9T	.	wild type in ref and reads so should not report
+noncoding1	n	A14T	noncoding_group1	ref has wild type, reads have variant so should report
+noncoding1	n	A40C	.	ref has variant, reads have wild type so should not report
diff --git a/ariba/tests/aln_to_metadata_test.py b/ariba/tests/aln_to_metadata_test.py
new file mode 100644
index 00000000..7f55dd3f
--- /dev/null
+++ b/ariba/tests/aln_to_metadata_test.py
@@ -0,0 +1,411 @@
+import unittest
+import os
+import copy
+import shutil
+import filecmp
+import pyfastaq
+from ariba import aln_to_metadata, sequence_variant
+
+modules_dir = os.path.dirname(os.path.abspath(aln_to_metadata.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestAlnToMetadata(unittest.TestCase):
+    def test_load_aln_file(self):
+        '''test _load_aln_file'''
+        aln_file = os.path.join(data_dir, 'aln_to_metadata_load_aln_file.in.fa')
+        expected = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'ABC-DE'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'ABCQDE'),
+        }
+        got = aln_to_metadata.AlnToMetadata._load_aln_file(aln_file)
+        self.assertEqual(expected, got)
+
+
+    def test_load_vars_file_good_file(self):
+        '''test _load_vars_file good input file'''
+        infile = os.path.join(data_dir, 'aln_to_metadata_load_vars_file_good.tsv')
+        variant1 = sequence_variant.Variant('p', 'A42B', 'id1')
+        variant2 = sequence_variant.Variant('p', 'C43D', 'id2')
+        variant3 = sequence_variant.Variant('p', 'E100F', 'id3')
+        expected = {
+            'seq1': [(variant1, 'description 1')],
+            'seq2': [(variant2, 'description 2'), (variant3, 'description 3')]
+        }
+        got = aln_to_metadata.AlnToMetadata._load_vars_file(infile, True)
+        self.assertEqual(expected, got)
+
+
+    def test_load_vars_bad_files(self):
+        '''test _load_vars_file bad input files'''
+        infiles = [
+            os.path.join(data_dir, 'aln_to_metadata_load_vars_file_bad.1.tsv'),
+            os.path.join(data_dir, 'aln_to_metadata_load_vars_file_bad.2.tsv')
+        ]
+
+        for infile in infiles:
+            with self.assertRaises(aln_to_metadata.Error):
+                aln_to_metadata.AlnToMetadata._load_vars_file(infile, True)
+
+
+    def test_make_unpadded_seqs(self):
+        '''test _make_unpadded_seqs'''
+        padded = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'acg---t'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', '---a-cgt-'),
+        }
+        expected = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'acgt'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'acgt'),
+        }
+        got = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded)
+        self.assertEqual(expected, got)
+
+
+    def test_check_seq_lengths_same(self):
+        '''test _check_seq_lengths_same'''
+        seqs = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'acgt'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'acgt'),
+        }
+
+        self.assertTrue(aln_to_metadata.AlnToMetadata._check_seq_lengths_same(seqs))
+        seqs['seq1'].seq = 'a'
+        with self.assertRaises(aln_to_metadata.Error):
+            aln_to_metadata.AlnToMetadata._check_seq_lengths_same(seqs)
+
+
+    def test_insertion_coords(self):
+        '''test _insertion_coords'''
+        ivl = pyfastaq.intervals.Interval
+        tests = [
+            ('acgt', []),
+            ('-a', [pyfastaq.intervals.Interval(0, 0)]),
+            ('a---cgt--', [pyfastaq.intervals.Interval(1, 3), pyfastaq.intervals.Interval(7, 8)]),
+        ]
+
+        for seq, expected in tests:
+            fa = pyfastaq.sequences.Fasta('x', seq)
+            got = aln_to_metadata.AlnToMetadata._insertion_coords(fa)
+            self.assertEqual(expected, got)
+
+
+    def test_make_unpadded_insertion_coords(self):
+        '''test _make_unpadded_insertion_coords'''
+        seqs = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'acgt'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'ac-gt'),
+            'seq3': pyfastaq.sequences.Fasta('seq3', '--acg-t'),
+        }
+
+        expected = {
+            'seq1': [],
+            'seq2': [pyfastaq.intervals.Interval(2, 2)],
+            'seq3': [pyfastaq.intervals.Interval(0, 1), pyfastaq.intervals.Interval(5, 5)],
+
+        }
+        got = aln_to_metadata.AlnToMetadata._make_unpadded_insertion_coords(seqs)
+        self.assertEqual(expected, got)
+
+
+    def test_check_insertion_coords(self):
+        '''test _check_insertion_coords'''
+        seq = pyfastaq.sequences.Fasta('name', 'AAA---GGG------TTT---')
+        self.assertTrue(aln_to_metadata.AlnToMetadata._check_insertion_coords(seq))
+
+        bad_seqs = [
+            pyfastaq.sequences.Fasta('name', 'AAA--GGG'),  # bad length
+            pyfastaq.sequences.Fasta('name', 'A---AA'),  # bad start position
+            pyfastaq.sequences.Fasta('name', 'AA---AA'), # bad start position
+        ]
+
+        for seq in bad_seqs:
+            with self.assertRaises(aln_to_metadata.Error):
+                aln_to_metadata.AlnToMetadata._check_insertion_coords(seq)
+
+
+    def test_check_coding_seq(self):
+        '''test _check_coding_seq'''
+        seq = pyfastaq.sequences.Fasta('name', 'ATGCTTTAG')
+        self.assertTrue(aln_to_metadata.AlnToMetadata._check_coding_seq(seq))
+
+        bad_seqs = [
+            pyfastaq.sequences.Fasta('name', 'TTGCTTAG'), # length not a mutliple of 3
+            pyfastaq.sequences.Fasta('name', 'TTTCTTTAG'), # no start codon
+            pyfastaq.sequences.Fasta('name', 'ATGTAGCTTTAG'), # stop codon in middle
+            pyfastaq.sequences.Fasta('name', 'TTGCTTTTT'), # no stop at end
+        ]
+
+        for seq in bad_seqs:
+            with self.assertRaises(aln_to_metadata.Error):
+                aln_to_metadata.AlnToMetadata._check_coding_seq(seq)
+
+
+    def test_check_sequences_non_coding(self):
+        '''test _check_sequences with noncoding seqs'''
+        padded_sequences = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'AC-T')
+        }
+
+        unpadded_sequences = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_sequences)
+        self.assertTrue(aln_to_metadata.AlnToMetadata._check_sequences(padded_sequences, unpadded_sequences, False))
+        padded_sequences['seq2'] = pyfastaq.sequences.Fasta('seq2', 'AC-')
+        unpadded_sequences = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_sequences)
+        with self.assertRaises(aln_to_metadata.Error):
+            aln_to_metadata.AlnToMetadata._check_sequences(padded_sequences, unpadded_sequences, False)
+
+
+    def test_check_sequences_coding(self):
+        '''test _check_sequences with coding seqs'''
+        padded_sequences = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'ATGCTTTAG'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'ATG---TAG')
+        }
+
+        unpadded_sequences = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_sequences)
+
+        self.assertTrue(aln_to_metadata.AlnToMetadata._check_sequences(padded_sequences, unpadded_sequences, True))
+
+        bad_seqs = [
+            'ATGCTTAG', # length not a mutliple of 3
+            'TTTCTTTAG', # no start codon
+            'ATGTAGCTTTAG', # stop codon in middle
+            'ATGTTTTTT', # no stop at end
+            'ATGC---TTTAG', # bad insertion
+            'ATGCT---TTAG', # bad insertion
+            'ATG-CTTTAG', # bad insertion
+            'ATG--CTTTAG', # bad insertion
+            'ATG----CTTTAG', # bad insertion
+        ]
+
+        for seq in bad_seqs:
+            padded_sequences['seq2'] = pyfastaq.sequences.Fasta('seq2', seq)
+            unpadded_sequences = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_sequences)
+            with self.assertRaises(aln_to_metadata.Error):
+                aln_to_metadata.AlnToMetadata._check_sequences(padded_sequences, unpadded_sequences, True)
+
+
+    def test_check_variants_match_sequences(self):
+        '''test _check_variants_match_sequences'''
+        seqs = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'ATGCTTTAG'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'ATGCTTCTTTAG'),
+            'seq3': pyfastaq.sequences.Fasta('seq3', 'ATG---TAG')
+        }
+
+        variants = {'seq1': [(sequence_variant.Variant('p', 'L2M', 'id1'), 'description1')]}
+        self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True))
+        variants = {'seq1': [(sequence_variant.Variant('p', 'M2L', 'id1'), 'description1')]}
+        self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True))
+
+        variants = {'seq1': [(sequence_variant.Variant('p', 'A2M', 'id1'), 'description1')]}
+        with self.assertRaises(aln_to_metadata.Error):
+            self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True))
+
+        variants = {'seq4': [(sequence_variant.Variant('p', 'A2M', 'id1'), 'description1')]}
+        with self.assertRaises(aln_to_metadata.Error):
+            self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True))
+
+
+    def test_variant_ids_are_unique(self):
+        '''test variant_ids_are_unique'''
+        variants = {
+            'seq1': [(sequence_variant.Variant('p', 'L2M', 'id1'), 'description1')],
+            'seq2': [(sequence_variant.Variant('p', 'L2M', 'id2'), 'description2')]
+        }
+
+        self.assertTrue(aln_to_metadata.AlnToMetadata._variant_ids_are_unique(variants))
+        variants['seq2'].append((sequence_variant.Variant('p', 'I3K', 'id1'), 'description3'))
+        with self.assertRaises(aln_to_metadata.Error):
+            self.assertTrue(aln_to_metadata.AlnToMetadata._variant_ids_are_unique(variants))
+
+
+    def test_unpadded_to_padded_nt_position(self):
+        '''test _unpadded_to_padded_nt_position'''
+        ivl = pyfastaq.intervals.Interval
+
+        tests = [
+            (0, [], 0),
+            (1, [], 1),
+            (2, [], 2),
+            (0, [ivl(3, 5)], 0),
+            (1, [ivl(3, 5)], 1),
+            (2, [ivl(3, 5)], 2),
+            (3, [ivl(3, 5)], 6),
+            (4, [ivl(3, 5)], 7),
+            (5, [ivl(3, 5)], 8),
+            (0, [ivl(3, 5), ivl(9,14)], 0),
+            (1, [ivl(3, 5), ivl(9,14)], 1),
+            (2, [ivl(3, 5), ivl(9,14)], 2),
+            (3, [ivl(3, 5), ivl(9,14)], 6),
+            (4, [ivl(3, 5), ivl(9,14)], 7),
+            (5, [ivl(3, 5), ivl(9,14)], 8),
+            (6, [ivl(3, 5), ivl(9,14)], 15),
+            (7, [ivl(3, 5), ivl(9,14)], 16),
+            (8, [ivl(3, 5), ivl(9,14)], 17),
+        ]
+
+        for position, insertions, expected in tests:
+            got = aln_to_metadata.AlnToMetadata._unpadded_to_padded_nt_position(position, insertions)
+            self.assertEqual(expected, got)
+
+
+    def test_padded_to_unpadded_nt_position(self):
+        '''test _padded_to_unpadded_nt_position'''
+        ivl = pyfastaq.intervals.Interval
+
+        tests = [
+            (0, [], 0),
+            (1, [], 1),
+            (2, [], 2),
+            (0, [ivl(3, 5)], 0),
+            (1, [ivl(3, 5)], 1),
+            (2, [ivl(3, 5)], 2),
+            (3, [ivl(3, 5)], None),
+            (4, [ivl(3, 5)], None),
+            (5, [ivl(3, 5)], None),
+            (6, [ivl(3, 5)], 3),
+            (7, [ivl(3, 5)], 4),
+            (8, [ivl(3, 5)], 5),
+            (0, [ivl(3, 5), ivl(7,10)], 0),
+            (1, [ivl(3, 5), ivl(7,10)], 1),
+            (2, [ivl(3, 5), ivl(7,10)], 2),
+            (3, [ivl(3, 5), ivl(7,10)], None),
+            (4, [ivl(3, 5), ivl(7,10)], None),
+            (5, [ivl(3, 5), ivl(7,10)], None),
+            (6, [ivl(3, 5), ivl(7,10)], 3),
+            (7, [ivl(3, 5), ivl(7,10)], None),
+            (8, [ivl(3, 5), ivl(7,10)], None),
+            (9, [ivl(3, 5), ivl(7,10)], None),
+            (10, [ivl(3, 5), ivl(7,10)], None),
+            (11, [ivl(3, 5), ivl(7,10)], 4),
+            (12, [ivl(3, 5), ivl(7,10)], 5),
+        ]
+
+        for position, insertions, expected in tests:
+            got = aln_to_metadata.AlnToMetadata._padded_to_unpadded_nt_position(position, insertions)
+            self.assertEqual(expected, got)
+
+
+    def test_variants_to_tsv_lines_coding(self):
+        '''test _variants_to_tsv_lines coding sequences'''
+        padded_seqs = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'ATG---GCTAATTAG'), # M-AN*
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'ATG---GCTAATTAG'), # MFAN*
+            'seq3': pyfastaq.sequences.Fasta('seq3', 'ATGTTT---AATTAG'), # MF-N*
+            'seq4': pyfastaq.sequences.Fasta('seq4', 'ATGTTTTGTAATTAG'), # MFCN*
+            'seq5': pyfastaq.sequences.Fasta('seq5', 'ATGTTTGATAATTAG'), # MFDN*
+        }
+
+        unpadded_seqs = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_seqs)
+        insertions = aln_to_metadata.AlnToMetadata._make_unpadded_insertion_coords(padded_seqs)
+
+        variant1 = sequence_variant.Variant('p', 'A2D', 'id1')
+        variant2 = sequence_variant.Variant('p', 'F2E', 'id2')
+        variants = {
+            'seq1': [(variant1, 'description 1')],
+            'seq5': [(variant2, 'description 2')],
+        }
+
+        expected = [
+            'seq1\tp\tA2D\tid1\tdescription 1',
+            'seq2\tp\tA2D\tid1\tdescription 1',
+            'seq4\tp\tC3D\tid1\tdescription 1',
+            'seq5\tp\tA3D\tid1\tdescription 1',
+            'seq5\tp\tF2E\tid2\tdescription 2',
+            'seq3\tp\tF2E\tid2\tdescription 2',
+            'seq4\tp\tF2E\tid2\tdescription 2',
+        ]
+
+        got = aln_to_metadata.AlnToMetadata._variants_to_tsv_lines(variants, unpadded_seqs, padded_seqs, insertions, True)
+        self.assertEqual(expected, got)
+
+
+    def test_variants_to_tsv_lines_noncoding(self):
+        '''test _variants_to_tsv_lines noncoding sequences'''
+        padded_seqs = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'ATG---GCTAATTAG'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'ATG---GCTAATTAG'),
+            'seq3': pyfastaq.sequences.Fasta('seq3', 'ATGTAT---AATTAG'),
+            'seq4': pyfastaq.sequences.Fasta('seq4', 'ATGTGTTGTAATTAG'),
+            'seq5': pyfastaq.sequences.Fasta('seq5', 'ATGTTTGATAATTAG'),
+        }
+
+        unpadded_seqs = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_seqs)
+        unpadded_aa_seqs = {x: unpadded_seqs[x].translate() for x in unpadded_seqs}
+        insertions = aln_to_metadata.AlnToMetadata._make_unpadded_insertion_coords(padded_seqs)
+
+        variant1 = sequence_variant.Variant('n', 'C5T', 'id1')
+        variant2 = sequence_variant.Variant('n', 'A5T', 'id2')
+        variants = {
+            'seq1': [(variant1, 'description 1')],
+            'seq5': [(variant2, 'description 2')],
+        }
+
+        expected = [
+            'seq1\tn\tC5T\tid1\tdescription 1',
+            'seq2\tn\tC5T\tid1\tdescription 1',
+            'seq4\tn\tG8T\tid1\tdescription 1',
+            'seq5\tn\tA8T\tid1\tdescription 1',
+            'seq5\tn\tA5T\tid2\tdescription 2',
+            'seq3\tn\tA5T\tid2\tdescription 2',
+            'seq4\tn\tG5T\tid2\tdescription 2',
+        ]
+
+        got = aln_to_metadata.AlnToMetadata._variants_to_tsv_lines(variants, unpadded_seqs, padded_seqs, insertions, False)
+        self.assertEqual(expected, got)
+
+
+    def test_make_cluster_file(self):
+        '''test _make_cluster_file'''
+        seqs = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'a'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'c'),
+            'seq3': pyfastaq.sequences.Fasta('seq3', 'g'),
+        }
+        tmpfile = 'tmp.aln_to_meta_test_make_cluster_file.out'
+        expected_file = os.path.join(data_dir, 'aln_to_metadata_make_cluster_file.out')
+
+        with self.assertRaises(aln_to_metadata.Error):
+            aln_to_metadata.AlnToMetadata._make_cluster_file('not_found', seqs, tmpfile)
+
+        aln_to_metadata.AlnToMetadata._make_cluster_file('seq2', seqs, tmpfile)
+        self.assertTrue(filecmp.cmp(expected_file, tmpfile, shallow=False))
+        os.unlink(tmpfile)
+
+
+    def test_run_coding(self):
+        '''test run coding sequences'''
+        fa_in = os.path.join(data_dir, 'aln_to_metadata_run_coding.in.fa')
+        fa_expected = os.path.join(data_dir, 'aln_to_metadata_run_coding.out.fa')
+        tsv_in = os.path.join(data_dir, 'aln_to_metadata_run_coding.in.tsv')
+        tsv_expected = os.path.join(data_dir, 'aln_to_metadata_run_coding.out.tsv')
+        cluster_expected = os.path.join(data_dir, 'aln_to_metadata_run_coding.out.cluster')
+        a_to_m = aln_to_metadata.AlnToMetadata(fa_in, tsv_in, True, 'seq3')
+        outprefix = 'tmp.test.aln_to_metadata.run_coding'
+        a_to_m.run(outprefix)
+        self.assertTrue(filecmp.cmp(tsv_expected, outprefix + '.tsv', shallow=False))
+        self.assertTrue(filecmp.cmp(fa_expected, outprefix + '.fa', shallow=False))
+        self.assertTrue(filecmp.cmp(cluster_expected, outprefix + '.cluster', shallow=False))
+        os.unlink(outprefix + '.tsv')
+        os.unlink(outprefix + '.fa')
+        os.unlink(outprefix + '.cluster')
+
+
+    def test_run_noncoding(self):
+        '''test run noncoding sequences'''
+        fa_in = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.in.fa')
+        fa_expected = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.out.fa')
+        tsv_in = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.in.tsv')
+        tsv_expected = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.out.tsv')
+        cluster_expected = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.out.cluster')
+        a_to_m = aln_to_metadata.AlnToMetadata(fa_in, tsv_in, False, 'seq2')
+        outprefix = 'tmp.test.aln_to_metadata.run_noncoding'
+        a_to_m.run(outprefix)
+        self.assertTrue(filecmp.cmp(tsv_expected, outprefix + '.tsv', shallow=False))
+        self.assertTrue(filecmp.cmp(fa_expected, outprefix + '.fa', shallow=False))
+        self.assertTrue(filecmp.cmp(cluster_expected, outprefix + '.cluster', shallow=False))
+        os.unlink(outprefix + '.tsv')
+        os.unlink(outprefix + '.fa')
+        os.unlink(outprefix + '.cluster')
+
diff --git a/ariba/tests/assembly_variants_test.py b/ariba/tests/assembly_variants_test.py
index c8dea8bb..6061b292 100644
--- a/ariba/tests/assembly_variants_test.py
+++ b/ariba/tests/assembly_variants_test.py
@@ -120,8 +120,8 @@ def test_get_one_variant_for_one_contig_non_coding(self):
         # ref has T at position 5, which is wild type. This gives contig variant type A. Should report
         v2 = pymummer.variant.Variant(pymummer.snp.Snp('5\tT\tA\t5\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig'))
 
-        meta0 = sequence_metadata.SequenceMetadata('non_coding\tn\tC3A\tref has variant type A')
-        meta2 = sequence_metadata.SequenceMetadata('non_coding\tn\tT5A\tref has wild type T')
+        meta0 = sequence_metadata.SequenceMetadata('non_coding\tn\tC3A\tid1\tref has variant type A')
+        meta2 = sequence_metadata.SequenceMetadata('non_coding\tn\tT5A\tid1\tref has wild type T')
 
         mummer_variants = [v0, v1, v2]
 
@@ -188,8 +188,8 @@ def test_get_one_variant_for_one_contig_coding(self):
 
         mummer_variants = [[v0], [v1], [v2], [v3], [v4], [v5], [v6], [v7], [v8], [v9], [v10]]
 
-        meta0 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD2E\tref has wild type D (GAT=D, GAA=E)')
-        meta4 = sequence_metadata.SequenceMetadata('presence_absence\tp\tS3R\tref has variant type R (AGA=R, AGT=S)')
+        meta0 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD2E\tid1\tref has wild type D (GAT=D, GAA=E)')
+        meta4 = sequence_metadata.SequenceMetadata('presence_absence\tp\tS3R\tid1\tref has variant type R (AGA=R, AGT=S)')
 
         expected_tuples = [
             (1, 'p', 'D2E', 'NONSYN', [v0], {meta0}, set()),    #0
@@ -230,13 +230,13 @@ def test_get_one_variant_for_one_contig_coding(self):
 
     def test_get_remaining_known_ref_variants_amino_acids(self):
         '''test _get_remaining_known_ref_variants with amino acids'''
-        ref_var1 = sequence_metadata.SequenceMetadata('gene1\tp\tD2E\tfoo bar')
-        ref_var2 = sequence_metadata.SequenceMetadata('gene1\tp\tD3E\tfoo bar baz')
-        ref_var3 = sequence_metadata.SequenceMetadata('gene1\tp\tD3I\tfoo bar baz spam')
-        ref_var4 = sequence_metadata.SequenceMetadata('gene1\tp\tD10E\tfoo bar baz spam egg')
-        ref_var5 = sequence_metadata.SequenceMetadata('gene1\tp\tD14E\tfoo bar baz spam egg chips')
-        ref_var6 = sequence_metadata.SequenceMetadata('gene1\tp\tD15E\tfoo bar baz spam egg chips')
-        ref_var7 = sequence_metadata.SequenceMetadata('gene1\tp\tD40E\tfoo bar baz spam egg chips')
+        ref_var1 = sequence_metadata.SequenceMetadata('gene1\tp\tD2E\tid1\tfoo bar')
+        ref_var2 = sequence_metadata.SequenceMetadata('gene1\tp\tD3E\tid1\tfoo bar baz')
+        ref_var3 = sequence_metadata.SequenceMetadata('gene1\tp\tD3I\tid1\tfoo bar baz spam')
+        ref_var4 = sequence_metadata.SequenceMetadata('gene1\tp\tD10E\tid1\tfoo bar baz spam egg')
+        ref_var5 = sequence_metadata.SequenceMetadata('gene1\tp\tD14E\tid1\tfoo bar baz spam egg chips')
+        ref_var6 = sequence_metadata.SequenceMetadata('gene1\tp\tD15E\tid1\tfoo bar baz spam egg chips')
+        ref_var7 = sequence_metadata.SequenceMetadata('gene1\tp\tD40E\tid1\tfoo bar baz spam egg chips')
 
         known_ref_variants = {
             1: {ref_var1},
@@ -261,13 +261,13 @@ def test_get_remaining_known_ref_variants_amino_acids(self):
 
     def test_get_remaining_known_ref_variants_nucleotides(self):
         '''test _get_remaining_known_ref_variants with nucleotides'''
-        ref_var1 = sequence_metadata.SequenceMetadata('gene1\tn\tA2C\tfoo bar')
-        ref_var2 = sequence_metadata.SequenceMetadata('gene1\tn\tA3C\tfoo bar baz')
-        ref_var3 = sequence_metadata.SequenceMetadata('gene1\tn\tA3T\tfoo bar baz spam')
-        ref_var4 = sequence_metadata.SequenceMetadata('gene1\tn\tA10C\tfoo bar baz spam egg')
-        ref_var5 = sequence_metadata.SequenceMetadata('gene1\tn\tA14C\tfoo bar baz spam egg chips')
-        ref_var6 = sequence_metadata.SequenceMetadata('gene1\tn\tA15C\tfoo bar baz spam egg chips')
-        ref_var7 = sequence_metadata.SequenceMetadata('gene1\tn\tA40C\tfoo bar baz spam egg chips')
+        ref_var1 = sequence_metadata.SequenceMetadata('gene1\tn\tA2C\tid1\tfoo bar')
+        ref_var2 = sequence_metadata.SequenceMetadata('gene1\tn\tA3C\tid1\tfoo bar baz')
+        ref_var3 = sequence_metadata.SequenceMetadata('gene1\tn\tA3T\tid1\tfoo bar baz spam')
+        ref_var4 = sequence_metadata.SequenceMetadata('gene1\tn\tA10C\tid1\tfoo bar baz spam egg')
+        ref_var5 = sequence_metadata.SequenceMetadata('gene1\tn\tA14C\tid1\tfoo bar baz spam egg chips')
+        ref_var6 = sequence_metadata.SequenceMetadata('gene1\tn\tA15C\tid1\tfoo bar baz spam egg chips')
+        ref_var7 = sequence_metadata.SequenceMetadata('gene1\tn\tA40C\tid1\tfoo bar baz spam egg chips')
 
         known_ref_variants = {
             1: {ref_var1},
@@ -292,11 +292,11 @@ def test_get_remaining_known_ref_variants_nucleotides(self):
 
     def test_get_variants_presence_absence(self):
         '''test get_variants presence absence genes'''
-        meta1 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD2E\tref has wild type D, contig has var (GAT=D, GAA=E)')
-        meta2 = sequence_metadata.SequenceMetadata('presence_absence\tp\tS3R\tref has variant type R, contig has wild (AGA=R, AGT=S)')
-        meta3 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD4E\tref has variant type E, contig has var (GAA=E, GAC=D)')
-        meta4 = sequence_metadata.SequenceMetadata('presence_absence\tp\tA5D\tref has wild type A, contig has var (GCG=A, GAC=D)')
-        meta5 = sequence_metadata.SequenceMetadata('presence_absence\tp\tR13S\tref and qry have wild type')
+        meta1 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD2E\tid1\tref has wild type D, contig has var (GAT=D, GAA=E)')
+        meta2 = sequence_metadata.SequenceMetadata('presence_absence\tp\tS3R\tid1\tref has variant type R, contig has wild (AGA=R, AGT=S)')
+        meta3 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD4E\tid1\tref has variant type E, contig has var (GAA=E, GAC=D)')
+        meta4 = sequence_metadata.SequenceMetadata('presence_absence\tp\tA5D\tid1\tref has wild type A, contig has var (GCG=A, GAC=D)')
+        meta5 = sequence_metadata.SequenceMetadata('presence_absence\tp\tR13S\tid1\tref and qry have wild type')
 
         metadata_tsv = 'tmp.test_get_variants_presence_absence.metadata.tsv'
         with open(metadata_tsv, 'w') as f:
@@ -343,9 +343,9 @@ def test_get_variants_presence_absence(self):
 
     def test_get_variants_variants_only(self):
         '''test get_variants variants only'''
-        meta1 = sequence_metadata.SequenceMetadata('variants_only\tp\tD2E\tref has wild type D (GAT=D, GAA=E)')
-        meta2 = sequence_metadata.SequenceMetadata('variants_only\tp\tS3R\tref has variant type R (AGA=R, AGT=S)')
-        meta3 = sequence_metadata.SequenceMetadata('variants_only\tp\tD4E\tref has variant type E (GAA=E, GAC=D)')
+        meta1 = sequence_metadata.SequenceMetadata('variants_only\tp\tD2E\tid1\tref has wild type D (GAT=D, GAA=E)')
+        meta2 = sequence_metadata.SequenceMetadata('variants_only\tp\tS3R\tid1\tref has variant type R (AGA=R, AGT=S)')
+        meta3 = sequence_metadata.SequenceMetadata('variants_only\tp\tD4E\tid1\tref has variant type E (GAA=E, GAC=D)')
 
         metadata_tsv = 'tmp.test_get_variants_variants_only.metadata.tsv'
         with open(metadata_tsv, 'w') as f:
diff --git a/ariba/tests/cluster_test.py b/ariba/tests/cluster_test.py
index 7ccdd61d..6e959eb0 100644
--- a/ariba/tests/cluster_test.py
+++ b/ariba/tests/cluster_test.py
@@ -151,12 +151,12 @@ def test_full_run_ok_non_coding(self):
         c.run()
 
         expected = [
-            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t73\t73\tT\t19\t.\t19\tnoncoding1_n_A14T_ref has wild type, reads has variant so should report\tgeneric description of noncoding1',
+            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t73\t73\tT\t19\t.\t19\tnoncoding1:n:A14T:.:ref has wild type, reads has variant so should report\tgeneric description of noncoding1',
             'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t0\t.\tn\t.\t0\tG61T\tSNP\t60\t60\tG\t120\t120\tT\t24\t.\t24\t.\tgeneric description of noncoding1',
             'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t0\t.\tn\t.\t0\t.82C\tINS\t81\t81\t.\t142\t142\tC\t23\t.\t23\t.\tgeneric description of noncoding1',
             'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t0\t.\tn\t.\t0\tT108.\tDEL\t107\t107\tT\t167\t167\t.\t17\t.\t17\t.\tgeneric description of noncoding1',
-            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tA6G\t1\t.\t.\t6\t6\tG\t66\t66\tG\t19\t.\t19\tnoncoding1_n_A6G_variant in ref and reads so should report\tgeneric description of noncoding1',
-            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tG9T\t0\t.\t.\t9\t9\tG\t69\t69\tG\t19\t.\t19\tnoncoding1_n_G9T_wild type in ref and reads\tgeneric description of noncoding1'
+            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tA6G\t1\t.\t.\t6\t6\tG\t66\t66\tG\t19\t.\t19\tnoncoding1:n:A6G:.:variant in ref and reads so should report\tgeneric description of noncoding1',
+            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tG9T\t0\t.\t.\t9\t9\tG\t69\t69\tG\t19\t.\t19\tnoncoding1:n:G9T:.:wild type in ref and reads\tgeneric description of noncoding1'
         ]
 
         self.assertEqual(expected, c.report_lines)
@@ -177,12 +177,12 @@ def test_full_run_ok_presence_absence(self):
         c.run()
 
         expected = [
-            'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tA10V\t1\tA10V\tNONSYN\t28\t28\tC\t83\t83\tT\t22\t.\t22\tpresence_absence1_p_A10V_Ref has wild, reads have variant so report\tGeneric description of presence_absence1',
+            'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tA10V\t1\tA10V\tNONSYN\t28\t28\tC\t83\t83\tT\t22\t.\t22\tpresence_absence1:p:A10V:.:Ref has wild, reads have variant so report\tGeneric description of presence_absence1',
             'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t0\t.\tp\t.\t0\t.\tSYN\t53\t53\tT\t108\t108\tC\t32\t.\t32\t.\tGeneric description of presence_absence1',
 
-            'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t62\t64\tC;G;C\t18;17;17\t.;.;.\t18;17;17\tpresence_absence1_p_R3S_Ref and assembly have wild type\tGeneric description of presence_absence1',
+            'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t62\t64\tC;G;C\t18;17;17\t.;.;.\t18;17;17\tpresence_absence1:p:R3S:.:Ref and assembly have wild type\tGeneric description of presence_absence1',
 
-            'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tG;C;G\t68\t70\tG;C;G\t18;20;20\t.;.;.\t18;20;20\tpresence_absence1_p_I5A_Ref and reads have variant so report\tGeneric description of presence_absence1',
+            'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tG;C;G\t68\t70\tG;C;G\t18;20;20\t.;.;.\t18;20;20\tpresence_absence1:p:I5A:.:Ref and reads have variant so report\tGeneric description of presence_absence1',
         ]
 
         self.assertEqual(expected, c.report_lines)
@@ -202,7 +202,7 @@ def test_full_run_ok_variants_only_variant_not_present(self):
         c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=66, total_reads_bases=3300)
         c.run()
         expected = [
-            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1_p_R3S_Ref and assembly have wild type, so do not report\tGeneric description of variants_only1'
+            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1:p:R3S:.:Ref and assembly have wild type, so do not report\tGeneric description of variants_only1'
         ]
         self.assertEqual(expected, c.report_lines)
         shutil.rmtree(tmpdir)
@@ -221,7 +221,7 @@ def test_full_run_ok_variants_only_variant_not_present_always_report(self):
         c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=66, total_reads_bases=3300)
         c.run()
         expected = [
-            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1_p_R3S_Ref and assembly have wild type, but always report anyway\tGeneric description of variants_only1'
+            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1:p:R3S:.:Ref and assembly have wild type, but always report anyway\tGeneric description of variants_only1'
         ]
         self.assertEqual(expected, c.report_lines)
         shutil.rmtree(tmpdir)
@@ -241,8 +241,8 @@ def test_full_run_ok_variants_only_variant_is_present(self):
         c.run()
 
         expected = [
-            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1_p_R3S_Ref and assembly have wild type\tGeneric description of variants_only1',
-            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tG;C;G\t71\t73\tG;C;G\t17;17;17\t.;.;.\t17;17;17\tvariants_only1_p_I5A_Ref and reads have variant so report\tGeneric description of variants_only1',
+            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1:p:R3S:.:Ref and assembly have wild type\tGeneric description of variants_only1',
+            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tG;C;G\t71\t73\tG;C;G\t17;17;17\t.;.;.\t17;17;17\tvariants_only1:p:I5A:.:Ref and reads have variant so report\tGeneric description of variants_only1',
         ]
         self.assertEqual(expected, c.report_lines)
         shutil.rmtree(tmpdir)
diff --git a/ariba/tests/clusters_test.py b/ariba/tests/clusters_test.py
index a263de0e..60ec0c41 100644
--- a/ariba/tests/clusters_test.py
+++ b/ariba/tests/clusters_test.py
@@ -63,14 +63,14 @@ def test_load_reference_data_from_dir(self):
 
         expected_metadata = {
             'presabs1': {
-                '.': {sequence_metadata.SequenceMetadata('presabs1\t.\t.\tpresabs1 description')},
+                '.': {sequence_metadata.SequenceMetadata('presabs1\t.\t.\t.\tpresabs1 description')},
                 'n': {},
                 'p': {}
             },
             'variants_only1': {
                 '.': set(),
                 'n': {},
-                'p': {1: {sequence_metadata.SequenceMetadata('variants_only1\tp\tC2I\tdescription of variants_only1 C2I')}}
+                'p': {1: {sequence_metadata.SequenceMetadata('variants_only1\tp\tC2I\t.\tdescription of variants_only1 C2I')}}
             }
         }
         self.assertEqual(expected_metadata, got_refdata.metadata)
diff --git a/ariba/tests/data/aln_to_metadata_load_aln_file.in.fa b/ariba/tests/data/aln_to_metadata_load_aln_file.in.fa
new file mode 100644
index 00000000..4c7c0fe7
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_load_aln_file.in.fa
@@ -0,0 +1,4 @@
+>seq1
+ABC-DE
+>seq2
+ABCQDE
diff --git a/ariba/tests/data/aln_to_metadata_load_vars_file_bad.1.tsv b/ariba/tests/data/aln_to_metadata_load_vars_file_bad.1.tsv
new file mode 100644
index 00000000..6152c4a8
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_load_vars_file_bad.1.tsv
@@ -0,0 +1,2 @@
+seq1	A42B	id1	description 1
+seq2	C43D	id2
diff --git a/ariba/tests/data/aln_to_metadata_load_vars_file_bad.2.tsv b/ariba/tests/data/aln_to_metadata_load_vars_file_bad.2.tsv
new file mode 100644
index 00000000..da6dd350
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_load_vars_file_bad.2.tsv
@@ -0,0 +1,2 @@
+seq1	A42B	id1	description 1
+seq2	wrong_format	id2	description 2
diff --git a/ariba/tests/data/aln_to_metadata_load_vars_file_good.tsv b/ariba/tests/data/aln_to_metadata_load_vars_file_good.tsv
new file mode 100644
index 00000000..058b1dee
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_load_vars_file_good.tsv
@@ -0,0 +1,3 @@
+seq1	A42B	id1	description 1
+seq2	C43D	id2	description 2
+seq2	E100F	id3	description 3
diff --git a/ariba/tests/data/aln_to_metadata_make_cluster_file.out b/ariba/tests/data/aln_to_metadata_make_cluster_file.out
new file mode 100644
index 00000000..529a3cc6
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_make_cluster_file.out
@@ -0,0 +1 @@
+seq2	seq1	seq3
diff --git a/ariba/tests/data/aln_to_metadata_run_coding.in.fa b/ariba/tests/data/aln_to_metadata_run_coding.in.fa
new file mode 100644
index 00000000..c71f8c11
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_coding.in.fa
@@ -0,0 +1,10 @@
+>seq1
+ATG---GCTAATTAG
+>seq2
+ATG---GCTAATTAG
+>seq3
+ATGTTT---AATTAG
+>seq4
+ATGTTTTGTAATTAG
+>seq5
+ATGTTTGATAATTAG
diff --git a/ariba/tests/data/aln_to_metadata_run_coding.in.tsv b/ariba/tests/data/aln_to_metadata_run_coding.in.tsv
new file mode 100644
index 00000000..552e7a51
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_coding.in.tsv
@@ -0,0 +1,2 @@
+seq1	A2D	id1	description 1
+seq5	F2E	id2	description 2
diff --git a/ariba/tests/data/aln_to_metadata_run_coding.out.cluster b/ariba/tests/data/aln_to_metadata_run_coding.out.cluster
new file mode 100644
index 00000000..6df8ac7b
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_coding.out.cluster
@@ -0,0 +1 @@
+seq3	seq1	seq2	seq4	seq5
diff --git a/ariba/tests/data/aln_to_metadata_run_coding.out.fa b/ariba/tests/data/aln_to_metadata_run_coding.out.fa
new file mode 100644
index 00000000..97d0f121
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_coding.out.fa
@@ -0,0 +1,10 @@
+>seq1
+ATGGCTAATTAG
+>seq2
+ATGGCTAATTAG
+>seq3
+ATGTTTAATTAG
+>seq4
+ATGTTTTGTAATTAG
+>seq5
+ATGTTTGATAATTAG
diff --git a/ariba/tests/data/aln_to_metadata_run_coding.out.tsv b/ariba/tests/data/aln_to_metadata_run_coding.out.tsv
new file mode 100644
index 00000000..ee957fa6
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_coding.out.tsv
@@ -0,0 +1,7 @@
+seq1	p	A2D	id1	description 1
+seq2	p	A2D	id1	description 1
+seq4	p	C3D	id1	description 1
+seq5	p	A3D	id1	description 1
+seq5	p	F2E	id2	description 2
+seq3	p	F2E	id2	description 2
+seq4	p	F2E	id2	description 2
diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.in.fa b/ariba/tests/data/aln_to_metadata_run_noncoding.in.fa
new file mode 100644
index 00000000..2bc56571
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_noncoding.in.fa
@@ -0,0 +1,10 @@
+>seq1
+ATG---GCTAATTAG
+>seq2
+ATG---GCTAATTAG
+>seq3
+ATGTAT---AATTAG
+>seq4
+ATGTGTTGTAATTAG
+>seq5
+ATGTTTGATAATTAG
diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.in.tsv b/ariba/tests/data/aln_to_metadata_run_noncoding.in.tsv
new file mode 100644
index 00000000..3d32d779
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_noncoding.in.tsv
@@ -0,0 +1,2 @@
+seq1	C5T	id1	description 1
+seq5	A5T	id2	description 2
diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.out.cluster b/ariba/tests/data/aln_to_metadata_run_noncoding.out.cluster
new file mode 100644
index 00000000..aee4e5a9
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_noncoding.out.cluster
@@ -0,0 +1 @@
+seq2	seq1	seq3	seq4	seq5
diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.out.fa b/ariba/tests/data/aln_to_metadata_run_noncoding.out.fa
new file mode 100644
index 00000000..e737be69
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_noncoding.out.fa
@@ -0,0 +1,10 @@
+>seq1
+ATGGCTAATTAG
+>seq2
+ATGGCTAATTAG
+>seq3
+ATGTATAATTAG
+>seq4
+ATGTGTTGTAATTAG
+>seq5
+ATGTTTGATAATTAG
diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.out.tsv b/ariba/tests/data/aln_to_metadata_run_noncoding.out.tsv
new file mode 100644
index 00000000..7ba82bf3
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_noncoding.out.tsv
@@ -0,0 +1,7 @@
+seq1	n	C5T	id1	description 1
+seq2	n	C5T	id1	description 1
+seq4	n	G8T	id1	description 1
+seq5	n	A8T	id1	description 1
+seq5	n	A5T	id2	description 2
+seq3	n	A5T	id2	description 2
+seq4	n	G5T	id2	description 2
diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv
index 5d0fd041..f1e3583e 100644
--- a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv
+++ b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv
@@ -1,11 +1,11 @@
-presence_absence	p	D2E	ref has wild type D (GAT=D, GAA=E)
-presence_absence	p	S3R	ref has variant type R (AGA=R, AGT=S)
-presence_absence	p	D4E	ref has variant type E (GAA=E, GAC=D)
-presence_absence	p	A5D	ref has wild type A (GCG=A)
-variants_only	p	D2E	ref has wild type D (GAT=D, GAA=E)
-variants_only	p	S3R	ref has variant type R (AGA=R, AGT=S)
-variants_only	p	D4E	ref has variant type E (GAA=E, GAC=D)
-variants_only	p	A5D	ref has wild type A (GCG=A)
-non_coding	n	C3A	ref has variant type A
-non_coding	n	T5A	ref has wild type T
-non_coding	n	C6G	ref has variant type G
+presence_absence	p	D2E	id1	ref has wild type D (GAT=D, GAA=E)
+presence_absence	p	S3R	id1	ref has variant type R (AGA=R, AGT=S)
+presence_absence	p	D4E	id1	ref has variant type E (GAA=E, GAC=D)
+presence_absence	p	A5D	id1	ref has wild type A (GCG=A)
+variants_only	p	D2E	id1	ref has wild type D (GAT=D, GAA=E)
+variants_only	p	S3R	id1	ref has variant type R (AGA=R, AGT=S)
+variants_only	p	D4E	id1	ref has variant type E (GAA=E, GAC=D)
+variants_only	p	A5D	id1	ref has wild type A (GCG=A)
+non_coding	n	C3A	id1	ref has variant type A
+non_coding	n	T5A	id1	ref has wild type T
+non_coding	n	C6G	id1	ref has variant type G
diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv
index 5d0fd041..f1e3583e 100644
--- a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv
+++ b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv
@@ -1,11 +1,11 @@
-presence_absence	p	D2E	ref has wild type D (GAT=D, GAA=E)
-presence_absence	p	S3R	ref has variant type R (AGA=R, AGT=S)
-presence_absence	p	D4E	ref has variant type E (GAA=E, GAC=D)
-presence_absence	p	A5D	ref has wild type A (GCG=A)
-variants_only	p	D2E	ref has wild type D (GAT=D, GAA=E)
-variants_only	p	S3R	ref has variant type R (AGA=R, AGT=S)
-variants_only	p	D4E	ref has variant type E (GAA=E, GAC=D)
-variants_only	p	A5D	ref has wild type A (GCG=A)
-non_coding	n	C3A	ref has variant type A
-non_coding	n	T5A	ref has wild type T
-non_coding	n	C6G	ref has variant type G
+presence_absence	p	D2E	id1	ref has wild type D (GAT=D, GAA=E)
+presence_absence	p	S3R	id1	ref has variant type R (AGA=R, AGT=S)
+presence_absence	p	D4E	id1	ref has variant type E (GAA=E, GAC=D)
+presence_absence	p	A5D	id1	ref has wild type A (GCG=A)
+variants_only	p	D2E	id1	ref has wild type D (GAT=D, GAA=E)
+variants_only	p	S3R	id1	ref has variant type R (AGA=R, AGT=S)
+variants_only	p	D4E	id1	ref has variant type E (GAA=E, GAC=D)
+variants_only	p	A5D	id1	ref has wild type A (GCG=A)
+non_coding	n	C3A	id1	ref has variant type A
+non_coding	n	T5A	id1	ref has wild type T
+non_coding	n	C6G	id1	ref has variant type G
diff --git a/ariba/tests/data/cluster_test_full_run_ok_non_coding.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_non_coding.metadata.tsv
index ba79712a..97cfd32f 100644
--- a/ariba/tests/data/cluster_test_full_run_ok_non_coding.metadata.tsv
+++ b/ariba/tests/data/cluster_test_full_run_ok_non_coding.metadata.tsv
@@ -1,5 +1,5 @@
-noncoding1	.	.	generic description of noncoding1
-noncoding1	n	A6G	variant in ref and reads so should report
-noncoding1	n	G9T	wild type in ref and reads
-noncoding1	n	A14T	ref has wild type, reads has variant so should report
-noncoding1	n	A40C	ref has variant, reads has wild type
+noncoding1	.	.	.	generic description of noncoding1
+noncoding1	n	A6G	.	variant in ref and reads so should report
+noncoding1	n	G9T	.	wild type in ref and reads
+noncoding1	n	A14T	.	ref has wild type, reads has variant so should report
+noncoding1	n	A40C	.	ref has variant, reads has wild type
diff --git a/ariba/tests/data/cluster_test_full_run_ok_presence_absence.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_presence_absence.metadata.tsv
index 8adc93cf..bc5a3d97 100644
--- a/ariba/tests/data/cluster_test_full_run_ok_presence_absence.metadata.tsv
+++ b/ariba/tests/data/cluster_test_full_run_ok_presence_absence.metadata.tsv
@@ -1,4 +1,4 @@
-presence_absence1	.	.	Generic description of presence_absence1
-presence_absence1	p	R3S	Ref and assembly have wild type
-presence_absence1	p	A10V	Ref has wild, reads have variant so report
-presence_absence1	p	I5A	Ref and reads have variant so report
+presence_absence1	.	.	.	Generic description of presence_absence1
+presence_absence1	p	R3S	.	Ref and assembly have wild type
+presence_absence1	p	A10V	.	Ref has wild, reads have variant so report
+presence_absence1	p	I5A	.	Ref and reads have variant so report
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv
index b0ee54de..7e193f69 100644
--- a/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv
@@ -1,2 +1,2 @@
-variants_only1	.	.	Generic description of variants_only1
-variants_only1	p	R3S	Ref and assembly have wild type, but always report anyway
+variants_only1	.	.	.	Generic description of variants_only1
+variants_only1	p	R3S	.	Ref and assembly have wild type, but always report anyway
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.metadata.tsv
index c314c207..de14a1b3 100644
--- a/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.metadata.tsv
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.metadata.tsv
@@ -1,2 +1,2 @@
-variants_only1	.	.	Generic description of variants_only1
-variants_only1	p	R3S	Ref and assembly have wild type, so do not report
+variants_only1	.	.	.	Generic description of variants_only1
+variants_only1	p	R3S	.	Ref and assembly have wild type, so do not report
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.present.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_variants_only.present.metadata.tsv
index f4b198da..621f2c90 100644
--- a/ariba/tests/data/cluster_test_full_run_ok_variants_only.present.metadata.tsv
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only.present.metadata.tsv
@@ -1,3 +1,3 @@
-variants_only1	.	.	Generic description of variants_only1
-variants_only1	p	R3S	Ref and assembly have wild type
-variants_only1	p	I5A	Ref and reads have variant so report
+variants_only1	.	.	.	Generic description of variants_only1
+variants_only1	p	R3S	.	Ref and assembly have wild type
+variants_only1	p	I5A	.	Ref and reads have variant so report
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.tsv b/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.tsv
index 88a5889e..07c89d5c 100644
--- a/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.tsv
+++ b/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.tsv
@@ -1,2 +1,2 @@
-variants_only1	p	C2I	description of variants_only1 C2I
-presabs1	.	.	presabs1 description
+variants_only1	p	C2I	.	description of variants_only1 C2I
+presabs1	.	.	.	presabs1 description
diff --git a/ariba/tests/data/reference_data_filter_bad_data_metadata.expected.tsv b/ariba/tests/data/reference_data_filter_bad_data_metadata.expected.tsv
index 66fae14e..4f90cbf1 100644
--- a/ariba/tests/data/reference_data_filter_bad_data_metadata.expected.tsv
+++ b/ariba/tests/data/reference_data_filter_bad_data_metadata.expected.tsv
@@ -1,8 +1,9 @@
-non_coding_1	.	.	should be in output because this field is here
-non_coding_1	n	C5A	dna variant ok
-presence_absence_1	.	.	should be in output because this field is here
-presence_absence_2	n	T4G	dna variant ok
-presence_absence_3	p	R3S	amino acid variant ok
-variants_only_1	.	.	should be kept as a generic description of variants_only_1
-variants_only_1	p	S2T	amino acid variant ok
-variants_only_1	n	T4A	dna variant ok
+non_coding_1	.	.	.	non_coding_1 description1
+non_coding_1	.	.	.	should be in output because this field is here
+non_coding_1	n	C5A	id1	dna variant ok
+presence_absence_1	.	.	.	should be in output because this field is here
+presence_absence_2	n	T4G	id2	dna variant ok
+presence_absence_3	p	R3S	.	amino acid variant ok
+variants_only_1	.	.	.	should be kept as a generic description of variants_only_1
+variants_only_1	p	S2T	.	amino acid variant ok
+variants_only_1	n	T4A	.	dna variant ok
diff --git a/ariba/tests/data/reference_data_filter_bad_data_metadata.in.tsv b/ariba/tests/data/reference_data_filter_bad_data_metadata.in.tsv
index 4f590de0..031a8d7a 100644
--- a/ariba/tests/data/reference_data_filter_bad_data_metadata.in.tsv
+++ b/ariba/tests/data/reference_data_filter_bad_data_metadata.in.tsv
@@ -1,20 +1,20 @@
-non_coding_1	.	.
-non_coding_1	.	.	should be in output because this field is here
-non_coding_1	p	L2K	should be removed because this is non-coding, but variant is protein
-non_coding_1	n	C5A	dna variant ok
-non_coding_not_in_fasta	.	. should be removed from tsv because not in fasta
-presence_absence_1	.	.
-presence_absence_1	.	.	should be in output because this field is here
-presence_absence_2	n	T4G	dna variant ok
-presence_absence_2	n	A4G	dna variant not ok
-presence_absence_3	p	R3S	amino acid variant ok
-presence_absence_3	p	I3S	amino acid variant not ok
-presence_absence_not_in_fasta	.	. should be removed from tsv because not in fasta
-variants_only_1	n	T4A	dna variant ok
-variants_only_1	n	C4G	dna variant not ok
-variants_only_1	p	S2T	amino acid variant ok
-variants_only_1	p	I2L	amin acid variant not ok
-variants_only_1	.	.	should be kept as a generic description of variants_only_1
-variants_only_1	.	.
-variants_only_not_in_fasta	.	.	should be removed from tsv because not in fasta
-variants_only_no_good_variants	n	A4G	dna variant not ok
+non_coding_1	.	.	.	non_coding_1 description1
+non_coding_1	.	.	.	should be in output because this field is here
+non_coding_1	p	L2K	.	should be removed because this is non-coding, but variant is protein
+non_coding_1	n	C5A	id1	dna variant ok
+non_coding_not_in_fasta	.	.	.	should be removed from tsv because not in fasta
+presence_absence_1	.	.	.	.
+presence_absence_1	.	.	.	should be in output because this field is here
+presence_absence_2	n	T4G	id2	dna variant ok
+presence_absence_2	n	A4G	.	dna variant not ok
+presence_absence_3	p	R3S	.	amino acid variant ok
+presence_absence_3	p	I3S	.	amino acid variant not ok
+presence_absence_not_in_fasta	.	.	.	should be removed from tsv because not in fasta
+variants_only_1	n	T4A	.	dna variant ok
+variants_only_1	n	C4G	.	dna variant not ok
+variants_only_1	p	S2T	.	amino acid variant ok
+variants_only_1	p	I2L	.	amin acid variant not ok
+variants_only_1	.	.	.	should be kept as a generic description of variants_only_1
+variants_only_1	.	.	.	.
+variants_only_not_in_fasta	.	.	.	should be removed from tsv because not in fasta
+variants_only_no_good_variants	n	A4G	.	dna variant not ok
diff --git a/ariba/tests/data/reference_data_init.tsv b/ariba/tests/data/reference_data_init.tsv
index 612b0774..1e8f1a60 100644
--- a/ariba/tests/data/reference_data_init.tsv
+++ b/ariba/tests/data/reference_data_init.tsv
@@ -1,4 +1,4 @@
-gene1	n	A42G	free text
-gene1	n	A42T	free text2
-gene1	n	G13T	confers killer rabbit resistance
-gene2	p	I42L	removes tardigrade's space-living capability
+gene1	n	A42G	.	free text
+gene1	n	A42T	.	free text2
+gene1	n	G13T	.	confers killer rabbit resistance
+gene2	p	I42L	.	removes tardigrade's space-living capability
diff --git a/ariba/tests/data/reference_data_load_metadata_tsv.tsv b/ariba/tests/data/reference_data_load_metadata_tsv.tsv
index 8d151a62..3551863f 100644
--- a/ariba/tests/data/reference_data_load_metadata_tsv.tsv
+++ b/ariba/tests/data/reference_data_load_metadata_tsv.tsv
@@ -1,3 +1,3 @@
-gene1	n	A42G	free text
-gene1	n	G13T	confers killer rabbit resistance
-gene2	p	I42L	removes tardigrade's space-living capability
+gene1	n	A42G	.	free text
+gene1	n	G13T	.	confers killer rabbit resistance
+gene2	p	I42L	.	removes tardigrade's space-living capability
diff --git a/ariba/tests/data/reference_data_rename_sequences_metadata.tsv b/ariba/tests/data/reference_data_rename_sequences_metadata.tsv
index 1dd13740..6d43433f 100644
--- a/ariba/tests/data/reference_data_rename_sequences_metadata.tsv
+++ b/ariba/tests/data/reference_data_rename_sequences_metadata.tsv
@@ -1,11 +1,11 @@
-noncoding1	.	.	original name "noncoding1"
-noncoding1 blah	.	.	original name "noncoding1 blah"
-pres_abs1 foo bar spam eggs	.	.	original name "pres_abs1 foo bar spam eggs"
-pres_abs1 blah	.	.	original name "pres_abs1 blah"
-pres'abs1	.	.	original name "pres'abs1"
-pres_abs2	.	.	original name "pres_abs2"
-pres!abs3	.	.	original name "pres!abs3"
-var_only1 hello	.	.	original name "var_only1 hello"
-var:only1 boo	.	.	original name "var:only1 boo"
-var_only1	.	.	original name "var_only1"
-var_only2	.	.	original name "var_only2"
+noncoding1	.	.	.	original name "noncoding1"
+noncoding1 blah	.	.	.	original name "noncoding1 blah"
+pres_abs1 foo bar spam eggs	.	.	.	original name "pres_abs1 foo bar spam eggs"
+pres_abs1 blah	.	.	.	original name "pres_abs1 blah"
+pres'abs1	.	.	.	original name "pres'abs1"
+pres_abs2	.	.	.	original name "pres_abs2"
+pres!abs3	.	.	.	original name "pres!abs3"
+var_only1 hello	.	.	.	original name "var_only1 hello"
+var:only1 boo	.	.	.	original name "var:only1 boo"
+var_only1	.	.	.	original name "var_only1"
+var_only2	.	.	.	original name "var_only2"
diff --git a/ariba/tests/data/reference_data_test_all_non_wild_type_variants.tsv b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.tsv
index 76f865f3..6f1defa0 100644
--- a/ariba/tests/data/reference_data_test_all_non_wild_type_variants.tsv
+++ b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.tsv
@@ -1,12 +1,12 @@
-var_only_gene	n	A8T	ref has wild type A
-var_only_gene	n	G9C	ref has variant C instead of G
-var_only_gene	p	G4I	ref has wild type F
-var_only_gene	p	F6I	ref has wild type F
-var_only_gene	p	P3Q	ref has wild type P
-var_only_gene	p	I5V	ref has variant V instead of I
-presence_absence_gene	n	A4G	ref has wild type A
-presence_absence_gene	n	A6C	ref has variant C instead of A
-presence_absence_gene	p	N2I	ref has wild type N
-presence_absence_gene	p	A4G	ref has variant G instead of A
-non_coding	n	A2C	ref has wild type A
-non_coding	n	C4T	ref has variant T instead of C
+var_only_gene	n	A8T	.	ref has wild type A
+var_only_gene	n	G9C	.	ref has variant C instead of G
+var_only_gene	p	G4I	.	ref has wild type F
+var_only_gene	p	F6I	.	ref has wild type F
+var_only_gene	p	P3Q	.	ref has wild type P
+var_only_gene	p	I5V	.	ref has variant V instead of I
+presence_absence_gene	n	A4G	.	ref has wild type A
+presence_absence_gene	n	A6C	.	ref has variant C instead of A
+presence_absence_gene	p	N2I	.	ref has wild type N
+presence_absence_gene	p	A4G	.	ref has variant G instead of A
+non_coding	n	A2C	.	ref has wild type A
+non_coding	n	C4T	.	ref has variant T instead of C
diff --git a/ariba/tests/data/reference_data_write_metadata_tsv.expected.tsv b/ariba/tests/data/reference_data_write_metadata_tsv.expected.tsv
index 0faf409d..7baa4023 100644
--- a/ariba/tests/data/reference_data_write_metadata_tsv.expected.tsv
+++ b/ariba/tests/data/reference_data_write_metadata_tsv.expected.tsv
@@ -1,2 +1,2 @@
-gene1	.	.	has anybody got a bottle of orange juice?
-gene2	.	.	we didn't burn him
+gene1	.	.	.	has anybody got a bottle of orange juice?
+gene2	.	.	.	we didn't burn him
diff --git a/ariba/tests/data/reference_data_write_metadata_tsv.tsv b/ariba/tests/data/reference_data_write_metadata_tsv.tsv
index 93192808..4143803e 100644
--- a/ariba/tests/data/reference_data_write_metadata_tsv.tsv
+++ b/ariba/tests/data/reference_data_write_metadata_tsv.tsv
@@ -1,2 +1,2 @@
-gene2	.	.	we didn't burn him
-gene1	.	.	has anybody got a bottle of orange juice?
+gene2	.	.	.	we didn't burn him
+gene1	.	.	.	has anybody got a bottle of orange juice?
diff --git a/ariba/tests/data/report_filter_test_init_bad.tsv b/ariba/tests/data/report_filter_test_init_bad.tsv
index f3dc87a5..f93b0f57 100644
--- a/ariba/tests/data/report_filter_test_init_bad.tsv
+++ b/ariba/tests/data/report_filter_test_init_bad.tsv
@@ -1,4 +1,4 @@
 #ef_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	Description_of_variant.C42T	free_text
-cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	Description_of_variant.A51G	free_text2
-cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	Description_of_variant.I42L	free_text3
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	a:n:C42T:id1:foo	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id2:bar	free_text2
+cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:n:I42L:id3:baz	free_text3
diff --git a/ariba/tests/data/report_filter_test_init_good.tsv b/ariba/tests/data/report_filter_test_init_good.tsv
index 5b3368fd..c98baf86 100644
--- a/ariba/tests/data/report_filter_test_init_good.tsv
+++ b/ariba/tests/data/report_filter_test_init_good.tsv
@@ -1,5 +1,5 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	10.5	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	Description_of_variant.C42T	free_text
-cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	10.5	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	Description_of_variant.A51G	free_text2
-cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.2	1300	12.4	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	Description_of_variant.A51G	free_text3
-cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	20.2	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	Description_of_variant.I42L	free_text3
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	10.5	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	a:n:C42T:id1:foo	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	10.5	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id2:bar	free_text2
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.2	1300	12.4	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id3:spam	free_text3
+cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	20.2	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:v:I42L:id4:eggs	free_text3
diff --git a/ariba/tests/data/report_filter_test_load_report_bad.tsv b/ariba/tests/data/report_filter_test_load_report_bad.tsv
index f3dc87a5..553e60ff 100644
--- a/ariba/tests/data/report_filter_test_load_report_bad.tsv
+++ b/ariba/tests/data/report_filter_test_load_report_bad.tsv
@@ -1,4 +1,4 @@
 #ef_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	Description_of_variant.C42T	free_text
-cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	Description_of_variant.A51G	free_text2
-cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	Description_of_variant.I42L	free_text3
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	a:n:C42T:id1:foo	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id1:bar	free_text2
+cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:n:I42L:id1:foo	free_text3
diff --git a/ariba/tests/data/report_filter_test_load_report_good.tsv b/ariba/tests/data/report_filter_test_load_report_good.tsv
index 9a0afe0d..1165ea45 100644
--- a/ariba/tests/data/report_filter_test_load_report_good.tsv
+++ b/ariba/tests/data/report_filter_test_load_report_good.tsv
@@ -1,5 +1,5 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	12.2	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	Description_of_variant.C42T	free_text
-cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	12.2	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	Description_of_variant.A51G	free_text2
-cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.2	1300	22.2	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	Description_of_variant.A51G	free_text3
-cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	33.3	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	Description_of_variant.I42L	free_text3
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	12.2	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	a:n:C42T:id1:foo	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	12.2	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id2:bar	free_text2
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.2	1300	22.2	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id3:spam	free_text3
+cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	33.3	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:v:I42L:id4:eggs	free_text3
diff --git a/ariba/tests/data/report_filter_test_run.expected.tsv b/ariba/tests/data/report_filter_test_run.expected.tsv
index 0a6c41e1..a35a0cf6 100644
--- a/ariba/tests/data/report_filter_test_run.expected.tsv
+++ b/ariba/tests/data/report_filter_test_run.expected.tsv
@@ -1,6 +1,6 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	12.4	1	SNP	n	A51G	1	.	.	51	51	C	151	151	C	542	.	542	Description_of_variant.A51G	free_text2
-cluster2	variants_only	179	20000	cluster2	1042	1042	99.0	cluster2.scaffold.1	1442	13.5	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	Description_of_variant.I42L	free_text3
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	12.4	1	SNP	n	A51G	1	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id2:bar	free_text2
+cluster2	variants_only	179	20000	cluster2	1042	1042	99.0	cluster2.scaffold.1	1442	13.5	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:n:I42L:id3:baz	free_text3
 cluster4	variants_only	179	20000	cluster4	1042	1042	99.0	cluster4.scaffold.1	1442	14.6	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	free_text3
 cluster5	presence_absence	528	1874	cluster5	1188	1097	92.43	cluster5.scaffold.1	2218	20.0	0	.	p	.	0	E89G	NONSYN	65	265	A;A	766	766	G;C	88;90	.;.	87;90	.	.'
 cluster5	presence_absence	528	1874	cluster5	1188	1097	92.43	cluster5.scaffold.1	2218	20.0	0	.	p	.	0	Q37fs	FSHIFT	109	109	A	634	634	.	67	.	67	.	.
diff --git a/ariba/tests/data/report_filter_test_run.in.tsv b/ariba/tests/data/report_filter_test_run.in.tsv
index 59a81cfc..f701c3a3 100644
--- a/ariba/tests/data/report_filter_test_run.in.tsv
+++ b/ariba/tests/data/report_filter_test_run.in.tsv
@@ -1,9 +1,9 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-cluster1	non_coding	27	10000	cluster1	1000	0	99.42	cluster1.scaffold.1	1300	12.4	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	Description_of_variant.C42T	free_text
-cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	12.4	1	SNP	n	A51G	1	.	.	51	51	C	151	151	C	542	.	542	Description_of_variant.A51G	free_text2
-cluster2	variants_only	179	20000	cluster2	1042	1042	99.0	cluster2.scaffold.1	1442	13.5	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	Description_of_variant.I42L	free_text3
-cluster3	variants_only	179	20000	cluster3	1042	1042	89.0	cluster2.scaffold.1	1442	13.5	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	Description_of_variant.I42L	free_text3
-cluster4	variants_only	179	20000	cluster4	1042	1042	99.0	cluster4.scaffold.1	1442	14.6	1	SNP	p	I42L	1	I42L	SYN	112	112	C	442	442	T	300	.	290	Description_of_variant.I42L	free_text3
+cluster1	non_coding	27	10000	cluster1	1000	0	99.42	cluster1.scaffold.1	1300	12.4	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	a:n:C42T:id1:foo	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	12.4	1	SNP	n	A51G	1	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id2:bar	free_text2
+cluster2	variants_only	179	20000	cluster2	1042	1042	99.0	cluster2.scaffold.1	1442	13.5	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:n:I42L:id3:baz	free_text3
+cluster3	variants_only	179	20000	cluster3	1042	1042	89.0	cluster2.scaffold.1	1442	13.5	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:n:I42L:id4:spam	free_text3
+cluster4	variants_only	179	20000	cluster4	1042	1042	99.0	cluster4.scaffold.1	1442	14.6	1	SNP	p	I42L	1	I42L	SYN	112	112	C	442	442	T	300	.	290	a:n:I42L:id5:eggs	free_text3
 cluster5	presence_absence	528	1874	cluster5	1188	1097	92.43	cluster5.scaffold.1	2218	20.0	0	.	p	.	0	E89G	NONSYN	65	265	A;A	766	766	G;C	88;90	.;.	87;90	.	.'
 cluster5	presence_absence	528	1874	cluster5	1188	1097	92.43	cluster5.scaffold.1	2218	20.0	0	.	p	.	0	Q37fs	FSHIFT	109	109	A	634	634	.	67	.	67	.	.
 cluster5	presence_absence	528	1874	cluster5	1188	1097	92.43	cluster5.scaffold.1	2218	20.0	0	.	p	.	0	E89G	NONSYN	265	265	A;A	766	766	G;C	88;90	.;.	87;90	.	.
diff --git a/ariba/tests/data/report_filter_test_write_report.tsv b/ariba/tests/data/report_filter_test_write_report.tsv
index e159675a..11b3ab4f 100644
--- a/ariba/tests/data/report_filter_test_write_report.tsv
+++ b/ariba/tests/data/report_filter_test_write_report.tsv
@@ -1,4 +1,4 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	42.4	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	Description_of_variant.C42T	free_text
-cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	42.4	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	Description_of_variant.A51G	free_text2
-cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	42.4	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	Description_of_variant.I42L	free_text3
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	42.4	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	a:n:C42T:id1:foo	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	42.4	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id2:bar	free_text2
+cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	42.4	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:v:I42L:id3:baz	free_text3
diff --git a/ariba/tests/data/summary_sample_test_column_names_tuples.tsv b/ariba/tests/data/summary_sample_test_column_names_tuples.tsv
index 5e5926c6..4684c424 100644
--- a/ariba/tests/data/summary_sample_test_column_names_tuples.tsv
+++ b/ariba/tests/data/summary_sample_test_column_names_tuples.tsv
@@ -1,7 +1,8 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n_A14T_N_ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1_n_A6G_N_variant in ref and reads so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:.:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	0	SNP	.	.	.	G15T	SNP	15	15	G	85	85	T	17	.	17	.	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
 noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
-presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1_p_A10V_N_Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
 presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
-variants_only1	variants_only	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1_p_S5T_N_Ref and reads have variant so report	Generic description of variants_only1
+variants_only1	variants_only	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:.:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_sample_test_column_summary_data.tsv b/ariba/tests/data/summary_sample_test_column_summary_data.tsv
index 733e0963..cf7e5b98 100644
--- a/ariba/tests/data/summary_sample_test_column_summary_data.tsv
+++ b/ariba/tests/data/summary_sample_test_column_summary_data.tsv
@@ -1,7 +1,8 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n_A14T_N_ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1_n_A6G_N_variant in ref and reads so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	0	SNP	.	.	.	G15T	SNP	15	15	G	85	85	T	17	.	17	.	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1_n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
 noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
-presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1_p_A10V_N_Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
 presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
-variants_only1	variants_only	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1_p_S5T_N_Ref and reads have variant so report	Generic description of variants_only1
+variants_only1	variants_only	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_sample_test_load_file.in.tsv b/ariba/tests/data/summary_sample_test_load_file.in.tsv
index 733e0963..524d3347 100644
--- a/ariba/tests/data/summary_sample_test_load_file.in.tsv
+++ b/ariba/tests/data/summary_sample_test_load_file.in.tsv
@@ -1,7 +1,7 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n_A14T_N_ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1_n_A6G_N_variant in ref and reads so should report	generic description of noncoding1
+noncoding1	non:coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non:coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
 noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
-presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1_p_A10V_N_Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
 presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
-variants_only1	variants_only	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1_p_S5T_N_Ref and reads have variant so report	Generic description of variants_only1
+variants_only1	variants_only	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_sample_test_non_synon_variants.tsv b/ariba/tests/data/summary_sample_test_non_synon_variants.tsv
index 733e0963..b8f5753d 100644
--- a/ariba/tests/data/summary_sample_test_non_synon_variants.tsv
+++ b/ariba/tests/data/summary_sample_test_non_synon_variants.tsv
@@ -1,7 +1,8 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n_A14T_N_ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1_n_A6G_N_variant in ref and reads so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	0	SNP	.	.	.	G15T	SNP	15	15	G	85	85	T	17	.	17	.	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
 noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
-presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1_p_A10V_N_Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
 presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
-variants_only1	variants_only	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1_p_S5T_N_Ref and reads have variant so report	Generic description of variants_only1
+variants_only1	variants_only	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_sample_test_var_groups.tsv b/ariba/tests/data/summary_sample_test_var_groups.tsv
new file mode 100644
index 00000000..056296ab
--- /dev/null
+++ b/ariba/tests/data/summary_sample_test_var_groups.tsv
@@ -0,0 +1,7 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
+variants_only1	variants_only	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
index f88dd14e..6ec23eca 100644
--- a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
+++ b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
@@ -1,3 +1,3 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n_A14T_N_ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1_p_A10V_N_Ref has wild, reads have variant so report	Generic description of presence_absence1
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
index 3e322baa..322f9656 100644
--- a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
+++ b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
@@ -1,5 +1,5 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n_A14T_N_ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1_n_A6G_N_variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1_p_A10V_N_Ref has wild, reads have variant so report	Generic description of presence_absence1
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id3:variant in ref and reads so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
 variants_only1	variants_only	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
index 5848d5d7..2d068427 100644
--- a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
+++ b/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
@@ -1,3 +1,3 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n_A14T_N_ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	presence_absence	27	88	cluster.p.1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1_p_A10V_N_Ref has wild, reads have variant so report	Generic description of presence_absence1
+noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	cluster.p.1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id2:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv
index a80d7582..0058b231 100644
--- a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv
+++ b/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv
@@ -1,5 +1,5 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n_A14T_N_ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1_n_A6G_N_variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	presence_absence	27	88	cluster.p.2	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1_p_A10V_N_Ref has wild, reads have variant so report	Generic description of presence_absence1
+noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	cluster.p.2	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
 variants_only1	variants_only	64	12	cluster.v.1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv b/ariba/tests/data/summary_test_get_all_var_groups.1.tsv
new file mode 100644
index 00000000..e3465e4e
--- /dev/null
+++ b/ariba/tests/data/summary_test_get_all_var_groups.1.tsv
@@ -0,0 +1,3 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	cluster.p.1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id4:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv b/ariba/tests/data/summary_test_get_all_var_groups.2.tsv
new file mode 100644
index 00000000..0058b231
--- /dev/null
+++ b/ariba/tests/data/summary_test_get_all_var_groups.2.tsv
@@ -0,0 +1,5 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	cluster.p.2	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+variants_only1	variants_only	64	12	cluster.v.1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_test_load_input_files.1.tsv b/ariba/tests/data/summary_test_load_input_files.1.tsv
index ffee4cdb..e1bc25f8 100644
--- a/ariba/tests/data/summary_test_load_input_files.1.tsv
+++ b/ariba/tests/data/summary_test_load_input_files.1.tsv
@@ -1,3 +1,3 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n_A14T_N_ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1_p_A10V_N_Ref has wild, reads have variant so report	Generic description of presence_absence1
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id2:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_load_input_files.2.tsv b/ariba/tests/data/summary_test_load_input_files.2.tsv
index b4dcb0e8..ff47b223 100644
--- a/ariba/tests/data/summary_test_load_input_files.2.tsv
+++ b/ariba/tests/data/summary_test_load_input_files.2.tsv
@@ -1,5 +1,5 @@
 #ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n_A14T_N_ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1_n_A6G_N_variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1_p_A10V_N_Ref has wild, reads have variant so report	Generic description of presence_absence1
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
 variants_only1	variants_only	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/vfdb_parser_test_run.out.tsv b/ariba/tests/data/vfdb_parser_test_run.out.tsv
index 83652b41..242514cb 100644
--- a/ariba/tests/data/vfdb_parser_test_run.out.tsv
+++ b/ariba/tests/data/vfdb_parser_test_run.out.tsv
@@ -1,2 +1,2 @@
-abcD.VF123(gi:1234).genus1_species1	.	.	foobar description1 [abc]
-efgH.VF234(gi:2345).genus2_species2	.	.	spam eggs description2 [abc]
+abcD.VF123(gi:1234).genus1_species1	.	.	.	foobar description1 [abc]
+efgH.VF234(gi:2345).genus2_species2	.	.	.	spam eggs description2 [abc]
diff --git a/ariba/tests/reference_data_test.py b/ariba/tests/reference_data_test.py
index f01a7ccd..cb64077f 100644
--- a/ariba/tests/reference_data_test.py
+++ b/ariba/tests/reference_data_test.py
@@ -30,10 +30,10 @@ def test_init_ok(self):
         '''Test init with good input'''
         tsv_file = os.path.join(data_dir, 'reference_data_init.tsv')
         presence_absence_fa = os.path.join(data_dir, 'reference_data_init_presence_absence.fa')
-        meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\tfree text')
-        meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tA42T\tfree text2')
-        meta3 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\tconfers killer rabbit resistance')
-        meta4 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\tremoves tardigrade's space-living capability")
+        meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\t.\tfree text')
+        meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tA42T\t.\tfree text2')
+        meta3 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\t.\tconfers killer rabbit resistance')
+        meta4 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\t.\tremoves tardigrade's space-living capability")
 
         expected_metadata = {
             'gene1': {
@@ -83,9 +83,9 @@ def test_get_filename(self):
 
     def test_load_metadata_tsv(self):
         '''Test _load_metadata_tsv'''
-        meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\tfree text')
-        meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\tconfers killer rabbit resistance')
-        meta3 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\tremoves tardigrade's space-living capability")
+        meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\t.\tfree text')
+        meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\t.\tconfers killer rabbit resistance')
+        meta3 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\t.\tremoves tardigrade's space-living capability")
         expected = {
             'gene1': {
                 'n': {12: {meta2}, 41: {meta1}},
@@ -284,13 +284,13 @@ def test_rename_names_in_seq_dicts(self):
     def test_rename_metadata_set(self):
         '''Test _rename_metadata_set'''
         metaset = {
-            sequence_metadata.SequenceMetadata('foo 1\t.\t.\tdescription'),
-            sequence_metadata.SequenceMetadata('foo 1\tp\tI42L\tspam eggs')
+            sequence_metadata.SequenceMetadata('foo 1\t.\t.\t.\tdescription'),
+            sequence_metadata.SequenceMetadata('foo 1\tp\tI42L\t.\tspam eggs')
         }
 
         expected = {
-            sequence_metadata.SequenceMetadata('new_name\t.\t.\tdescription'),
-            sequence_metadata.SequenceMetadata('new_name\tp\tI42L\tspam eggs')
+            sequence_metadata.SequenceMetadata('new_name\t.\t.\t.\tdescription'),
+            sequence_metadata.SequenceMetadata('new_name\tp\tI42L\t.\tspam eggs')
         }
         got = reference_data.ReferenceData._rename_metadata_set(metaset, 'new_name')
         self.assertEqual(expected, got)
@@ -298,15 +298,15 @@ def test_rename_metadata_set(self):
 
     def test_rename_names_in_metadata(self):
         '''Test _rename_names_in_metadata'''
-        meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\tfree text')
-        meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tA42T\tfree text2')
-        meta3 = sequence_metadata.SequenceMetadata('gene1\t.\t.\tfree text3')
-        meta4 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\tconfers killer rabbit resistance')
-        meta5 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\tremoves tardigrade's space-living capability")
-        meta1rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tA42G\tfree text')
-        meta2rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tA42T\tfree text2')
-        meta3rename = sequence_metadata.SequenceMetadata('new_gene1\t.\t.\tfree text3')
-        meta4rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tG13T\tconfers killer rabbit resistance')
+        meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\t.\tfree text')
+        meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tA42T\t.\tfree text2')
+        meta3 = sequence_metadata.SequenceMetadata('gene1\t.\t.\t.\tfree text3')
+        meta4 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\t.\tconfers killer rabbit resistance')
+        meta5 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\t.\tremoves tardigrade's space-living capability")
+        meta1rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tA42G\t.\tfree text')
+        meta2rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tA42T\t.\tfree text2')
+        meta3rename = sequence_metadata.SequenceMetadata('new_gene1\t.\t.\t.\tfree text3')
+        meta4rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tG13T\t.\tconfers killer rabbit resistance')
 
         metadata = {
             'gene1': {
@@ -357,17 +357,17 @@ def test_rename_sequences(self):
         self.assertTrue(filecmp.cmp(expected_file, tmp_out, shallow=False))
         os.unlink(tmp_out)
 
-        meta1 = sequence_metadata.SequenceMetadata('noncoding1\t.\t.\toriginal name "noncoding1"')
-        meta2 = sequence_metadata.SequenceMetadata('noncoding1_1\t.\t.\toriginal name "noncoding1 blah"')
-        meta3 = sequence_metadata.SequenceMetadata('pres_abs1_2\t.\t.\toriginal name "pres_abs1 foo bar spam eggs"')
-        meta4 = sequence_metadata.SequenceMetadata('pres_abs1_1\t.\t.\toriginal name "pres_abs1 blah"')
-        meta5 = sequence_metadata.SequenceMetadata('pres_abs1\t.\t.\toriginal name "pres\'abs1"')
-        meta6 = sequence_metadata.SequenceMetadata('pres_abs2\t.\t.\toriginal name "pres_abs2"')
-        meta7 = sequence_metadata.SequenceMetadata('pres_abs3\t.\t.\toriginal name "pres!abs3"')
-        meta8 = sequence_metadata.SequenceMetadata('var_only1_2\t.\t.\toriginal name "var_only1 hello"')
-        meta9 = sequence_metadata.SequenceMetadata('var_only1\t.\t.\toriginal name "var:only1 boo"')
-        meta10 = sequence_metadata.SequenceMetadata('var_only1_1\t.\t.\toriginal name "var_only1"')
-        meta11 = sequence_metadata.SequenceMetadata('var_only2\t.\t.\toriginal name "var_only2"')
+        meta1 = sequence_metadata.SequenceMetadata('noncoding1\t.\t.\t.\toriginal name "noncoding1"')
+        meta2 = sequence_metadata.SequenceMetadata('noncoding1_1\t.\t.\t.\toriginal name "noncoding1 blah"')
+        meta3 = sequence_metadata.SequenceMetadata('pres_abs1_2\t.\t.\t.\toriginal name "pres_abs1 foo bar spam eggs"')
+        meta4 = sequence_metadata.SequenceMetadata('pres_abs1_1\t.\t.\t.\toriginal name "pres_abs1 blah"')
+        meta5 = sequence_metadata.SequenceMetadata('pres_abs1\t.\t.\t.\toriginal name "pres\'abs1"')
+        meta6 = sequence_metadata.SequenceMetadata('pres_abs2\t.\t.\t.\toriginal name "pres_abs2"')
+        meta7 = sequence_metadata.SequenceMetadata('pres_abs3\t.\t.\t.\toriginal name "pres!abs3"')
+        meta8 = sequence_metadata.SequenceMetadata('var_only1_2\t.\t.\t.\toriginal name "var_only1 hello"')
+        meta9 = sequence_metadata.SequenceMetadata('var_only1\t.\t.\t.\toriginal name "var:only1 boo"')
+        meta10 = sequence_metadata.SequenceMetadata('var_only1_1\t.\t.\t.\toriginal name "var_only1"')
+        meta11 = sequence_metadata.SequenceMetadata('var_only2\t.\t.\t.\toriginal name "var_only2"')
 
         expected_meta = {
             'noncoding1': {'n': {}, 'p': {}, '.': {meta1}},
@@ -476,18 +476,18 @@ def test_all_non_wild_type_variants(self):
             metadata_tsv=tsv_file
         )
 
-        v1 = sequence_metadata.SequenceMetadata('var_only_gene\tn\tA8T\tref has wild type A')
-        v2 = sequence_metadata.SequenceMetadata('var_only_gene\tn\tG9C\tref has variant C instead of G')
-        v3 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tP3Q\tref has wild type P')
-        v4 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tG4I\tref has wild type F')
-        v5 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tI5V\tref has variant V instead of I')
-        v6 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tF6I\tref has wild type F')
-        p1 = sequence_metadata.SequenceMetadata('presence_absence_gene\tn\tA4G\tref has wild type A')
-        p2 = sequence_metadata.SequenceMetadata('presence_absence_gene\tn\tA6C\tref has variant C instead of A')
-        p3 = sequence_metadata.SequenceMetadata('presence_absence_gene\tp\tN2I\tref has wild type N')
-        p4 = sequence_metadata.SequenceMetadata('presence_absence_gene\tp\tA4G\tref has variant G instead of A')
-        n1 = sequence_metadata.SequenceMetadata('non_coding\tn\tA2C\tref has wild type A')
-        n2 = sequence_metadata.SequenceMetadata('non_coding\tn\tC4T\tref has variant T instead of C')
+        v1 = sequence_metadata.SequenceMetadata('var_only_gene\tn\tA8T\t.\tref has wild type A')
+        v2 = sequence_metadata.SequenceMetadata('var_only_gene\tn\tG9C\t.\tref has variant C instead of G')
+        v3 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tP3Q\t.\tref has wild type P')
+        v4 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tG4I\t.\tref has wild type F')
+        v5 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tI5V\t.\tref has variant V instead of I')
+        v6 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tF6I\t.\tref has wild type F')
+        p1 = sequence_metadata.SequenceMetadata('presence_absence_gene\tn\tA4G\t.\tref has wild type A')
+        p2 = sequence_metadata.SequenceMetadata('presence_absence_gene\tn\tA6C\t.\tref has variant C instead of A')
+        p3 = sequence_metadata.SequenceMetadata('presence_absence_gene\tp\tN2I\t.\tref has wild type N')
+        p4 = sequence_metadata.SequenceMetadata('presence_absence_gene\tp\tA4G\t.\tref has variant G instead of A')
+        n1 = sequence_metadata.SequenceMetadata('non_coding\tn\tA2C\t.\tref has wild type A')
+        n2 = sequence_metadata.SequenceMetadata('non_coding\tn\tC4T\t.\tref has variant T instead of C')
 
         var_only_expected = {
              'n': {7: {v1}, 8: {v2}},
diff --git a/ariba/tests/report_filter_test.py b/ariba/tests/report_filter_test.py
index 5c8919b4..7725de67 100644
--- a/ariba/tests/report_filter_test.py
+++ b/ariba/tests/report_filter_test.py
@@ -13,10 +13,10 @@ def test_init_good_file(self):
         '''test __init__ on good input file'''
         infile = os.path.join(data_dir, 'report_filter_test_init_good.tsv')
         rf = report_filter.ReportFilter(infile=infile)
-        line1 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '10.5', '1', 'SNP', 'n', 'C42T', '0', '.', '.', '42', '42', 'C', '142', '142', 'C', '500', '.', '500', 'Description_of_variant.C42T', 'free_text'])
-        line2 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '10.5', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'Description_of_variant.A51G', 'free_text2'])
-        line3 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.2', '1300', '12.4', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'Description_of_variant.A51G', 'free_text3'])
-        line4 = '\t'.join(['cluster2', 'variants_only', '179', '20000', 'cluster2', '1042', '1042', '42.42', 'cluster2.scaffold.1', '1442', '20.2', '1', 'SNP', 'p', 'I42L', '1', 'I42L', 'NONSYN', '112', '112', 'C', '442', '442', 'T', '300', '.', '290', 'Description_of_variant.I42L', 'free_text3'])
+        line1 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '10.5', '1', 'SNP', 'n', 'C42T', '0', '.', '.', '42', '42', 'C', '142', '142', 'C', '500', '.', '500', 'a:n:C42T:id1:foo', 'free_text'])
+        line2 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '10.5', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'a:n:A51G:id2:bar', 'free_text2'])
+        line3 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.2', '1300', '12.4', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'a:n:A51G:id3:spam', 'free_text3'])
+        line4 = '\t'.join(['cluster2', 'variants_only', '179', '20000', 'cluster2', '1042', '1042', '42.42', 'cluster2.scaffold.1', '1442', '20.2', '1', 'SNP', 'p', 'I42L', '1', 'I42L', 'NONSYN', '112', '112', 'C', '442', '442', 'T', '300', '.', '290', 'a:v:I42L:id4:eggs', 'free_text3'])
 
         expected = {
             'cluster1': {
@@ -39,7 +39,7 @@ def test_init_bad_file(self):
 
 
     def test_report_line_to_dict(self):
-        line = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t99.42\tcluster1.scaffold.1\t999\t23.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
+        line = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t99.42\tcluster1.scaffold.1\t999\t23.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\ta:n:C42T:id1:foo\tfree text'
         expected = {
             'ref_name':           'cluster1',
             'ref_type':           'non_coding',
@@ -68,7 +68,7 @@ def test_report_line_to_dict(self):
             'smtls_total_depth':  '500',
             'smtls_alt_nt':       '.',
             'smtls_alt_depth':    '500',
-            'var_description':    'Description_of_variant C42T',
+            'var_description':    'a:n:C42T:id1:foo',
             'free_text':          'free text',
         }
 
@@ -108,11 +108,11 @@ def test_dict_to_report_line(self):
             'smtls_total_depth':  '500',
             'smtls_alt_nt':       '.',
             'smtls_alt_depth':    '500',
-            'var_description':    'Description_of_variant C42T',
+            'var_description':    'a:n:C42T:id1:foo',
             'free_text':          'free text',
         }
 
-        expected = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t99.42\tcluster1.scaffold.1\t1300\t42.4\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
+        expected = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t99.42\tcluster1.scaffold.1\t1300\t42.4\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\ta:n:C42T:id1:foo\tfree text'
         self.assertEqual(expected, report_filter.ReportFilter._dict_to_report_line(report_dict))
 
 
@@ -120,10 +120,10 @@ def test_load_report(self):
         good_infile = os.path.join(data_dir, 'report_filter_test_load_report_good.tsv')
         bad_infile = os.path.join(data_dir, 'report_filter_test_load_report_bad.tsv')
 
-        line1 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '12.2', '1', 'SNP', 'n', 'C42T', '0', '.', '.', '42', '42', 'C', '142', '142', 'C', '500', '.', '500', 'Description_of_variant.C42T', 'free_text'])
-        line2 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '12.2', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'Description_of_variant.A51G', 'free_text2'])
-        line3 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.2', '1300', '22.2', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'Description_of_variant.A51G', 'free_text3'])
-        line4 = '\t'.join(['cluster2', 'variants_only', '179', '20000', 'cluster2', '1042', '1042', '42.42', 'cluster2.scaffold.1', '1442', '33.3', '1', 'SNP', 'p', 'I42L', '1', 'I42L', 'NONSYN', '112', '112', 'C', '442', '442', 'T', '300', '.', '290', 'Description_of_variant.I42L', 'free_text3'])
+        line1 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '12.2', '1', 'SNP', 'n', 'C42T', '0', '.', '.', '42', '42', 'C', '142', '142', 'C', '500', '.', '500', 'a:n:C42T:id1:foo', 'free_text'])
+        line2 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '12.2', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'a:n:A51G:id2:bar', 'free_text2'])
+        line3 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.2', '1300', '22.2', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'a:n:A51G:id3:spam', 'free_text3'])
+        line4 = '\t'.join(['cluster2', 'variants_only', '179', '20000', 'cluster2', '1042', '1042', '42.42', 'cluster2.scaffold.1', '1442', '33.3', '1', 'SNP', 'p', 'I42L', '1', 'I42L', 'NONSYN', '112', '112', 'C', '442', '442', 'T', '300', '.', '290', 'a:v:I42L:id4:eggs', 'free_text3'])
 
         expected = {
             'cluster1': {
diff --git a/ariba/tests/sequence_metadata_test.py b/ariba/tests/sequence_metadata_test.py
index aad7eddb..ed4fe473 100644
--- a/ariba/tests/sequence_metadata_test.py
+++ b/ariba/tests/sequence_metadata_test.py
@@ -13,34 +13,42 @@ def test_init_fails_on_bad_lines(self):
         lines = [
             'only one column. There can NOT be only one\n',
             'two\tcolumns is not enough\n',
-            'five\tcolumns\tis\ttoo\tmany\n',
+            'three\tcolumns\tis still not enough\n',
+            'four\tcolumns\tis\tis also not enough\n',
+            'six\tcolumns\tis\tone\ttoo\tmany\n',
         ]
 
         for line in lines:
             with self.assertRaises(sequence_metadata.Error):
                 sequence_metadata.SequenceMetadata(line)
 
-        with self.assertRaises(sequence_variant.Error):
-            sequence_metadata.SequenceMetadata('gene\tx\tI42L\n')
+        lines = [
+            'gene\tx\tI42L\tid\tfoo\n',
+        ]
+
+        for line in lines:
+            with self.assertRaises(sequence_variant.Error):
+                sequence_metadata.SequenceMetadata(line)
 
 
     def test_init_on_good_input(self):
         '''test init ok on good input'''
-        data = sequence_metadata.SequenceMetadata('gene\tn\tI42L\tspam spam wonderful spam')
+        data = sequence_metadata.SequenceMetadata('gene\tn\tI42L\tid\tspam spam wonderful spam')
         self.assertEqual(data.name, 'gene')
         self.assertEqual(data.variant_type, 'n')
         self.assertEqual(data.variant.wild_value, 'I')
         self.assertEqual(data.variant.variant_value, 'L')
+        self.assertEqual(data.variant.identifier, 'id')
         self.assertEqual(data.free_text, 'spam spam wonderful spam')
 
 
     def test_str(self):
         '''test __str__'''
         lines = [
-            'gene1\tn\tA42G\tspam',
-            'gene2\t.\t.',
-            'gene3\t.\t.\teggs',
-            'gene4\tp\tI42K\tthis mutation kills tardigrades',
+            'gene1\tn\tA42G\tid1\tspam',
+            'gene2\t.\t.\t.\t.',
+            'gene3\t.\t.\t.\teggs',
+            'gene4\tp\tI42K\tid\tthis mutation kills tardigrades',
         ]
 
         for line in lines:
@@ -50,11 +58,11 @@ def test_str(self):
     def test_has_variant(self):
         '''test has_variant'''
         tests = [
-            ('gene1\t.\t.', False),
-            ('gene1\tn\tA2T', True),
-            ('gene1\tn\tT2A', False),
-            ('gene1\tp\tI2Y', True),
-            ('gene1\tp\tY2I', False),
+            ('gene1\t.\t.\t.\t.', False),
+            ('gene1\tn\tA2T\t.\t,', True),
+            ('gene1\tn\tT2A\t.\t.', False),
+            ('gene1\tp\tI2Y\t.\t.', True),
+            ('gene1\tp\tY2I\t.\t.', False),
         ]
 
         seq = pyfastaq.sequences.Fasta('name', 'ATGTATTGCTGA') # translation: MYC*
@@ -62,3 +70,20 @@ def test_has_variant(self):
         for line, expected in tests:
             metadata = sequence_metadata.SequenceMetadata(line)
             self.assertEqual(expected, metadata.has_variant(seq))
+
+
+    def test_to_string(self):
+        '''test to_string'''
+        lines = [
+            ('gene1', 'n', 'A42G', 'id1', 'spam'),
+            ('gene2', '.', '.', '.', '.'),
+            ('gene3', '.', '.', '.', 'eggs'),
+            ('gene4', 'p', 'I42K', 'id', 'this mutation kills tardigrades'),
+        ]
+
+        for line in lines:
+            m = sequence_metadata.SequenceMetadata('\t'.join(line))
+            for separator in ('_', '\t'):
+                expected = separator.join(line)
+                self.assertEqual(expected, m.to_string(separator=separator))
+
diff --git a/ariba/tests/sequence_variant_test.py b/ariba/tests/sequence_variant_test.py
index 49a55c53..1c199156 100644
--- a/ariba/tests/sequence_variant_test.py
+++ b/ariba/tests/sequence_variant_test.py
@@ -21,18 +21,22 @@ def test_init_fails_on_bad_variant_strings(self):
 
         for var in bad_variants:
             with self.assertRaises(sequence_variant.Error):
-                v = sequence_variant.Variant('p', var)
+                v = sequence_variant.Variant('p', var, '.')
 
 
     def test_init_ok(self):
         '''Test init ok'''
-        variants = ['I42K', 'i42k', 'I42k', 'i42K']
+        variants = [('I42K', '.'), ('i42k', 'id1'), ('I42k', 'id2'), ('i42K', 'id3')]
 
-        for var in variants:
-            aa_var = sequence_variant.Variant('p', var)
+        for var, identifier in variants:
+            aa_var = sequence_variant.Variant('p', var, identifier)
             self.assertEqual(41, aa_var.position)
             self.assertEqual('I', aa_var.wild_value)
             self.assertEqual('K', aa_var.variant_value)
+            if identifier == '.':
+                self.assertIsNone(aa_var.identifier)
+            else:
+                self.assertEqual(identifier, aa_var.identifier)
 
 
     def test_init_str(self):
@@ -41,7 +45,7 @@ def test_init_str(self):
         expected = 'I42K'
 
         for var in variants:
-            self.assertEqual(expected, str(sequence_variant.Variant('p', var)))
+            self.assertEqual(expected, str(sequence_variant.Variant('p', var, '.')))
 
 
     def test_sanity_check_against_seq_no_translate(self):
@@ -55,7 +59,7 @@ def test_sanity_check_against_seq_no_translate(self):
         ]
 
         for var, expected in tests:
-            variant = sequence_variant.Variant('p', var)
+            variant = sequence_variant.Variant('p', var, '.')
             self.assertEqual(expected, variant.sanity_check_against_seq(seq))
 
 
@@ -70,7 +74,7 @@ def test_sanity_check_against_seq_translate(self):
         ]
 
         for var, expected in tests:
-            variant = sequence_variant.Variant('p', var)
+            variant = sequence_variant.Variant('p', var, '.')
             self.assertEqual(expected, variant.sanity_check_against_seq(seq, translate_seq=True))
 
 
@@ -78,10 +82,10 @@ def test_has_variant(self):
         '''test has_variant'''
         seq = pyfastaq.sequences.Fasta('name', 'ATGTATTGCTGA') # translation: MYC*
         tests = [
-            (sequence_variant.Variant('n', 'A2T'), True),
-            (sequence_variant.Variant('n', 'T2A'), False),
-            (sequence_variant.Variant('p', 'I2Y'), True),
-            (sequence_variant.Variant('p', 'Y2I'), False),
+            (sequence_variant.Variant('n', 'A2T', '.'), True),
+            (sequence_variant.Variant('n', 'T2A', '.'), False),
+            (sequence_variant.Variant('p', 'I2Y', '.'), True),
+            (sequence_variant.Variant('p', 'Y2I', '.'), False),
         ]
 
         for var, expected in tests:
@@ -90,7 +94,7 @@ def test_has_variant(self):
 
     def test_nucleotide_range(self):
         '''test nucleotide_range'''
-        sv = sequence_variant.Variant('n', 'A2T')
+        sv = sequence_variant.Variant('n', 'A2T', '.')
         self.assertEqual((1, 1), sv.nucleotide_range())
-        sv = sequence_variant.Variant('p', 'I42L')
+        sv = sequence_variant.Variant('p', 'I42L', '.')
         self.assertEqual((123, 125), sv.nucleotide_range())
diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py
index f6acfccb..4219af7f 100644
--- a/ariba/tests/summary_cluster_test.py
+++ b/ariba/tests/summary_cluster_test.py
@@ -10,7 +10,7 @@
 class TestSummaryCluster(unittest.TestCase):
     def test_line2dict(self):
         '''Test _line2dict'''
-        line = 'refname\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text'
+        line = 'refname\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:var_group1:ref has wild type, foo bar\tsome free text'
 
         expected = {
             'ref_name': 'refname',
@@ -40,7 +40,8 @@ def test_line2dict(self):
             'smtls_total_depth': '17',
             'smtls_alt_nt': '.',
             'smtls_alt_depth': '17',
-            'var_description': 'noncoding1_n_A14T_N_ref has wild type, foo bar',
+            'var_description': 'noncoding1:n:A14T:var_group1:ref has wild type, foo bar',
+            'var_group': 'var_group1',
             'free_text': 'some free text'
         }
 
@@ -51,9 +52,9 @@ def test_add_data_dict(self):
         '''Test add_data_dict'''
         cluster = summary_cluster.SummaryCluster()
         self.assertTrue(cluster.name is None)
-        line1 = 'refname\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text'
-        line2 = 'refname\treftype\t19\t78\tcluster2\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text'
-        line3 = 'refname2\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text'
+        line1 = 'refname\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line2 = 'refname\treftype\t19\t78\tcluster2\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id2:ref has wild type, foo bar\tsome free text'
+        line3 = 'refname2\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id3:ref has wild type, foo bar\tsome free text'
         data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
         data_dict2 = summary_cluster.SummaryCluster.line2dict(line2)
         data_dict3 = summary_cluster.SummaryCluster.line2dict(line3)
@@ -71,9 +72,9 @@ def test_pc_id_of_longest(self):
         '''Test pc_id_of_longest'''
         cluster = summary_cluster.SummaryCluster()
         self.assertTrue(cluster.name is None)
-        line1 = 'refname\treftype\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text'
-        line2 = 'refname\treftype\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text'
-        line3 = 'refname\treftype\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text'
+        line1 = 'refname\treftype\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line2 = 'refname\treftype\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line3 = 'refname\treftype\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
         data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
         data_dict2 = summary_cluster.SummaryCluster.line2dict(line2)
         data_dict3 = summary_cluster.SummaryCluster.line2dict(line3)
@@ -85,7 +86,7 @@ def test_pc_id_of_longest(self):
 
     def test_to_cluster_summary_number(self):
         '''Test _to_cluster_summary_assembled'''
-        line = 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text'
+        line = 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
         data_dict = summary_cluster.SummaryCluster.line2dict(line)
 
         tests = [
@@ -122,9 +123,9 @@ def test_to_cluster_summary_number(self):
     def test_has_known_variant(self):
         '''Test _has_known_variant'''
         lines = [
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -139,9 +140,9 @@ def test_has_known_variant(self):
 
     def test_has_any_known_variant(self):
         lines = [
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -159,10 +160,10 @@ def test_has_any_known_variant(self):
     def test_has_nonsynonymous(self):
         '''Test _has_nonsynonymous'''
         lines = [
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -178,11 +179,11 @@ def test_has_nonsynonymous(self):
     def test_has_any_nonsynonymous(self):
         '''Test _has_any_nonsynonymous'''
         lines = [
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:N_ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['no', 'yes', 'no', 'yes', 'yes']
@@ -198,9 +199,9 @@ def test_has_any_nonsynonymous(self):
     def test_has_novel_nonsynonymous(self):
         '''Test _has_novel_nonsynonymous'''
         lines = [
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -216,9 +217,9 @@ def test_has_novel_nonsynonymous(self):
     def test_has_any_novel_nonsynonymous(self):
         '''Test _has_any_novel_nonsynonymous'''
         lines = [
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -236,11 +237,11 @@ def test_has_any_novel_nonsynonymous(self):
     def test_to_cluster_summary_has_known_nonsynonymous(self):
         '''Test _to_cluster_summary_has_known_nonsynonymous'''
         lines = [
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['yes', 'yes', 'no', 'no', 'no']
@@ -257,11 +258,11 @@ def test_to_cluster_summary_has_known_nonsynonymous(self):
     def test_to_cluster_summary_has_novel_nonsynonymous(self):
         '''Test _to_cluster_summary_has_novel_nonsynonymous'''
         lines = [
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['no', 'no', 'no', 'yes', 'yes']
@@ -278,11 +279,11 @@ def test_to_cluster_summary_has_novel_nonsynonymous(self):
     def test_to_cluster_summary_has_nonsynonymous(self):
         '''Test _to_cluster_summary_has_nonsynonymous'''
         lines = [
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['no', 'yes', 'no', 'yes', 'yes']
@@ -306,45 +307,51 @@ def test_get_nonsynonymous_var(self):
             'known_var': '0',
             'ref_ctg_change': '.',
             'ref_ctg_effect': '.',
-            'var_seq_type': '.'
+            'var_seq_type': '.',
+            'var_group': '.',
         }
 
         self.assertEqual(None, summary_cluster.SummaryCluster._get_nonsynonymous_var(d))
 
         d['var_type'] = 'p'
+        d['known_var'] = '1'
         d['has_known_var'] = '1'
         with self.assertRaises(summary_cluster.Error):
             summary_cluster.SummaryCluster._get_nonsynonymous_var(d)
 
         d['known_var_change'] = 'I42L'
-        self.assertEqual('I42L', summary_cluster.SummaryCluster._get_nonsynonymous_var(d))
+        self.assertEqual(('ref', 'I42L', 'ungrouped', None), summary_cluster.SummaryCluster._get_nonsynonymous_var(d))
+
+        d['var_group'] = 'vgroup'
+        self.assertEqual(('ref', 'I42L', 'grouped', 'vgroup'), summary_cluster.SummaryCluster._get_nonsynonymous_var(d))
+        d['var_group'] = '.'
 
         d['ref_ctg_change'] = 'P43Q'
         with self.assertRaises(summary_cluster.Error):
             summary_cluster.SummaryCluster._get_nonsynonymous_var(d)
 
         d['known_var_change'] = '.'
-        self.assertEqual('P43Q', summary_cluster.SummaryCluster._get_nonsynonymous_var(d))
+        self.assertEqual(('ref', 'P43Q', 'novel', None), summary_cluster.SummaryCluster._get_nonsynonymous_var(d))
 
         d['ref_ctg_change'] = '.'
         with self.assertRaises(summary_cluster.Error):
             summary_cluster.SummaryCluster._get_nonsynonymous_var(d)
 
         d['ref_ctg_effect'] = 'MULTIPLE'
-        self.assertEqual('MULTIPLE', summary_cluster.SummaryCluster._get_nonsynonymous_var(d))
+        self.assertEqual(('ref', 'MULTIPLE', 'novel', None), summary_cluster.SummaryCluster._get_nonsynonymous_var(d))
 
 
     def test_has_resistance(self):
         '''Test _has_resistance'''
         lines = [
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
-            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no']
@@ -359,10 +366,30 @@ def test_has_resistance(self):
                 self.assertEqual('no', cluster._has_resistance(assembled_summary))
 
 
+    def test_has_var_groups(self):
+        '''Test has_var_groups'''
+        lines = [
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id2:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id3:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id4:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id5:ref has wild type, foo bar\tsome free text',
+            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id6:ref has wild type, foo bar\tsome free text',
+            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id7:ref has wild type, foo bar\tsome free text',
+            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id7:ref has wild type, foo bar\tsome free text',
+        ]
+        dicts = [summary_cluster.SummaryCluster.line2dict(line) for line in lines]
+        cluster = summary_cluster.SummaryCluster()
+        for d in dicts:
+            cluster.add_data_dict(d)
+        got = cluster.has_var_groups()
+        expected = {'id1', 'id3', 'id6'}
+        self.assertEqual(expected, got)
+
 
     def test_column_summary_data(self):
         '''Test column_summary_data'''
-        line1 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tfoo bar\tspam eggs'
+        line1 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs'
         line2 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text'
 
         data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
@@ -384,7 +411,7 @@ def test_column_summary_data(self):
 
     def test_non_synon_variants(self):
         '''Test non_synon_variants'''
-        line1 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tfoo bar\tspam eggs'
+        line1 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs'
         line2 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text'
 
         data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
@@ -393,5 +420,5 @@ def test_non_synon_variants(self):
         cluster.add_data_dict(data_dict1)
         cluster.add_data_dict(data_dict2)
         got = cluster.non_synon_variants()
-        expected = {'A14T'}
+        expected = {('ref1', 'A14T', 'grouped', 'id1')}
         self.assertEqual(expected, got)
diff --git a/ariba/tests/summary_sample_test.py b/ariba/tests/summary_sample_test.py
index a318270e..3c5b2bef 100644
--- a/ariba/tests/summary_sample_test.py
+++ b/ariba/tests/summary_sample_test.py
@@ -45,7 +45,7 @@ def test_column_summary_data(self):
                 'has_res': 'yes',
                 'ref_seq': 'noncoding1',
                 'known_var': 'yes',
-                'novel_var': 'no',
+                'novel_var': 'yes',
                 'pct_id': '98.33'
             },
             'cluster.p': {
@@ -70,17 +70,17 @@ def test_column_summary_data(self):
         self.assertEqual(expected, got)
 
 
-    def test_non_synon_variants(self):
-        '''Test _non_synon_variants'''
-        infile = os.path.join(data_dir, 'summary_sample_test_non_synon_variants.tsv')
+    def test_var_groups(self):
+        '''test _var_groups'''
+        infile = os.path.join(data_dir, 'summary_sample_test_var_groups.tsv')
         sample_summary = summary_sample.SummarySample(infile)
         sample_summary.clusters = sample_summary._load_file(infile, 90)
+        got = sample_summary._var_groups()
         expected = {
-            'cluster.n': {'A14T', 'A6G'},
-            'cluster.p': {'A10V'},
-            'cluster.v': {'S5T'}
+            'cluster.n': {'id1', 'id2'},
+            'cluster.p': {'id3'},
+            'cluster.v': {'id4'}
         }
-        got = sample_summary._non_synon_variants()
         self.assertEqual(expected, got)
 
 
@@ -90,11 +90,14 @@ def test_variant_column_names_tuples(self):
         sample_summary = summary_sample.SummarySample(infile)
         sample_summary.clusters = sample_summary._load_file(infile, 90)
         sample_summary.column_summary_data = sample_summary._column_summary_data()
-        sample_summary.variants = sample_summary._non_synon_variants()
         expected = {
-            'cluster.v': {('variants_only1', 'S5T', 'known')},
-            'cluster.n': {('noncoding1', 'A6G', 'known'), ('noncoding1', 'A14T', 'known')},
-            'cluster.p': {('presence_absence1', 'A10V', 'unknown')}
+            'cluster.v': {('variants_only1', 'S5T', 'ungrouped', None)},
+            'cluster.n': {
+                ('noncoding1', 'A6G', 'grouped', 'id2'),
+                ('noncoding1', 'A14T', 'ungrouped', None),
+                ('noncoding1', 'G15T', 'novel', None)
+             },
+            'cluster.p': {('presence_absence1', 'A10V', 'grouped', 'id3')}
         }
         got = sample_summary._variant_column_names_tuples()
         self.assertEqual(expected, got)
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 1fa454bc..6cfd1e63 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -44,6 +44,35 @@ def test_determine_cluster_cols(self):
             self.assertEqual(expected[i], summary.Summary._determine_cluster_cols(col_strings[i]))
 
 
+    def test_determine_var_cols(self):
+        col_strings = [
+            'groups,grouped,ungrouped,novel',
+            'groups,grouped,ungrouped',
+            'grouped,novel',
+            'ungrouped,novel',
+            'grouped',
+            'ungrouped',
+            'novel',
+            ''
+        ]
+
+        expected = [
+            {'groups': True, 'grouped': True, 'ungrouped': True, 'novel': True},
+            {'groups': True, 'grouped': True, 'ungrouped': True, 'novel': False},
+            {'groups': False, 'grouped': True, 'ungrouped': False, 'novel': True},
+            {'groups': False, 'grouped': False, 'ungrouped': True, 'novel': True},
+            {'groups': False, 'grouped': True, 'ungrouped': False, 'novel': False},
+            {'groups': False, 'grouped': False, 'ungrouped': True, 'novel': False},
+            {'groups': False, 'grouped': False, 'ungrouped': False, 'novel': True},
+            {'groups': False, 'grouped': False, 'ungrouped': False, 'novel': False},
+        ]
+
+        assert len(col_strings) == len(expected)
+
+        for i in range(len(col_strings)):
+            self.assertEqual(expected[i], summary.Summary._determine_var_cols(col_strings[i]))
+
+
     def test_load_input_files(self):
         '''Test _load_input_files'''
         file1 = os.path.join(data_dir, 'summary_test_load_input_files.1.tsv')
@@ -74,9 +103,24 @@ def test_get_all_variant_columns(self):
         samples = summary.Summary._load_input_files([file1, file2], 90)
         got = summary.Summary._get_all_variant_columns(samples)
         expected = {
-            'cluster.p.2': {('presence_absence1', 'A10V', 'known')},
-            'cluster.n.1': {('noncoding1', 'A6G', 'known'), ('noncoding1', 'A14T', 'known')},
-            'cluster.p.1': {('presence_absence1', 'A10V', 'known')},
+            'cluster.p.2': {('presence_absence1', 'A10V', 'grouped', 'id3')},
+            'cluster.n.1': {('noncoding1', 'A6G', 'grouped', 'id2'), ('noncoding1', 'A14T', 'grouped', 'id1')},
+            'cluster.p.1': {('presence_absence1', 'A10V', 'grouped', 'id2')},
+        }
+        self.assertEqual(expected, got)
+
+
+    def test_get_all_var_groups(self):
+        '''test _get_all_var_groups'''
+        file1 = os.path.join(data_dir, 'summary_test_get_all_var_groups.1.tsv')
+        file2 = os.path.join(data_dir, 'summary_test_get_all_var_groups.2.tsv')
+        samples = summary.Summary._load_input_files([file1, file2], 90)
+        got = summary.Summary._get_all_var_groups(samples)
+        expected = {
+            'cluster.p.1': {'id4'},
+            'cluster.p.2': {'id3'},
+            'cluster.v.1': set(),
+            'cluster.n.1': {'id1', 'id2'}
         }
         self.assertEqual(expected, got)
 
@@ -87,7 +131,7 @@ def test_gather_output_rows(self):
             os.path.join(data_dir, 'summary_test_gather_output_rows.in.1.tsv'),
             os.path.join(data_dir, 'summary_test_gather_output_rows.in.2.tsv')
         ]
-        s = summary.Summary('out', filenames=infiles, include_all_known_variant_columns=False)
+        s = summary.Summary('out', filenames=infiles, variant_cols=None)
         s.samples = summary.Summary._load_input_files(infiles, 90)
         expected = {
             infiles[0]: {
@@ -146,7 +190,17 @@ def test_gather_output_rows(self):
         got = s._gather_output_rows()
         self.assertEqual(expected, got)
 
-        s.include_all_known_variant_columns = True
+        s.var_columns['groups'] = True
+        expected[infiles[0]]['noncoding1']['vgroup.id1'] = 'yes'
+        expected[infiles[0]]['noncoding1']['vgroup.id3'] = 'no'
+        expected[infiles[1]]['noncoding1']['vgroup.id1'] = 'yes'
+        expected[infiles[1]]['noncoding1']['vgroup.id3'] = 'yes'
+        got = s._gather_output_rows()
+        self.assertEqual(expected, got)
+
+
+        s.var_columns['grouped'] = True
+        s.var_columns['ungrouped'] = True
         expected[infiles[0]]['noncoding1']['noncoding1.A14T'] = 'yes'
         expected[infiles[0]]['noncoding1']['noncoding1.A6G'] = 'no'
         expected[infiles[1]]['noncoding1']['noncoding1.A14T'] = 'yes'
@@ -154,17 +208,19 @@ def test_gather_output_rows(self):
         got = s._gather_output_rows()
         self.assertEqual(expected, got)
 
-        s.include_all_novel_variant_columns = True
+        s.var_columns['novel'] = True
         expected[infiles[0]]['presence_absence1']['presence_absence1.A10V'] = 'yes'
         expected[infiles[1]]['presence_absence1']['presence_absence1.A10V'] = 'yes'
         got = s._gather_output_rows()
         self.assertEqual(expected, got)
 
         for filename in expected:
+            del expected[filename]['noncoding1']['vgroup.id1']
+            del expected[filename]['noncoding1']['vgroup.id3']
             for gene_type in expected[filename]:
                 del expected[filename][gene_type]['ref_seq']
 
-        s = summary.Summary('out', filenames=infiles, cluster_cols='assembled,has_res,pct_id,known_var,novel_var', include_all_novel_variant_columns=True)
+        s = summary.Summary('out', filenames=infiles, cluster_cols='assembled,has_res,pct_id,known_var,novel_var', variant_cols='ungrouped,grouped,novel')
         s.samples = summary.Summary._load_input_files(infiles, 90)
         s.include_all_variant_columns = True
         got = s._gather_output_rows()
diff --git a/ariba/vfdb_parser.py b/ariba/vfdb_parser.py
index 052d2f7b..9e0dab80 100644
--- a/ariba/vfdb_parser.py
+++ b/ariba/vfdb_parser.py
@@ -38,7 +38,7 @@ def run(self):
         for seq in file_reader:
             seq.id, description = self._fa_header_to_name_and_metadata(seq.id)
             if description is not None:
-                print(seq.id, '.', '.', description, sep='\t', file=tsv_out)
+                print(seq.id, '.', '.', '.', description, sep='\t', file=tsv_out)
             print(seq, file=fa_out)
 
         pyfastaq.utils.close(fa_out)
diff --git a/scripts/ariba b/scripts/ariba
index a4bbeac3..28d16786 100755
--- a/scripts/ariba
+++ b/scripts/ariba
@@ -9,6 +9,7 @@ tasks = {
     'run': 'Run the ARIBA local assembly pipeline',
     'summary': 'Summarise multiple reports made by "run"',
     'flag': 'Translate the meaning of a flag output by the pipeline',
+    'aln2meta': 'Make metadata input to preparef, using multialignment and SNPs',
     'test': 'Run on small test dataset',
     'version': 'Print version and exit',
 }
@@ -21,6 +22,7 @@ ordered_tasks = [
     'reportfilter',
     'summary',
     'flag',
+    'aln2meta',
     'test',
     'version',
 ]