From 02ed0674f7d2d03ebb02d20bc11b024ce44bdbe9 Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Mon, 4 Dec 2023 15:27:33 +1300
Subject: [PATCH 1/7] [ancestral] add general (simple-genome) tests

While we had tests that (probably?) covered the code here, using a
simple genome is much easier to reason with and to validate every
mutation. In my opinion it's also important to see tests using "big"
datasets (e.g. those bigger than we can manually verify each and every
mutation) as not necessarily testing the accuracy of the command, but
rather testing for regressions / changes in behaviour.

VCF-input tests using the same data will be added in a future commit
because (spoiler!) there are bugs which need fixing.
---
 tests/functional/ancestral/cram/general.t     | 39 ++++++++++++++
 .../data/simple-genome/nt_muts.ref-seq.json   | 54 +++++++++++++++++++
 .../data/simple-genome/reference.fasta        |  2 +
 .../data/simple-genome/sequences.fasta        |  6 +++
 .../ancestral/data/simple-genome/tree.nwk     |  1 +
 5 files changed, 102 insertions(+)
 create mode 100644 tests/functional/ancestral/cram/general.t
 create mode 100644 tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json
 create mode 100644 tests/functional/ancestral/data/simple-genome/reference.fasta
 create mode 100644 tests/functional/ancestral/data/simple-genome/sequences.fasta
 create mode 100644 tests/functional/ancestral/data/simple-genome/tree.nwk

diff --git a/tests/functional/ancestral/cram/general.t b/tests/functional/ancestral/cram/general.t
new file mode 100644
index 000000000..abe7c7d7f
--- /dev/null
+++ b/tests/functional/ancestral/cram/general.t
@@ -0,0 +1,39 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+General running of augur ancestral.
+- Note that the (parsimonious) C18T on the branch of Sample_C is inferred by augur
+as a root mutation of C18T + T18C reversion on Node_AB. This is reflected in the
+node-data JSON we diff against.
+- We supply the refererence sequence so mutations are called at the root.
+- Ambiguous nucleotides (there are 3 Ns in sample_C) are inferred (default)
+
+  $ ${AUGUR} ancestral \
+  >  --tree "$TESTDIR/../data/simple-genome/tree.nwk" \
+  >  --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \
+  >  --root-sequence "$TESTDIR/../data/simple-genome/reference.fasta" \
+  >  --output-node-data "nt_muts.ref-seq.json" \
+  >  --inference marginal > /dev/null
+
+
+  $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
+  >   "$TESTDIR/../data/simple-genome/nt_muts.ref-seq.json" \
+  >   "nt_muts.ref-seq.json"
+  {}
+
+Same as above but without a root-sequence so no mutations inferred on the root node
+(and thus the inferred reference will be different)
+
+  $ ${AUGUR} ancestral \
+  >  --tree "$TESTDIR/../data/simple-genome/tree.nwk" \
+  >  --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \
+  >  --output-node-data "nt_muts.no-ref-seq.json" \
+  >  --inference marginal > /dev/null
+
+
+  $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
+  >   "$TESTDIR/../data/simple-genome/nt_muts.ref-seq.json" \
+  >   "nt_muts.no-ref-seq.json" \
+  >   --exclude-paths "root['reference']['nuc']" "root['nodes']['node_root']['muts']"
+  {}
diff --git a/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json b/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json
new file mode 100644
index 000000000..e3c7b3870
--- /dev/null
+++ b/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json
@@ -0,0 +1,54 @@
+{
+  "annotations": {
+    "nuc": {
+      "end": 50,
+      "start": 1,
+      "strand": "+",
+      "type": "source"
+    }
+  },
+  "generated_by": {
+    "program": "augur",
+    "version": "23.1.1"
+  },
+  "mask": "00000000000000000000000000000000000000000000000000",
+  "nodes": {
+    "node_AB": {
+      "muts": [
+        "A7G",
+        "C14T",
+        "T18C",
+        "A33G",
+        "A43T"
+      ],
+      "sequence": "AAAACAGAAATGCTCTGCGGGTAAAAAAAAAAGAACTACTTGTCCATAAA"
+    },
+    "node_root": {
+      "muts": [
+        "A5C",
+        "C18T"
+      ],
+      "sequence": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA"
+    },
+    "sample_A": {
+      "muts": [
+        "G33C",
+        "C39T"
+      ],
+      "sequence": "AAAACAGAAATGCTCTGCGGGTAAAAAAAAAACAACTATTTGTCCATAAA"
+    },
+    "sample_B": {
+      "muts": [
+        "G42A"
+      ],
+      "sequence": "AAAACAGAAATGCTCTGCGGGTAAAAAAAAAAGAACTACTTATCCATAAA"
+    },
+    "sample_C": {
+      "muts": [],
+      "sequence": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA"
+    }
+  },
+  "reference": {
+    "nuc": "AAAAAAAAAATGCCCTGCGGGTAAAAAAAAAAAAACTACTTGACCATAAA"
+  }
+}
\ No newline at end of file
diff --git a/tests/functional/ancestral/data/simple-genome/reference.fasta b/tests/functional/ancestral/data/simple-genome/reference.fasta
new file mode 100644
index 000000000..b45333e53
--- /dev/null
+++ b/tests/functional/ancestral/data/simple-genome/reference.fasta
@@ -0,0 +1,2 @@
+>reference_name
+AAAAAAAAAATGCCCTGCGGGTAAAAAAAAAAAAACTACTTGACCATAAA
diff --git a/tests/functional/ancestral/data/simple-genome/sequences.fasta b/tests/functional/ancestral/data/simple-genome/sequences.fasta
new file mode 100644
index 000000000..c515fd272
--- /dev/null
+++ b/tests/functional/ancestral/data/simple-genome/sequences.fasta
@@ -0,0 +1,6 @@
+>sample_A
+AAAACAGAAATGCTCTGCGGGTAAAAAAAAAACAACTATTTGTCCATAAA
+>sample_B
+AAAACAGAAATGCTCTGCGGGTAAAAAAAAAAGAACTACTTATCCATAAA
+>sample_C
+AAAACAAAAATGCCCTGTGGGTAAAAANNNAAAAACTACTTGACCATAAA
diff --git a/tests/functional/ancestral/data/simple-genome/tree.nwk b/tests/functional/ancestral/data/simple-genome/tree.nwk
new file mode 100644
index 000000000..b847e97e4
--- /dev/null
+++ b/tests/functional/ancestral/data/simple-genome/tree.nwk
@@ -0,0 +1 @@
+(sample_C:0.02,(sample_B:0.02,sample_A:0.02)node_AB:0.06)node_root:0.02;
\ No newline at end of file

From 0987264f9b172b41942ec4a7c5092df100a386e2 Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Mon, 11 Dec 2023 10:29:34 +1300
Subject: [PATCH 2/7] [translate] update test syntax / format

The move to "Creating files in the initial working directory" is
motivated by
<https://github.com/nextstrain/augur/pull/1344#discussion_r1416467185>
and <https://github.com/nextstrain/augur/issues/1176>.

Additionally, I remove the pushd commands which were confusing (there
were multiple!) and use variables to refer to common directories to
improve readability.
---
 tests/functional/translate/cram/_setup.sh     |  2 --
 .../translate/cram/translate-with-genbank.t   | 19 ++++++++------
 .../cram/translate-with-gff-and-gene-name.t   | 23 +++++++++--------
 .../cram/translate-with-gff-and-gene.t        | 24 ++++++++++--------
 .../cram/translate-with-gff-and-locus-tag.t   | 25 +++++++++++--------
 5 files changed, 50 insertions(+), 43 deletions(-)
 delete mode 100644 tests/functional/translate/cram/_setup.sh

diff --git a/tests/functional/translate/cram/_setup.sh b/tests/functional/translate/cram/_setup.sh
deleted file mode 100644
index 19d57534d..000000000
--- a/tests/functional/translate/cram/_setup.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-pushd "$TESTDIR/../../" > /dev/null
-export AUGUR="${AUGUR:-../../bin/augur}"
diff --git a/tests/functional/translate/cram/translate-with-genbank.t b/tests/functional/translate/cram/translate-with-genbank.t
index d4dec07e7..9115b6f31 100644
--- a/tests/functional/translate/cram/translate-with-genbank.t
+++ b/tests/functional/translate/cram/translate-with-genbank.t
@@ -1,18 +1,21 @@
 Setup
 
-  $ pushd "$TESTDIR" > /dev/null
-  $ source _setup.sh
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
+  $ export DATA="$TESTDIR/../data"
+  $ export SCRIPTS="$TESTDIR/../../../../scripts"
 
 Translate amino acids for genes using a GenBank file.
 
   $ ${AUGUR} translate \
-  >   --tree translate/data/zika/tree.nwk \
-  >   --ancestral-sequences translate/data/zika/nt_muts.json \
-  >   --reference-sequence translate/data/zika/zika_outgroup.gb \
+  >   --tree "$DATA/zika/tree.nwk" \
+  >   --ancestral-sequences "$DATA/zika/nt_muts.json" \
+  >   --reference-sequence "$DATA/zika/zika_outgroup.gb" \
   >   --genes CA PRO \
-  >   --output-node-data $TMP/aa_muts.json
-  Validating schema of 'translate/data/zika/nt_muts.json'...
+  >   --output-node-data aa_muts.json
+  Validating schema of '.+nt_muts.json'... (re)
   Read in 3 features from reference sequence file
   amino acid mutations written to .* (re)
-  $ python3 "../../scripts/diff_jsons.py" translate/data/zika/aa_muts_genbank.json $TMP/aa_muts.json
+
+  $ python3 "$SCRIPTS/diff_jsons.py" $DATA/zika/aa_muts_genbank.json aa_muts.json \
+  >  --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
   {}
diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t
index 8796d35c4..62e6acf38 100644
--- a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t
+++ b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t
@@ -1,11 +1,12 @@
 Setup
 
-  $ pushd "$TESTDIR" > /dev/null
-  $ source _setup.sh
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
+  $ export DATA="$TESTDIR/../data"
+  $ export SCRIPTS="$TESTDIR/../../../../scripts"
 
 Translate amino acids for genes using a GFF3 file where the gene names are stored in a qualifier named "gene_name".
 
-  $ cat >$TMP/genemap.gff <<~~
+  $ cat >genemap.gff <<~~
   > ##gff-version 3
   > ##sequence-region PF13/251013_18 1 10769
   > PF13/251013_18	GenBank	gene	91	456	.	+	.	gene_name="CA"
@@ -13,19 +14,19 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
   > ~~
 
   $ ${AUGUR} translate \
-  >   --tree translate/data/zika/tree.nwk \
-  >   --ancestral-sequences translate/data/zika/nt_muts.json \
-  >   --reference-sequence "$TMP/genemap.gff" \
-  >   --output-node-data $TMP/aa_muts.json
-  Validating schema of 'translate/data/zika/nt_muts.json'...
+  >   --tree "${DATA}/zika/tree.nwk" \
+  >   --ancestral-sequences "${DATA}/zika/nt_muts.json" \
+  >   --reference-sequence "genemap.gff" \
+  >   --output-node-data aa_muts.json
+  Validating schema of '.+/nt_muts.json'... (re)
   Read in 2 features from reference sequence file
   amino acid mutations written to .* (re)
 
 Other than the sequence ids which will include a temporary path, the JSONs
 should be identical.
 
-  $ python3 "../../scripts/diff_jsons.py" \
+  $ python3 "${SCRIPTS}/diff_jsons.py" \
   >  --exclude-regex-paths "['seqid']" -- \
-  >  translate/data/zika/aa_muts_gff.json \
-  >  $TMP/aa_muts.json
+  >  "${DATA}/zika/aa_muts_gff.json" \
+  >  aa_muts.json
   {}
diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene.t b/tests/functional/translate/cram/translate-with-gff-and-gene.t
index 703240e8c..ca32639ed 100644
--- a/tests/functional/translate/cram/translate-with-gff-and-gene.t
+++ b/tests/functional/translate/cram/translate-with-gff-and-gene.t
@@ -1,11 +1,12 @@
 Setup
 
-  $ pushd "$TESTDIR" > /dev/null
-  $ source _setup.sh
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
+  $ export DATA="$TESTDIR/../data"
+  $ export SCRIPTS="$TESTDIR/../../../../scripts"
 
 Translate amino acids for genes using a GFF3 file where the gene names are stored in a qualifier named "gene".
 
-  $ cat >$TMP/genemap.gff <<~~
+  $ cat >genemap.gff <<~~
   > ##gff-version 3
   > ##sequence-region PF13/251013_18 1 10769
   > PF13/251013_18	GenBank	gene	91	456	.	+	.	gene="CA"
@@ -13,15 +14,16 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
   > ~~
 
   $ ${AUGUR} translate \
-  >   --tree translate/data/zika/tree.nwk \
-  >   --ancestral-sequences translate/data/zika/nt_muts.json \
-  >   --reference-sequence "$TMP/genemap.gff" \
-  >   --output-node-data $TMP/aa_muts.json
-  Validating schema of 'translate/data/zika/nt_muts.json'...
+  >   --tree "${DATA}/zika/tree.nwk" \
+  >   --ancestral-sequences "${DATA}/zika/nt_muts.json" \
+  >   --reference-sequence genemap.gff \
+  >   --output-node-data aa_muts.json
+  Validating schema of '.+/nt_muts.json'... (re)
   Read in 2 features from reference sequence file
   amino acid mutations written to .* (re)
-  $ python3 "../../scripts/diff_jsons.py" \
+
+  $ python3 "${SCRIPTS}/diff_jsons.py" \
   >  --exclude-regex-paths "['seqid']" -- \
-  >  translate/data/zika/aa_muts_gff.json \
-  >  $TMP/aa_muts.json
+  >  "${DATA}/zika/aa_muts_gff.json" \
+  >  aa_muts.json
   {}
diff --git a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t
index 96d263266..8f7ede91c 100644
--- a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t
+++ b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t
@@ -1,23 +1,26 @@
 Setup
 
-  $ pushd "$TESTDIR" > /dev/null
-  $ source _setup.sh
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
+  $ export DATA="$TESTDIR/../data"
+  $ export SCRIPTS="$TESTDIR/../../../../scripts"
 
 Translate amino acids for genes using a GFF3 file where the gene names are stored in a qualifier named "locus_tag".
 
   $ ${AUGUR} translate \
-  >   --tree translate/data/tb/tree.nwk \
-  >   --genes translate/data/tb/genes.txt \
-  >   --vcf-reference translate/data/tb/ref.fasta \
-  >   --ancestral-sequences translate/data/tb/nt_muts.vcf \
-  >   --reference-sequence translate/data/tb/Mtb_H37Rv_NCBI_Annot.gff \
-  >   --output-node-data $TMP/aa_muts.json \
-  >   --alignment-output $TMP/translations.vcf \
-  >   --vcf-reference-output $TMP/translations_reference.fasta
+  >   --tree "${DATA}/tb/tree.nwk" \
+  >   --genes "${DATA}/tb/genes.txt" \
+  >   --vcf-reference "${DATA}/tb/ref.fasta" \
+  >   --ancestral-sequences "${DATA}/tb/nt_muts.vcf" \
+  >   --reference-sequence "${DATA}/tb/Mtb_H37Rv_NCBI_Annot.gff" \
+  >   --output-node-data aa_muts.json \
+  >   --alignment-output translations.vcf \
+  >   --vcf-reference-output translations_reference.fasta
   Gene length of rrs_Rvnr01 is not a multiple of 3. will pad with N
   Read in 187 specified genes to translate.
   Read in 187 features from reference sequence file
   162 genes had no mutations and so have been be excluded.
   amino acid mutations written to .* (re)
-  $ python3 "../../scripts/diff_jsons.py" translate/data/tb/aa_muts.json $TMP/aa_muts.json
+
+  $ python3 "${SCRIPTS}/diff_jsons.py" "${DATA}/tb/aa_muts.json" aa_muts.json \
+  >  --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
   {}

From d5899d9f471b6fe4a63d14d07a8b3a50777f0c82 Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Mon, 4 Dec 2023 20:28:31 +1300
Subject: [PATCH 3/7] [translate] add general tests

Uses the same approach as an earlier commit introduced for `augur
ancestral`, but this time for `augur translate`. Tests the `--genes`
argument as well for good measure. The exact format of GFF/GenBank files
here is extremely important, but future commits will hopefully relax
this.
---
 tests/functional/translate/cram/general.t     | 38 ++++++++++
 tests/functional/translate/cram/genes.t       | 48 +++++++++++++
 .../translate/data/simple-genome/aa_muts.json | 70 +++++++++++++++++++
 .../translate/data/simple-genome/reference.gb | 15 ++++
 .../data/simple-genome/reference.source.gff   |  7 ++
 5 files changed, 178 insertions(+)
 create mode 100644 tests/functional/translate/cram/general.t
 create mode 100644 tests/functional/translate/cram/genes.t
 create mode 100644 tests/functional/translate/data/simple-genome/aa_muts.json
 create mode 100644 tests/functional/translate/data/simple-genome/reference.gb
 create mode 100644 tests/functional/translate/data/simple-genome/reference.source.gff

diff --git a/tests/functional/translate/cram/general.t b/tests/functional/translate/cram/general.t
new file mode 100644
index 000000000..fa389dbcb
--- /dev/null
+++ b/tests/functional/translate/cram/general.t
@@ -0,0 +1,38 @@
+Setup
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
+  $ export SCRIPTS="$TESTDIR/../../../../scripts"
+  $ export ANC_DATA="$TESTDIR/../../ancestral/data/simple-genome"
+  $ export DATA="$TESTDIR/../data/simple-genome"
+
+General running of augur translate. See the cram test general.t for `augur ancestral`
+which uses many of the same files.
+NOTE: The GFF reference here contains a 'source' ID because without this downstream commands
+which validate the output will fail as it's missing a 'nuc' annotation.
+
+  $ ${AUGUR} translate \
+  >  --tree "$ANC_DATA/tree.nwk" \
+  >  --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \
+  >  --reference-sequence "$DATA/reference.source.gff" \
+  >  --output-node-data "aa_muts.json" > /dev/null
+
+  $ python3 "$SCRIPTS/diff_jsons.py" \
+  >   "$DATA/aa_muts.json" \
+  >   "aa_muts.json" \
+  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
+  {}
+
+Same as above but using a GenBank file. This changes the 'type' of the annotations,
+but this is irrelevant for Auspice's use and simply reflects the reference source.
+
+  $ ${AUGUR} translate \
+  >  --tree "$ANC_DATA/tree.nwk" \
+  >  --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \
+  >  --reference-sequence "$DATA/reference.gb" \
+  >  --output-node-data "aa_muts.genbank.json" > /dev/null
+
+  $ python3 "$SCRIPTS/diff_jsons.py" \
+  >   "$DATA/aa_muts.json" \
+  >   "aa_muts.genbank.json" \
+  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" "root\['annotations'\]\['.+'\]\['type'\]" 
+  {}
diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t
new file mode 100644
index 000000000..aac2e8748
--- /dev/null
+++ b/tests/functional/translate/cram/genes.t
@@ -0,0 +1,48 @@
+Setup
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
+  $ export SCRIPTS="$TESTDIR/../../../../scripts"
+  $ export ANC_DATA="$TESTDIR/../../ancestral/data/simple-genome"
+  $ export DATA="$TESTDIR/../data/simple-genome"
+
+Similar tests to those in `general.t` but here testing the --genes argument.
+Note that the output is a little misleading, as it's counting the 'source' GFF ID
+as a feature ('nuc' in this case)
+
+  $ ${AUGUR} translate \
+  >  --tree "$ANC_DATA/tree.nwk" \
+  >  --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \
+  >  --reference-sequence "$DATA/reference.source.gff" \
+  >  --genes gene2 gene3 \
+  >  --output-node-data "aa_muts.genes-args.json"
+  Validating schema of .+ (re)
+  Couldn't find gene gene3 in GFF or GenBank file
+  Read in 2 features from reference sequence file
+  amino acid mutations written to .+ (re)
+
+  $ python3 "$SCRIPTS/diff_jsons.py" \
+  >   "$DATA/aa_muts.json" \
+  >   "aa_muts.genes-args.json" \
+  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
+  {'dictionary_item_removed': [root['annotations']['gene1'], root['nodes']['node_AB']['aa_muts']['gene1'], root['nodes']['node_root']['aa_sequences']['gene1'], root['nodes']['sample_A']['aa_muts']['gene1'], root['nodes']['sample_B']['aa_muts']['gene1'], root['nodes']['sample_C']['aa_muts']['gene1'], root['reference']['gene1']]}
+
+Using a text file rather than command line arguments
+
+  $ echo -e "#comment\ngene2\ngene3"> "genes.txt"
+
+  $ ${AUGUR} translate \
+  >  --tree "$ANC_DATA/tree.nwk" \
+  >  --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \
+  >  --reference-sequence "$DATA/reference.source.gff" \
+  >  --genes "genes.txt" \
+  >  --output-node-data "aa_muts.genes-txt.json"
+  Read in 2 specified genes to translate.
+  Validating schema of .+ (re)
+  Couldn't find gene gene3 in GFF or GenBank file
+  Read in 2 features from reference sequence file
+  amino acid mutations written to .+ (re)
+
+  $ python3 "$SCRIPTS/diff_jsons.py" \
+  >   "aa_muts.genes-args.json" \
+  >   "aa_muts.genes-txt.json"
+  {}
diff --git a/tests/functional/translate/data/simple-genome/aa_muts.json b/tests/functional/translate/data/simple-genome/aa_muts.json
new file mode 100644
index 000000000..aa3c82801
--- /dev/null
+++ b/tests/functional/translate/data/simple-genome/aa_muts.json
@@ -0,0 +1,70 @@
+{
+  "annotations": {
+    "gene1": {
+      "end": 24,
+      "seqid": "data/reference.source.gff",
+      "start": 10,
+      "strand": "+",
+      "type": "gene"
+    },
+    "gene2": {
+      "end": 47,
+      "seqid": "data/reference.source.gff",
+      "start": 36,
+      "strand": "-",
+      "type": "gene"
+    },
+    "nuc": {
+      "end": 50,
+      "seqid": "data/reference.source.gff",
+      "start": 1,
+      "strand": "+",
+      "type": "source"
+    }
+  },
+  "generated_by": {
+    "program": "augur",
+    "version": "23.1.1"
+  },
+  "nodes": {
+    "node_AB": {
+      "aa_muts": {
+        "gene1": [
+          "P2L"
+        ],
+        "gene2": [
+          "V2D"
+        ]
+      }
+    },
+    "node_root": {
+      "aa_muts": {},
+      "aa_sequences": {
+        "gene1": "MPCG*",
+        "gene2": "MVK*"
+      }
+    },
+    "sample_A": {
+      "aa_muts": {
+        "gene1": [],
+        "gene2": []
+      }
+    },
+    "sample_B": {
+      "aa_muts": {
+        "gene1": [],
+        "gene2": []
+      }
+    },
+    "sample_C": {
+      "aa_muts": {
+        "gene1": [],
+        "gene2": []
+      }
+    }
+  },
+  "reference": {
+    "gene1": "MPCG*",
+    "gene2": "MVK*"
+  }
+}
\ No newline at end of file
diff --git a/tests/functional/translate/data/simple-genome/reference.gb b/tests/functional/translate/data/simple-genome/reference.gb
new file mode 100644
index 000000000..81560bc86
--- /dev/null
+++ b/tests/functional/translate/data/simple-genome/reference.gb
@@ -0,0 +1,15 @@
+LOCUS       Simple-genome             50 bp    DNA     linear   VRL 05-DEC-2023
+DEFINITION  Synthetic genome used for Nextstrain testing
+KEYWORDS    .
+FEATURES             Location/Qualifiers
+     source          1..50
+                     /strain="Nextstrain/Simple-genome"
+     CDS             10..24
+                     /gene="gene1"
+                     /translation="MPCG"
+     CDS             complement(36..47)
+                     /gene="gene2"
+                     /translation="MVK"
+ORIGIN
+        1 AAAAAAAAAA TGCCCTGCGG GTAAAAAAAA AAAAACTACT TGACCATAAA
+//
\ No newline at end of file
diff --git a/tests/functional/translate/data/simple-genome/reference.source.gff b/tests/functional/translate/data/simple-genome/reference.source.gff
new file mode 100644
index 000000000..bbe1084a7
--- /dev/null
+++ b/tests/functional/translate/data/simple-genome/reference.source.gff
@@ -0,0 +1,7 @@
+##gff-version 3
+##created by james hadfield for testing NextStrain (December 2023)
+##sequence-region reference_name 1 50
+reference_name	RefSeq	region	1	50	.	+	.	ID=reference_name
+reference_name	RefSeq	source	1	50	.	+	.	ID=reference_name;locus_tag="https://github.com/nextstrain/augur/issues/1349";Note1="Source isn't really a GFF ID, but is required for Nextstrain to function correctly"
+reference_name	RefSeq	gene	10	24	.	+	.	Name=gene1;gene=gene1
+reference_name	RefSeq	gene	36	47	.	-	.	Name=gene2;gene=gene2

From 112e73affb83a356bbf3407b998412976a1aa460 Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Mon, 4 Dec 2023 17:21:42 +1300
Subject: [PATCH 4/7] [translate] make required arguments required

---
 CHANGES.md         | 7 +++++++
 augur/translate.py | 4 ++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 2ab4c8d8c..663d53990 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -9,7 +9,14 @@
      * Incompatible arguments are now checked, especially related to VCF vs FASTA inputs. 
      * `--vcf-reference` and `--root-sequence` are now mutually exclusive.
 
+
+### Bug Fixes
+
+* translate: Improvements to command line arguments.  [#1348][] (@jameshadfield)
+    * `--tree` and `--ancestral-sequences` are now required arguments.
+
 [#1344]: https://github.com/nextstrain/augur/pull/1344
+[#1348]: https://github.com/nextstrain/augur/pull/1348
 
 ## 23.1.1 (7 November 2023)
 
diff --git a/augur/translate.py b/augur/translate.py
index 568293137..06e71b987 100644
--- a/augur/translate.py
+++ b/augur/translate.py
@@ -317,8 +317,8 @@ def get_genes_from_file(fname):
 
 def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("translate", help=__doc__)
-    parser.add_argument('--tree', help="prebuilt Newick -- no tree will be built if provided")
-    parser.add_argument('--ancestral-sequences', type=str, help='JSON (fasta input) or VCF (VCF input) containing ancestral and tip sequences')
+    parser.add_argument('--tree', required=True, help="prebuilt Newick -- no tree will be built if provided")
+    parser.add_argument('--ancestral-sequences', required=True, type=str, help='JSON (fasta input) or VCF (VCF input) containing ancestral and tip sequences')
     parser.add_argument('--reference-sequence', required=True,
                         help='GenBank or GFF file containing the annotation')
     parser.add_argument('--genes', nargs='+', help="genes to translate (list or file containing list)")

From efb43b098ad7013a198d3599a5600cda98161d7b Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Mon, 4 Dec 2023 19:52:08 +1300
Subject: [PATCH 5/7] [translate] separate out VCF-specific arguments

This helps readability of the --help content. We enforce this behaviour
by adding a check in the code following the example in `augur ancestral`
---
 CHANGES.md         |  1 +
 augur/translate.py | 22 ++++++++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 663d53990..571fe95ea 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -14,6 +14,7 @@
 
 * translate: Improvements to command line arguments.  [#1348][] (@jameshadfield)
     * `--tree` and `--ancestral-sequences` are now required arguments.
+    * separate VCF-only arguments into their own group
 
 [#1344]: https://github.com/nextstrain/augur/pull/1344
 [#1348]: https://github.com/nextstrain/augur/pull/1348
diff --git a/augur/translate.py b/augur/translate.py
index 06e71b987..521982dc6 100644
--- a/augur/translate.py
+++ b/augur/translate.py
@@ -19,6 +19,7 @@
 from .io.vcf import write_VCF_translation
 from .utils import read_node_data, load_features, write_json, get_json_name
 from treetime.vcf_utils import read_vcf
+from augur.errors import AugurError
 
 class MissingNodeError(Exception):
     pass
@@ -326,10 +327,25 @@ def register_parser(parent_subparsers):
     parser.add_argument('--alignment-output', type=str, help="write out translated gene alignments. "
                                    "If a VCF-input, a .vcf or .vcf.gz will be output here (depending on file ending). If fasta-input, specify the file name "
                                    "like so: 'my_alignment_%%GENE.fasta', where '%%GENE' will be replaced by the name of the gene")
-    parser.add_argument('--vcf-reference-output', type=str, help="fasta file where reference sequence translations for VCF input will be written")
-    parser.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to')
+    vcf_only = parser.add_argument_group(
+        title="VCF specific",
+        description="These arguments are only applicable if the input (--ancestral-sequences) is in VCF format."
+    )
+    vcf_only.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to')
+    vcf_only.add_argument('--vcf-reference-output', type=str, help="fasta file where reference sequence translations for VCF input will be written")
+
     return parser
 
+def check_arg_combinations(args, is_vcf):
+    """
+    Check that provided arguments are compatible.
+    Where possible we use argparse built-ins, but they don't cover everything we want to check.
+    This checking shouldn't be used by downstream code to assume arguments exist, however by checking for
+    invalid combinations up-front we can exit quickly.
+    """
+    if not is_vcf and (args.vcf_reference or args.vcf_reference_output):
+        raise AugurError("Arguments '--vcf-reference' and/or '--vcf-reference-output' are only applicable if the input ('--ancestral-sequences') is VCF")
+
 
 def run(args):
     ## read tree and data, if reading data fails, return with error code
@@ -362,6 +378,8 @@ def run(args):
             if 'sequence' in v:
                 sequences[k] = v['sequence']
 
+    check_arg_combinations(args, is_vcf)
+
     ## load features; only requested features if genes given
     features = load_features(args.reference_sequence, genes)
     if features is None:

From 0a01c596b0567fb2bcc615e961fc38402d9a012b Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Thu, 30 Nov 2023 17:12:31 +1300
Subject: [PATCH 6/7] [translate] refactor sequence extraction

Simplifies the reading of the main function, and follows the established
pattern in this file of using a pair of functions (one for VCF, one for
JSON/FASTA). Added docstrings. The only code change is to throw an
AugurError rather than `return 1`, but the result is unchanged.
---
 augur/translate.py | 53 +++++++++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 19 deletions(-)

diff --git a/augur/translate.py b/augur/translate.py
index 521982dc6..fbde1ab8c 100644
--- a/augur/translate.py
+++ b/augur/translate.py
@@ -315,6 +315,34 @@ def get_genes_from_file(fname):
 
     return unique_genes
 
+def sequences_vcf(reference_fasta, vcf):
+    """
+    Extract the nucleotide variation in the VCF
+    Returns a tuple
+    [0] The sequences as a dict of dicts. sequences → <NODE_NAME> → <POS> → <ALT_NUC> where <POS> is a 0-based int
+    [1] The sequence of the provided `reference_fasta` (string)
+    """
+    if not reference_fasta:
+        raise AugurError("A reference Fasta is required with VCF-format input")
+    compress_seq = read_vcf(vcf, reference_fasta)
+    sequences = compress_seq['sequences']
+    ref = compress_seq['reference']
+    return (sequences, ref)
+
+def sequences_json(node_data_json, tree):
+    """
+    Extract the full nuc sequence for each node in the provided node-data JSON.
+    Returns a dict, keys are node names and values are a string of the genome sequence (nuc)
+    """
+    node_data = read_node_data(node_data_json, tree)
+    if node_data is None:
+        raise AugurError("could not read node data (incl sequences)")
+    # extract sequences from node meta data
+    sequences = {}
+    for k,v in node_data['nodes'].items():
+        if 'sequence' in v:
+            sequences[k] = v['sequence']
+    return sequences
 
 def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("translate", help=__doc__)
@@ -358,27 +386,14 @@ def run(args):
         genes = args.genes
 
     ## check file format and read in sequences
-    is_vcf = False
-    if any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
-        if not args.vcf_reference:
-            print("ERROR: a reference Fasta is required with VCF-format input")
-            return 1
-        compress_seq = read_vcf(args.ancestral_sequences, args.vcf_reference)
-        sequences = compress_seq['sequences']
-        ref = compress_seq['reference']
-        is_vcf = True
+    is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
+    check_arg_combinations(args, is_vcf)
+
+    if is_vcf:
+        (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
     else:
-        node_data = read_node_data(args.ancestral_sequences, args.tree)
-        if node_data is None:
-            print("ERROR: could not read node data (incl sequences)")
-            return 1
-        # extract sequences from node meta data
-        sequences = {}
-        for k,v in node_data['nodes'].items():
-            if 'sequence' in v:
-                sequences[k] = v['sequence']
+        sequences = sequences_json(args.ancestral_sequences, args.tree)
 
-    check_arg_combinations(args, is_vcf)
 
     ## load features; only requested features if genes given
     features = load_features(args.reference_sequence, genes)

From 03c19657c12383cfffc81a7e933d269ade99ab99 Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Fri, 1 Dec 2023 12:14:29 +1300
Subject: [PATCH 7/7] [translate] refactor & improve readability

There should be no functional changes. Co-locating the sequence reading
& feature extraction is easier to read, and were Python's scoping to be
different it'd be even nicer as we wouldn't leave around variables which
are never re-used.

As part of this `translate_vcf_feature` has changed from using an
undocumented `return None` to raising a (documented) error which (IMO)
is easier to reason with.
---
 CHANGES.md                                    |  3 +-
 augur/translate.py                            | 72 ++++++++++---------
 tests/functional/translate/cram/genes.t       |  4 +-
 .../translate/cram/translate-with-genbank.t   |  2 +-
 .../cram/translate-with-gff-and-gene-name.t   |  2 +-
 .../cram/translate-with-gff-and-gene.t        |  2 +-
 .../cram/translate-with-gff-and-locus-tag.t   |  2 +-
 7 files changed, 48 insertions(+), 39 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 571fe95ea..02ad665f3 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -8,10 +8,11 @@
 * ancestral: Improvements to command line arguments. [#1344][] (@jameshadfield)
      * Incompatible arguments are now checked, especially related to VCF vs FASTA inputs. 
      * `--vcf-reference` and `--root-sequence` are now mutually exclusive.
-
+* translate: Tree nodes are checked against the node-data JSON input to ensure sequences are present. [#1348][] (@jameshadfield)
 
 ### Bug Fixes
 
+* translate: The 'source' ID for GFF files is now ignored as a potential gene feature (it is still used for overall nuc coords). [#1348][] (@jameshadfield)
 * translate: Improvements to command line arguments.  [#1348][] (@jameshadfield)
     * `--tree` and `--ancestral-sequences` are now required arguments.
     * separate VCF-only arguments into their own group
diff --git a/augur/translate.py b/augur/translate.py
index fbde1ab8c..8c7253c39 100644
--- a/augur/translate.py
+++ b/augur/translate.py
@@ -20,6 +20,7 @@
 from .utils import read_node_data, load_features, write_json, get_json_name
 from treetime.vcf_utils import read_vcf
 from augur.errors import AugurError
+from textwrap import dedent
 
 class MissingNodeError(Exception):
     pass
@@ -27,6 +28,9 @@ class MissingNodeError(Exception):
 class MismatchNodeError(Exception):
     pass
 
+class NoVariationError(Exception):
+    pass
+
 def safe_translate(sequence, report_exceptions=False):
     """Returns an amino acid translation of the given nucleotide sequence accounting
     for gaps in the given sequence.
@@ -125,7 +129,7 @@ def translate_feature(aln, feature):
     return translations
 
 
-def translate_vcf_feature(sequences, ref, feature):
+def translate_vcf_feature(sequences, ref, feature, feature_name):
     '''Translates a subsequence of input nucleotide sequences.
 
     Parameters
@@ -145,6 +149,9 @@ def translate_vcf_feature(sequences, ref, feature):
         translated reference gene, positions of AA differences, and AA
         differences indexed by node name
 
+    Raises
+    ------
+    NoVariationError : if no variable sites within this feature (across all sequences)
     '''
     def str_reverse_comp(str_seq):
         #gets reverse-compliment of a string and returns it as a string
@@ -161,7 +168,7 @@ def str_reverse_comp(str_seq):
     # Need to get ref translation to store. check if multiple of 3 for sanity.
     # will be padded in safe_translate if not
     if len(refNuc)%3:
-        print("Gene length of {} is not a multiple of 3. will pad with N".format(feature.qualifiers['Name'][0]), file=sys.stderr)
+        print(f"Gene length of {feature_name!r} is not a multiple of 3. will pad with N", file=sys.stderr)
 
     ref_aa_seq = safe_translate(refNuc)
     prot['reference'] = ref_aa_seq
@@ -205,11 +212,10 @@ def str_reverse_comp(str_seq):
 
     prot['positions'].sort()
 
-    #if no variable sites, exclude this gene
+    # raise an error if no variable sites observed
     if len(prot['positions']) == 0:
-        return None
-    else:
-        return prot
+        raise NoVariationError()
+    return prot
 
 def construct_mut(start, pos, end):
     return str(start) + str(pos) + str(end)
@@ -334,7 +340,7 @@ def sequences_json(node_data_json, tree):
     Extract the full nuc sequence for each node in the provided node-data JSON.
     Returns a dict, keys are node names and values are a string of the genome sequence (nuc)
     """
-    node_data = read_node_data(node_data_json, tree)
+    node_data = read_node_data(node_data_json)
     if node_data is None:
         raise AugurError("could not read node data (incl sequences)")
     # extract sequences from node meta data
@@ -342,6 +348,14 @@ def sequences_json(node_data_json, tree):
     for k,v in node_data['nodes'].items():
         if 'sequence' in v:
             sequences[k] = v['sequence']
+    tree_nodes = {c.name for c in tree.find_clades()}
+    tree_nodes_missing_sequences = tree_nodes - set(sequences.keys())
+    if len(tree_nodes_missing_sequences):
+        raise AugurError(dedent(f"""\
+            {len(tree_nodes_missing_sequences)} nodes on the tree are missing nucleotide sequences in the node-data JSON.
+            These must be present under 'nodes' → <node_name> → 'sequence'.
+            This error may originate from using 'augur ancestral' with VCF input; in this case try using VCF output from that command here.
+            """))
     return sequences
 
 def register_parser(parent_subparsers):
@@ -378,6 +392,8 @@ def check_arg_combinations(args, is_vcf):
 def run(args):
     ## read tree and data, if reading data fails, return with error code
     tree = Phylo.read(args.tree, 'newick')
+    is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
+    check_arg_combinations(args, is_vcf)
 
     # If genes is a file, read in the genes to translate
     if args.genes and len(args.genes) == 1 and os.path.isfile(args.genes[0]):
@@ -385,16 +401,6 @@ def run(args):
     else:
         genes = args.genes
 
-    ## check file format and read in sequences
-    is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
-    check_arg_combinations(args, is_vcf)
-
-    if is_vcf:
-        (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
-    else:
-        sequences = sequences_json(args.ancestral_sequences, args.tree)
-
-
     ## load features; only requested features if genes given
     features = load_features(args.reference_sequence, genes)
     if features is None:
@@ -402,22 +408,24 @@ def run(args):
         return 1
     print("Read in {} features from reference sequence file".format(len(features)))
 
-    ### translate every feature - but not 'nuc'!
+    ## Read in sequences & for each sequence translate each feature _except for_ the source (nuc) feature
+    ## Note that `load_features` _only_ extracts {'gene', 'source'} for GFF files, {'CDS', 'source'} for GenBank.
     translations = {}
-    deleted = []
-    for fname, feat in features.items():
-        if is_vcf:
-            trans = translate_vcf_feature(sequences, ref, feat)
-            if trans:
-                translations[fname] = trans
-            else:
-                deleted.append(fname)
-        else:
-            if feat.type != 'source':
-                translations[fname] = translate_feature(sequences, feat)
-
-    if len(deleted) != 0:
-        print("{} genes had no mutations and so have been be excluded.".format(len(deleted)))
+    if is_vcf:
+        (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
+        features_without_variation = []
+        for fname, feat in features.items():
+            if feat.type=='source':
+                continue
+            try:
+                translations[fname] = translate_vcf_feature(sequences, ref, feat, fname)
+            except NoVariationError:
+                features_without_variation.append(fname)
+        if len(features_without_variation):
+            print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))  
+    else:
+        sequences = sequences_json(args.ancestral_sequences, tree)
+        translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if feat.type != 'source'}
 
     ## glob the annotations for later auspice export
     #
diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t
index aac2e8748..3a5b866bd 100644
--- a/tests/functional/translate/cram/genes.t
+++ b/tests/functional/translate/cram/genes.t
@@ -15,9 +15,9 @@ as a feature ('nuc' in this case)
   >  --reference-sequence "$DATA/reference.source.gff" \
   >  --genes gene2 gene3 \
   >  --output-node-data "aa_muts.genes-args.json"
-  Validating schema of .+ (re)
   Couldn't find gene gene3 in GFF or GenBank file
   Read in 2 features from reference sequence file
+  Validating schema of .+ (re)
   amino acid mutations written to .+ (re)
 
   $ python3 "$SCRIPTS/diff_jsons.py" \
@@ -37,9 +37,9 @@ Using a text file rather than command line arguments
   >  --genes "genes.txt" \
   >  --output-node-data "aa_muts.genes-txt.json"
   Read in 2 specified genes to translate.
-  Validating schema of .+ (re)
   Couldn't find gene gene3 in GFF or GenBank file
   Read in 2 features from reference sequence file
+  Validating schema of .+ (re)
   amino acid mutations written to .+ (re)
 
   $ python3 "$SCRIPTS/diff_jsons.py" \
diff --git a/tests/functional/translate/cram/translate-with-genbank.t b/tests/functional/translate/cram/translate-with-genbank.t
index 9115b6f31..b68b134f9 100644
--- a/tests/functional/translate/cram/translate-with-genbank.t
+++ b/tests/functional/translate/cram/translate-with-genbank.t
@@ -12,8 +12,8 @@ Translate amino acids for genes using a GenBank file.
   >   --reference-sequence "$DATA/zika/zika_outgroup.gb" \
   >   --genes CA PRO \
   >   --output-node-data aa_muts.json
-  Validating schema of '.+nt_muts.json'... (re)
   Read in 3 features from reference sequence file
+  Validating schema of '.+nt_muts.json'... (re)
   amino acid mutations written to .* (re)
 
   $ python3 "$SCRIPTS/diff_jsons.py" $DATA/zika/aa_muts_genbank.json aa_muts.json \
diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t
index 62e6acf38..9cb273bc5 100644
--- a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t
+++ b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t
@@ -18,8 +18,8 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
   >   --ancestral-sequences "${DATA}/zika/nt_muts.json" \
   >   --reference-sequence "genemap.gff" \
   >   --output-node-data aa_muts.json
-  Validating schema of '.+/nt_muts.json'... (re)
   Read in 2 features from reference sequence file
+  Validating schema of '.+/nt_muts.json'... (re)
   amino acid mutations written to .* (re)
 
 Other than the sequence ids which will include a temporary path, the JSONs
diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene.t b/tests/functional/translate/cram/translate-with-gff-and-gene.t
index ca32639ed..2c7d1d016 100644
--- a/tests/functional/translate/cram/translate-with-gff-and-gene.t
+++ b/tests/functional/translate/cram/translate-with-gff-and-gene.t
@@ -18,8 +18,8 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
   >   --ancestral-sequences "${DATA}/zika/nt_muts.json" \
   >   --reference-sequence genemap.gff \
   >   --output-node-data aa_muts.json
-  Validating schema of '.+/nt_muts.json'... (re)
   Read in 2 features from reference sequence file
+  Validating schema of '.+/nt_muts.json'... (re)
   amino acid mutations written to .* (re)
 
   $ python3 "${SCRIPTS}/diff_jsons.py" \
diff --git a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t
index 8f7ede91c..e58ea8979 100644
--- a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t
+++ b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t
@@ -15,7 +15,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
   >   --output-node-data aa_muts.json \
   >   --alignment-output translations.vcf \
   >   --vcf-reference-output translations_reference.fasta
-  Gene length of rrs_Rvnr01 is not a multiple of 3. will pad with N
+  Gene length of 'rrs' is not a multiple of 3. will pad with N
   Read in 187 specified genes to translate.
   Read in 187 features from reference sequence file
   162 genes had no mutations and so have been be excluded.