From 02ed0674f7d2d03ebb02d20bc11b024ce44bdbe9 Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Mon, 4 Dec 2023 15:27:33 +1300 Subject: [PATCH 1/7] [ancestral] add general (simple-genome) tests While we had tests that (probably?) covered the code here, using a simple genome is much easier to reason with and to validate every mutation. In my opinion it's also important to see tests using "big" datasets (e.g. those bigger than we can manually verify each and every mutation) as not necessarily testing the accuracy of the command, but rather testing for regressions / changes in behaviour. VCF-input tests using the same data will be added in a future commit because (spoiler!) there are bugs which need fixing. --- tests/functional/ancestral/cram/general.t | 39 ++++++++++++++ .../data/simple-genome/nt_muts.ref-seq.json | 54 +++++++++++++++++++ .../data/simple-genome/reference.fasta | 2 + .../data/simple-genome/sequences.fasta | 6 +++ .../ancestral/data/simple-genome/tree.nwk | 1 + 5 files changed, 102 insertions(+) create mode 100644 tests/functional/ancestral/cram/general.t create mode 100644 tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json create mode 100644 tests/functional/ancestral/data/simple-genome/reference.fasta create mode 100644 tests/functional/ancestral/data/simple-genome/sequences.fasta create mode 100644 tests/functional/ancestral/data/simple-genome/tree.nwk diff --git a/tests/functional/ancestral/cram/general.t b/tests/functional/ancestral/cram/general.t new file mode 100644 index 000000000..abe7c7d7f --- /dev/null +++ b/tests/functional/ancestral/cram/general.t @@ -0,0 +1,39 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +General running of augur ancestral. +- Note that the (parsimonious) C18T on the branch of Sample_C is inferred by augur +as a root mutation of C18T + T18C reversion on Node_AB. This is reflected in the +node-data JSON we diff against. +- We supply the refererence sequence so mutations are called at the root. +- Ambiguous nucleotides (there are 3 Ns in sample_C) are inferred (default) + + $ ${AUGUR} ancestral \ + > --tree "$TESTDIR/../data/simple-genome/tree.nwk" \ + > --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \ + > --root-sequence "$TESTDIR/../data/simple-genome/reference.fasta" \ + > --output-node-data "nt_muts.ref-seq.json" \ + > --inference marginal > /dev/null + + + $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \ + > "$TESTDIR/../data/simple-genome/nt_muts.ref-seq.json" \ + > "nt_muts.ref-seq.json" + {} + +Same as above but without a root-sequence so no mutations inferred on the root node +(and thus the inferred reference will be different) + + $ ${AUGUR} ancestral \ + > --tree "$TESTDIR/../data/simple-genome/tree.nwk" \ + > --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \ + > --output-node-data "nt_muts.no-ref-seq.json" \ + > --inference marginal > /dev/null + + + $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \ + > "$TESTDIR/../data/simple-genome/nt_muts.ref-seq.json" \ + > "nt_muts.no-ref-seq.json" \ + > --exclude-paths "root['reference']['nuc']" "root['nodes']['node_root']['muts']" + {} diff --git a/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json b/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json new file mode 100644 index 000000000..e3c7b3870 --- /dev/null +++ b/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json @@ -0,0 +1,54 @@ +{ + "annotations": { + "nuc": { + "end": 50, + "start": 1, + "strand": "+", + "type": "source" + } + }, + "generated_by": { + "program": "augur", + "version": "23.1.1" + }, + "mask": "00000000000000000000000000000000000000000000000000", + "nodes": { + "node_AB": { + "muts": [ + "A7G", + "C14T", + "T18C", + "A33G", + "A43T" + ], + "sequence": "AAAACAGAAATGCTCTGCGGGTAAAAAAAAAAGAACTACTTGTCCATAAA" + }, + "node_root": { + "muts": [ + "A5C", + "C18T" + ], + "sequence": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA" + }, + "sample_A": { + "muts": [ + "G33C", + "C39T" + ], + "sequence": "AAAACAGAAATGCTCTGCGGGTAAAAAAAAAACAACTATTTGTCCATAAA" + }, + "sample_B": { + "muts": [ + "G42A" + ], + "sequence": "AAAACAGAAATGCTCTGCGGGTAAAAAAAAAAGAACTACTTATCCATAAA" + }, + "sample_C": { + "muts": [], + "sequence": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA" + } + }, + "reference": { + "nuc": "AAAAAAAAAATGCCCTGCGGGTAAAAAAAAAAAAACTACTTGACCATAAA" + } +} \ No newline at end of file diff --git a/tests/functional/ancestral/data/simple-genome/reference.fasta b/tests/functional/ancestral/data/simple-genome/reference.fasta new file mode 100644 index 000000000..b45333e53 --- /dev/null +++ b/tests/functional/ancestral/data/simple-genome/reference.fasta @@ -0,0 +1,2 @@ +>reference_name +AAAAAAAAAATGCCCTGCGGGTAAAAAAAAAAAAACTACTTGACCATAAA diff --git a/tests/functional/ancestral/data/simple-genome/sequences.fasta b/tests/functional/ancestral/data/simple-genome/sequences.fasta new file mode 100644 index 000000000..c515fd272 --- /dev/null +++ b/tests/functional/ancestral/data/simple-genome/sequences.fasta @@ -0,0 +1,6 @@ +>sample_A +AAAACAGAAATGCTCTGCGGGTAAAAAAAAAACAACTATTTGTCCATAAA +>sample_B +AAAACAGAAATGCTCTGCGGGTAAAAAAAAAAGAACTACTTATCCATAAA +>sample_C +AAAACAAAAATGCCCTGTGGGTAAAAANNNAAAAACTACTTGACCATAAA diff --git a/tests/functional/ancestral/data/simple-genome/tree.nwk b/tests/functional/ancestral/data/simple-genome/tree.nwk new file mode 100644 index 000000000..b847e97e4 --- /dev/null +++ b/tests/functional/ancestral/data/simple-genome/tree.nwk @@ -0,0 +1 @@ +(sample_C:0.02,(sample_B:0.02,sample_A:0.02)node_AB:0.06)node_root:0.02; \ No newline at end of file From 0987264f9b172b41942ec4a7c5092df100a386e2 Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Mon, 11 Dec 2023 10:29:34 +1300 Subject: [PATCH 2/7] [translate] update test syntax / format The move to "Creating files in the initial working directory" is motivated by and . Additionally, I remove the pushd commands which were confusing (there were multiple!) and use variables to refer to common directories to improve readability. --- tests/functional/translate/cram/_setup.sh | 2 -- .../translate/cram/translate-with-genbank.t | 19 ++++++++------ .../cram/translate-with-gff-and-gene-name.t | 23 +++++++++-------- .../cram/translate-with-gff-and-gene.t | 24 ++++++++++-------- .../cram/translate-with-gff-and-locus-tag.t | 25 +++++++++++-------- 5 files changed, 50 insertions(+), 43 deletions(-) delete mode 100644 tests/functional/translate/cram/_setup.sh diff --git a/tests/functional/translate/cram/_setup.sh b/tests/functional/translate/cram/_setup.sh deleted file mode 100644 index 19d57534d..000000000 --- a/tests/functional/translate/cram/_setup.sh +++ /dev/null @@ -1,2 +0,0 @@ -pushd "$TESTDIR/../../" > /dev/null -export AUGUR="${AUGUR:-../../bin/augur}" diff --git a/tests/functional/translate/cram/translate-with-genbank.t b/tests/functional/translate/cram/translate-with-genbank.t index d4dec07e7..9115b6f31 100644 --- a/tests/functional/translate/cram/translate-with-genbank.t +++ b/tests/functional/translate/cram/translate-with-genbank.t @@ -1,18 +1,21 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ source _setup.sh + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export DATA="$TESTDIR/../data" + $ export SCRIPTS="$TESTDIR/../../../../scripts" Translate amino acids for genes using a GenBank file. $ ${AUGUR} translate \ - > --tree translate/data/zika/tree.nwk \ - > --ancestral-sequences translate/data/zika/nt_muts.json \ - > --reference-sequence translate/data/zika/zika_outgroup.gb \ + > --tree "$DATA/zika/tree.nwk" \ + > --ancestral-sequences "$DATA/zika/nt_muts.json" \ + > --reference-sequence "$DATA/zika/zika_outgroup.gb" \ > --genes CA PRO \ - > --output-node-data $TMP/aa_muts.json - Validating schema of 'translate/data/zika/nt_muts.json'... + > --output-node-data aa_muts.json + Validating schema of '.+nt_muts.json'... (re) Read in 3 features from reference sequence file amino acid mutations written to .* (re) - $ python3 "../../scripts/diff_jsons.py" translate/data/zika/aa_muts_genbank.json $TMP/aa_muts.json + + $ python3 "$SCRIPTS/diff_jsons.py" $DATA/zika/aa_muts_genbank.json aa_muts.json \ + > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" {} diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t index 8796d35c4..62e6acf38 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t +++ b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t @@ -1,11 +1,12 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ source _setup.sh + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export DATA="$TESTDIR/../data" + $ export SCRIPTS="$TESTDIR/../../../../scripts" Translate amino acids for genes using a GFF3 file where the gene names are stored in a qualifier named "gene_name". - $ cat >$TMP/genemap.gff <<~~ + $ cat >genemap.gff <<~~ > ##gff-version 3 > ##sequence-region PF13/251013_18 1 10769 > PF13/251013_18 GenBank gene 91 456 . + . gene_name="CA" @@ -13,19 +14,19 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > ~~ $ ${AUGUR} translate \ - > --tree translate/data/zika/tree.nwk \ - > --ancestral-sequences translate/data/zika/nt_muts.json \ - > --reference-sequence "$TMP/genemap.gff" \ - > --output-node-data $TMP/aa_muts.json - Validating schema of 'translate/data/zika/nt_muts.json'... + > --tree "${DATA}/zika/tree.nwk" \ + > --ancestral-sequences "${DATA}/zika/nt_muts.json" \ + > --reference-sequence "genemap.gff" \ + > --output-node-data aa_muts.json + Validating schema of '.+/nt_muts.json'... (re) Read in 2 features from reference sequence file amino acid mutations written to .* (re) Other than the sequence ids which will include a temporary path, the JSONs should be identical. - $ python3 "../../scripts/diff_jsons.py" \ + $ python3 "${SCRIPTS}/diff_jsons.py" \ > --exclude-regex-paths "['seqid']" -- \ - > translate/data/zika/aa_muts_gff.json \ - > $TMP/aa_muts.json + > "${DATA}/zika/aa_muts_gff.json" \ + > aa_muts.json {} diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene.t b/tests/functional/translate/cram/translate-with-gff-and-gene.t index 703240e8c..ca32639ed 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-gene.t +++ b/tests/functional/translate/cram/translate-with-gff-and-gene.t @@ -1,11 +1,12 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ source _setup.sh + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export DATA="$TESTDIR/../data" + $ export SCRIPTS="$TESTDIR/../../../../scripts" Translate amino acids for genes using a GFF3 file where the gene names are stored in a qualifier named "gene". - $ cat >$TMP/genemap.gff <<~~ + $ cat >genemap.gff <<~~ > ##gff-version 3 > ##sequence-region PF13/251013_18 1 10769 > PF13/251013_18 GenBank gene 91 456 . + . gene="CA" @@ -13,15 +14,16 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > ~~ $ ${AUGUR} translate \ - > --tree translate/data/zika/tree.nwk \ - > --ancestral-sequences translate/data/zika/nt_muts.json \ - > --reference-sequence "$TMP/genemap.gff" \ - > --output-node-data $TMP/aa_muts.json - Validating schema of 'translate/data/zika/nt_muts.json'... + > --tree "${DATA}/zika/tree.nwk" \ + > --ancestral-sequences "${DATA}/zika/nt_muts.json" \ + > --reference-sequence genemap.gff \ + > --output-node-data aa_muts.json + Validating schema of '.+/nt_muts.json'... (re) Read in 2 features from reference sequence file amino acid mutations written to .* (re) - $ python3 "../../scripts/diff_jsons.py" \ + + $ python3 "${SCRIPTS}/diff_jsons.py" \ > --exclude-regex-paths "['seqid']" -- \ - > translate/data/zika/aa_muts_gff.json \ - > $TMP/aa_muts.json + > "${DATA}/zika/aa_muts_gff.json" \ + > aa_muts.json {} diff --git a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t index 96d263266..8f7ede91c 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t +++ b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t @@ -1,23 +1,26 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ source _setup.sh + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export DATA="$TESTDIR/../data" + $ export SCRIPTS="$TESTDIR/../../../../scripts" Translate amino acids for genes using a GFF3 file where the gene names are stored in a qualifier named "locus_tag". $ ${AUGUR} translate \ - > --tree translate/data/tb/tree.nwk \ - > --genes translate/data/tb/genes.txt \ - > --vcf-reference translate/data/tb/ref.fasta \ - > --ancestral-sequences translate/data/tb/nt_muts.vcf \ - > --reference-sequence translate/data/tb/Mtb_H37Rv_NCBI_Annot.gff \ - > --output-node-data $TMP/aa_muts.json \ - > --alignment-output $TMP/translations.vcf \ - > --vcf-reference-output $TMP/translations_reference.fasta + > --tree "${DATA}/tb/tree.nwk" \ + > --genes "${DATA}/tb/genes.txt" \ + > --vcf-reference "${DATA}/tb/ref.fasta" \ + > --ancestral-sequences "${DATA}/tb/nt_muts.vcf" \ + > --reference-sequence "${DATA}/tb/Mtb_H37Rv_NCBI_Annot.gff" \ + > --output-node-data aa_muts.json \ + > --alignment-output translations.vcf \ + > --vcf-reference-output translations_reference.fasta Gene length of rrs_Rvnr01 is not a multiple of 3. will pad with N Read in 187 specified genes to translate. Read in 187 features from reference sequence file 162 genes had no mutations and so have been be excluded. amino acid mutations written to .* (re) - $ python3 "../../scripts/diff_jsons.py" translate/data/tb/aa_muts.json $TMP/aa_muts.json + + $ python3 "${SCRIPTS}/diff_jsons.py" "${DATA}/tb/aa_muts.json" aa_muts.json \ + > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" {} From d5899d9f471b6fe4a63d14d07a8b3a50777f0c82 Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Mon, 4 Dec 2023 20:28:31 +1300 Subject: [PATCH 3/7] [translate] add general tests Uses the same approach as an earlier commit introduced for `augur ancestral`, but this time for `augur translate`. Tests the `--genes` argument as well for good measure. The exact format of GFF/GenBank files here is extremely important, but future commits will hopefully relax this. --- tests/functional/translate/cram/general.t | 38 ++++++++++ tests/functional/translate/cram/genes.t | 48 +++++++++++++ .../translate/data/simple-genome/aa_muts.json | 70 +++++++++++++++++++ .../translate/data/simple-genome/reference.gb | 15 ++++ .../data/simple-genome/reference.source.gff | 7 ++ 5 files changed, 178 insertions(+) create mode 100644 tests/functional/translate/cram/general.t create mode 100644 tests/functional/translate/cram/genes.t create mode 100644 tests/functional/translate/data/simple-genome/aa_muts.json create mode 100644 tests/functional/translate/data/simple-genome/reference.gb create mode 100644 tests/functional/translate/data/simple-genome/reference.source.gff diff --git a/tests/functional/translate/cram/general.t b/tests/functional/translate/cram/general.t new file mode 100644 index 000000000..fa389dbcb --- /dev/null +++ b/tests/functional/translate/cram/general.t @@ -0,0 +1,38 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export SCRIPTS="$TESTDIR/../../../../scripts" + $ export ANC_DATA="$TESTDIR/../../ancestral/data/simple-genome" + $ export DATA="$TESTDIR/../data/simple-genome" + +General running of augur translate. See the cram test general.t for `augur ancestral` +which uses many of the same files. +NOTE: The GFF reference here contains a 'source' ID because without this downstream commands +which validate the output will fail as it's missing a 'nuc' annotation. + + $ ${AUGUR} translate \ + > --tree "$ANC_DATA/tree.nwk" \ + > --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \ + > --reference-sequence "$DATA/reference.source.gff" \ + > --output-node-data "aa_muts.json" > /dev/null + + $ python3 "$SCRIPTS/diff_jsons.py" \ + > "$DATA/aa_muts.json" \ + > "aa_muts.json" \ + > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" + {} + +Same as above but using a GenBank file. This changes the 'type' of the annotations, +but this is irrelevant for Auspice's use and simply reflects the reference source. + + $ ${AUGUR} translate \ + > --tree "$ANC_DATA/tree.nwk" \ + > --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \ + > --reference-sequence "$DATA/reference.gb" \ + > --output-node-data "aa_muts.genbank.json" > /dev/null + + $ python3 "$SCRIPTS/diff_jsons.py" \ + > "$DATA/aa_muts.json" \ + > "aa_muts.genbank.json" \ + > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" "root\['annotations'\]\['.+'\]\['type'\]" + {} diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t new file mode 100644 index 000000000..aac2e8748 --- /dev/null +++ b/tests/functional/translate/cram/genes.t @@ -0,0 +1,48 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export SCRIPTS="$TESTDIR/../../../../scripts" + $ export ANC_DATA="$TESTDIR/../../ancestral/data/simple-genome" + $ export DATA="$TESTDIR/../data/simple-genome" + +Similar tests to those in `general.t` but here testing the --genes argument. +Note that the output is a little misleading, as it's counting the 'source' GFF ID +as a feature ('nuc' in this case) + + $ ${AUGUR} translate \ + > --tree "$ANC_DATA/tree.nwk" \ + > --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \ + > --reference-sequence "$DATA/reference.source.gff" \ + > --genes gene2 gene3 \ + > --output-node-data "aa_muts.genes-args.json" + Validating schema of .+ (re) + Couldn't find gene gene3 in GFF or GenBank file + Read in 2 features from reference sequence file + amino acid mutations written to .+ (re) + + $ python3 "$SCRIPTS/diff_jsons.py" \ + > "$DATA/aa_muts.json" \ + > "aa_muts.genes-args.json" \ + > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" + {'dictionary_item_removed': [root['annotations']['gene1'], root['nodes']['node_AB']['aa_muts']['gene1'], root['nodes']['node_root']['aa_sequences']['gene1'], root['nodes']['sample_A']['aa_muts']['gene1'], root['nodes']['sample_B']['aa_muts']['gene1'], root['nodes']['sample_C']['aa_muts']['gene1'], root['reference']['gene1']]} + +Using a text file rather than command line arguments + + $ echo -e "#comment\ngene2\ngene3"> "genes.txt" + + $ ${AUGUR} translate \ + > --tree "$ANC_DATA/tree.nwk" \ + > --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \ + > --reference-sequence "$DATA/reference.source.gff" \ + > --genes "genes.txt" \ + > --output-node-data "aa_muts.genes-txt.json" + Read in 2 specified genes to translate. + Validating schema of .+ (re) + Couldn't find gene gene3 in GFF or GenBank file + Read in 2 features from reference sequence file + amino acid mutations written to .+ (re) + + $ python3 "$SCRIPTS/diff_jsons.py" \ + > "aa_muts.genes-args.json" \ + > "aa_muts.genes-txt.json" + {} diff --git a/tests/functional/translate/data/simple-genome/aa_muts.json b/tests/functional/translate/data/simple-genome/aa_muts.json new file mode 100644 index 000000000..aa3c82801 --- /dev/null +++ b/tests/functional/translate/data/simple-genome/aa_muts.json @@ -0,0 +1,70 @@ +{ + "annotations": { + "gene1": { + "end": 24, + "seqid": "data/reference.source.gff", + "start": 10, + "strand": "+", + "type": "gene" + }, + "gene2": { + "end": 47, + "seqid": "data/reference.source.gff", + "start": 36, + "strand": "-", + "type": "gene" + }, + "nuc": { + "end": 50, + "seqid": "data/reference.source.gff", + "start": 1, + "strand": "+", + "type": "source" + } + }, + "generated_by": { + "program": "augur", + "version": "23.1.1" + }, + "nodes": { + "node_AB": { + "aa_muts": { + "gene1": [ + "P2L" + ], + "gene2": [ + "V2D" + ] + } + }, + "node_root": { + "aa_muts": {}, + "aa_sequences": { + "gene1": "MPCG*", + "gene2": "MVK*" + } + }, + "sample_A": { + "aa_muts": { + "gene1": [], + "gene2": [] + } + }, + "sample_B": { + "aa_muts": { + "gene1": [], + "gene2": [] + } + }, + "sample_C": { + "aa_muts": { + "gene1": [], + "gene2": [] + } + } + }, + "reference": { + "gene1": "MPCG*", + "gene2": "MVK*" + } +} \ No newline at end of file diff --git a/tests/functional/translate/data/simple-genome/reference.gb b/tests/functional/translate/data/simple-genome/reference.gb new file mode 100644 index 000000000..81560bc86 --- /dev/null +++ b/tests/functional/translate/data/simple-genome/reference.gb @@ -0,0 +1,15 @@ +LOCUS Simple-genome 50 bp DNA linear VRL 05-DEC-2023 +DEFINITION Synthetic genome used for Nextstrain testing +KEYWORDS . +FEATURES Location/Qualifiers + source 1..50 + /strain="Nextstrain/Simple-genome" + CDS 10..24 + /gene="gene1" + /translation="MPCG" + CDS complement(36..47) + /gene="gene2" + /translation="MVK" +ORIGIN + 1 AAAAAAAAAA TGCCCTGCGG GTAAAAAAAA AAAAACTACT TGACCATAAA +// \ No newline at end of file diff --git a/tests/functional/translate/data/simple-genome/reference.source.gff b/tests/functional/translate/data/simple-genome/reference.source.gff new file mode 100644 index 000000000..bbe1084a7 --- /dev/null +++ b/tests/functional/translate/data/simple-genome/reference.source.gff @@ -0,0 +1,7 @@ +##gff-version 3 +##created by james hadfield for testing NextStrain (December 2023) +##sequence-region reference_name 1 50 +reference_name RefSeq region 1 50 . + . ID=reference_name +reference_name RefSeq source 1 50 . + . ID=reference_name;locus_tag="https://github.com/nextstrain/augur/issues/1349";Note1="Source isn't really a GFF ID, but is required for Nextstrain to function correctly" +reference_name RefSeq gene 10 24 . + . Name=gene1;gene=gene1 +reference_name RefSeq gene 36 47 . - . Name=gene2;gene=gene2 From 112e73affb83a356bbf3407b998412976a1aa460 Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Mon, 4 Dec 2023 17:21:42 +1300 Subject: [PATCH 4/7] [translate] make required arguments required --- CHANGES.md | 7 +++++++ augur/translate.py | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 2ab4c8d8c..663d53990 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -9,7 +9,14 @@ * Incompatible arguments are now checked, especially related to VCF vs FASTA inputs. * `--vcf-reference` and `--root-sequence` are now mutually exclusive. + +### Bug Fixes + +* translate: Improvements to command line arguments. [#1348][] (@jameshadfield) + * `--tree` and `--ancestral-sequences` are now required arguments. + [#1344]: https://github.com/nextstrain/augur/pull/1344 +[#1348]: https://github.com/nextstrain/augur/pull/1348 ## 23.1.1 (7 November 2023) diff --git a/augur/translate.py b/augur/translate.py index 568293137..06e71b987 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -317,8 +317,8 @@ def get_genes_from_file(fname): def register_parser(parent_subparsers): parser = parent_subparsers.add_parser("translate", help=__doc__) - parser.add_argument('--tree', help="prebuilt Newick -- no tree will be built if provided") - parser.add_argument('--ancestral-sequences', type=str, help='JSON (fasta input) or VCF (VCF input) containing ancestral and tip sequences') + parser.add_argument('--tree', required=True, help="prebuilt Newick -- no tree will be built if provided") + parser.add_argument('--ancestral-sequences', required=True, type=str, help='JSON (fasta input) or VCF (VCF input) containing ancestral and tip sequences') parser.add_argument('--reference-sequence', required=True, help='GenBank or GFF file containing the annotation') parser.add_argument('--genes', nargs='+', help="genes to translate (list or file containing list)") From efb43b098ad7013a198d3599a5600cda98161d7b Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Mon, 4 Dec 2023 19:52:08 +1300 Subject: [PATCH 5/7] [translate] separate out VCF-specific arguments This helps readability of the --help content. We enforce this behaviour by adding a check in the code following the example in `augur ancestral` --- CHANGES.md | 1 + augur/translate.py | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 663d53990..571fe95ea 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -14,6 +14,7 @@ * translate: Improvements to command line arguments. [#1348][] (@jameshadfield) * `--tree` and `--ancestral-sequences` are now required arguments. + * separate VCF-only arguments into their own group [#1344]: https://github.com/nextstrain/augur/pull/1344 [#1348]: https://github.com/nextstrain/augur/pull/1348 diff --git a/augur/translate.py b/augur/translate.py index 06e71b987..521982dc6 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -19,6 +19,7 @@ from .io.vcf import write_VCF_translation from .utils import read_node_data, load_features, write_json, get_json_name from treetime.vcf_utils import read_vcf +from augur.errors import AugurError class MissingNodeError(Exception): pass @@ -326,10 +327,25 @@ def register_parser(parent_subparsers): parser.add_argument('--alignment-output', type=str, help="write out translated gene alignments. " "If a VCF-input, a .vcf or .vcf.gz will be output here (depending on file ending). If fasta-input, specify the file name " "like so: 'my_alignment_%%GENE.fasta', where '%%GENE' will be replaced by the name of the gene") - parser.add_argument('--vcf-reference-output', type=str, help="fasta file where reference sequence translations for VCF input will be written") - parser.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to') + vcf_only = parser.add_argument_group( + title="VCF specific", + description="These arguments are only applicable if the input (--ancestral-sequences) is in VCF format." + ) + vcf_only.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to') + vcf_only.add_argument('--vcf-reference-output', type=str, help="fasta file where reference sequence translations for VCF input will be written") + return parser +def check_arg_combinations(args, is_vcf): + """ + Check that provided arguments are compatible. + Where possible we use argparse built-ins, but they don't cover everything we want to check. + This checking shouldn't be used by downstream code to assume arguments exist, however by checking for + invalid combinations up-front we can exit quickly. + """ + if not is_vcf and (args.vcf_reference or args.vcf_reference_output): + raise AugurError("Arguments '--vcf-reference' and/or '--vcf-reference-output' are only applicable if the input ('--ancestral-sequences') is VCF") + def run(args): ## read tree and data, if reading data fails, return with error code @@ -362,6 +378,8 @@ def run(args): if 'sequence' in v: sequences[k] = v['sequence'] + check_arg_combinations(args, is_vcf) + ## load features; only requested features if genes given features = load_features(args.reference_sequence, genes) if features is None: From 0a01c596b0567fb2bcc615e961fc38402d9a012b Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Thu, 30 Nov 2023 17:12:31 +1300 Subject: [PATCH 6/7] [translate] refactor sequence extraction Simplifies the reading of the main function, and follows the established pattern in this file of using a pair of functions (one for VCF, one for JSON/FASTA). Added docstrings. The only code change is to throw an AugurError rather than `return 1`, but the result is unchanged. --- augur/translate.py | 53 +++++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/augur/translate.py b/augur/translate.py index 521982dc6..fbde1ab8c 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -315,6 +315,34 @@ def get_genes_from_file(fname): return unique_genes +def sequences_vcf(reference_fasta, vcf): + """ + Extract the nucleotide variation in the VCF + Returns a tuple + [0] The sequences as a dict of dicts. sequences → where is a 0-based int + [1] The sequence of the provided `reference_fasta` (string) + """ + if not reference_fasta: + raise AugurError("A reference Fasta is required with VCF-format input") + compress_seq = read_vcf(vcf, reference_fasta) + sequences = compress_seq['sequences'] + ref = compress_seq['reference'] + return (sequences, ref) + +def sequences_json(node_data_json, tree): + """ + Extract the full nuc sequence for each node in the provided node-data JSON. + Returns a dict, keys are node names and values are a string of the genome sequence (nuc) + """ + node_data = read_node_data(node_data_json, tree) + if node_data is None: + raise AugurError("could not read node data (incl sequences)") + # extract sequences from node meta data + sequences = {} + for k,v in node_data['nodes'].items(): + if 'sequence' in v: + sequences[k] = v['sequence'] + return sequences def register_parser(parent_subparsers): parser = parent_subparsers.add_parser("translate", help=__doc__) @@ -358,27 +386,14 @@ def run(args): genes = args.genes ## check file format and read in sequences - is_vcf = False - if any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): - if not args.vcf_reference: - print("ERROR: a reference Fasta is required with VCF-format input") - return 1 - compress_seq = read_vcf(args.ancestral_sequences, args.vcf_reference) - sequences = compress_seq['sequences'] - ref = compress_seq['reference'] - is_vcf = True + is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]) + check_arg_combinations(args, is_vcf) + + if is_vcf: + (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences) else: - node_data = read_node_data(args.ancestral_sequences, args.tree) - if node_data is None: - print("ERROR: could not read node data (incl sequences)") - return 1 - # extract sequences from node meta data - sequences = {} - for k,v in node_data['nodes'].items(): - if 'sequence' in v: - sequences[k] = v['sequence'] + sequences = sequences_json(args.ancestral_sequences, args.tree) - check_arg_combinations(args, is_vcf) ## load features; only requested features if genes given features = load_features(args.reference_sequence, genes) From 03c19657c12383cfffc81a7e933d269ade99ab99 Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Fri, 1 Dec 2023 12:14:29 +1300 Subject: [PATCH 7/7] [translate] refactor & improve readability There should be no functional changes. Co-locating the sequence reading & feature extraction is easier to read, and were Python's scoping to be different it'd be even nicer as we wouldn't leave around variables which are never re-used. As part of this `translate_vcf_feature` has changed from using an undocumented `return None` to raising a (documented) error which (IMO) is easier to reason with. --- CHANGES.md | 3 +- augur/translate.py | 72 ++++++++++--------- tests/functional/translate/cram/genes.t | 4 +- .../translate/cram/translate-with-genbank.t | 2 +- .../cram/translate-with-gff-and-gene-name.t | 2 +- .../cram/translate-with-gff-and-gene.t | 2 +- .../cram/translate-with-gff-and-locus-tag.t | 2 +- 7 files changed, 48 insertions(+), 39 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 571fe95ea..02ad665f3 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,10 +8,11 @@ * ancestral: Improvements to command line arguments. [#1344][] (@jameshadfield) * Incompatible arguments are now checked, especially related to VCF vs FASTA inputs. * `--vcf-reference` and `--root-sequence` are now mutually exclusive. - +* translate: Tree nodes are checked against the node-data JSON input to ensure sequences are present. [#1348][] (@jameshadfield) ### Bug Fixes +* translate: The 'source' ID for GFF files is now ignored as a potential gene feature (it is still used for overall nuc coords). [#1348][] (@jameshadfield) * translate: Improvements to command line arguments. [#1348][] (@jameshadfield) * `--tree` and `--ancestral-sequences` are now required arguments. * separate VCF-only arguments into their own group diff --git a/augur/translate.py b/augur/translate.py index fbde1ab8c..8c7253c39 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -20,6 +20,7 @@ from .utils import read_node_data, load_features, write_json, get_json_name from treetime.vcf_utils import read_vcf from augur.errors import AugurError +from textwrap import dedent class MissingNodeError(Exception): pass @@ -27,6 +28,9 @@ class MissingNodeError(Exception): class MismatchNodeError(Exception): pass +class NoVariationError(Exception): + pass + def safe_translate(sequence, report_exceptions=False): """Returns an amino acid translation of the given nucleotide sequence accounting for gaps in the given sequence. @@ -125,7 +129,7 @@ def translate_feature(aln, feature): return translations -def translate_vcf_feature(sequences, ref, feature): +def translate_vcf_feature(sequences, ref, feature, feature_name): '''Translates a subsequence of input nucleotide sequences. Parameters @@ -145,6 +149,9 @@ def translate_vcf_feature(sequences, ref, feature): translated reference gene, positions of AA differences, and AA differences indexed by node name + Raises + ------ + NoVariationError : if no variable sites within this feature (across all sequences) ''' def str_reverse_comp(str_seq): #gets reverse-compliment of a string and returns it as a string @@ -161,7 +168,7 @@ def str_reverse_comp(str_seq): # Need to get ref translation to store. check if multiple of 3 for sanity. # will be padded in safe_translate if not if len(refNuc)%3: - print("Gene length of {} is not a multiple of 3. will pad with N".format(feature.qualifiers['Name'][0]), file=sys.stderr) + print(f"Gene length of {feature_name!r} is not a multiple of 3. will pad with N", file=sys.stderr) ref_aa_seq = safe_translate(refNuc) prot['reference'] = ref_aa_seq @@ -205,11 +212,10 @@ def str_reverse_comp(str_seq): prot['positions'].sort() - #if no variable sites, exclude this gene + # raise an error if no variable sites observed if len(prot['positions']) == 0: - return None - else: - return prot + raise NoVariationError() + return prot def construct_mut(start, pos, end): return str(start) + str(pos) + str(end) @@ -334,7 +340,7 @@ def sequences_json(node_data_json, tree): Extract the full nuc sequence for each node in the provided node-data JSON. Returns a dict, keys are node names and values are a string of the genome sequence (nuc) """ - node_data = read_node_data(node_data_json, tree) + node_data = read_node_data(node_data_json) if node_data is None: raise AugurError("could not read node data (incl sequences)") # extract sequences from node meta data @@ -342,6 +348,14 @@ def sequences_json(node_data_json, tree): for k,v in node_data['nodes'].items(): if 'sequence' in v: sequences[k] = v['sequence'] + tree_nodes = {c.name for c in tree.find_clades()} + tree_nodes_missing_sequences = tree_nodes - set(sequences.keys()) + if len(tree_nodes_missing_sequences): + raise AugurError(dedent(f"""\ + {len(tree_nodes_missing_sequences)} nodes on the tree are missing nucleotide sequences in the node-data JSON. + These must be present under 'nodes' → → 'sequence'. + This error may originate from using 'augur ancestral' with VCF input; in this case try using VCF output from that command here. + """)) return sequences def register_parser(parent_subparsers): @@ -378,6 +392,8 @@ def check_arg_combinations(args, is_vcf): def run(args): ## read tree and data, if reading data fails, return with error code tree = Phylo.read(args.tree, 'newick') + is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]) + check_arg_combinations(args, is_vcf) # If genes is a file, read in the genes to translate if args.genes and len(args.genes) == 1 and os.path.isfile(args.genes[0]): @@ -385,16 +401,6 @@ def run(args): else: genes = args.genes - ## check file format and read in sequences - is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]) - check_arg_combinations(args, is_vcf) - - if is_vcf: - (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences) - else: - sequences = sequences_json(args.ancestral_sequences, args.tree) - - ## load features; only requested features if genes given features = load_features(args.reference_sequence, genes) if features is None: @@ -402,22 +408,24 @@ def run(args): return 1 print("Read in {} features from reference sequence file".format(len(features))) - ### translate every feature - but not 'nuc'! + ## Read in sequences & for each sequence translate each feature _except for_ the source (nuc) feature + ## Note that `load_features` _only_ extracts {'gene', 'source'} for GFF files, {'CDS', 'source'} for GenBank. translations = {} - deleted = [] - for fname, feat in features.items(): - if is_vcf: - trans = translate_vcf_feature(sequences, ref, feat) - if trans: - translations[fname] = trans - else: - deleted.append(fname) - else: - if feat.type != 'source': - translations[fname] = translate_feature(sequences, feat) - - if len(deleted) != 0: - print("{} genes had no mutations and so have been be excluded.".format(len(deleted))) + if is_vcf: + (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences) + features_without_variation = [] + for fname, feat in features.items(): + if feat.type=='source': + continue + try: + translations[fname] = translate_vcf_feature(sequences, ref, feat, fname) + except NoVariationError: + features_without_variation.append(fname) + if len(features_without_variation): + print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation))) + else: + sequences = sequences_json(args.ancestral_sequences, tree) + translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if feat.type != 'source'} ## glob the annotations for later auspice export # diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t index aac2e8748..3a5b866bd 100644 --- a/tests/functional/translate/cram/genes.t +++ b/tests/functional/translate/cram/genes.t @@ -15,9 +15,9 @@ as a feature ('nuc' in this case) > --reference-sequence "$DATA/reference.source.gff" \ > --genes gene2 gene3 \ > --output-node-data "aa_muts.genes-args.json" - Validating schema of .+ (re) Couldn't find gene gene3 in GFF or GenBank file Read in 2 features from reference sequence file + Validating schema of .+ (re) amino acid mutations written to .+ (re) $ python3 "$SCRIPTS/diff_jsons.py" \ @@ -37,9 +37,9 @@ Using a text file rather than command line arguments > --genes "genes.txt" \ > --output-node-data "aa_muts.genes-txt.json" Read in 2 specified genes to translate. - Validating schema of .+ (re) Couldn't find gene gene3 in GFF or GenBank file Read in 2 features from reference sequence file + Validating schema of .+ (re) amino acid mutations written to .+ (re) $ python3 "$SCRIPTS/diff_jsons.py" \ diff --git a/tests/functional/translate/cram/translate-with-genbank.t b/tests/functional/translate/cram/translate-with-genbank.t index 9115b6f31..b68b134f9 100644 --- a/tests/functional/translate/cram/translate-with-genbank.t +++ b/tests/functional/translate/cram/translate-with-genbank.t @@ -12,8 +12,8 @@ Translate amino acids for genes using a GenBank file. > --reference-sequence "$DATA/zika/zika_outgroup.gb" \ > --genes CA PRO \ > --output-node-data aa_muts.json - Validating schema of '.+nt_muts.json'... (re) Read in 3 features from reference sequence file + Validating schema of '.+nt_muts.json'... (re) amino acid mutations written to .* (re) $ python3 "$SCRIPTS/diff_jsons.py" $DATA/zika/aa_muts_genbank.json aa_muts.json \ diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t index 62e6acf38..9cb273bc5 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t +++ b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t @@ -18,8 +18,8 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > --ancestral-sequences "${DATA}/zika/nt_muts.json" \ > --reference-sequence "genemap.gff" \ > --output-node-data aa_muts.json - Validating schema of '.+/nt_muts.json'... (re) Read in 2 features from reference sequence file + Validating schema of '.+/nt_muts.json'... (re) amino acid mutations written to .* (re) Other than the sequence ids which will include a temporary path, the JSONs diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene.t b/tests/functional/translate/cram/translate-with-gff-and-gene.t index ca32639ed..2c7d1d016 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-gene.t +++ b/tests/functional/translate/cram/translate-with-gff-and-gene.t @@ -18,8 +18,8 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > --ancestral-sequences "${DATA}/zika/nt_muts.json" \ > --reference-sequence genemap.gff \ > --output-node-data aa_muts.json - Validating schema of '.+/nt_muts.json'... (re) Read in 2 features from reference sequence file + Validating schema of '.+/nt_muts.json'... (re) amino acid mutations written to .* (re) $ python3 "${SCRIPTS}/diff_jsons.py" \ diff --git a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t index 8f7ede91c..e58ea8979 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t +++ b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t @@ -15,7 +15,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > --output-node-data aa_muts.json \ > --alignment-output translations.vcf \ > --vcf-reference-output translations_reference.fasta - Gene length of rrs_Rvnr01 is not a multiple of 3. will pad with N + Gene length of 'rrs' is not a multiple of 3. will pad with N Read in 187 specified genes to translate. Read in 187 features from reference sequence file 162 genes had no mutations and so have been be excluded.