diff --git a/CHANGES.md b/CHANGES.md index 2ab4c8d8c..02ad665f3 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,8 +8,17 @@ * ancestral: Improvements to command line arguments. [#1344][] (@jameshadfield) * Incompatible arguments are now checked, especially related to VCF vs FASTA inputs. * `--vcf-reference` and `--root-sequence` are now mutually exclusive. +* translate: Tree nodes are checked against the node-data JSON input to ensure sequences are present. [#1348][] (@jameshadfield) + +### Bug Fixes + +* translate: The 'source' ID for GFF files is now ignored as a potential gene feature (it is still used for overall nuc coords). [#1348][] (@jameshadfield) +* translate: Improvements to command line arguments. [#1348][] (@jameshadfield) + * `--tree` and `--ancestral-sequences` are now required arguments. + * separate VCF-only arguments into their own group [#1344]: https://github.com/nextstrain/augur/pull/1344 +[#1348]: https://github.com/nextstrain/augur/pull/1348 ## 23.1.1 (7 November 2023) diff --git a/augur/translate.py b/augur/translate.py index 568293137..8c7253c39 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -19,6 +19,8 @@ from .io.vcf import write_VCF_translation from .utils import read_node_data, load_features, write_json, get_json_name from treetime.vcf_utils import read_vcf +from augur.errors import AugurError +from textwrap import dedent class MissingNodeError(Exception): pass @@ -26,6 +28,9 @@ class MissingNodeError(Exception): class MismatchNodeError(Exception): pass +class NoVariationError(Exception): + pass + def safe_translate(sequence, report_exceptions=False): """Returns an amino acid translation of the given nucleotide sequence accounting for gaps in the given sequence. @@ -124,7 +129,7 @@ def translate_feature(aln, feature): return translations -def translate_vcf_feature(sequences, ref, feature): +def translate_vcf_feature(sequences, ref, feature, feature_name): '''Translates a subsequence of input nucleotide sequences. Parameters @@ -144,6 +149,9 @@ def translate_vcf_feature(sequences, ref, feature): translated reference gene, positions of AA differences, and AA differences indexed by node name + Raises + ------ + NoVariationError : if no variable sites within this feature (across all sequences) ''' def str_reverse_comp(str_seq): #gets reverse-compliment of a string and returns it as a string @@ -160,7 +168,7 @@ def str_reverse_comp(str_seq): # Need to get ref translation to store. check if multiple of 3 for sanity. # will be padded in safe_translate if not if len(refNuc)%3: - print("Gene length of {} is not a multiple of 3. will pad with N".format(feature.qualifiers['Name'][0]), file=sys.stderr) + print(f"Gene length of {feature_name!r} is not a multiple of 3. will pad with N", file=sys.stderr) ref_aa_seq = safe_translate(refNuc) prot['reference'] = ref_aa_seq @@ -204,11 +212,10 @@ def str_reverse_comp(str_seq): prot['positions'].sort() - #if no variable sites, exclude this gene + # raise an error if no variable sites observed if len(prot['positions']) == 0: - return None - else: - return prot + raise NoVariationError() + return prot def construct_mut(start, pos, end): return str(start) + str(pos) + str(end) @@ -314,11 +321,47 @@ def get_genes_from_file(fname): return unique_genes +def sequences_vcf(reference_fasta, vcf): + """ + Extract the nucleotide variation in the VCF + Returns a tuple + [0] The sequences as a dict of dicts. sequences → where is a 0-based int + [1] The sequence of the provided `reference_fasta` (string) + """ + if not reference_fasta: + raise AugurError("A reference Fasta is required with VCF-format input") + compress_seq = read_vcf(vcf, reference_fasta) + sequences = compress_seq['sequences'] + ref = compress_seq['reference'] + return (sequences, ref) + +def sequences_json(node_data_json, tree): + """ + Extract the full nuc sequence for each node in the provided node-data JSON. + Returns a dict, keys are node names and values are a string of the genome sequence (nuc) + """ + node_data = read_node_data(node_data_json) + if node_data is None: + raise AugurError("could not read node data (incl sequences)") + # extract sequences from node meta data + sequences = {} + for k,v in node_data['nodes'].items(): + if 'sequence' in v: + sequences[k] = v['sequence'] + tree_nodes = {c.name for c in tree.find_clades()} + tree_nodes_missing_sequences = tree_nodes - set(sequences.keys()) + if len(tree_nodes_missing_sequences): + raise AugurError(dedent(f"""\ + {len(tree_nodes_missing_sequences)} nodes on the tree are missing nucleotide sequences in the node-data JSON. + These must be present under 'nodes' → → 'sequence'. + This error may originate from using 'augur ancestral' with VCF input; in this case try using VCF output from that command here. + """)) + return sequences def register_parser(parent_subparsers): parser = parent_subparsers.add_parser("translate", help=__doc__) - parser.add_argument('--tree', help="prebuilt Newick -- no tree will be built if provided") - parser.add_argument('--ancestral-sequences', type=str, help='JSON (fasta input) or VCF (VCF input) containing ancestral and tip sequences') + parser.add_argument('--tree', required=True, help="prebuilt Newick -- no tree will be built if provided") + parser.add_argument('--ancestral-sequences', required=True, type=str, help='JSON (fasta input) or VCF (VCF input) containing ancestral and tip sequences') parser.add_argument('--reference-sequence', required=True, help='GenBank or GFF file containing the annotation') parser.add_argument('--genes', nargs='+', help="genes to translate (list or file containing list)") @@ -326,14 +369,31 @@ def register_parser(parent_subparsers): parser.add_argument('--alignment-output', type=str, help="write out translated gene alignments. " "If a VCF-input, a .vcf or .vcf.gz will be output here (depending on file ending). If fasta-input, specify the file name " "like so: 'my_alignment_%%GENE.fasta', where '%%GENE' will be replaced by the name of the gene") - parser.add_argument('--vcf-reference-output', type=str, help="fasta file where reference sequence translations for VCF input will be written") - parser.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to') + vcf_only = parser.add_argument_group( + title="VCF specific", + description="These arguments are only applicable if the input (--ancestral-sequences) is in VCF format." + ) + vcf_only.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to') + vcf_only.add_argument('--vcf-reference-output', type=str, help="fasta file where reference sequence translations for VCF input will be written") + return parser +def check_arg_combinations(args, is_vcf): + """ + Check that provided arguments are compatible. + Where possible we use argparse built-ins, but they don't cover everything we want to check. + This checking shouldn't be used by downstream code to assume arguments exist, however by checking for + invalid combinations up-front we can exit quickly. + """ + if not is_vcf and (args.vcf_reference or args.vcf_reference_output): + raise AugurError("Arguments '--vcf-reference' and/or '--vcf-reference-output' are only applicable if the input ('--ancestral-sequences') is VCF") + def run(args): ## read tree and data, if reading data fails, return with error code tree = Phylo.read(args.tree, 'newick') + is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]) + check_arg_combinations(args, is_vcf) # If genes is a file, read in the genes to translate if args.genes and len(args.genes) == 1 and os.path.isfile(args.genes[0]): @@ -341,27 +401,6 @@ def run(args): else: genes = args.genes - ## check file format and read in sequences - is_vcf = False - if any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): - if not args.vcf_reference: - print("ERROR: a reference Fasta is required with VCF-format input") - return 1 - compress_seq = read_vcf(args.ancestral_sequences, args.vcf_reference) - sequences = compress_seq['sequences'] - ref = compress_seq['reference'] - is_vcf = True - else: - node_data = read_node_data(args.ancestral_sequences, args.tree) - if node_data is None: - print("ERROR: could not read node data (incl sequences)") - return 1 - # extract sequences from node meta data - sequences = {} - for k,v in node_data['nodes'].items(): - if 'sequence' in v: - sequences[k] = v['sequence'] - ## load features; only requested features if genes given features = load_features(args.reference_sequence, genes) if features is None: @@ -369,22 +408,24 @@ def run(args): return 1 print("Read in {} features from reference sequence file".format(len(features))) - ### translate every feature - but not 'nuc'! + ## Read in sequences & for each sequence translate each feature _except for_ the source (nuc) feature + ## Note that `load_features` _only_ extracts {'gene', 'source'} for GFF files, {'CDS', 'source'} for GenBank. translations = {} - deleted = [] - for fname, feat in features.items(): - if is_vcf: - trans = translate_vcf_feature(sequences, ref, feat) - if trans: - translations[fname] = trans - else: - deleted.append(fname) - else: - if feat.type != 'source': - translations[fname] = translate_feature(sequences, feat) - - if len(deleted) != 0: - print("{} genes had no mutations and so have been be excluded.".format(len(deleted))) + if is_vcf: + (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences) + features_without_variation = [] + for fname, feat in features.items(): + if feat.type=='source': + continue + try: + translations[fname] = translate_vcf_feature(sequences, ref, feat, fname) + except NoVariationError: + features_without_variation.append(fname) + if len(features_without_variation): + print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation))) + else: + sequences = sequences_json(args.ancestral_sequences, tree) + translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if feat.type != 'source'} ## glob the annotations for later auspice export # diff --git a/tests/functional/ancestral/cram/general.t b/tests/functional/ancestral/cram/general.t new file mode 100644 index 000000000..abe7c7d7f --- /dev/null +++ b/tests/functional/ancestral/cram/general.t @@ -0,0 +1,39 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +General running of augur ancestral. +- Note that the (parsimonious) C18T on the branch of Sample_C is inferred by augur +as a root mutation of C18T + T18C reversion on Node_AB. This is reflected in the +node-data JSON we diff against. +- We supply the refererence sequence so mutations are called at the root. +- Ambiguous nucleotides (there are 3 Ns in sample_C) are inferred (default) + + $ ${AUGUR} ancestral \ + > --tree "$TESTDIR/../data/simple-genome/tree.nwk" \ + > --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \ + > --root-sequence "$TESTDIR/../data/simple-genome/reference.fasta" \ + > --output-node-data "nt_muts.ref-seq.json" \ + > --inference marginal > /dev/null + + + $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \ + > "$TESTDIR/../data/simple-genome/nt_muts.ref-seq.json" \ + > "nt_muts.ref-seq.json" + {} + +Same as above but without a root-sequence so no mutations inferred on the root node +(and thus the inferred reference will be different) + + $ ${AUGUR} ancestral \ + > --tree "$TESTDIR/../data/simple-genome/tree.nwk" \ + > --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \ + > --output-node-data "nt_muts.no-ref-seq.json" \ + > --inference marginal > /dev/null + + + $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \ + > "$TESTDIR/../data/simple-genome/nt_muts.ref-seq.json" \ + > "nt_muts.no-ref-seq.json" \ + > --exclude-paths "root['reference']['nuc']" "root['nodes']['node_root']['muts']" + {} diff --git a/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json b/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json new file mode 100644 index 000000000..e3c7b3870 --- /dev/null +++ b/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json @@ -0,0 +1,54 @@ +{ + "annotations": { + "nuc": { + "end": 50, + "start": 1, + "strand": "+", + "type": "source" + } + }, + "generated_by": { + "program": "augur", + "version": "23.1.1" + }, + "mask": "00000000000000000000000000000000000000000000000000", + "nodes": { + "node_AB": { + "muts": [ + "A7G", + "C14T", + "T18C", + "A33G", + "A43T" + ], + "sequence": "AAAACAGAAATGCTCTGCGGGTAAAAAAAAAAGAACTACTTGTCCATAAA" + }, + "node_root": { + "muts": [ + "A5C", + "C18T" + ], + "sequence": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA" + }, + "sample_A": { + "muts": [ + "G33C", + "C39T" + ], + "sequence": "AAAACAGAAATGCTCTGCGGGTAAAAAAAAAACAACTATTTGTCCATAAA" + }, + "sample_B": { + "muts": [ + "G42A" + ], + "sequence": "AAAACAGAAATGCTCTGCGGGTAAAAAAAAAAGAACTACTTATCCATAAA" + }, + "sample_C": { + "muts": [], + "sequence": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA" + } + }, + "reference": { + "nuc": "AAAAAAAAAATGCCCTGCGGGTAAAAAAAAAAAAACTACTTGACCATAAA" + } +} \ No newline at end of file diff --git a/tests/functional/ancestral/data/simple-genome/reference.fasta b/tests/functional/ancestral/data/simple-genome/reference.fasta new file mode 100644 index 000000000..b45333e53 --- /dev/null +++ b/tests/functional/ancestral/data/simple-genome/reference.fasta @@ -0,0 +1,2 @@ +>reference_name +AAAAAAAAAATGCCCTGCGGGTAAAAAAAAAAAAACTACTTGACCATAAA diff --git a/tests/functional/ancestral/data/simple-genome/sequences.fasta b/tests/functional/ancestral/data/simple-genome/sequences.fasta new file mode 100644 index 000000000..c515fd272 --- /dev/null +++ b/tests/functional/ancestral/data/simple-genome/sequences.fasta @@ -0,0 +1,6 @@ +>sample_A +AAAACAGAAATGCTCTGCGGGTAAAAAAAAAACAACTATTTGTCCATAAA +>sample_B +AAAACAGAAATGCTCTGCGGGTAAAAAAAAAAGAACTACTTATCCATAAA +>sample_C +AAAACAAAAATGCCCTGTGGGTAAAAANNNAAAAACTACTTGACCATAAA diff --git a/tests/functional/ancestral/data/simple-genome/tree.nwk b/tests/functional/ancestral/data/simple-genome/tree.nwk new file mode 100644 index 000000000..b847e97e4 --- /dev/null +++ b/tests/functional/ancestral/data/simple-genome/tree.nwk @@ -0,0 +1 @@ +(sample_C:0.02,(sample_B:0.02,sample_A:0.02)node_AB:0.06)node_root:0.02; \ No newline at end of file diff --git a/tests/functional/translate/cram/_setup.sh b/tests/functional/translate/cram/_setup.sh deleted file mode 100644 index 19d57534d..000000000 --- a/tests/functional/translate/cram/_setup.sh +++ /dev/null @@ -1,2 +0,0 @@ -pushd "$TESTDIR/../../" > /dev/null -export AUGUR="${AUGUR:-../../bin/augur}" diff --git a/tests/functional/translate/cram/general.t b/tests/functional/translate/cram/general.t new file mode 100644 index 000000000..fa389dbcb --- /dev/null +++ b/tests/functional/translate/cram/general.t @@ -0,0 +1,38 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export SCRIPTS="$TESTDIR/../../../../scripts" + $ export ANC_DATA="$TESTDIR/../../ancestral/data/simple-genome" + $ export DATA="$TESTDIR/../data/simple-genome" + +General running of augur translate. See the cram test general.t for `augur ancestral` +which uses many of the same files. +NOTE: The GFF reference here contains a 'source' ID because without this downstream commands +which validate the output will fail as it's missing a 'nuc' annotation. + + $ ${AUGUR} translate \ + > --tree "$ANC_DATA/tree.nwk" \ + > --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \ + > --reference-sequence "$DATA/reference.source.gff" \ + > --output-node-data "aa_muts.json" > /dev/null + + $ python3 "$SCRIPTS/diff_jsons.py" \ + > "$DATA/aa_muts.json" \ + > "aa_muts.json" \ + > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" + {} + +Same as above but using a GenBank file. This changes the 'type' of the annotations, +but this is irrelevant for Auspice's use and simply reflects the reference source. + + $ ${AUGUR} translate \ + > --tree "$ANC_DATA/tree.nwk" \ + > --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \ + > --reference-sequence "$DATA/reference.gb" \ + > --output-node-data "aa_muts.genbank.json" > /dev/null + + $ python3 "$SCRIPTS/diff_jsons.py" \ + > "$DATA/aa_muts.json" \ + > "aa_muts.genbank.json" \ + > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" "root\['annotations'\]\['.+'\]\['type'\]" + {} diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t new file mode 100644 index 000000000..3a5b866bd --- /dev/null +++ b/tests/functional/translate/cram/genes.t @@ -0,0 +1,48 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export SCRIPTS="$TESTDIR/../../../../scripts" + $ export ANC_DATA="$TESTDIR/../../ancestral/data/simple-genome" + $ export DATA="$TESTDIR/../data/simple-genome" + +Similar tests to those in `general.t` but here testing the --genes argument. +Note that the output is a little misleading, as it's counting the 'source' GFF ID +as a feature ('nuc' in this case) + + $ ${AUGUR} translate \ + > --tree "$ANC_DATA/tree.nwk" \ + > --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \ + > --reference-sequence "$DATA/reference.source.gff" \ + > --genes gene2 gene3 \ + > --output-node-data "aa_muts.genes-args.json" + Couldn't find gene gene3 in GFF or GenBank file + Read in 2 features from reference sequence file + Validating schema of .+ (re) + amino acid mutations written to .+ (re) + + $ python3 "$SCRIPTS/diff_jsons.py" \ + > "$DATA/aa_muts.json" \ + > "aa_muts.genes-args.json" \ + > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" + {'dictionary_item_removed': [root['annotations']['gene1'], root['nodes']['node_AB']['aa_muts']['gene1'], root['nodes']['node_root']['aa_sequences']['gene1'], root['nodes']['sample_A']['aa_muts']['gene1'], root['nodes']['sample_B']['aa_muts']['gene1'], root['nodes']['sample_C']['aa_muts']['gene1'], root['reference']['gene1']]} + +Using a text file rather than command line arguments + + $ echo -e "#comment\ngene2\ngene3"> "genes.txt" + + $ ${AUGUR} translate \ + > --tree "$ANC_DATA/tree.nwk" \ + > --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \ + > --reference-sequence "$DATA/reference.source.gff" \ + > --genes "genes.txt" \ + > --output-node-data "aa_muts.genes-txt.json" + Read in 2 specified genes to translate. + Couldn't find gene gene3 in GFF or GenBank file + Read in 2 features from reference sequence file + Validating schema of .+ (re) + amino acid mutations written to .+ (re) + + $ python3 "$SCRIPTS/diff_jsons.py" \ + > "aa_muts.genes-args.json" \ + > "aa_muts.genes-txt.json" + {} diff --git a/tests/functional/translate/cram/translate-with-genbank.t b/tests/functional/translate/cram/translate-with-genbank.t index d4dec07e7..b68b134f9 100644 --- a/tests/functional/translate/cram/translate-with-genbank.t +++ b/tests/functional/translate/cram/translate-with-genbank.t @@ -1,18 +1,21 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ source _setup.sh + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export DATA="$TESTDIR/../data" + $ export SCRIPTS="$TESTDIR/../../../../scripts" Translate amino acids for genes using a GenBank file. $ ${AUGUR} translate \ - > --tree translate/data/zika/tree.nwk \ - > --ancestral-sequences translate/data/zika/nt_muts.json \ - > --reference-sequence translate/data/zika/zika_outgroup.gb \ + > --tree "$DATA/zika/tree.nwk" \ + > --ancestral-sequences "$DATA/zika/nt_muts.json" \ + > --reference-sequence "$DATA/zika/zika_outgroup.gb" \ > --genes CA PRO \ - > --output-node-data $TMP/aa_muts.json - Validating schema of 'translate/data/zika/nt_muts.json'... + > --output-node-data aa_muts.json Read in 3 features from reference sequence file + Validating schema of '.+nt_muts.json'... (re) amino acid mutations written to .* (re) - $ python3 "../../scripts/diff_jsons.py" translate/data/zika/aa_muts_genbank.json $TMP/aa_muts.json + + $ python3 "$SCRIPTS/diff_jsons.py" $DATA/zika/aa_muts_genbank.json aa_muts.json \ + > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" {} diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t index 8796d35c4..9cb273bc5 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t +++ b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t @@ -1,11 +1,12 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ source _setup.sh + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export DATA="$TESTDIR/../data" + $ export SCRIPTS="$TESTDIR/../../../../scripts" Translate amino acids for genes using a GFF3 file where the gene names are stored in a qualifier named "gene_name". - $ cat >$TMP/genemap.gff <<~~ + $ cat >genemap.gff <<~~ > ##gff-version 3 > ##sequence-region PF13/251013_18 1 10769 > PF13/251013_18 GenBank gene 91 456 . + . gene_name="CA" @@ -13,19 +14,19 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > ~~ $ ${AUGUR} translate \ - > --tree translate/data/zika/tree.nwk \ - > --ancestral-sequences translate/data/zika/nt_muts.json \ - > --reference-sequence "$TMP/genemap.gff" \ - > --output-node-data $TMP/aa_muts.json - Validating schema of 'translate/data/zika/nt_muts.json'... + > --tree "${DATA}/zika/tree.nwk" \ + > --ancestral-sequences "${DATA}/zika/nt_muts.json" \ + > --reference-sequence "genemap.gff" \ + > --output-node-data aa_muts.json Read in 2 features from reference sequence file + Validating schema of '.+/nt_muts.json'... (re) amino acid mutations written to .* (re) Other than the sequence ids which will include a temporary path, the JSONs should be identical. - $ python3 "../../scripts/diff_jsons.py" \ + $ python3 "${SCRIPTS}/diff_jsons.py" \ > --exclude-regex-paths "['seqid']" -- \ - > translate/data/zika/aa_muts_gff.json \ - > $TMP/aa_muts.json + > "${DATA}/zika/aa_muts_gff.json" \ + > aa_muts.json {} diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene.t b/tests/functional/translate/cram/translate-with-gff-and-gene.t index 703240e8c..2c7d1d016 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-gene.t +++ b/tests/functional/translate/cram/translate-with-gff-and-gene.t @@ -1,11 +1,12 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ source _setup.sh + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export DATA="$TESTDIR/../data" + $ export SCRIPTS="$TESTDIR/../../../../scripts" Translate amino acids for genes using a GFF3 file where the gene names are stored in a qualifier named "gene". - $ cat >$TMP/genemap.gff <<~~ + $ cat >genemap.gff <<~~ > ##gff-version 3 > ##sequence-region PF13/251013_18 1 10769 > PF13/251013_18 GenBank gene 91 456 . + . gene="CA" @@ -13,15 +14,16 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > ~~ $ ${AUGUR} translate \ - > --tree translate/data/zika/tree.nwk \ - > --ancestral-sequences translate/data/zika/nt_muts.json \ - > --reference-sequence "$TMP/genemap.gff" \ - > --output-node-data $TMP/aa_muts.json - Validating schema of 'translate/data/zika/nt_muts.json'... + > --tree "${DATA}/zika/tree.nwk" \ + > --ancestral-sequences "${DATA}/zika/nt_muts.json" \ + > --reference-sequence genemap.gff \ + > --output-node-data aa_muts.json Read in 2 features from reference sequence file + Validating schema of '.+/nt_muts.json'... (re) amino acid mutations written to .* (re) - $ python3 "../../scripts/diff_jsons.py" \ + + $ python3 "${SCRIPTS}/diff_jsons.py" \ > --exclude-regex-paths "['seqid']" -- \ - > translate/data/zika/aa_muts_gff.json \ - > $TMP/aa_muts.json + > "${DATA}/zika/aa_muts_gff.json" \ + > aa_muts.json {} diff --git a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t index 96d263266..e58ea8979 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t +++ b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t @@ -1,23 +1,26 @@ Setup - $ pushd "$TESTDIR" > /dev/null - $ source _setup.sh + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export DATA="$TESTDIR/../data" + $ export SCRIPTS="$TESTDIR/../../../../scripts" Translate amino acids for genes using a GFF3 file where the gene names are stored in a qualifier named "locus_tag". $ ${AUGUR} translate \ - > --tree translate/data/tb/tree.nwk \ - > --genes translate/data/tb/genes.txt \ - > --vcf-reference translate/data/tb/ref.fasta \ - > --ancestral-sequences translate/data/tb/nt_muts.vcf \ - > --reference-sequence translate/data/tb/Mtb_H37Rv_NCBI_Annot.gff \ - > --output-node-data $TMP/aa_muts.json \ - > --alignment-output $TMP/translations.vcf \ - > --vcf-reference-output $TMP/translations_reference.fasta - Gene length of rrs_Rvnr01 is not a multiple of 3. will pad with N + > --tree "${DATA}/tb/tree.nwk" \ + > --genes "${DATA}/tb/genes.txt" \ + > --vcf-reference "${DATA}/tb/ref.fasta" \ + > --ancestral-sequences "${DATA}/tb/nt_muts.vcf" \ + > --reference-sequence "${DATA}/tb/Mtb_H37Rv_NCBI_Annot.gff" \ + > --output-node-data aa_muts.json \ + > --alignment-output translations.vcf \ + > --vcf-reference-output translations_reference.fasta + Gene length of 'rrs' is not a multiple of 3. will pad with N Read in 187 specified genes to translate. Read in 187 features from reference sequence file 162 genes had no mutations and so have been be excluded. amino acid mutations written to .* (re) - $ python3 "../../scripts/diff_jsons.py" translate/data/tb/aa_muts.json $TMP/aa_muts.json + + $ python3 "${SCRIPTS}/diff_jsons.py" "${DATA}/tb/aa_muts.json" aa_muts.json \ + > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" {} diff --git a/tests/functional/translate/data/simple-genome/aa_muts.json b/tests/functional/translate/data/simple-genome/aa_muts.json new file mode 100644 index 000000000..aa3c82801 --- /dev/null +++ b/tests/functional/translate/data/simple-genome/aa_muts.json @@ -0,0 +1,70 @@ +{ + "annotations": { + "gene1": { + "end": 24, + "seqid": "data/reference.source.gff", + "start": 10, + "strand": "+", + "type": "gene" + }, + "gene2": { + "end": 47, + "seqid": "data/reference.source.gff", + "start": 36, + "strand": "-", + "type": "gene" + }, + "nuc": { + "end": 50, + "seqid": "data/reference.source.gff", + "start": 1, + "strand": "+", + "type": "source" + } + }, + "generated_by": { + "program": "augur", + "version": "23.1.1" + }, + "nodes": { + "node_AB": { + "aa_muts": { + "gene1": [ + "P2L" + ], + "gene2": [ + "V2D" + ] + } + }, + "node_root": { + "aa_muts": {}, + "aa_sequences": { + "gene1": "MPCG*", + "gene2": "MVK*" + } + }, + "sample_A": { + "aa_muts": { + "gene1": [], + "gene2": [] + } + }, + "sample_B": { + "aa_muts": { + "gene1": [], + "gene2": [] + } + }, + "sample_C": { + "aa_muts": { + "gene1": [], + "gene2": [] + } + } + }, + "reference": { + "gene1": "MPCG*", + "gene2": "MVK*" + } +} \ No newline at end of file diff --git a/tests/functional/translate/data/simple-genome/reference.gb b/tests/functional/translate/data/simple-genome/reference.gb new file mode 100644 index 000000000..81560bc86 --- /dev/null +++ b/tests/functional/translate/data/simple-genome/reference.gb @@ -0,0 +1,15 @@ +LOCUS Simple-genome 50 bp DNA linear VRL 05-DEC-2023 +DEFINITION Synthetic genome used for Nextstrain testing +KEYWORDS . +FEATURES Location/Qualifiers + source 1..50 + /strain="Nextstrain/Simple-genome" + CDS 10..24 + /gene="gene1" + /translation="MPCG" + CDS complement(36..47) + /gene="gene2" + /translation="MVK" +ORIGIN + 1 AAAAAAAAAA TGCCCTGCGG GTAAAAAAAA AAAAACTACT TGACCATAAA +// \ No newline at end of file diff --git a/tests/functional/translate/data/simple-genome/reference.source.gff b/tests/functional/translate/data/simple-genome/reference.source.gff new file mode 100644 index 000000000..bbe1084a7 --- /dev/null +++ b/tests/functional/translate/data/simple-genome/reference.source.gff @@ -0,0 +1,7 @@ +##gff-version 3 +##created by james hadfield for testing NextStrain (December 2023) +##sequence-region reference_name 1 50 +reference_name RefSeq region 1 50 . + . ID=reference_name +reference_name RefSeq source 1 50 . + . ID=reference_name;locus_tag="https://github.com/nextstrain/augur/issues/1349";Note1="Source isn't really a GFF ID, but is required for Nextstrain to function correctly" +reference_name RefSeq gene 10 24 . + . Name=gene1;gene=gene1 +reference_name RefSeq gene 36 47 . - . Name=gene2;gene=gene2