Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ancestral): Add optional Nextclade GFF3 compatibility mode #1664

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,10 @@ def register_parser(parent_subparsers):
"Currently only supported for FASTA-input. Specify the file name via a "
"template like 'aa_sequences_%%GENE.fasta' where %%GENE will be replaced "
"by the gene name.")
amino_acid_options_group.add_argument(
'--use-nextclade-gff-parsing', action="store_true", default=False,
help="Read GFF annotations the way Nextclade does, using CDSes (including compound) and same qualifiers for gene names."
)

output_group = parser.add_argument_group(
"outputs",
Expand All @@ -345,8 +349,9 @@ def validate_arguments(args, is_vcf):
This checking shouldn't be used by downstream code to assume arguments exist, however by checking for
invalid combinations up-front we can exit quickly.
"""
aa_arguments = (args.annotation, args.genes, args.translations)
if any(aa_arguments) and not all(aa_arguments):
mandatory_aa_arguments = (args.annotation, args.genes, args.translations)
all_aa_arguments = (*mandatory_aa_arguments, args.use_nextclade_gff_parsing)
if any(all_aa_arguments) and not all(mandatory_aa_arguments):
raise AugurError("For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences.")

if args.output_sequences and args.output_vcf:
Expand Down Expand Up @@ -431,7 +436,7 @@ def run(args):

from .utils import load_features
## load features; only requested features if genes given
features = load_features(args.annotation, genes)
features = load_features(args.annotation, genes, args.use_nextclade_gff_parsing)
# Ensure the already-created nuc annotation coordinates match those parsed from the reference file
if (features['nuc'].location.start+1 != anc_seqs['annotations']['nuc']['start'] or
features['nuc'].location.end != anc_seqs['annotations']['nuc']['end']):
Expand Down
204 changes: 149 additions & 55 deletions augur/utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import argparse
import Bio
import Bio.Phylo
import Bio.SeqRecord
import numpy as np
import os, json, sys
import pandas as pd
from Bio.SeqFeature import SimpleLocation, CompoundLocation, SeqFeature, FeatureLocation
from collections import defaultdict, OrderedDict
from io import RawIOBase
from textwrap import dedent
from typing import Dict
from .__version__ import __version__

from augur.data import as_file
Expand Down Expand Up @@ -166,7 +169,7 @@ def default(self, obj):
return super().default(obj)


def load_features(reference, feature_names=None):
def load_features(reference, feature_names=None, use_nextclade_gff_parsing=False):
"""
Parse a GFF/GenBank reference file. See the docstrings for _read_gff and
_read_genbank for details.
Expand All @@ -177,6 +180,8 @@ def load_features(reference, feature_names=None):
File path to GFF or GenBank (.gb) reference
feature_names : None or set or list (optional)
Restrict the genes we read to those in the set/list
use_nextclade_gff_parsing : bool (optional)
If True, parse GFF file the way Nextclade does

Returns
-------
Expand All @@ -194,6 +199,8 @@ def load_features(reference, feature_names=None):
raise AugurError(f"reference sequence file {reference!r} not found")

if '.gff' in reference.lower():
if use_nextclade_gff_parsing:
return _read_gff_like_nextclade(reference, feature_names)
return _read_gff(reference, feature_names)
else:
return _read_genbank(reference, feature_names)
Expand Down Expand Up @@ -228,7 +235,6 @@ def _read_nuc_annotation_from_gff(record, reference):
if len(sequence_regions)>1:
raise AugurError(f"Reference {reference!r} contains multiple ##sequence-region pragma lines. Augur can only handle GFF files with a single one.")
elif sequence_regions:
from Bio.SeqFeature import SeqFeature, FeatureLocation
(name, start, stop) = sequence_regions[0]
nuc['pragma'] = SeqFeature(
FeatureLocation(start, stop, strand=1),
Expand Down Expand Up @@ -259,6 +265,114 @@ def _read_nuc_annotation_from_gff(record, reference):
raise AugurError(f"Reference {reference!r} didn't define any information we can use to create the 'nuc' annotation. You can use a line with a 'record' or 'source' GFF type or a ##sequence-region pragma.")


def _load_gff_record(reference, valid_types=None) -> Bio.SeqRecord.SeqRecord:
"""
Open a GFF file and return single record.
If there are multiple records, raise AugurError.
Optionally, limit the GFF types to parse.
"""
from BCBio import GFF
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have any sort of "all imports at top of file" convention or rule?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no code style enforcement in this codebase. Historically, each subcommand has different authors with varying styles, so there is also no convention.

For this line in particular, I think it's fine to scope the import to the function if no other functions in utils.py need it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How much do top-of-file imports contribute to the painful loading times @victorlin? We're essentially importing every dependency in order to show --help aren't we?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jameshadfield good point and you're right. I wrote more in #472 (comment)


limit_info = {'gff_type': valid_types} if valid_types else None

with open_file(reference) as in_handle:
# Note that `GFF.parse` doesn't always yield GFF records in the order
# one may expect, but since we raise AugurError if there are multiple
# this doesn't matter.
# TODO: Remove warning suppression after it's addressed upstream:
# <https://github.com/chapmanb/bcbb/issues/143>
import warnings

from Bio import BiopythonDeprecationWarning
warnings.simplefilter("ignore", BiopythonDeprecationWarning)
gff_entries = list(GFF.parse(in_handle, limit_info=limit_info))
warnings.simplefilter("default", BiopythonDeprecationWarning)

if len(gff_entries) == 0:
msg = f"Reference {reference!r} contains no valid data rows."
if valid_types:
msg += f" Valid GFF types (3rd column) are {', '.join(valid_types)}."
raise AugurError(msg)
if len(gff_entries) > 1:
raise AugurError(f"Reference {reference!r} contains multiple seqids (first column). Augur can only handle GFF files with a single seqid.")
return gff_entries[0]

def _lookup_feature_name_like_nextclade(feature) -> str:
# Matching Nextclade conventions (NAME_ATTRS_CDS)
# https://github.com/nextstrain/nextclade/blob/59e757fd9c9f8d8edd16cf2063d77a859d4d3b96/packages/nextclade/src/io/gff3.rs#L35-L54
QUALIFIER_PRIORITY = [ "Name", "name", "Alias", "alias", "standard_name", "old-name", "Gene", "gene", "gene_name",
"locus_tag", "product", "gene_synonym", "gb-synonym", "acronym", "gb-acronym", "protein_id", "ID"]

for qualifier in QUALIFIER_PRIORITY:
if qualifier in feature.qualifiers:
return feature.qualifiers[qualifier][0]
raise AugurError(f"No valid feature name found for feature for feature {feature.id}")

def _read_gff_like_nextclade(reference, feature_names) -> Dict[str, SeqFeature]:
"""
Read a GFF file the way Nextclade does. That means:
- We only look at CDS features.
- CDS features with identical IDs are joined into a compound feature.
- The feature name is taken from qualifiers in the same priority order as in Nextclade.

Parameters
----------
reference : string
File path to GFF reference
feature_names : None or set or list
Restrict the genes we read to those in the set/list

Returns
-------
features : dict
keys: feature names, values: :py:class:`Bio.SeqFeature.SeqFeature`
Note that feature names may not equivalent to GenBank feature keys

Raises
------
AugurError
If the reference file contains no IDs or multiple different seqids
If a gene is found with the name 'nuc'
"""
rec = _load_gff_record(reference)
features = {'nuc': _read_nuc_annotation_from_gff(rec, reference)}

cds_features: Dict[str, SeqFeature] = {}

def _flatten(feature):
if feature.type == "CDS":
if not feature.id:
# Hack to ensure we have some IDs for all features
# Technically, all features should have an ID, but Nextclade doesn't require it
feature.id = str(feature.qualifiers)
if feature.id in cds_features:
if isinstance(cds_features[feature.id].location, CompoundLocation):
cds_features[feature.id].location.parts.append(feature.location)
else:
cds_features[feature.id].location = CompoundLocation([cds_features[feature.id].location, feature.location])
else:
cds_features[feature.id] = feature
for sub_feature in getattr(feature, "sub_features", []):
_flatten(sub_feature)

for feature in rec.features:
_flatten(feature)

for feature in cds_features.values():
feature_name = _lookup_feature_name_like_nextclade(feature)
if feature_name == 'nuc':
raise AugurError(f"Reference {reference!r} contains a gene with the name 'nuc'. This is not allowed.")
if feature_names is None or feature_name in feature_names:
features[feature_name] = feature

if feature_names is not None:
for fe in feature_names:
if fe not in features:
print(f"Couldn't find gene {fe} in GFF")

return features


def _read_gff(reference, feature_names):
"""
Read a GFF file. We only read GFF IDs 'gene' or 'source' (the latter may not technically
Expand Down Expand Up @@ -288,65 +402,47 @@ def _read_gff(reference, feature_names):
If the reference file contains no IDs or multiple different seqids
If a gene is found with the name 'nuc'
"""
from BCBio import GFF
valid_types = ['gene', 'source', 'region']
features = {}

with open_file(reference) as in_handle:
# Note that `GFF.parse` doesn't always yield GFF records in the order
# one may expect, but since we raise AugurError if there are multiple
# this doesn't matter.
# TODO: Remove warning suppression after it's addressed upstream:
# <https://github.com/chapmanb/bcbb/issues/143>
import warnings
from Bio import BiopythonDeprecationWarning
warnings.simplefilter("ignore", BiopythonDeprecationWarning)
gff_entries = list(GFF.parse(in_handle, limit_info={'gff_type': valid_types}))
warnings.simplefilter("default", BiopythonDeprecationWarning)
rec = _load_gff_record(reference, valid_types=valid_types)

if len(gff_entries) == 0:
raise AugurError(f"Reference {reference!r} contains no valid data rows. Valid GFF types (3rd column) are {', '.join(valid_types)}.")
elif len(gff_entries) > 1:
raise AugurError(f"Reference {reference!r} contains multiple seqids (first column). Augur can only handle GFF files with a single seqid.")
else:
rec = gff_entries[0]

features['nuc'] = _read_nuc_annotation_from_gff(rec, reference)
features_skipped = 0

for feat in rec.features:
if feat.type == "gene":
# Check for gene names stored in qualifiers commonly used by
# virus-specific gene maps first (e.g., 'gene',
# 'gene_name'). Then, check for qualifiers used by non-viral
# pathogens (e.g., 'locus_tag').
if "gene" in feat.qualifiers:
fname = feat.qualifiers["gene"][0]
elif "gene_name" in feat.qualifiers:
fname = feat.qualifiers["gene_name"][0]
elif "locus_tag" in feat.qualifiers:
fname = feat.qualifiers["locus_tag"][0]
else:
features_skipped+=1
fname = None
features = {'nuc': _read_nuc_annotation_from_gff(rec, reference)}

features_skipped = 0

if fname == 'nuc':
raise AugurError(f"Reference {reference!r} contains a gene with the name 'nuc'. This is not allowed.")
for feat in rec.features:
if feat.type == "gene":
# Check for gene names stored in qualifiers commonly used by
# virus-specific gene maps first (e.g., 'gene',
# 'gene_name'). Then, check for qualifiers used by non-viral
# pathogens (e.g., 'locus_tag').
if "gene" in feat.qualifiers:
fname = feat.qualifiers["gene"][0]
elif "gene_name" in feat.qualifiers:
fname = feat.qualifiers["gene_name"][0]
elif "locus_tag" in feat.qualifiers:
fname = feat.qualifiers["locus_tag"][0]
else:
features_skipped+=1
fname = None

if feature_names is not None and fname not in feature_names:
# Skip (don't store) this feature
continue
if fname == 'nuc':
raise AugurError(f"Reference {reference!r} contains a gene with the name 'nuc'. This is not allowed.")

if fname:
features[fname] = feat
if feature_names is not None and fname not in feature_names:
# Skip (don't store) this feature
continue

if feature_names is not None:
for fe in feature_names:
if fe not in features:
print("Couldn't find gene {} in GFF or GenBank file".format(fe))
if fname:
features[fname] = feat

if feature_names is not None:
for fe in feature_names:
if fe not in features:
print("Couldn't find gene {} in GFF or GenBank file".format(fe))

if features_skipped:
print(f"WARNING: {features_skipped} GFF rows of type=gene skipped as they didn't have a gene, gene_name or locus_tag attribute.")
if features_skipped:
print(f"WARNING: {features_skipped} GFF rows of type=gene skipped as they didn't have a gene, gene_name or locus_tag attribute.")

return features

Expand Down Expand Up @@ -852,8 +948,6 @@ def genome_features_to_auspice_annotation(features, ref_seq_name=None, assert_nu
See schema-annotations.json for the schema this conforms to

"""
from Bio.SeqFeature import SimpleLocation, CompoundLocation

if assert_nuc and 'nuc' not in features:
raise AugurError("Genome features must include a feature for 'nuc'")

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
Setup

$ source "$TESTDIR"/_setup.sh

Infer ancestral nucleotide and amino acid sequences using Nextclade GFF annotations.

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/ebola/tree.nwk \
> --alignment $TESTDIR/../data/ebola/masked.fasta \
> --annotation $TESTDIR/../data/ebola/genome_annotation.gff3 \
> --genes GP L NP sGP ssGP VP24 VP30 VP35 VP40 \
> --use-nextclade-gff-parsing \
> --translations $TESTDIR/../data/ebola/translations/%GENE.fasta \
> --infer-ambiguous \
> --inference joint \
> --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
> > /dev/null

Check that output is as expected

$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
> --exclude-regex-paths "\['seqid'\]" -- \
> "$TESTDIR/../data/ebola/nt_muts.json" \
> "$CRAMTMP/$TESTFILE/ancestral_mutations.json"
{}
Loading
Loading