Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moving karyotype to anoph #702

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions malariagen_data/af1.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def __init__(
taxon_colors=TAXON_COLORS,
virtual_contigs=None,
gene_names=None,
inversion_tag_path=None,
)

def __repr__(self):
Expand Down
94 changes: 2 additions & 92 deletions malariagen_data/ag3.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,11 @@

import dask
import pandas as pd # type: ignore
from pandas import CategoricalDtype
import numpy as np # type: ignore
import allel # type: ignore
import plotly.express as px # type: ignore

import malariagen_data
from .anopheles import AnophelesDataResource

from numpydoc_decorator import doc
from .util import check_types, _karyotype_tags_n_alt
from .anoph import base_params
from typing import Optional, Literal, Annotated, TypeAlias

# silence dask performance warnings
dask.config.set(**{"array.slicing.split_native_chunks": False}) # type: ignore
Expand All @@ -35,6 +28,7 @@
GENE_NAMES = {
"AGAP004707": "Vgsc/para",
}
INVERSION_TAG_PATH = "karyotype_tag_snps.csv"


def _setup_aim_palettes():
Expand Down Expand Up @@ -83,12 +77,6 @@ def _setup_aim_palettes():
}


inversion_param: TypeAlias = Annotated[
Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"],
"Name of inversion to infer karyotype for.",
]


class Ag3(AnophelesDataResource):
"""Provides access to data from Ag3.x releases.

Expand Down Expand Up @@ -203,6 +191,7 @@ def __init__(
taxon_colors=TAXON_COLORS,
virtual_contigs=VIRTUAL_CONTIGS,
gene_names=GENE_NAMES,
inversion_tag_path=INVERSION_TAG_PATH,
)

# set up caches
Expand Down Expand Up @@ -355,82 +344,3 @@ def _results_cache_add_analysis_params(self, params):
super()._results_cache_add_analysis_params(params)
# override parent class to add AIM analysis
params["aim_analysis"] = self._aim_analysis

@check_types
@doc(
summary="Load tag SNPs for a given inversion in Ag.",
)
def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
# needs to be modified depending on where we are hosting
import importlib.resources
from . import resources

with importlib.resources.path(resources, "karyotype_tag_snps.csv") as path:
df_tag_snps = pd.read_csv(path, sep=",")
return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()

@check_types
@doc(
summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
)
def karyotype(
self,
inversion: inversion_param,
sample_sets: Optional[base_params.sample_sets] = None,
sample_query: Optional[base_params.sample_query] = None,
sample_query_options: Optional[base_params.sample_query_options] = None,
) -> pd.DataFrame:
# load tag snp data
df_tagsnps = self.load_inversion_tags(inversion=inversion)
inversion_pos = df_tagsnps["position"]
inversion_alts = df_tagsnps["alt_allele"]
contig = inversion[0:2]

# get snp calls for inversion region
start, end = np.min(inversion_pos), np.max(inversion_pos)
region = f"{contig}:{start}-{end}"

ds_snps = self.snp_calls(
region=region,
sample_sets=sample_sets,
sample_query=sample_query,
sample_query_options=sample_query_options,
)

with self._spinner("Inferring karyotype from tag SNPs"):
# access variables we need
geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data)
pos = allel.SortedIndex(ds_snps["variant_position"].values)
samples = ds_snps["sample_id"].values
alts = ds_snps["variant_allele"].values.astype(str)

# subset to position of inversion tags
mask = pos.locate_intersection(inversion_pos)[0]
alts = alts[mask]
geno = geno.compress(mask, axis=0).compute()

# infer karyotype
gn_alt = _karyotype_tags_n_alt(
gt=geno, alts=alts, inversion_alts=inversion_alts
)
is_called = geno.is_called()

# calculate mean genotype for each sample whilst masking missing calls
av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0)
total_sites = np.sum(is_called, axis=0)

df = pd.DataFrame(
{
"sample_id": samples,
"inversion": inversion,
f"karyotype_{inversion}_mean": av_gts,
# round the genotypes then convert to int
f"karyotype_{inversion}": av_gts.round().astype(int),
"total_tag_snps": total_sites,
},
)
# Allow filling missing values with "<NA>" visible placeholder.
kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True)
df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype)

return df
112 changes: 112 additions & 0 deletions malariagen_data/anoph/karyotype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import pandas as pd # type: ignore
from pandas import CategoricalDtype
import numpy as np # type: ignore
import allel # type: ignore

from numpydoc_decorator import doc
from ..util import check_types, _karyotype_tags_n_alt
jonbrenas marked this conversation as resolved.
Show resolved Hide resolved
from . import base_params
from typing import Optional

from .snp_data import AnophelesSnpData
from .karyotype_params import inversion_param


class AnophelesKaryotypeData(AnophelesSnpData):
jonbrenas marked this conversation as resolved.
Show resolved Hide resolved
def __init__(
self,
inversion_tag_path: Optional[str] = None,
**kwargs,
):
# N.B., this class is designed to work cooperatively, and
# so it's important that any remaining parameters are passed
# to the superclass constructor.
super().__init__(**kwargs)

# If provided, this analysis version will override the
# default value provided in the release configuration.
jonbrenas marked this conversation as resolved.
Show resolved Hide resolved
self._inversion_tag_path = inversion_tag_path

@check_types
@doc(
summary="Load tag SNPs for a given inversion in Ag.",
jonbrenas marked this conversation as resolved.
Show resolved Hide resolved
)
def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
# needs to be modified depending on where we are hosting
import importlib.resources
from .. import resources

if not self._inversion_tag_path:
raise FileNotFoundError(
"The file containing the inversion tags is missing."
)
Comment on lines +58 to +61
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FileNotFoundError isn't quite the right exception class here.

Suggested change
if not self._inversion_tag_path:
raise FileNotFoundError(
"The file containing the inversion tags is missing."
)
if self._inversion_tag_path is None:
raise NotImplementedError(
"No inversion tags are available for this data resource."
)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have been of two minds about this one: on one hand, it is true that we have not generated the tags for Af1 and one could argue that NotImplementedError is a more appropriate error for work that is still to be done; on the other hand, the code itself would (I think) work if the file actually existed and it is thus less an issue of missing code and more an issue of a missing input. Also, I can imagine a situation where someone created tags for an inversion and would want to try to use their local file (it is not currently possible, the path that is used is hard-coded in both Ag3 and Af1) in which case the error would come from the path being incorrect (though, an error message referring to the actual path inputed would be more helpful if we want to offer this option).

else:
with importlib.resources.path(resources, self._inversion_tag_path) as path:
df_tag_snps = pd.read_csv(path, sep=",")
return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()

@check_types
@doc(
summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
jonbrenas marked this conversation as resolved.
Show resolved Hide resolved
)
def karyotype(
self,
inversion: inversion_param,
sample_sets: Optional[base_params.sample_sets] = None,
sample_query: Optional[base_params.sample_query] = None,
sample_query_options: Optional[base_params.sample_query_options] = None,
) -> pd.DataFrame:
# load tag snp data
df_tagsnps = self.load_inversion_tags(inversion=inversion)
inversion_pos = df_tagsnps["position"]
inversion_alts = df_tagsnps["alt_allele"]
contig = inversion[0:2]

# get snp calls for inversion region
start, end = np.min(inversion_pos), np.max(inversion_pos)
region = f"{contig}:{start}-{end}"

ds_snps = self.snp_calls(
region=region,
sample_sets=sample_sets,
sample_query=sample_query,
sample_query_options=sample_query_options,
)

with self._spinner("Inferring karyotype from tag SNPs"):
# access variables we need
geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data)
pos = allel.SortedIndex(ds_snps["variant_position"].values)
samples = ds_snps["sample_id"].values
alts = ds_snps["variant_allele"].values.astype(str)

# subset to position of inversion tags
mask = pos.locate_intersection(inversion_pos)[0]
alts = alts[mask]
geno = geno.compress(mask, axis=0).compute()

# infer karyotype
gn_alt = _karyotype_tags_n_alt(
gt=geno, alts=alts, inversion_alts=inversion_alts
)
is_called = geno.is_called()

# calculate mean genotype for each sample whilst masking missing calls
av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0)
total_sites = np.sum(is_called, axis=0)

df = pd.DataFrame(
{
"sample_id": samples,
"inversion": inversion,
f"karyotype_{inversion}_mean": av_gts,
# round the genotypes then convert to int
f"karyotype_{inversion}": av_gts.round().astype(int),
"total_tag_snps": total_sites,
},
)
# Allow filling missing values with "<NA>" visible placeholder.
kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True)
df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype)

return df
10 changes: 10 additions & 0 deletions malariagen_data/anoph/karyotype_params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Parameter definitions for karyotype analysis functions."""

from typing import Literal

from typing_extensions import Annotated, TypeAlias

inversion_param: TypeAlias = Annotated[
Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"],
"Name of inversion to infer karyotype for.",
]
jonbrenas marked this conversation as resolved.
Show resolved Hide resolved
4 changes: 4 additions & 0 deletions malariagen_data/anopheles.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
plotly_params,
xpehh_params,
)
from .anoph.karyotype import AnophelesKaryotypeData
from .anoph.aim_data import AnophelesAimData
from .anoph.base import AnophelesBase
from .anoph.cnv_data import AnophelesCnvData
Expand Down Expand Up @@ -94,6 +95,7 @@ class AnophelesDataResource(
AnophelesPca,
PlinkConverter,
AnophelesIgv,
AnophelesKaryotypeData,
AnophelesAimData,
AnophelesHapData,
AnophelesSnpData,
Expand Down Expand Up @@ -138,6 +140,7 @@ def __init__(
taxon_colors: Optional[Mapping[str, str]],
virtual_contigs: Optional[Mapping[str, Sequence[str]]],
gene_names: Optional[Mapping[str, str]],
inversion_tag_path: Optional[str],
):
super().__init__(
url=url,
Expand Down Expand Up @@ -171,6 +174,7 @@ def __init__(
taxon_colors=taxon_colors,
virtual_contigs=virtual_contigs,
gene_names=gene_names,
inversion_tag_path=inversion_tag_path,
)

@property
Expand Down
21 changes: 21 additions & 0 deletions tests/integration/test_af1.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,24 @@ def test_locate_region(region_raw):
assert region == Region("2RL", 48714463, 48715355)
if region_raw == "2RL:24,630,355-24,633,221":
assert region == Region("2RL", 24630355, 24633221)


@pytest.mark.parametrize(
"inversion",
["2La", "2Rb", "2Rc_col", "X_x"],
)
def test_karyotyping(inversion):
af1 = setup_af1()

if inversion == "X_x":
with pytest.raises(TypeError):
af1.karyotype(
inversion=inversion, sample_sets="AG1000G-GH", sample_query=None
)
else:
with pytest.raises(FileNotFoundError):
af1.karyotype(
inversion=inversion,
sample_sets="1229-VO-GH-DADZIE-VMF00095",
sample_query=None,
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All inversion parameter values should fail with NotImplementedError I think.

39 changes: 25 additions & 14 deletions tests/integration/test_ag3.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,19 +159,30 @@ def test_xpehh_gwss():
assert_allclose(xpehh[:, 2][100], 0.4817561326426265)


def test_karyotyping():
@pytest.mark.parametrize(
"inversion",
["2La", "2Rb", "2Rc_col", "X_x"],
)
def test_karyotyping(inversion):
ag3 = setup_ag3(cohorts_analysis="20230516")

df = ag3.karyotype(inversion="2La", sample_sets="AG1000G-GH", sample_query=None)

assert isinstance(df, pd.DataFrame)
expected_cols = [
"sample_id",
"inversion",
"karyotype_2La_mean",
"karyotype_2La",
"total_tag_snps",
]
assert set(df.columns) == set(expected_cols)
assert all(df["karyotype_2La"].isin([0, 1, 2]))
assert all(df["karyotype_2La_mean"].between(0, 2))
if inversion == "X_x":
with pytest.raises(TypeError):
jonbrenas marked this conversation as resolved.
Show resolved Hide resolved
ag3.karyotype(
inversion=inversion, sample_sets="AG1000G-GH", sample_query=None
)
else:
df = ag3.karyotype(
inversion=inversion, sample_sets="AG1000G-GH", sample_query=None
)
assert isinstance(df, pd.DataFrame)
expected_cols = [
"sample_id",
"inversion",
f"karyotype_{inversion}_mean",
f"karyotype_{inversion}",
"total_tag_snps",
]
assert set(df.columns) == set(expected_cols)
assert all(df[f"karyotype_{inversion}"].isin([0, 1, 2]))
assert all(df[f"karyotype_{inversion}_mean"].between(0, 2))
Loading