Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset "schema" v0.3 #276

Merged
merged 10 commits into from
Oct 2, 2020
Prev Previous commit
Use Hashable instead of string for var names
ravwojdyla committed Oct 2, 2020
commit 5a459e5a6fab30d15101366ce5d664f5c13fec76
16 changes: 8 additions & 8 deletions sgkit/stats/aggregation.py
Original file line number Diff line number Diff line change
@@ -279,7 +279,7 @@ def _swap(dim: Dimension) -> Dimension:
return "samples" if dim == "variants" else "variants"


def call_rate(ds: Dataset, dim: Dimension, call_genotype_mask: str) -> Dataset:
def call_rate(ds: Dataset, dim: Dimension, call_genotype_mask: Hashable) -> Dataset:
odim = _swap(dim)[:-1]
n_called = (~ds[call_genotype_mask].any(dim="ploidy")).sum(dim=dim)
return xr.Dataset(
@@ -288,7 +288,7 @@ def call_rate(ds: Dataset, dim: Dimension, call_genotype_mask: str) -> Dataset:


def genotype_count(
ds: Dataset, dim: Dimension, call_genotype: str, call_genotype_mask: str
ds: Dataset, dim: Dimension, call_genotype: Hashable, call_genotype_mask: Hashable
) -> Dataset:
odim = _swap(dim)[:-1]
M, G = ds[call_genotype_mask].any(dim="ploidy"), ds[call_genotype]
@@ -310,9 +310,9 @@ def genotype_count(

def allele_frequency(
ds: Dataset,
call_genotype: str,
call_genotype_mask: str,
variant_allele_count: Optional[str],
call_genotype: Hashable,
call_genotype_mask: Hashable,
variant_allele_count: Optional[Hashable],
) -> Dataset:
data_vars: Dict[Hashable, Any] = {}
# only compute variant allele count if not already in dataset
@@ -339,9 +339,9 @@ def allele_frequency(
def variant_stats(
ds: Dataset,
*,
call_genotype_mask: str = variables.call_genotype_mask,
call_genotype: str = variables.call_genotype,
variant_allele_count: Optional[str] = None,
call_genotype_mask: Hashable = variables.call_genotype_mask,
call_genotype: Hashable = variables.call_genotype,
variant_allele_count: Optional[Hashable] = None,
merge: bool = True,
) -> Dataset:
"""Compute quality control variant statistics from genotype calls.
16 changes: 8 additions & 8 deletions sgkit/stats/association.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from dataclasses import dataclass
from typing import Optional, Sequence, Union
from typing import Hashable, Optional, Sequence, Union

import dask.array as da
import numpy as np
@@ -105,7 +105,7 @@ def linear_regression(


def _get_loop_covariates(
ds: Dataset, call_genotype: str, dosage: Optional[str] = None
ds: Dataset, call_genotype: Hashable, dosage: Optional[Hashable] = None
) -> Array:
if dosage is None:
# TODO: This should be (probably gwas-specific) allele
@@ -119,11 +119,11 @@ def _get_loop_covariates(
def gwas_linear_regression(
ds: Dataset,
*,
dosage: str,
covariates: Union[str, Sequence[str]],
traits: Union[str, Sequence[str]],
dosage: Hashable,
covariates: Union[Hashable, Sequence[Hashable]],
traits: Union[Hashable, Sequence[Hashable]],
add_intercept: bool = True,
call_genotype: str = variables.call_genotype,
call_genotype: Hashable = variables.call_genotype,
merge: bool = True,
) -> Dataset:
"""Run linear regression to identify continuous trait associations with genetic variants.
@@ -192,9 +192,9 @@ def gwas_linear_regression(
Nature Genetics 47 (3): 284–90.

"""
if isinstance(covariates, str):
if isinstance(covariates, Hashable):
covariates = [covariates]
if isinstance(traits, str):
if isinstance(traits, Hashable):
traits = [traits]

variables.validate(
4 changes: 2 additions & 2 deletions sgkit/stats/hwe.py
Original file line number Diff line number Diff line change
@@ -127,8 +127,8 @@ def hardy_weinberg_test(
ds: Dataset,
*,
genotype_counts: Optional[Hashable] = None,
call_genotype: str = variables.call_genotype,
call_genotype_mask: str = variables.call_genotype_mask,
call_genotype: Hashable = variables.call_genotype,
call_genotype_mask: Hashable = variables.call_genotype_mask,
merge: bool = True,
) -> Dataset:
"""Exact test for HWE as described in Wigginton et al. 2005 [1].
12 changes: 6 additions & 6 deletions sgkit/stats/pc_relate.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Tuple
from typing import Hashable, Tuple

import dask.array as da
import xarray as xr
@@ -24,8 +24,8 @@ def _impute_genotype_call_with_variant_mean(

def _collapse_ploidy(
ds: xr.Dataset,
call_genotype: str = variables.call_genotype,
call_genotype_mask: str = variables.call_genotype_mask,
call_genotype: Hashable = variables.call_genotype,
call_genotype_mask: Hashable = variables.call_genotype_mask,
) -> Tuple[xr.DataArray, xr.DataArray]:
call_g_mask = ds[call_genotype_mask].any(dim="ploidy")
call_g = xr.where(call_g_mask, -1, ds[call_genotype].sum(dim="ploidy")) # type: ignore[no-untyped-call]
@@ -36,9 +36,9 @@ def pc_relate(
ds: xr.Dataset,
*,
maf: float = 0.01,
call_genotype: str = variables.call_genotype,
call_genotype_mask: str = variables.call_genotype_mask,
sample_pcs: str = variables.sample_pcs,
call_genotype: Hashable = variables.call_genotype,
call_genotype_mask: Hashable = variables.call_genotype_mask,
sample_pcs: Hashable = variables.sample_pcs,
merge: bool = True
) -> xr.Dataset:
"""Compute PC-Relate as described in Conomos, et al. 2016 [1].
10 changes: 5 additions & 5 deletions sgkit/stats/regenie.py
Original file line number Diff line number Diff line change
@@ -729,9 +729,9 @@ def regenie(
ds: Dataset,
*,
dosage: str,
covariates: Union[str, Sequence[str]],
traits: Union[str, Sequence[str]],
variant_contig: str = variables.variant_contig,
covariates: Union[Hashable, Sequence[Hashable]],
traits: Union[Hashable, Sequence[Hashable]],
variant_contig: Hashable = variables.variant_contig,
variant_block_size: Optional[Union[int, Tuple[int, ...]]] = None,
sample_block_size: Optional[Union[int, Tuple[int, ...]]] = None,
alphas: Optional[Sequence[float]] = None,
@@ -857,9 +857,9 @@ def regenie(

[2] - https://glow.readthedocs.io/en/latest/tertiary/whole-genome-regression.html
"""
if isinstance(covariates, str):
if isinstance(covariates, Hashable):
covariates = [covariates]
if isinstance(traits, str):
if isinstance(traits, Hashable):
traits = [traits]

variables.validate(