Skip to content

Commit

Permalink
feat: allow multiple variants in pearson_corr_ld() (#258)
Browse files Browse the repository at this point in the history
  • Loading branch information
aryarm authored Nov 16, 2024
1 parent b86fea8 commit 808c31e
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 6 deletions.
32 changes: 26 additions & 6 deletions haptools/ld.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,42 @@ class Haplotype(HaplotypeBase):
)


def pearson_corr_ld(arrA: npt.NDArray, arrB: npt.NDArray) -> float:
def pearson_corr_ld(arrA: npt.NDArray, arrB: npt.NDArray) -> float | npt.NDArray:
"""
Compute the Pearson correlation coefficient between two vectors (1D numpy arrays)
Compute the Pearson correlation coefficient (LD) between two vectors (1D arrays)
If 2D array(s) are given instead, the rows will be treated as samples and the columns
as variants
Parameters
----------
arrA: npt.NDArray
The first 1D numpy array
The first 1D (or 2D) numpy array
arrB: npt.NDArray
The second 1D numpy array
The second 1D (or 2D) numpy array
Returns
-------
The LD between the genotypes in arrA and the genotypes in arrB
The signed LD between the genotypes in arrA and the genotypes in arrB
If either arrA or arrB is 2D, then the return value will be an array instead. If
both are 2D, the shape of the array will correspond with the second dimensions of
arrA and arrB, respectively. Otherwise, it will be 1D.
"""
return np.corrcoef(arrA, arrB)[1, 0]
if arrA.ndim == 1:
dim = 1
elif arrA.ndim == 2:
dim = arrA.shape[1]
else:
raise ValueError("We can only compute LD for 2D genotype arrays")
ld_mat = np.corrcoef(arrA, arrB, rowvar=False)[:dim:, dim:]
if arrA.ndim == 1:
ld_mat = ld_mat[0, :]
if arrB.ndim == 1:
ld_mat = ld_mat[0]
elif arrB.ndim == 1:
ld_mat = ld_mat[:, 0]
return ld_mat


def calc_ld(
Expand Down
60 changes: 60 additions & 0 deletions tests/test_ld.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,73 @@
import math
from pathlib import Path

import numpy as np
from click.testing import CliRunner

from haptools.data import Data
from haptools.__main__ import main
from haptools.ld import pearson_corr_ld

DATADIR = Path(__file__).parent.joinpath("data")


def test_ld(seed=42):
rng = np.random.default_rng(seed)

# Different input shapes will give different output shapes
# (25,) x (25,) --> float
arrA = rng.choice((0, 1, 2), size=(25,))
arrB = rng.choice((0, 1, 2), size=(25,))
ld = pearson_corr_ld(arrA, arrB)
assert isinstance(ld, float)
assert math.isclose(ld, -0.1148198316929615)

# (25,) x (25,1) --> (1,)
arrB = arrB[:, np.newaxis]
old_ld = ld
ld = pearson_corr_ld(arrA, arrB)
assert isinstance(ld, np.ndarray)
assert ld.shape == (1,)
assert old_ld == ld[0]

# (25,1) x (25,1) --> (1,1)
arrA = arrA[:, np.newaxis]
ld = pearson_corr_ld(arrA, arrB)
assert isinstance(ld, np.ndarray)
assert ld.shape == (1, 1)
assert old_ld == ld[0, 0]

# (25,3) x (25,) --> (3,)
arrA = np.hstack((np.random.choice((0, 1, 2), size=(25, 2)), arrA))
arrB = np.squeeze(arrB)
ld = pearson_corr_ld(arrA, arrB)
assert isinstance(ld, np.ndarray)
assert ld.shape == (3,)
assert old_ld == ld[2]

# (25,) x (25,3) --> (3,)
arrA = arrB
arrB = np.random.choice((0, 1, 2), size=(25, 3))
ld = pearson_corr_ld(arrA, arrB)
assert isinstance(ld, np.ndarray)
assert ld.shape == (3,)

# (25,1) x (25,3) --> (1,3)
arrA = arrA[:, np.newaxis]
old_ld = ld
ld = pearson_corr_ld(arrA, arrB)
assert isinstance(ld, np.ndarray)
assert ld.shape == (1, 3)
np.testing.assert_allclose(old_ld[np.newaxis, :], ld)

# (25,2) x (25,3) --> (2,3)
arrA = np.hstack((arrA, np.random.choice((0, 1, 2), size=(25, 1))))
ld = pearson_corr_ld(arrA, arrB)
assert isinstance(ld, np.ndarray)
assert ld.shape == (2, 3)
np.testing.assert_allclose(old_ld, ld[0])


def test_basic(capfd):
expected = """#\torderH\tld
#\tversion\t0.2.0
Expand Down

0 comments on commit 808c31e

Please sign in to comment.