From c4c4de03ecf5a4a36c3ef338d8d51fc045b01270 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 15 Nov 2023 15:36:06 -0800 Subject: [PATCH 1/9] use python 3.8 style unioning instead --- haptools/data/genotypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index d0562d9b..35114dc5 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -3,7 +3,7 @@ import gc from csv import reader from pathlib import Path -from typing import Iterator, Union +from typing import Iterator from logging import getLogger, Logger from collections import namedtuple, Counter @@ -837,7 +837,7 @@ def __init__( self, vcffile: VCF, vcfiter: object, - vcftype: Union[str, trh.VcfTypes] = "auto", + vcftype: str | trh.VcfTypes = "auto", ): super().__init__(vcffile, vcftype) self.vcfiter = vcfiter From dd2976b0cd4c6402a66123ae536cce24f20a0885 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 15 Nov 2023 15:40:45 -0800 Subject: [PATCH 2/9] move reorder_samples param into read() since it is not a necessary variable for both reading and writing - just for reading --- haptools/data/genotypes.py | 91 ++++++++++++++++---------------------- haptools/transform.py | 18 ++++++-- 2 files changed, 51 insertions(+), 58 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 35114dc5..18294802 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -40,9 +40,6 @@ class Genotypes(Data): 3. POS log: Logger A logging instance for recording debug statements - reorder_samples: bool, optional - If true and the list of samples are given to read load samples in the same - order as the tuple in samples. Otherwise, load in order of read in by VCF. _prephased : bool If True, assume that the genotypes are phased. Otherwise, extract their phase when reading from the VCF. @@ -60,9 +57,7 @@ class Genotypes(Data): >>> genotypes.data """ - def __init__( - self, fname: Path | str, log: Logger = None, reorder_samples: bool = True - ): + def __init__(self, fname: Path | str, log: Logger = None): super().__init__(fname, log) self.samples = tuple() self.variants = np.array( @@ -76,7 +71,6 @@ def __init__( self._prephased = False self._samp_idx = None self._var_idx = None - self.reorder_samples = reorder_samples @classmethod def load( @@ -121,6 +115,7 @@ def read( samples: list[str] = None, variants: set[str] = None, max_variants: int = None, + reorder_samples: bool = None, ): """ Read genotypes from a VCF into a numpy matrix stored in :py:attr:`~.Genotypes.data` @@ -159,13 +154,15 @@ def read( number of expected sites within a region. Note that this value is ignored if the variants argument is provided. + reorder_samples : bool, optional + Reorder samples in the gneotype matrix to match the order specified via the + samples parameter. This parameter is ignored when the samples parameter is + not specified. Note that reordering the samples requires us to make a copy + of the genotype matrix in memory, so it is disabled by default. Instead, a + warning is issued when the samples do not match up. You can silence the + warning by passing False to this parameter. """ super().read() - if not self.reorder_samples and samples: - self.log.warning( - "Loading genotypes in order of variant file. Output samples and data" - " may not be in the same order as the given samples array." - ) records = self.__iter__(region=region, samples=samples, variants=variants) if variants is not None: max_variants = len(variants) @@ -216,8 +213,14 @@ def read( self.log.info(f"Transposing genotype matrix of size {self.data.shape}") self.data = self.data.transpose((1, 0, 2)) - if samples and self.reorder_samples: - self.subset(samples=samples, inplace=True) + if samples is not None: + samples = tuple(samples) + if reorder_samples: + self.subset(samples=samples, inplace=True) + elif reorder_samples is None and self.samples != samples: + self.log.warning( + "Samples in genotypes file do not match requested order" + ) def _variant_arr(self, record: Variant): """ @@ -732,15 +735,10 @@ class GenotypesVCF(Genotypes): 4. [REF, ALT1, ALT2, ...] log: Logger See documentation for :py:attr:`~.Genotypes.log` - reorder_samples: bool, optional - If true and the list of samples are given to read load samples in the same - order as the tuple in samples. Otherwise, load in order of read in by VCF. """ - def __init__( - self, fname: Path | str, log: Logger = None, reorder_samples: bool = True - ): - super().__init__(fname, log, reorder_samples) + def __init__(self, fname: Path | str, log: Logger = None): + super().__init__(fname, log) dtype = {k: v[0] for k, v in self.variants.dtype.fields.items()} self.variants = np.array([], dtype=list(dtype.items()) + [("alleles", object)]) @@ -871,9 +869,6 @@ class GenotypesTR(Genotypes): vcftype: str TR vcf type currently being read. Options are {'auto', 'gangstr', 'advntr', 'hipstr', 'eh', 'popstr'} - reorder_samples: bool, optional - If true and the list of samples are given to read load samples in the same - order as the tuple in samples. Otherwise, load in order of read in by VCF. """ def __init__( @@ -881,9 +876,8 @@ def __init__( fname: Path | str, log: Logger = None, vcftype: str = "auto", - reorder_samples: bool = True, ): - super().__init__(fname, log, reorder_samples) + super().__init__(fname, log) self.vcftype = vcftype @classmethod @@ -1009,9 +1003,6 @@ class GenotypesPLINK(GenotypesVCF): If this value is provided, variants from the PGEN file will be loaded in chunks so as to use less memory - reorder_samples: bool, optional - If true and the list of samples are given to read load samples in the same - order as the tuple in samples. Otherwise, load in order of read in by VCF. Examples -------- @@ -1023,9 +1014,8 @@ def __init__( fname: Path | str, log: Logger = None, chunk_size: int = None, - reorder_samples: bool = True, ): - super().__init__(fname, log, reorder_samples) + super().__init__(fname, log) self.chunk_size = chunk_size def read_samples(self, samples: list[str] = None): @@ -1281,6 +1271,7 @@ def read( samples: list[str] = None, variants: set[str] = None, max_variants: int = None, + reorder_samples: bool = None, ): """ Read genotypes from a PGEN file into a numpy matrix stored in @@ -1296,15 +1287,11 @@ def read( See documentation for :py:attr:`~.GenotypesVCF.read` max_variants : int, optional See documentation for :py:attr:`~.GenotypesVCF.read` + reorder_samples : bool, optional + See documentation for :py:attr:`~.GenotypesVCF.read` """ super(Genotypes, self).read() - if not self.reorder_samples and samples: - self.log.warning( - "Loading genotypes in order of variant file. Output samples and data" - " may not be in the same order as the given samples array." - ) - sample_idxs = self.read_samples(samples) pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8")) @@ -1382,8 +1369,14 @@ def read( del data gc.collect() - if samples and self.reorder_samples: - self.subset(samples=samples, inplace=True) + if samples is not None: + samples = tuple(samples) + if reorder_samples: + self.subset(samples=samples, inplace=True) + elif reorder_samples is None and self.samples != samples: + self.log.warning( + "Samples in genotypes file do not match requested order" + ) def _iterate( self, @@ -1639,9 +1632,6 @@ class GenotypesPLINKTR(GenotypesPLINK): See documentation for :py:attr:`~.GenotypesPLINK.chunk_size` vcftype: str, optional See documentation for :py:attr:`~.GenotypesTR.vcftype` - reorder_samples: bool, optional - If true and the list of samples are given to read load samples in the same - order as the tuple in samples. Otherwise, load in order of read in by VCF. Examples -------- >>> genotypes = GenotypesPLINK.load('tests/data/simple.pgen') @@ -1653,9 +1643,8 @@ def __init__( log: Logger = None, chunk_size: int = None, vcftype: str = "auto", - reorder_samples: bool = True, ): - super().__init__(fname, log, chunk_size, reorder_samples) + super().__init__(fname, log, chunk_size) self.vcftype = vcftype @classmethod @@ -1729,6 +1718,7 @@ def read( samples: list[str] = None, variants: set[str] = None, max_variants: int = None, + reorder_samples: bool = None, ): """ Read genotypes from a PGEN file into a numpy matrix stored in @@ -1744,14 +1734,10 @@ def read( See documentation for :py:attr:`~.GenotypesVCF.read` max_variants : int, optional See documentation for :py:attr:`~.GenotypesVCF.read` + reorder_samples : int, optional + See documentation for :py:attr:`~.GenotypesVCF.read` """ - super().read(region, samples, variants, max_variants) - - if not self.reorder_samples and samples: - self.log.warning( - "Loading genotypes in order of variant file. Output samples and data" - " may not be in the same order as the given samples array." - ) + super().read(region, samples, variants, max_variants, reorder_samples) num_variants = len(self.variants) # initialize a jagged array of allele lengths @@ -1781,9 +1767,6 @@ def read( del allele_lens gc.collect() - if samples and self.reorder_samples: - self.subset(samples=samples, inplace=True) - def _iterate( self, pgen: pgenlib.PgenReader, diff --git a/haptools/transform.py b/haptools/transform.py index df70f6cb..35ddf45e 100644 --- a/haptools/transform.py +++ b/haptools/transform.py @@ -5,8 +5,8 @@ from dataclasses import dataclass, field import numpy as np +from cyvcf2 import VCF import numpy.typing as npt -from cyvcf2 import VCF, Variant from pysam import VariantFile from . import data @@ -80,7 +80,7 @@ def __init__( fname: Path | str, haplotype: type[HaplotypeAncestry] = HaplotypeAncestry, variant: type[data.Variant] = data.Variant, - log: Logger = None, + log: logging.Logger = None, ): """ Contrasting with the base Haplotypes class: this class uses HaplotypeAncestry @@ -171,11 +171,11 @@ class GenotypesAncestry(data.GenotypesVCF): ancestry : np.array The ancestral population of each allele in each sample of :py:attr:`~.GenotypesAncestry.data` - log: Logger + log: logging.Logger See documentation for :py:attr:`~.Genotypes.log` """ - def __init__(self, fname: Path | str, log: Logger = None): + def __init__(self, fname: Path | str, log: logging.Logger = None): super().__init__(fname, log) self.ancestry = None self.valid_labels = None @@ -230,6 +230,7 @@ def read( samples: list[str] = None, variants: set[str] = None, max_variants: int = None, + reorder_samples: bool = None, ): """ See documentation for :py:meth:`~.Genotypes.read` @@ -299,6 +300,15 @@ def read( self.data = self.data.transpose((1, 0, 2)) self.ancestry = self.ancestry.transpose((1, 0, 2)) + if samples is not None: + samples = tuple(samples) + if reorder_samples: + self.subset(samples=samples, inplace=True) + elif reorder_samples is None and self.samples != samples: + self.log.warning( + "Samples in genotypes file do not match requested order" + ) + def subset( self, samples: tuple[str] = None, From 4cdcc43fe27329c650997c15cae56770c399d4d3 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 15 Nov 2023 15:42:21 -0800 Subject: [PATCH 3/9] use different sample ordering in tests and check data since HG00096 == HG00097 in simple.vcf --- tests/test_data.py | 106 ++++++++++++++++++++++++++++++++------------- 1 file changed, 75 insertions(+), 31 deletions(-) diff --git a/tests/test_data.py b/tests/test_data.py index 367d243c..2080bebd 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -310,19 +310,29 @@ def test_merge_variants(self): assert gts.data.shape[1] == (gts1.data.shape[1] + gts2.data.shape[1]) def test_reorder_samples(self): - # can we load the data from the VCF? - gts = Genotypes(DATADIR / "simple.vcf.gz", reorder_samples=True) - gts.read(region="1:10115-10117", samples=["HG00097", "HG00096", "HG00099"]) - assert gts.samples == ("HG00097", "HG00096", "HG00099") + expected = self._get_expected_genotypes()[[0, 1, 2]] + samples = ["HG00099", "HG00096", "HG00097"] - gts = Genotypes(DATADIR / "simple.vcf.gz", reorder_samples=False) - gts.read(region="1:10115-10117", samples=["HG00097", "HG00096", "HG00099"]) + # can we load the data from the VCF? + gts = Genotypes(DATADIR / "simple.vcf.gz") + gts.read(samples=samples, reorder_samples=False) assert gts.samples == ("HG00096", "HG00097", "HG00099") + np.testing.assert_allclose(gts.data, expected) + + # now, let's reorder + expected = expected[[2, 0, 1]] + + gts = Genotypes(DATADIR / "simple.vcf.gz") + gts.read(samples=samples, reorder_samples=True) + assert gts.samples == tuple(samples) + np.testing.assert_allclose(gts.data, expected) class TestGenotypesPLINK: - def _get_fake_genotypes_plink(self): - gts_ref_alt = TestGenotypesVCF()._get_fake_genotypes_refalt() + def _get_fake_genotypes_plink(self, with_phase: bool = False): + gts_ref_alt = TestGenotypesVCF()._get_fake_genotypes_refalt( + with_phase=with_phase + ) gts = GenotypesPLINK(gts_ref_alt.fname) gts.data = gts_ref_alt.data gts.samples = gts_ref_alt.samples @@ -700,14 +710,22 @@ def test_write_multiallelic_tr(self): fname.unlink() def test_reorder_samples(self): - # can we load the data from the VCF? - gts = GenotypesPLINK(DATADIR / "simple.pgen", reorder_samples=True) - gts.read(region="1:10115-10117", samples=["HG00097", "HG00096", "HG00099"]) - assert gts.samples == ("HG00097", "HG00096", "HG00099") + expected = self._get_fake_genotypes_plink(with_phase=True).data[[0, 1, 2]] + samples = ["HG00099", "HG00096", "HG00097"] - gts = GenotypesPLINK(DATADIR / "simple.pgen", reorder_samples=False) - gts.read(region="1:10115-10117", samples=["HG00097", "HG00096", "HG00099"]) + # can we load the data from the VCF? + gts = GenotypesPLINK(DATADIR / "simple.pgen") + gts.read(samples=samples, reorder_samples=False) assert gts.samples == ("HG00096", "HG00097", "HG00099") + np.testing.assert_allclose(gts.data, expected) + + # now, let's reorder + expected = expected[[2, 0, 1]] + + gts = GenotypesPLINK(DATADIR / "simple.pgen") + gts.read(samples=samples, reorder_samples=True) + assert gts.samples == tuple(samples) + np.testing.assert_allclose(gts.data, expected) class TestGenotypesPLINKTR: @@ -770,14 +788,22 @@ def test_read(self): np.testing.assert_allclose(expected_alleles, gts.data) def test_reorder_samples(self): - # can we load the data from the VCF? - gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen", reorder_samples=True) - gts.read(samples=["HG00097", "HG00096", "HG00099"]) - assert gts.samples == ("HG00097", "HG00096", "HG00099") + expected = self._get_fake_genotypes_multiallelic().data[[0, 1, 2]] + samples = ["HG00099", "HG00096", "HG00097"] - gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen", reorder_samples=False) - gts.read(samples=["HG00097", "HG00096", "HG00099"]) + # can we load the data from the VCF? + gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen") + gts.read(samples=samples, reorder_samples=False) assert gts.samples == ("HG00096", "HG00097", "HG00099") + np.testing.assert_allclose(gts.data, expected) + + # now, let's reorder + expected = expected[[2, 0, 1]] + + gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen") + gts.read(samples=samples, reorder_samples=True) + assert gts.samples == tuple(samples) + np.testing.assert_allclose(gts.data, expected) class TestPhenotypes: @@ -1701,7 +1727,7 @@ def _get_fake_genotypes_refalt(self, with_phase=False): ) return gts - def _get_fake_genotypes_multiallelic(self, with_phase=False): + def _get_fake_genotypes_multiallelic(self, with_phase: bool = False): gts = self._get_fake_genotypes_refalt(with_phase=with_phase) # replace the necessary properties gts.variants["alleles"] = np.array( @@ -1878,13 +1904,22 @@ def test_merge_variants_vcf(self): assert gts.data.shape[1] == (gts1.data.shape[1] + gts2.data.shape[1]) def test_reorder_samples(self): - gts = GenotypesVCF(DATADIR / "simple.vcf.gz", reorder_samples=True) - gts.read(region="1:10113-10115", samples=["HG00097", "HG00096", "HG00099"]) - assert gts.samples == ("HG00097", "HG00096", "HG00099") + expected = self._get_fake_genotypes_refalt(with_phase=True).data[[0, 1, 2]] + samples = ["HG00099", "HG00096", "HG00097"] - gts = GenotypesVCF(DATADIR / "simple.vcf.gz", reorder_samples=False) - gts.read(region="1:10113-10115", samples=["HG00097", "HG00096", "HG00099"]) + # can we load the data from the VCF? + gts = GenotypesVCF(DATADIR / "simple.vcf") + gts.read(samples=samples, reorder_samples=False) assert gts.samples == ("HG00096", "HG00097", "HG00099") + np.testing.assert_allclose(gts.data, expected) + + # now, let's reorder + expected = expected[[2, 0, 1]] + + gts = GenotypesVCF(DATADIR / "simple.vcf") + gts.read(samples=samples, reorder_samples=True) + assert gts.samples == tuple(samples) + np.testing.assert_allclose(gts.data, expected) class TestGenotypesTR: @@ -1919,13 +1954,22 @@ def test_iter(self): np.testing.assert_allclose(line.data[:, :3], expected[:, idx]) def test_reorder_samples(self): - gts = GenotypesTR(DATADIR / "simple_tr.vcf", reorder_samples=True) - gts.read(samples=["HG00097", "HG00096", "HG00099"]) - assert gts.samples == ("HG00097", "HG00096", "HG00099") + expected = self._get_fake_tr_alleles()[[0, 1, 2]] + samples = ["HG00099", "HG00096", "HG00097"] - gts = GenotypesTR(DATADIR / "simple_tr.vcf", reorder_samples=False) - gts.read(samples=["HG00097", "HG00096", "HG00099"]) + # can we load the data from the VCF? + gts = GenotypesTR(DATADIR / "simple_tr.vcf") + gts.read(samples=samples, reorder_samples=False) assert gts.samples == ("HG00096", "HG00097", "HG00099") + np.testing.assert_allclose(gts.data, expected) + + # now, let's reorder + expected = expected[[2, 0, 1]] + + gts = GenotypesTR(DATADIR / "simple_tr.vcf") + gts.read(samples=samples, reorder_samples=True) + assert gts.samples == tuple(samples) + np.testing.assert_allclose(gts.data, expected) class TestBreakpoints: From 8725b75ac08c2b500d7e13d041a40c8e7b13e244 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 15 Nov 2023 16:08:46 -0800 Subject: [PATCH 4/9] add reorder_samples testing for GenotypesAncestry class too --- tests/test_transform.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/test_transform.py b/tests/test_transform.py index 989f068d..cb6327a5 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -36,8 +36,8 @@ def _get_expected_ancestry(self): expected[-1, -1, :] = 2 return expected - def _get_fake_genotypes(self): - base_gts = TestGenotypesVCF()._get_fake_genotypes_refalt(with_phase=True) + def _get_fake_genotypes(self, with_phase: bool = True): + base_gts = TestGenotypesVCF()._get_fake_genotypes_refalt(with_phase=with_phase) # copy all of the fields gts = GenotypesAncestry(fname=None) gts.data = base_gts.data @@ -126,6 +126,24 @@ def test_subset_genotypes(self): np.testing.assert_allclose(gts_sub.ancestry, expected_ancestry) assert np.array_equal(gts_sub.variants, expected_variants) + def test_reorder_samples(self): + expected = self._get_fake_genotypes().data[[0, 1, 2]] + samples = ["HG00099", "HG00096", "HG00097"] + + # can we load the data from the VCF? + gts = GenotypesAncestry(DATADIR / "simple-ancestry.vcf") + gts.read(samples=samples, reorder_samples=False) + assert gts.samples == ("HG00096", "HG00097", "HG00099") + np.testing.assert_allclose(gts.data, expected) + + # now, let's reorder + expected = expected[[2, 0, 1]] + + gts = GenotypesAncestry(DATADIR / "simple-ancestry.vcf") + gts.read(samples=samples, reorder_samples=True) + assert gts.samples == tuple(samples) + np.testing.assert_allclose(gts.data, expected) + @pytest.mark.xfail(reason="not implemented yet") def test_write_genotypes(self): assert False From 7b9cd2078f2fcde06e8823e5a747dd3db9883bb8 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 16 Nov 2023 16:35:28 -0800 Subject: [PATCH 5/9] also test reorder_samples in __iter__() --- tests/test_data.py | 74 +++++++++++++++++++++++++++++++++++++++-- tests/test_transform.py | 16 ++++++++- 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/tests/test_data.py b/tests/test_data.py index 2080bebd..19a60e13 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -314,19 +314,33 @@ def test_reorder_samples(self): samples = ["HG00099", "HG00096", "HG00097"] # can we load the data from the VCF? - gts = Genotypes(DATADIR / "simple.vcf.gz") + gts = Genotypes(DATADIR / "simple.vcf") gts.read(samples=samples, reorder_samples=False) assert gts.samples == ("HG00096", "HG00097", "HG00099") np.testing.assert_allclose(gts.data, expected) + # let's also test __iter__ + gts = Genotypes(DATADIR / "simple.vcf") + gts_iter = gts.__iter__(samples=samples, reorder_samples=False) + for idx, line in enumerate(gts_iter): + np.testing.assert_allclose(line.data, expected[:, idx]) + assert gts.samples == ("HG00096", "HG00097", "HG00099") + # now, let's reorder expected = expected[[2, 0, 1]] - gts = Genotypes(DATADIR / "simple.vcf.gz") + gts = Genotypes(DATADIR / "simple.vcf") gts.read(samples=samples, reorder_samples=True) assert gts.samples == tuple(samples) np.testing.assert_allclose(gts.data, expected) + # let's also test __iter__ + gts = Genotypes(DATADIR / "simple.vcf") + gts_iter = gts.__iter__(samples=samples, reorder_samples=True) + for idx, line in enumerate(gts_iter): + np.testing.assert_allclose(line.data, expected[:, idx]) + assert gts.samples == tuple(samples) + class TestGenotypesPLINK: def _get_fake_genotypes_plink(self, with_phase: bool = False): @@ -719,6 +733,13 @@ def test_reorder_samples(self): assert gts.samples == ("HG00096", "HG00097", "HG00099") np.testing.assert_allclose(gts.data, expected) + # let's also test __iter__ + gts = GenotypesPLINK(DATADIR / "simple.pgen") + gts_iter = gts.__iter__(samples=samples, reorder_samples=False) + for idx, line in enumerate(gts_iter): + np.testing.assert_allclose(line.data, expected[:, idx]) + assert gts.samples == ("HG00096", "HG00097", "HG00099") + # now, let's reorder expected = expected[[2, 0, 1]] @@ -727,6 +748,13 @@ def test_reorder_samples(self): assert gts.samples == tuple(samples) np.testing.assert_allclose(gts.data, expected) + # let's also test __iter__ + gts = GenotypesPLINK(DATADIR / "simple.pgen") + gts_iter = gts.__iter__(samples=samples, reorder_samples=True) + for idx, line in enumerate(gts_iter): + np.testing.assert_allclose(line.data, expected[:, idx]) + assert gts.samples == tuple(samples) + class TestGenotypesPLINKTR: def _get_fake_genotypes_multiallelic(self): @@ -797,6 +825,13 @@ def test_reorder_samples(self): assert gts.samples == ("HG00096", "HG00097", "HG00099") np.testing.assert_allclose(gts.data, expected) + # let's also test __iter__ + gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen") + gts_iter = gts.__iter__(samples=samples, reorder_samples=False) + for idx, line in enumerate(gts_iter): + np.testing.assert_allclose(line.data, expected[:, idx]) + assert gts.samples == ("HG00096", "HG00097", "HG00099") + # now, let's reorder expected = expected[[2, 0, 1]] @@ -805,6 +840,13 @@ def test_reorder_samples(self): assert gts.samples == tuple(samples) np.testing.assert_allclose(gts.data, expected) + # let's also test __iter__ + gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen") + gts_iter = gts.__iter__(samples=samples, reorder_samples=True) + for idx, line in enumerate(gts_iter): + np.testing.assert_allclose(line.data, expected[:, idx]) + assert gts.samples == tuple(samples) + class TestPhenotypes: def _get_expected_phenotypes(self): @@ -1913,6 +1955,13 @@ def test_reorder_samples(self): assert gts.samples == ("HG00096", "HG00097", "HG00099") np.testing.assert_allclose(gts.data, expected) + # let's also test __iter__ + gts = GenotypesVCF(DATADIR / "simple.vcf") + gts_iter = gts.__iter__(samples=samples, reorder_samples=False) + for idx, line in enumerate(gts_iter): + np.testing.assert_allclose(line.data, expected[:, idx]) + assert gts.samples == ("HG00096", "HG00097", "HG00099") + # now, let's reorder expected = expected[[2, 0, 1]] @@ -1921,6 +1970,13 @@ def test_reorder_samples(self): assert gts.samples == tuple(samples) np.testing.assert_allclose(gts.data, expected) + # let's also test __iter__ + gts = GenotypesVCF(DATADIR / "simple.vcf") + gts_iter = gts.__iter__(samples=samples, reorder_samples=True) + for idx, line in enumerate(gts_iter): + np.testing.assert_allclose(line.data, expected[:, idx]) + assert gts.samples == tuple(samples) + class TestGenotypesTR: def _get_fake_tr_alleles(self): @@ -1963,6 +2019,13 @@ def test_reorder_samples(self): assert gts.samples == ("HG00096", "HG00097", "HG00099") np.testing.assert_allclose(gts.data, expected) + # let's also test __iter__ + gts = GenotypesTR(DATADIR / "simple_tr.vcf") + gts_iter = gts.__iter__(samples=samples, reorder_samples=False) + for idx, line in enumerate(gts_iter): + np.testing.assert_allclose(line.data, expected[:, idx]) + assert gts.samples == ("HG00096", "HG00097", "HG00099") + # now, let's reorder expected = expected[[2, 0, 1]] @@ -1971,6 +2034,13 @@ def test_reorder_samples(self): assert gts.samples == tuple(samples) np.testing.assert_allclose(gts.data, expected) + # let's also test __iter__ + gts = GenotypesTR(DATADIR / "simple_tr.vcf") + gts_iter = gts.__iter__(samples=samples, reorder_samples=True) + for idx, line in enumerate(gts_iter): + np.testing.assert_allclose(line.data, expected[:, idx]) + assert gts.samples == tuple(samples) + class TestBreakpoints: def _get_expected_breakpoints(self): diff --git a/tests/test_transform.py b/tests/test_transform.py index cb6327a5..969469b4 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -8,8 +8,8 @@ from haptools.transform import ( HaplotypeAncestry, - HaplotypesAncestry, GenotypesAncestry, + HaplotypesAncestry, ) from haptools.__main__ import main from .test_data import TestGenotypesVCF @@ -136,6 +136,13 @@ def test_reorder_samples(self): assert gts.samples == ("HG00096", "HG00097", "HG00099") np.testing.assert_allclose(gts.data, expected) + # let's also test __iter__ + gts = GenotypesAncestry(DATADIR / "simple-ancestry.vcf") + gts_iter = gts.__iter__(samples=samples, reorder_samples=False) + for idx, line in enumerate(gts_iter): + np.testing.assert_allclose(line.data, expected[:, idx]) + assert gts.samples == ("HG00096", "HG00097", "HG00099") + # now, let's reorder expected = expected[[2, 0, 1]] @@ -144,6 +151,13 @@ def test_reorder_samples(self): assert gts.samples == tuple(samples) np.testing.assert_allclose(gts.data, expected) + # let's also test __iter__ + gts = GenotypesAncestry(DATADIR / "simple-ancestry.vcf") + gts_iter = gts.__iter__(samples=samples, reorder_samples=True) + for idx, line in enumerate(gts_iter): + np.testing.assert_allclose(line.data, expected[:, idx]) + assert gts.samples == tuple(samples) + @pytest.mark.xfail(reason="not implemented yet") def test_write_genotypes(self): assert False From ef43c9eb2893deba272d359122fdb86d7e8f55c8 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 16 Nov 2023 22:25:01 -0800 Subject: [PATCH 6/9] also test ancestry samples reordering for GenotypesAncestry --- tests/test_transform.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_transform.py b/tests/test_transform.py index 969469b4..11772672 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -127,7 +127,9 @@ def test_subset_genotypes(self): assert np.array_equal(gts_sub.variants, expected_variants) def test_reorder_samples(self): - expected = self._get_fake_genotypes().data[[0, 1, 2]] + expected = self._get_fake_genotypes() + expected_ancestry = expected.ancestry[[0, 1, 2]] + expected = expected.data[[0, 1, 2]] samples = ["HG00099", "HG00096", "HG00097"] # can we load the data from the VCF? @@ -135,27 +137,32 @@ def test_reorder_samples(self): gts.read(samples=samples, reorder_samples=False) assert gts.samples == ("HG00096", "HG00097", "HG00099") np.testing.assert_allclose(gts.data, expected) + np.testing.assert_allclose(gts.ancestry, expected_ancestry) # let's also test __iter__ gts = GenotypesAncestry(DATADIR / "simple-ancestry.vcf") gts_iter = gts.__iter__(samples=samples, reorder_samples=False) for idx, line in enumerate(gts_iter): np.testing.assert_allclose(line.data, expected[:, idx]) + np.testing.assert_allclose(line.ancestry, expected_ancestry[:, idx]) assert gts.samples == ("HG00096", "HG00097", "HG00099") # now, let's reorder expected = expected[[2, 0, 1]] + expected_ancestry = expected_ancestry[[2, 0, 1]] gts = GenotypesAncestry(DATADIR / "simple-ancestry.vcf") gts.read(samples=samples, reorder_samples=True) assert gts.samples == tuple(samples) np.testing.assert_allclose(gts.data, expected) + np.testing.assert_allclose(gts.ancestry, expected_ancestry) # let's also test __iter__ gts = GenotypesAncestry(DATADIR / "simple-ancestry.vcf") gts_iter = gts.__iter__(samples=samples, reorder_samples=True) for idx, line in enumerate(gts_iter): np.testing.assert_allclose(line.data, expected[:, idx]) + np.testing.assert_allclose(line.ancestry, expected_ancestry[:, idx]) assert gts.samples == tuple(samples) @pytest.mark.xfail(reason="not implemented yet") From aed4d232f232797596521be9bddb4ac20ebb097c Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 16 Nov 2023 22:25:27 -0800 Subject: [PATCH 7/9] handle sample reordering in __iter__ methods --- haptools/data/genotypes.py | 77 +++++++++++++++++++++++++++++++++----- haptools/transform.py | 19 ++++++++-- 2 files changed, 83 insertions(+), 13 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 18294802..6700bc2b 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -163,7 +163,9 @@ def read( warning by passing False to this parameter. """ super().read() - records = self.__iter__(region=region, samples=samples, variants=variants) + records = self.__iter__( + region=region, samples=samples, variants=variants, reorder_samples=False, + ) if variants is not None: max_variants = len(variants) # check whether we can preallocate memory instead of making copies @@ -278,7 +280,13 @@ def _return_data(self, variant: Variant): """ return variant.genotype.array().astype(np.uint8) - def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None): + def _iterate( + self, + vcf: VCF, + region: str = None, + variants: set[str] = None, + samples_order: list[int] = None, + ): """ A generator over the lines of a VCF @@ -292,6 +300,9 @@ def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None): See documentation for :py:meth:`~.Genotypes.read` variants : set[str], optional See documentation for :py:meth:`~.Genotypes.read` + samples_order : list[int], optional + The indices of the samples in the order in which they should be + returned Yields ------ @@ -318,13 +329,18 @@ def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None): # 2) presence of REF in strand two # 3) whether the genotype is phased (if self._prephased is False) data = self._return_data(variant) - data = data[:, : (2 + (not self._prephased))] + # reorder samples if needed, as well + data = data[samples_order or Ellipsis, : (2 + (not self._prephased))] yield Record(data, variant_arr) num_seen += 1 vcf.close() def __iter__( - self, region: str = None, samples: list[str] = None, variants: set[str] = None + self, + region: str = None, + samples: list[str] = None, + variants: set[str] = None, + reorder_samples: bool = None, ) -> Iterator[namedtuple]: """ Read genotypes from a VCF line by line without storing anything @@ -337,6 +353,8 @@ def __iter__( See documentation for :py:meth:`~.Genotypes.read` variants : set[str], optional See documentation for :py:meth:`~.Genotypes.read` + reorder_samples : bool, optional + See documentation for :py:meth:`~.Genotypes.read` Returns ------- @@ -344,10 +362,23 @@ def __iter__( See documentation for :py:meth:`~.Genotypes._iterate` """ vcf = VCF(str(self.fname), samples=samples, lazy=True) + self.samples = tuple(vcf.samples) + samples_order = None + if samples is not None: + samples = tuple(samples) + if reorder_samples: + self.index(samples=True) + samples_order = [self._samp_idx[samp] for samp in samples] + self.samples = tuple(samples) + elif reorder_samples is None and self.samples != samples: + self.log.warning( + "Samples in genotypes file do not match requested order" + ) + # call another function to force the lines above to be run immediately # see https://stackoverflow.com/a/36726497 - return self._iterate(vcf, region, variants) + return self._iterate(vcf, region, variants, samples_order) def index(self, samples: bool = True, variants: bool = True): """ @@ -1383,6 +1414,7 @@ def _iterate( pgen: pgenlib.PgenReader, region: str = None, variants: set[str] = None, + samples_order: list[int] = None, ): """ A generator over the lines of a PGEN-PVAR file pair @@ -1397,6 +1429,9 @@ def _iterate( See documentation for :py:meth:`~.Genotypes.read` variants : set[str], optional See documentation for :py:meth:`~.Genotypes.read` + samples_order : list[int], optional + The indices of the samples in the order in which they should be + returned Yields ------ @@ -1427,7 +1462,9 @@ def _iterate( # concatenate phasing info into the data if not self._prephased: data = np.concatenate((data, phasing[:, np.newaxis]), axis=1) - # we extracted the genotypes to a matrix of size p x 3 + # reorder samples if needed + data = data[samples_order or Ellipsis] + # we extracted the genotypes to a matrix of size n x 3 # the last dimension has three items: # 1) presence of REF in strand one # 2) presence of REF in strand two @@ -1436,7 +1473,11 @@ def _iterate( pgen.close() def __iter__( - self, region: str = None, samples: list[str] = None, variants: set[str] = None + self, + region: str = None, + samples: list[str] = None, + variants: set[str] = None, + reorder_samples: bool = None, ) -> Iterator[namedtuple]: """ Read genotypes from a PGEN line by line without storing anything @@ -1449,6 +1490,8 @@ def __iter__( See documentation for :py:meth:`~.Genotypes.read` variants : set[str], optional See documentation for :py:meth:`~.Genotypes.read` + reorder_samples : bool, optional + See documentation for :py:meth:`~.Genotypes.read` Returns ------- @@ -1460,12 +1503,24 @@ def __iter__( pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8")) sample_idxs = self.read_samples(samples) + samples_order = None + if samples is not None: + samples = tuple(samples) + if reorder_samples: + self.index(samples=True) + samples_order = [self._samp_idx[samp] for samp in samples] + self.samples = tuple(samples) + elif reorder_samples is None and self.samples != samples: + self.log.warning( + "Samples in genotypes file do not match requested order" + ) + pgen = pgenlib.PgenReader( bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar=pv ) # call another function to force the lines above to be run immediately # see https://stackoverflow.com/a/36726497 - return self._iterate(pgen, region, variants) + return self._iterate(pgen, region, variants, samples_order) def write_samples(self): """ @@ -1772,6 +1827,7 @@ def _iterate( pgen: pgenlib.PgenReader, region: str = None, variants: set[str] = None, + samples_order: list[int] = None, ): """ A generator over the lines of a PGEN-PVAR file pair @@ -1786,6 +1842,9 @@ def _iterate( See documentation for :py:meth:`~.Genotypes.read` variants : set[str], optional See documentation for :py:meth:`~.Genotypes.read` + samples_order : list[int], optional + The indices of the samples in the order in which they should be + returned Yields ------ @@ -1794,7 +1853,7 @@ def _iterate( namedtuple containing each of the class properties """ tr_records = self._iter_TRRecords(region, variants) - variants = super()._iterate(pgen, region, variants) + variants = super()._iterate(pgen, region, variants, samples_order) for variant, record in zip(variants, tr_records): # extract the REF and ALT allele lengths allele_lens = np.array( diff --git a/haptools/transform.py b/haptools/transform.py index 35ddf45e..9673922c 100644 --- a/haptools/transform.py +++ b/haptools/transform.py @@ -28,7 +28,7 @@ class HaplotypeAncestry(data.Haplotype): default=(data.Extra("ancestry", "s", "Local ancestry"),), ) - def transform(self, genotypes: data.GenotypesVCF) -> npt.NDArray[bool]: + def transform(self, genotypes: data.GenotypesVCF) -> npt.NDArray: """ Transform a genotypes matrix via the current haplotype and its ancestral population @@ -184,7 +184,13 @@ def __init__(self, fname: Path | str, log: logging.Logger = None): # goes from encoding number to population code self.popnum_ancestry = {} - def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None): + def _iterate( + self, + vcf: VCF, + region: str = None, + variants: set[str] = None, + samples_order: list[int] = None, + ): """ See documentation for :py:meth:`~.Genotypes._iterate` """ @@ -208,7 +214,8 @@ def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None): # 2) presence of REF in strand two # 3) whether the genotype is phased (if self._prephased is False) data = np.array(variant.genotypes, dtype=np.uint8) - data = data[:, : (2 + (not self._prephased))] + # reorder samples if needed, as well + data = data[samples_order or Ellipsis, : (2 + (not self._prephased))] # also extract the ancestral population of each variant in each individual ancestry = np.empty((data.shape[0], 2), dtype=np.uint8) for i, sample in enumerate(variant.format("POP")): @@ -219,6 +226,8 @@ def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None): self.popnum_ancestry[pop_count] = pop pop_count += 1 ancestry[i] = tuple(map(self.ancestry_labels.get, pops)) + # reorder samples in ancestry too, if needed + ancestry = ancestry[samples_order or Ellipsis] # finally, output everything yield Record(data, ancestry, variant_arr) num_seen += 1 @@ -236,7 +245,9 @@ def read( See documentation for :py:meth:`~.Genotypes.read` """ super(data.Genotypes, self).read() - records = self.__iter__(region=region, samples=samples, variants=variants) + records = self.__iter__( + region=region, samples=samples, variants=variants, reorder_samples=False, + ) if variants is not None: max_variants = len(variants) # check whether we can preallocate memory instead of making copies From bfca6973fda2cb202cf40750070610ccd8c47dc9 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 17 Nov 2023 09:54:17 -0800 Subject: [PATCH 8/9] use VCF instead of cyvcf2.VCF --- haptools/data/genotypes.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 6700bc2b..8501ed3a 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -246,20 +246,20 @@ def _variant_arr(self, record: Variant): dtype=self.variants.dtype, ) - def _vcf_iter(self, vcf: cyvcf2.VCF, region: str): + def _vcf_iter(self, vcf: VCF, region: str): """ Yield all variants within a region in the VCF file. Parameters ---------- vcf: VCF - The cyvcf2.VCF object from which to fetch variant records + The VCF object from which to fetch variant records region : str, optional See documentation for :py:meth:`~.Genotypes.read` Returns ------- - vcffile : cyvcf2.VCF + vcffile : VCF Iterable cyvcf2 instance. """ return vcf(region) @@ -295,7 +295,7 @@ def _iterate( Parameters ---------- vcf: VCF - The cyvcf2.VCF object from which to fetch variant records + The VCF object from which to fetch variant records region : str, optional See documentation for :py:meth:`~.Genotypes.read` variants : set[str], optional @@ -946,14 +946,14 @@ def load( genotypes.check_phase() return genotypes - def _vcf_iter(self, vcf: cyvcf2.VCF, region: str = None): + def _vcf_iter(self, vcf: VCF, region: str = None): """ Collect GTs (trh.TRRecord objects) to iterate over Parameters ---------- vcf: VCF - The cyvcf2.VCF object from which to fetch variant records + The VCF object from which to fetch variant records region : str, optional See documentation for :py:meth:`~.Genotypes.read` From 0be098e0c979c1929abed57e303544e0d7c12c68 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 17 Nov 2023 09:55:10 -0800 Subject: [PATCH 9/9] refmt with black --- haptools/data/genotypes.py | 5 ++++- haptools/transform.py | 17 ++++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 8501ed3a..53c98a25 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -164,7 +164,10 @@ def read( """ super().read() records = self.__iter__( - region=region, samples=samples, variants=variants, reorder_samples=False, + region=region, + samples=samples, + variants=variants, + reorder_samples=False, ) if variants is not None: max_variants = len(variants) diff --git a/haptools/transform.py b/haptools/transform.py index 9673922c..73eef806 100644 --- a/haptools/transform.py +++ b/haptools/transform.py @@ -185,12 +185,12 @@ def __init__(self, fname: Path | str, log: logging.Logger = None): self.popnum_ancestry = {} def _iterate( - self, - vcf: VCF, - region: str = None, - variants: set[str] = None, - samples_order: list[int] = None, - ): + self, + vcf: VCF, + region: str = None, + variants: set[str] = None, + samples_order: list[int] = None, + ): """ See documentation for :py:meth:`~.Genotypes._iterate` """ @@ -246,7 +246,10 @@ def read( """ super(data.Genotypes, self).read() records = self.__iter__( - region=region, samples=samples, variants=variants, reorder_samples=False, + region=region, + samples=samples, + variants=variants, + reorder_samples=False, ) if variants is not None: max_variants = len(variants)