From c4c4de03ecf5a4a36c3ef338d8d51fc045b01270 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 15 Nov 2023 15:36:06 -0800
Subject: [PATCH 1/9] use python 3.8 style unioning instead

---
 haptools/data/genotypes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index d0562d9b..35114dc5 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -3,7 +3,7 @@
 import gc
 from csv import reader
 from pathlib import Path
-from typing import Iterator, Union
+from typing import Iterator
 from logging import getLogger, Logger
 from collections import namedtuple, Counter
 
@@ -837,7 +837,7 @@ def __init__(
         self,
         vcffile: VCF,
         vcfiter: object,
-        vcftype: Union[str, trh.VcfTypes] = "auto",
+        vcftype: str | trh.VcfTypes = "auto",
     ):
         super().__init__(vcffile, vcftype)
         self.vcfiter = vcfiter

From dd2976b0cd4c6402a66123ae536cce24f20a0885 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 15 Nov 2023 15:40:45 -0800
Subject: [PATCH 2/9] move reorder_samples param into read() since it is not a
 necessary variable for both reading and writing - just for reading

---
 haptools/data/genotypes.py | 91 ++++++++++++++++----------------------
 haptools/transform.py      | 18 ++++++--
 2 files changed, 51 insertions(+), 58 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 35114dc5..18294802 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -40,9 +40,6 @@ class Genotypes(Data):
             3. POS
     log: Logger
         A logging instance for recording debug statements
-    reorder_samples: bool, optional
-        If true and the list of samples are given to read load samples in the same
-        order as the tuple in samples. Otherwise, load in order of read in by VCF.
     _prephased : bool
         If True, assume that the genotypes are phased. Otherwise, extract their phase
         when reading from the VCF.
@@ -60,9 +57,7 @@ class Genotypes(Data):
     >>> genotypes.data
     """
 
-    def __init__(
-        self, fname: Path | str, log: Logger = None, reorder_samples: bool = True
-    ):
+    def __init__(self, fname: Path | str, log: Logger = None):
         super().__init__(fname, log)
         self.samples = tuple()
         self.variants = np.array(
@@ -76,7 +71,6 @@ def __init__(
         self._prephased = False
         self._samp_idx = None
         self._var_idx = None
-        self.reorder_samples = reorder_samples
 
     @classmethod
     def load(
@@ -121,6 +115,7 @@ def read(
         samples: list[str] = None,
         variants: set[str] = None,
         max_variants: int = None,
+        reorder_samples: bool = None,
     ):
         """
         Read genotypes from a VCF into a numpy matrix stored in :py:attr:`~.Genotypes.data`
@@ -159,13 +154,15 @@ def read(
             number of expected sites within a region.
 
             Note that this value is ignored if the variants argument is provided.
+        reorder_samples : bool, optional
+            Reorder samples in the gneotype matrix to match the order specified via the
+            samples parameter. This parameter is ignored when the samples parameter is
+            not specified. Note that reordering the samples requires us to make a copy
+            of the genotype matrix in memory, so it is disabled by default. Instead, a
+            warning is issued when the samples do not match up. You can silence the
+            warning by passing False to this parameter.
         """
         super().read()
-        if not self.reorder_samples and samples:
-            self.log.warning(
-                "Loading genotypes in order of variant file. Output samples and data"
-                " may not be in the same order as the given samples array."
-            )
         records = self.__iter__(region=region, samples=samples, variants=variants)
         if variants is not None:
             max_variants = len(variants)
@@ -216,8 +213,14 @@ def read(
         self.log.info(f"Transposing genotype matrix of size {self.data.shape}")
         self.data = self.data.transpose((1, 0, 2))
 
-        if samples and self.reorder_samples:
-            self.subset(samples=samples, inplace=True)
+        if samples is not None:
+            samples = tuple(samples)
+            if reorder_samples:
+                self.subset(samples=samples, inplace=True)
+            elif reorder_samples is None and self.samples != samples:
+                self.log.warning(
+                    "Samples in genotypes file do not match requested order"
+                )
 
     def _variant_arr(self, record: Variant):
         """
@@ -732,15 +735,10 @@ class GenotypesVCF(Genotypes):
             4. [REF, ALT1, ALT2, ...]
     log: Logger
         See documentation for :py:attr:`~.Genotypes.log`
-    reorder_samples: bool, optional
-        If true and the list of samples are given to read load samples in the same
-        order as the tuple in samples. Otherwise, load in order of read in by VCF.
     """
 
-    def __init__(
-        self, fname: Path | str, log: Logger = None, reorder_samples: bool = True
-    ):
-        super().__init__(fname, log, reorder_samples)
+    def __init__(self, fname: Path | str, log: Logger = None):
+        super().__init__(fname, log)
         dtype = {k: v[0] for k, v in self.variants.dtype.fields.items()}
         self.variants = np.array([], dtype=list(dtype.items()) + [("alleles", object)])
 
@@ -871,9 +869,6 @@ class GenotypesTR(Genotypes):
     vcftype: str
         TR vcf type currently being read. Options are
         {'auto', 'gangstr', 'advntr', 'hipstr', 'eh', 'popstr'}
-    reorder_samples: bool, optional
-        If true and the list of samples are given to read load samples in the same
-        order as the tuple in samples. Otherwise, load in order of read in by VCF.
     """
 
     def __init__(
@@ -881,9 +876,8 @@ def __init__(
         fname: Path | str,
         log: Logger = None,
         vcftype: str = "auto",
-        reorder_samples: bool = True,
     ):
-        super().__init__(fname, log, reorder_samples)
+        super().__init__(fname, log)
         self.vcftype = vcftype
 
     @classmethod
@@ -1009,9 +1003,6 @@ class GenotypesPLINK(GenotypesVCF):
 
         If this value is provided, variants from the PGEN file will be loaded in
         chunks so as to use less memory
-    reorder_samples: bool, optional
-        If true and the list of samples are given to read load samples in the same
-        order as the tuple in samples. Otherwise, load in order of read in by VCF.
 
     Examples
     --------
@@ -1023,9 +1014,8 @@ def __init__(
         fname: Path | str,
         log: Logger = None,
         chunk_size: int = None,
-        reorder_samples: bool = True,
     ):
-        super().__init__(fname, log, reorder_samples)
+        super().__init__(fname, log)
         self.chunk_size = chunk_size
 
     def read_samples(self, samples: list[str] = None):
@@ -1281,6 +1271,7 @@ def read(
         samples: list[str] = None,
         variants: set[str] = None,
         max_variants: int = None,
+        reorder_samples: bool = None,
     ):
         """
         Read genotypes from a PGEN file into a numpy matrix stored in
@@ -1296,15 +1287,11 @@ def read(
             See documentation for :py:attr:`~.GenotypesVCF.read`
         max_variants : int, optional
             See documentation for :py:attr:`~.GenotypesVCF.read`
+        reorder_samples : bool, optional
+            See documentation for :py:attr:`~.GenotypesVCF.read`
         """
         super(Genotypes, self).read()
 
-        if not self.reorder_samples and samples:
-            self.log.warning(
-                "Loading genotypes in order of variant file. Output samples and data"
-                " may not be in the same order as the given samples array."
-            )
-
         sample_idxs = self.read_samples(samples)
         pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8"))
 
@@ -1382,8 +1369,14 @@ def read(
                 del data
                 gc.collect()
 
-        if samples and self.reorder_samples:
-            self.subset(samples=samples, inplace=True)
+        if samples is not None:
+            samples = tuple(samples)
+            if reorder_samples:
+                self.subset(samples=samples, inplace=True)
+            elif reorder_samples is None and self.samples != samples:
+                self.log.warning(
+                    "Samples in genotypes file do not match requested order"
+                )
 
     def _iterate(
         self,
@@ -1639,9 +1632,6 @@ class GenotypesPLINKTR(GenotypesPLINK):
         See documentation for :py:attr:`~.GenotypesPLINK.chunk_size`
     vcftype: str, optional
         See documentation for :py:attr:`~.GenotypesTR.vcftype`
-    reorder_samples: bool, optional
-        If true and the list of samples are given to read load samples in the same
-        order as the tuple in samples. Otherwise, load in order of read in by VCF.
     Examples
     --------
     >>> genotypes = GenotypesPLINK.load('tests/data/simple.pgen')
@@ -1653,9 +1643,8 @@ def __init__(
         log: Logger = None,
         chunk_size: int = None,
         vcftype: str = "auto",
-        reorder_samples: bool = True,
     ):
-        super().__init__(fname, log, chunk_size, reorder_samples)
+        super().__init__(fname, log, chunk_size)
         self.vcftype = vcftype
 
     @classmethod
@@ -1729,6 +1718,7 @@ def read(
         samples: list[str] = None,
         variants: set[str] = None,
         max_variants: int = None,
+        reorder_samples: bool = None,
     ):
         """
         Read genotypes from a PGEN file into a numpy matrix stored in
@@ -1744,14 +1734,10 @@ def read(
             See documentation for :py:attr:`~.GenotypesVCF.read`
         max_variants : int, optional
             See documentation for :py:attr:`~.GenotypesVCF.read`
+        reorder_samples : int, optional
+            See documentation for :py:attr:`~.GenotypesVCF.read`
         """
-        super().read(region, samples, variants, max_variants)
-
-        if not self.reorder_samples and samples:
-            self.log.warning(
-                "Loading genotypes in order of variant file. Output samples and data"
-                " may not be in the same order as the given samples array."
-            )
+        super().read(region, samples, variants, max_variants, reorder_samples)
 
         num_variants = len(self.variants)
         # initialize a jagged array of allele lengths
@@ -1781,9 +1767,6 @@ def read(
         del allele_lens
         gc.collect()
 
-        if samples and self.reorder_samples:
-            self.subset(samples=samples, inplace=True)
-
     def _iterate(
         self,
         pgen: pgenlib.PgenReader,
diff --git a/haptools/transform.py b/haptools/transform.py
index df70f6cb..35ddf45e 100644
--- a/haptools/transform.py
+++ b/haptools/transform.py
@@ -5,8 +5,8 @@
 from dataclasses import dataclass, field
 
 import numpy as np
+from cyvcf2 import VCF
 import numpy.typing as npt
-from cyvcf2 import VCF, Variant
 from pysam import VariantFile
 
 from . import data
@@ -80,7 +80,7 @@ def __init__(
         fname: Path | str,
         haplotype: type[HaplotypeAncestry] = HaplotypeAncestry,
         variant: type[data.Variant] = data.Variant,
-        log: Logger = None,
+        log: logging.Logger = None,
     ):
         """
         Contrasting with the base Haplotypes class: this class uses HaplotypeAncestry
@@ -171,11 +171,11 @@ class GenotypesAncestry(data.GenotypesVCF):
     ancestry : np.array
         The ancestral population of each allele in each sample of
         :py:attr:`~.GenotypesAncestry.data`
-    log: Logger
+    log: logging.Logger
         See documentation for :py:attr:`~.Genotypes.log`
     """
 
-    def __init__(self, fname: Path | str, log: Logger = None):
+    def __init__(self, fname: Path | str, log: logging.Logger = None):
         super().__init__(fname, log)
         self.ancestry = None
         self.valid_labels = None
@@ -230,6 +230,7 @@ def read(
         samples: list[str] = None,
         variants: set[str] = None,
         max_variants: int = None,
+        reorder_samples: bool = None,
     ):
         """
         See documentation for :py:meth:`~.Genotypes.read`
@@ -299,6 +300,15 @@ def read(
         self.data = self.data.transpose((1, 0, 2))
         self.ancestry = self.ancestry.transpose((1, 0, 2))
 
+        if samples is not None:
+            samples = tuple(samples)
+            if reorder_samples:
+                self.subset(samples=samples, inplace=True)
+            elif reorder_samples is None and self.samples != samples:
+                self.log.warning(
+                    "Samples in genotypes file do not match requested order"
+                )
+
     def subset(
         self,
         samples: tuple[str] = None,

From 4cdcc43fe27329c650997c15cae56770c399d4d3 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 15 Nov 2023 15:42:21 -0800
Subject: [PATCH 3/9] use different sample ordering in tests and check data
 since HG00096 == HG00097 in simple.vcf

---
 tests/test_data.py | 106 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 75 insertions(+), 31 deletions(-)

diff --git a/tests/test_data.py b/tests/test_data.py
index 367d243c..2080bebd 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -310,19 +310,29 @@ def test_merge_variants(self):
         assert gts.data.shape[1] == (gts1.data.shape[1] + gts2.data.shape[1])
 
     def test_reorder_samples(self):
-        # can we load the data from the VCF?
-        gts = Genotypes(DATADIR / "simple.vcf.gz", reorder_samples=True)
-        gts.read(region="1:10115-10117", samples=["HG00097", "HG00096", "HG00099"])
-        assert gts.samples == ("HG00097", "HG00096", "HG00099")
+        expected = self._get_expected_genotypes()[[0, 1, 2]]
+        samples = ["HG00099", "HG00096", "HG00097"]
 
-        gts = Genotypes(DATADIR / "simple.vcf.gz", reorder_samples=False)
-        gts.read(region="1:10115-10117", samples=["HG00097", "HG00096", "HG00099"])
+        # can we load the data from the VCF?
+        gts = Genotypes(DATADIR / "simple.vcf.gz")
+        gts.read(samples=samples, reorder_samples=False)
         assert gts.samples == ("HG00096", "HG00097", "HG00099")
+        np.testing.assert_allclose(gts.data, expected)
+
+        # now, let's reorder
+        expected = expected[[2, 0, 1]]
+
+        gts = Genotypes(DATADIR / "simple.vcf.gz")
+        gts.read(samples=samples, reorder_samples=True)
+        assert gts.samples == tuple(samples)
+        np.testing.assert_allclose(gts.data, expected)
 
 
 class TestGenotypesPLINK:
-    def _get_fake_genotypes_plink(self):
-        gts_ref_alt = TestGenotypesVCF()._get_fake_genotypes_refalt()
+    def _get_fake_genotypes_plink(self, with_phase: bool = False):
+        gts_ref_alt = TestGenotypesVCF()._get_fake_genotypes_refalt(
+            with_phase=with_phase
+        )
         gts = GenotypesPLINK(gts_ref_alt.fname)
         gts.data = gts_ref_alt.data
         gts.samples = gts_ref_alt.samples
@@ -700,14 +710,22 @@ def test_write_multiallelic_tr(self):
         fname.unlink()
 
     def test_reorder_samples(self):
-        # can we load the data from the VCF?
-        gts = GenotypesPLINK(DATADIR / "simple.pgen", reorder_samples=True)
-        gts.read(region="1:10115-10117", samples=["HG00097", "HG00096", "HG00099"])
-        assert gts.samples == ("HG00097", "HG00096", "HG00099")
+        expected = self._get_fake_genotypes_plink(with_phase=True).data[[0, 1, 2]]
+        samples = ["HG00099", "HG00096", "HG00097"]
 
-        gts = GenotypesPLINK(DATADIR / "simple.pgen", reorder_samples=False)
-        gts.read(region="1:10115-10117", samples=["HG00097", "HG00096", "HG00099"])
+        # can we load the data from the VCF?
+        gts = GenotypesPLINK(DATADIR / "simple.pgen")
+        gts.read(samples=samples, reorder_samples=False)
         assert gts.samples == ("HG00096", "HG00097", "HG00099")
+        np.testing.assert_allclose(gts.data, expected)
+
+        # now, let's reorder
+        expected = expected[[2, 0, 1]]
+
+        gts = GenotypesPLINK(DATADIR / "simple.pgen")
+        gts.read(samples=samples, reorder_samples=True)
+        assert gts.samples == tuple(samples)
+        np.testing.assert_allclose(gts.data, expected)
 
 
 class TestGenotypesPLINKTR:
@@ -770,14 +788,22 @@ def test_read(self):
         np.testing.assert_allclose(expected_alleles, gts.data)
 
     def test_reorder_samples(self):
-        # can we load the data from the VCF?
-        gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen", reorder_samples=True)
-        gts.read(samples=["HG00097", "HG00096", "HG00099"])
-        assert gts.samples == ("HG00097", "HG00096", "HG00099")
+        expected = self._get_fake_genotypes_multiallelic().data[[0, 1, 2]]
+        samples = ["HG00099", "HG00096", "HG00097"]
 
-        gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen", reorder_samples=False)
-        gts.read(samples=["HG00097", "HG00096", "HG00099"])
+        # can we load the data from the VCF?
+        gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen")
+        gts.read(samples=samples, reorder_samples=False)
         assert gts.samples == ("HG00096", "HG00097", "HG00099")
+        np.testing.assert_allclose(gts.data, expected)
+
+        # now, let's reorder
+        expected = expected[[2, 0, 1]]
+
+        gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen")
+        gts.read(samples=samples, reorder_samples=True)
+        assert gts.samples == tuple(samples)
+        np.testing.assert_allclose(gts.data, expected)
 
 
 class TestPhenotypes:
@@ -1701,7 +1727,7 @@ def _get_fake_genotypes_refalt(self, with_phase=False):
         )
         return gts
 
-    def _get_fake_genotypes_multiallelic(self, with_phase=False):
+    def _get_fake_genotypes_multiallelic(self, with_phase: bool = False):
         gts = self._get_fake_genotypes_refalt(with_phase=with_phase)
         # replace the necessary properties
         gts.variants["alleles"] = np.array(
@@ -1878,13 +1904,22 @@ def test_merge_variants_vcf(self):
         assert gts.data.shape[1] == (gts1.data.shape[1] + gts2.data.shape[1])
 
     def test_reorder_samples(self):
-        gts = GenotypesVCF(DATADIR / "simple.vcf.gz", reorder_samples=True)
-        gts.read(region="1:10113-10115", samples=["HG00097", "HG00096", "HG00099"])
-        assert gts.samples == ("HG00097", "HG00096", "HG00099")
+        expected = self._get_fake_genotypes_refalt(with_phase=True).data[[0, 1, 2]]
+        samples = ["HG00099", "HG00096", "HG00097"]
 
-        gts = GenotypesVCF(DATADIR / "simple.vcf.gz", reorder_samples=False)
-        gts.read(region="1:10113-10115", samples=["HG00097", "HG00096", "HG00099"])
+        # can we load the data from the VCF?
+        gts = GenotypesVCF(DATADIR / "simple.vcf")
+        gts.read(samples=samples, reorder_samples=False)
         assert gts.samples == ("HG00096", "HG00097", "HG00099")
+        np.testing.assert_allclose(gts.data, expected)
+
+        # now, let's reorder
+        expected = expected[[2, 0, 1]]
+
+        gts = GenotypesVCF(DATADIR / "simple.vcf")
+        gts.read(samples=samples, reorder_samples=True)
+        assert gts.samples == tuple(samples)
+        np.testing.assert_allclose(gts.data, expected)
 
 
 class TestGenotypesTR:
@@ -1919,13 +1954,22 @@ def test_iter(self):
             np.testing.assert_allclose(line.data[:, :3], expected[:, idx])
 
     def test_reorder_samples(self):
-        gts = GenotypesTR(DATADIR / "simple_tr.vcf", reorder_samples=True)
-        gts.read(samples=["HG00097", "HG00096", "HG00099"])
-        assert gts.samples == ("HG00097", "HG00096", "HG00099")
+        expected = self._get_fake_tr_alleles()[[0, 1, 2]]
+        samples = ["HG00099", "HG00096", "HG00097"]
 
-        gts = GenotypesTR(DATADIR / "simple_tr.vcf", reorder_samples=False)
-        gts.read(samples=["HG00097", "HG00096", "HG00099"])
+        # can we load the data from the VCF?
+        gts = GenotypesTR(DATADIR / "simple_tr.vcf")
+        gts.read(samples=samples, reorder_samples=False)
         assert gts.samples == ("HG00096", "HG00097", "HG00099")
+        np.testing.assert_allclose(gts.data, expected)
+
+        # now, let's reorder
+        expected = expected[[2, 0, 1]]
+
+        gts = GenotypesTR(DATADIR / "simple_tr.vcf")
+        gts.read(samples=samples, reorder_samples=True)
+        assert gts.samples == tuple(samples)
+        np.testing.assert_allclose(gts.data, expected)
 
 
 class TestBreakpoints:

From 8725b75ac08c2b500d7e13d041a40c8e7b13e244 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 15 Nov 2023 16:08:46 -0800
Subject: [PATCH 4/9] add reorder_samples testing for GenotypesAncestry class
 too

---
 tests/test_transform.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/tests/test_transform.py b/tests/test_transform.py
index 989f068d..cb6327a5 100644
--- a/tests/test_transform.py
+++ b/tests/test_transform.py
@@ -36,8 +36,8 @@ def _get_expected_ancestry(self):
         expected[-1, -1, :] = 2
         return expected
 
-    def _get_fake_genotypes(self):
-        base_gts = TestGenotypesVCF()._get_fake_genotypes_refalt(with_phase=True)
+    def _get_fake_genotypes(self, with_phase: bool = True):
+        base_gts = TestGenotypesVCF()._get_fake_genotypes_refalt(with_phase=with_phase)
         # copy all of the fields
         gts = GenotypesAncestry(fname=None)
         gts.data = base_gts.data
@@ -126,6 +126,24 @@ def test_subset_genotypes(self):
         np.testing.assert_allclose(gts_sub.ancestry, expected_ancestry)
         assert np.array_equal(gts_sub.variants, expected_variants)
 
+    def test_reorder_samples(self):
+        expected = self._get_fake_genotypes().data[[0, 1, 2]]
+        samples = ["HG00099", "HG00096", "HG00097"]
+
+        # can we load the data from the VCF?
+        gts = GenotypesAncestry(DATADIR / "simple-ancestry.vcf")
+        gts.read(samples=samples, reorder_samples=False)
+        assert gts.samples == ("HG00096", "HG00097", "HG00099")
+        np.testing.assert_allclose(gts.data, expected)
+
+        # now, let's reorder
+        expected = expected[[2, 0, 1]]
+
+        gts = GenotypesAncestry(DATADIR / "simple-ancestry.vcf")
+        gts.read(samples=samples, reorder_samples=True)
+        assert gts.samples == tuple(samples)
+        np.testing.assert_allclose(gts.data, expected)
+
     @pytest.mark.xfail(reason="not implemented yet")
     def test_write_genotypes(self):
         assert False

From 7b9cd2078f2fcde06e8823e5a747dd3db9883bb8 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 16 Nov 2023 16:35:28 -0800
Subject: [PATCH 5/9] also test reorder_samples in __iter__()

---
 tests/test_data.py      | 74 +++++++++++++++++++++++++++++++++++++++--
 tests/test_transform.py | 16 ++++++++-
 2 files changed, 87 insertions(+), 3 deletions(-)

diff --git a/tests/test_data.py b/tests/test_data.py
index 2080bebd..19a60e13 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -314,19 +314,33 @@ def test_reorder_samples(self):
         samples = ["HG00099", "HG00096", "HG00097"]
 
         # can we load the data from the VCF?
-        gts = Genotypes(DATADIR / "simple.vcf.gz")
+        gts = Genotypes(DATADIR / "simple.vcf")
         gts.read(samples=samples, reorder_samples=False)
         assert gts.samples == ("HG00096", "HG00097", "HG00099")
         np.testing.assert_allclose(gts.data, expected)
 
+        # let's also test __iter__
+        gts = Genotypes(DATADIR / "simple.vcf")
+        gts_iter = gts.__iter__(samples=samples, reorder_samples=False)
+        for idx, line in enumerate(gts_iter):
+            np.testing.assert_allclose(line.data, expected[:, idx])
+        assert gts.samples == ("HG00096", "HG00097", "HG00099")
+
         # now, let's reorder
         expected = expected[[2, 0, 1]]
 
-        gts = Genotypes(DATADIR / "simple.vcf.gz")
+        gts = Genotypes(DATADIR / "simple.vcf")
         gts.read(samples=samples, reorder_samples=True)
         assert gts.samples == tuple(samples)
         np.testing.assert_allclose(gts.data, expected)
 
+        # let's also test __iter__
+        gts = Genotypes(DATADIR / "simple.vcf")
+        gts_iter = gts.__iter__(samples=samples, reorder_samples=True)
+        for idx, line in enumerate(gts_iter):
+            np.testing.assert_allclose(line.data, expected[:, idx])
+        assert gts.samples == tuple(samples)
+
 
 class TestGenotypesPLINK:
     def _get_fake_genotypes_plink(self, with_phase: bool = False):
@@ -719,6 +733,13 @@ def test_reorder_samples(self):
         assert gts.samples == ("HG00096", "HG00097", "HG00099")
         np.testing.assert_allclose(gts.data, expected)
 
+        # let's also test __iter__
+        gts = GenotypesPLINK(DATADIR / "simple.pgen")
+        gts_iter = gts.__iter__(samples=samples, reorder_samples=False)
+        for idx, line in enumerate(gts_iter):
+            np.testing.assert_allclose(line.data, expected[:, idx])
+        assert gts.samples == ("HG00096", "HG00097", "HG00099")
+
         # now, let's reorder
         expected = expected[[2, 0, 1]]
 
@@ -727,6 +748,13 @@ def test_reorder_samples(self):
         assert gts.samples == tuple(samples)
         np.testing.assert_allclose(gts.data, expected)
 
+        # let's also test __iter__
+        gts = GenotypesPLINK(DATADIR / "simple.pgen")
+        gts_iter = gts.__iter__(samples=samples, reorder_samples=True)
+        for idx, line in enumerate(gts_iter):
+            np.testing.assert_allclose(line.data, expected[:, idx])
+        assert gts.samples == tuple(samples)
+
 
 class TestGenotypesPLINKTR:
     def _get_fake_genotypes_multiallelic(self):
@@ -797,6 +825,13 @@ def test_reorder_samples(self):
         assert gts.samples == ("HG00096", "HG00097", "HG00099")
         np.testing.assert_allclose(gts.data, expected)
 
+        # let's also test __iter__
+        gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen")
+        gts_iter = gts.__iter__(samples=samples, reorder_samples=False)
+        for idx, line in enumerate(gts_iter):
+            np.testing.assert_allclose(line.data, expected[:, idx])
+        assert gts.samples == ("HG00096", "HG00097", "HG00099")
+
         # now, let's reorder
         expected = expected[[2, 0, 1]]
 
@@ -805,6 +840,13 @@ def test_reorder_samples(self):
         assert gts.samples == tuple(samples)
         np.testing.assert_allclose(gts.data, expected)
 
+        # let's also test __iter__
+        gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen")
+        gts_iter = gts.__iter__(samples=samples, reorder_samples=True)
+        for idx, line in enumerate(gts_iter):
+            np.testing.assert_allclose(line.data, expected[:, idx])
+        assert gts.samples == tuple(samples)
+
 
 class TestPhenotypes:
     def _get_expected_phenotypes(self):
@@ -1913,6 +1955,13 @@ def test_reorder_samples(self):
         assert gts.samples == ("HG00096", "HG00097", "HG00099")
         np.testing.assert_allclose(gts.data, expected)
 
+        # let's also test __iter__
+        gts = GenotypesVCF(DATADIR / "simple.vcf")
+        gts_iter = gts.__iter__(samples=samples, reorder_samples=False)
+        for idx, line in enumerate(gts_iter):
+            np.testing.assert_allclose(line.data, expected[:, idx])
+        assert gts.samples == ("HG00096", "HG00097", "HG00099")
+
         # now, let's reorder
         expected = expected[[2, 0, 1]]
 
@@ -1921,6 +1970,13 @@ def test_reorder_samples(self):
         assert gts.samples == tuple(samples)
         np.testing.assert_allclose(gts.data, expected)
 
+        # let's also test __iter__
+        gts = GenotypesVCF(DATADIR / "simple.vcf")
+        gts_iter = gts.__iter__(samples=samples, reorder_samples=True)
+        for idx, line in enumerate(gts_iter):
+            np.testing.assert_allclose(line.data, expected[:, idx])
+        assert gts.samples == tuple(samples)
+
 
 class TestGenotypesTR:
     def _get_fake_tr_alleles(self):
@@ -1963,6 +2019,13 @@ def test_reorder_samples(self):
         assert gts.samples == ("HG00096", "HG00097", "HG00099")
         np.testing.assert_allclose(gts.data, expected)
 
+        # let's also test __iter__
+        gts = GenotypesTR(DATADIR / "simple_tr.vcf")
+        gts_iter = gts.__iter__(samples=samples, reorder_samples=False)
+        for idx, line in enumerate(gts_iter):
+            np.testing.assert_allclose(line.data, expected[:, idx])
+        assert gts.samples == ("HG00096", "HG00097", "HG00099")
+
         # now, let's reorder
         expected = expected[[2, 0, 1]]
 
@@ -1971,6 +2034,13 @@ def test_reorder_samples(self):
         assert gts.samples == tuple(samples)
         np.testing.assert_allclose(gts.data, expected)
 
+        # let's also test __iter__
+        gts = GenotypesTR(DATADIR / "simple_tr.vcf")
+        gts_iter = gts.__iter__(samples=samples, reorder_samples=True)
+        for idx, line in enumerate(gts_iter):
+            np.testing.assert_allclose(line.data, expected[:, idx])
+        assert gts.samples == tuple(samples)
+
 
 class TestBreakpoints:
     def _get_expected_breakpoints(self):
diff --git a/tests/test_transform.py b/tests/test_transform.py
index cb6327a5..969469b4 100644
--- a/tests/test_transform.py
+++ b/tests/test_transform.py
@@ -8,8 +8,8 @@
 
 from haptools.transform import (
     HaplotypeAncestry,
-    HaplotypesAncestry,
     GenotypesAncestry,
+    HaplotypesAncestry,
 )
 from haptools.__main__ import main
 from .test_data import TestGenotypesVCF
@@ -136,6 +136,13 @@ def test_reorder_samples(self):
         assert gts.samples == ("HG00096", "HG00097", "HG00099")
         np.testing.assert_allclose(gts.data, expected)
 
+        # let's also test __iter__
+        gts = GenotypesAncestry(DATADIR / "simple-ancestry.vcf")
+        gts_iter = gts.__iter__(samples=samples, reorder_samples=False)
+        for idx, line in enumerate(gts_iter):
+            np.testing.assert_allclose(line.data, expected[:, idx])
+        assert gts.samples == ("HG00096", "HG00097", "HG00099")
+
         # now, let's reorder
         expected = expected[[2, 0, 1]]
 
@@ -144,6 +151,13 @@ def test_reorder_samples(self):
         assert gts.samples == tuple(samples)
         np.testing.assert_allclose(gts.data, expected)
 
+        # let's also test __iter__
+        gts = GenotypesAncestry(DATADIR / "simple-ancestry.vcf")
+        gts_iter = gts.__iter__(samples=samples, reorder_samples=True)
+        for idx, line in enumerate(gts_iter):
+            np.testing.assert_allclose(line.data, expected[:, idx])
+        assert gts.samples == tuple(samples)
+
     @pytest.mark.xfail(reason="not implemented yet")
     def test_write_genotypes(self):
         assert False

From ef43c9eb2893deba272d359122fdb86d7e8f55c8 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 16 Nov 2023 22:25:01 -0800
Subject: [PATCH 6/9] also test ancestry samples reordering for
 GenotypesAncestry

---
 tests/test_transform.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/test_transform.py b/tests/test_transform.py
index 969469b4..11772672 100644
--- a/tests/test_transform.py
+++ b/tests/test_transform.py
@@ -127,7 +127,9 @@ def test_subset_genotypes(self):
         assert np.array_equal(gts_sub.variants, expected_variants)
 
     def test_reorder_samples(self):
-        expected = self._get_fake_genotypes().data[[0, 1, 2]]
+        expected = self._get_fake_genotypes()
+        expected_ancestry = expected.ancestry[[0, 1, 2]]
+        expected = expected.data[[0, 1, 2]]
         samples = ["HG00099", "HG00096", "HG00097"]
 
         # can we load the data from the VCF?
@@ -135,27 +137,32 @@ def test_reorder_samples(self):
         gts.read(samples=samples, reorder_samples=False)
         assert gts.samples == ("HG00096", "HG00097", "HG00099")
         np.testing.assert_allclose(gts.data, expected)
+        np.testing.assert_allclose(gts.ancestry, expected_ancestry)
 
         # let's also test __iter__
         gts = GenotypesAncestry(DATADIR / "simple-ancestry.vcf")
         gts_iter = gts.__iter__(samples=samples, reorder_samples=False)
         for idx, line in enumerate(gts_iter):
             np.testing.assert_allclose(line.data, expected[:, idx])
+            np.testing.assert_allclose(line.ancestry, expected_ancestry[:, idx])
         assert gts.samples == ("HG00096", "HG00097", "HG00099")
 
         # now, let's reorder
         expected = expected[[2, 0, 1]]
+        expected_ancestry = expected_ancestry[[2, 0, 1]]
 
         gts = GenotypesAncestry(DATADIR / "simple-ancestry.vcf")
         gts.read(samples=samples, reorder_samples=True)
         assert gts.samples == tuple(samples)
         np.testing.assert_allclose(gts.data, expected)
+        np.testing.assert_allclose(gts.ancestry, expected_ancestry)
 
         # let's also test __iter__
         gts = GenotypesAncestry(DATADIR / "simple-ancestry.vcf")
         gts_iter = gts.__iter__(samples=samples, reorder_samples=True)
         for idx, line in enumerate(gts_iter):
             np.testing.assert_allclose(line.data, expected[:, idx])
+            np.testing.assert_allclose(line.ancestry, expected_ancestry[:, idx])
         assert gts.samples == tuple(samples)
 
     @pytest.mark.xfail(reason="not implemented yet")

From aed4d232f232797596521be9bddb4ac20ebb097c Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 16 Nov 2023 22:25:27 -0800
Subject: [PATCH 7/9] handle sample reordering in __iter__ methods

---
 haptools/data/genotypes.py | 77 +++++++++++++++++++++++++++++++++-----
 haptools/transform.py      | 19 ++++++++--
 2 files changed, 83 insertions(+), 13 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 18294802..6700bc2b 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -163,7 +163,9 @@ def read(
             warning by passing False to this parameter.
         """
         super().read()
-        records = self.__iter__(region=region, samples=samples, variants=variants)
+        records = self.__iter__(
+            region=region, samples=samples, variants=variants, reorder_samples=False,
+        )
         if variants is not None:
             max_variants = len(variants)
         # check whether we can preallocate memory instead of making copies
@@ -278,7 +280,13 @@ def _return_data(self, variant: Variant):
         """
         return variant.genotype.array().astype(np.uint8)
 
-    def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None):
+    def _iterate(
+        self,
+        vcf: VCF,
+        region: str = None,
+        variants: set[str] = None,
+        samples_order: list[int] = None,
+    ):
         """
         A generator over the lines of a VCF
 
@@ -292,6 +300,9 @@ def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None):
             See documentation for :py:meth:`~.Genotypes.read`
         variants : set[str], optional
             See documentation for :py:meth:`~.Genotypes.read`
+        samples_order : list[int], optional
+            The indices of the samples in the order in which they should be
+            returned
 
         Yields
         ------
@@ -318,13 +329,18 @@ def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None):
             # 2) presence of REF in strand two
             # 3) whether the genotype is phased (if self._prephased is False)
             data = self._return_data(variant)
-            data = data[:, : (2 + (not self._prephased))]
+            # reorder samples if needed, as well
+            data = data[samples_order or Ellipsis, : (2 + (not self._prephased))]
             yield Record(data, variant_arr)
             num_seen += 1
         vcf.close()
 
     def __iter__(
-        self, region: str = None, samples: list[str] = None, variants: set[str] = None
+        self,
+        region: str = None,
+        samples: list[str] = None,
+        variants: set[str] = None,
+        reorder_samples: bool = None,
     ) -> Iterator[namedtuple]:
         """
         Read genotypes from a VCF line by line without storing anything
@@ -337,6 +353,8 @@ def __iter__(
             See documentation for :py:meth:`~.Genotypes.read`
         variants : set[str], optional
             See documentation for :py:meth:`~.Genotypes.read`
+        reorder_samples : bool, optional
+            See documentation for :py:meth:`~.Genotypes.read`
 
         Returns
         -------
@@ -344,10 +362,23 @@ def __iter__(
             See documentation for :py:meth:`~.Genotypes._iterate`
         """
         vcf = VCF(str(self.fname), samples=samples, lazy=True)
+
         self.samples = tuple(vcf.samples)
+        samples_order = None
+        if samples is not None:
+            samples = tuple(samples)
+            if reorder_samples:
+                self.index(samples=True)
+                samples_order = [self._samp_idx[samp] for samp in samples]
+                self.samples = tuple(samples)
+            elif reorder_samples is None and self.samples != samples:
+                self.log.warning(
+                    "Samples in genotypes file do not match requested order"
+                )
+
         # call another function to force the lines above to be run immediately
         # see https://stackoverflow.com/a/36726497
-        return self._iterate(vcf, region, variants)
+        return self._iterate(vcf, region, variants, samples_order)
 
     def index(self, samples: bool = True, variants: bool = True):
         """
@@ -1383,6 +1414,7 @@ def _iterate(
         pgen: pgenlib.PgenReader,
         region: str = None,
         variants: set[str] = None,
+        samples_order: list[int] = None,
     ):
         """
         A generator over the lines of a PGEN-PVAR file pair
@@ -1397,6 +1429,9 @@ def _iterate(
             See documentation for :py:meth:`~.Genotypes.read`
         variants : set[str], optional
             See documentation for :py:meth:`~.Genotypes.read`
+        samples_order : list[int], optional
+            The indices of the samples in the order in which they should be
+            returned
 
         Yields
         ------
@@ -1427,7 +1462,9 @@ def _iterate(
             # concatenate phasing info into the data
             if not self._prephased:
                 data = np.concatenate((data, phasing[:, np.newaxis]), axis=1)
-            # we extracted the genotypes to a matrix of size p x 3
+            # reorder samples if needed
+            data = data[samples_order or Ellipsis]
+            # we extracted the genotypes to a matrix of size n x 3
             # the last dimension has three items:
             # 1) presence of REF in strand one
             # 2) presence of REF in strand two
@@ -1436,7 +1473,11 @@ def _iterate(
         pgen.close()
 
     def __iter__(
-        self, region: str = None, samples: list[str] = None, variants: set[str] = None
+        self,
+        region: str = None,
+        samples: list[str] = None,
+        variants: set[str] = None,
+        reorder_samples: bool = None,
     ) -> Iterator[namedtuple]:
         """
         Read genotypes from a PGEN line by line without storing anything
@@ -1449,6 +1490,8 @@ def __iter__(
             See documentation for :py:meth:`~.Genotypes.read`
         variants : set[str], optional
             See documentation for :py:meth:`~.Genotypes.read`
+        reorder_samples : bool, optional
+            See documentation for :py:meth:`~.Genotypes.read`
 
         Returns
         -------
@@ -1460,12 +1503,24 @@ def __iter__(
         pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8"))
 
         sample_idxs = self.read_samples(samples)
+        samples_order = None
+        if samples is not None:
+            samples = tuple(samples)
+            if reorder_samples:
+                self.index(samples=True)
+                samples_order = [self._samp_idx[samp] for samp in samples]
+                self.samples = tuple(samples)
+            elif reorder_samples is None and self.samples != samples:
+                self.log.warning(
+                    "Samples in genotypes file do not match requested order"
+                )
+
         pgen = pgenlib.PgenReader(
             bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar=pv
         )
         # call another function to force the lines above to be run immediately
         # see https://stackoverflow.com/a/36726497
-        return self._iterate(pgen, region, variants)
+        return self._iterate(pgen, region, variants, samples_order)
 
     def write_samples(self):
         """
@@ -1772,6 +1827,7 @@ def _iterate(
         pgen: pgenlib.PgenReader,
         region: str = None,
         variants: set[str] = None,
+        samples_order: list[int] = None,
     ):
         """
         A generator over the lines of a PGEN-PVAR file pair
@@ -1786,6 +1842,9 @@ def _iterate(
             See documentation for :py:meth:`~.Genotypes.read`
         variants : set[str], optional
             See documentation for :py:meth:`~.Genotypes.read`
+        samples_order : list[int], optional
+            The indices of the samples in the order in which they should be
+            returned
 
         Yields
         ------
@@ -1794,7 +1853,7 @@ def _iterate(
             namedtuple containing each of the class properties
         """
         tr_records = self._iter_TRRecords(region, variants)
-        variants = super()._iterate(pgen, region, variants)
+        variants = super()._iterate(pgen, region, variants, samples_order)
         for variant, record in zip(variants, tr_records):
             # extract the REF and ALT allele lengths
             allele_lens = np.array(
diff --git a/haptools/transform.py b/haptools/transform.py
index 35ddf45e..9673922c 100644
--- a/haptools/transform.py
+++ b/haptools/transform.py
@@ -28,7 +28,7 @@ class HaplotypeAncestry(data.Haplotype):
         default=(data.Extra("ancestry", "s", "Local ancestry"),),
     )
 
-    def transform(self, genotypes: data.GenotypesVCF) -> npt.NDArray[bool]:
+    def transform(self, genotypes: data.GenotypesVCF) -> npt.NDArray:
         """
         Transform a genotypes matrix via the current haplotype and its ancestral
         population
@@ -184,7 +184,13 @@ def __init__(self, fname: Path | str, log: logging.Logger = None):
         # goes from encoding number to population code
         self.popnum_ancestry = {}
 
-    def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None):
+    def _iterate(
+            self,
+            vcf: VCF,
+            region: str = None,
+            variants: set[str] = None,
+            samples_order: list[int] = None,
+        ):
         """
         See documentation for :py:meth:`~.Genotypes._iterate`
         """
@@ -208,7 +214,8 @@ def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None):
             # 2) presence of REF in strand two
             # 3) whether the genotype is phased (if self._prephased is False)
             data = np.array(variant.genotypes, dtype=np.uint8)
-            data = data[:, : (2 + (not self._prephased))]
+            # reorder samples if needed, as well
+            data = data[samples_order or Ellipsis, : (2 + (not self._prephased))]
             # also extract the ancestral population of each variant in each individual
             ancestry = np.empty((data.shape[0], 2), dtype=np.uint8)
             for i, sample in enumerate(variant.format("POP")):
@@ -219,6 +226,8 @@ def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None):
                         self.popnum_ancestry[pop_count] = pop
                         pop_count += 1
                 ancestry[i] = tuple(map(self.ancestry_labels.get, pops))
+            # reorder samples in ancestry too, if needed
+            ancestry = ancestry[samples_order or Ellipsis]
             # finally, output everything
             yield Record(data, ancestry, variant_arr)
             num_seen += 1
@@ -236,7 +245,9 @@ def read(
         See documentation for :py:meth:`~.Genotypes.read`
         """
         super(data.Genotypes, self).read()
-        records = self.__iter__(region=region, samples=samples, variants=variants)
+        records = self.__iter__(
+            region=region, samples=samples, variants=variants, reorder_samples=False,
+        )
         if variants is not None:
             max_variants = len(variants)
         # check whether we can preallocate memory instead of making copies

From bfca6973fda2cb202cf40750070610ccd8c47dc9 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 17 Nov 2023 09:54:17 -0800
Subject: [PATCH 8/9] use VCF instead of cyvcf2.VCF

---
 haptools/data/genotypes.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 6700bc2b..8501ed3a 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -246,20 +246,20 @@ def _variant_arr(self, record: Variant):
             dtype=self.variants.dtype,
         )
 
-    def _vcf_iter(self, vcf: cyvcf2.VCF, region: str):
+    def _vcf_iter(self, vcf: VCF, region: str):
         """
         Yield all variants within a region in the VCF file.
 
         Parameters
         ----------
         vcf: VCF
-            The cyvcf2.VCF object from which to fetch variant records
+            The VCF object from which to fetch variant records
         region : str, optional
             See documentation for :py:meth:`~.Genotypes.read`
 
         Returns
         -------
-        vcffile : cyvcf2.VCF
+        vcffile : VCF
             Iterable cyvcf2 instance.
         """
         return vcf(region)
@@ -295,7 +295,7 @@ def _iterate(
         Parameters
         ----------
         vcf: VCF
-            The cyvcf2.VCF object from which to fetch variant records
+            The VCF object from which to fetch variant records
         region : str, optional
             See documentation for :py:meth:`~.Genotypes.read`
         variants : set[str], optional
@@ -946,14 +946,14 @@ def load(
         genotypes.check_phase()
         return genotypes
 
-    def _vcf_iter(self, vcf: cyvcf2.VCF, region: str = None):
+    def _vcf_iter(self, vcf: VCF, region: str = None):
         """
         Collect GTs (trh.TRRecord objects) to iterate over
 
         Parameters
         ----------
         vcf: VCF
-            The cyvcf2.VCF object from which to fetch variant records
+            The VCF object from which to fetch variant records
         region : str, optional
             See documentation for :py:meth:`~.Genotypes.read`
 

From 0be098e0c979c1929abed57e303544e0d7c12c68 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 17 Nov 2023 09:55:10 -0800
Subject: [PATCH 9/9] refmt with black

---
 haptools/data/genotypes.py |  5 ++++-
 haptools/transform.py      | 17 ++++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 8501ed3a..53c98a25 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -164,7 +164,10 @@ def read(
         """
         super().read()
         records = self.__iter__(
-            region=region, samples=samples, variants=variants, reorder_samples=False,
+            region=region,
+            samples=samples,
+            variants=variants,
+            reorder_samples=False,
         )
         if variants is not None:
             max_variants = len(variants)
diff --git a/haptools/transform.py b/haptools/transform.py
index 9673922c..73eef806 100644
--- a/haptools/transform.py
+++ b/haptools/transform.py
@@ -185,12 +185,12 @@ def __init__(self, fname: Path | str, log: logging.Logger = None):
         self.popnum_ancestry = {}
 
     def _iterate(
-            self,
-            vcf: VCF,
-            region: str = None,
-            variants: set[str] = None,
-            samples_order: list[int] = None,
-        ):
+        self,
+        vcf: VCF,
+        region: str = None,
+        variants: set[str] = None,
+        samples_order: list[int] = None,
+    ):
         """
         See documentation for :py:meth:`~.Genotypes._iterate`
         """
@@ -246,7 +246,10 @@ def read(
         """
         super(data.Genotypes, self).read()
         records = self.__iter__(
-            region=region, samples=samples, variants=variants, reorder_samples=False,
+            region=region,
+            samples=samples,
+            variants=variants,
+            reorder_samples=False,
         )
         if variants is not None:
             max_variants = len(variants)