CLI tests (#33)

* removing contribution comment, since GH tracks such things * tidying * cli test module, and test data * avoid problematic gt_types attribute in cyvcf2 * more cli tests * lint * update badges * Update snps.missing_gts.vcf Add a test for a double missing variant in a sample. * Update test_cli.py Updating the truth based on the new line in the test vcf for for a double missing variant in a sample. * move `iterate_with_ambiguity_warning` inside spectra function for safety Co-authored-by: Mitchell Robert Vollger <[email protected]>
harrispopgen · Mar 14, 2022 · 74deb32 · 74deb32
1 parent 4d2ad77
commit 74deb32
Show file tree

Hide file tree

Showing 8 changed files with 226 additions and 65 deletions.
diff --git a/README.md b/README.md
@@ -2,8 +2,9 @@
 
 A Python package and command line utility for annotating the local ancestral sequence context of biallelic SNPs
 
-![build and test](https://github.com/harrispopgen/mutyper/workflows/build%20and%20test/badge.svg)
-![Python package](https://github.com/harrispopgen/mutyper/workflows/Python%20package/badge.svg)
+[![build and test](https://github.com/harrispopgen/mutyper/actions/workflows/build-and-test.yml/badge.svg)](https://github.com/harrispopgen/mutyper/actions/workflows/build-and-test.yml)
+[![Docs build and deploy](https://github.com/harrispopgen/mutyper/actions/workflows/docs-build-and-deploy.yml/badge.svg)](https://github.com/harrispopgen/mutyper/actions/workflows/docs-build-and-deploy.yml)
+[![Python package](https://github.com/harrispopgen/mutyper/actions/workflows/python-publish.yml/badge.svg)](https://github.com/harrispopgen/mutyper/actions/workflows/python-publish.yml)
 
 #### See [documentation](https://harrispopgen.github.io/mutyper) for install and usage information.
 

diff --git a/mutyper/cli.py b/mutyper/cli.py
@@ -9,7 +9,6 @@
 import numpy as np
 from shutil import copyfile, copyfileobj
 import pyfaidx
-from random import choice
 from pyliftover import LiftOver
 from Bio.Seq import reverse_complement
 import logging
@@ -31,7 +30,6 @@ def setup_ancestor(args):
     )
 
 
-# mrv addition, whole function
 def is_compressed(file):
     """returns true if file is compressed."""
     f = open(file, "rb")
@@ -154,7 +152,7 @@ def variants(args):
     vcf.add_info_to_header(
         {
             "ID": "mutation_type",
-            "Description": f"ancestral {args.k}-mer mutation " "type",
+            "Description": f"ancestral {args.k}-mer mutation type",
             "Type": "Character",
             "Number": "1",
         }
@@ -232,13 +230,29 @@ def targets(args):
 
 def spectra(args):
     """subroutine for spectra subcommand."""
-    vcf = cyvcf2.VCF(args.vcf, gts012=True)
+
+    # NOTE: vcf must be instantiated with gts012=False (the default) due to cyvcf2
+    # num_unknown property bug https://github.com/brentp/cyvcf2/issues/236
+    vcf = cyvcf2.VCF(args.vcf, strict_gt=True)
+
+    def iterate_with_ambiguity_warning():
+        """In several places we want to check for genotype ambiguity as we
+        iterate over vcf variants, so we define this generator wrapper."""
+        for variant in vcf:
+            yield variant
+            if variant.num_unknown:
+                logging.warning(
+                    "Ambiguous genotypes found! Continuing by assuming reference genotypes for these variants."
+                )
+                break
+        yield from vcf
 
     if args.population:
         spectra_data = Counter()
 
-        for variant in vcf:
-            spectra_data[variant.INFO["mutation_type"]] += 1
+        for variant in iterate_with_ambiguity_warning():
+            if variant.aaf:
+                spectra_data[variant.INFO["mutation_type"]] += 1
 
         spectra = pd.DataFrame(spectra_data, ["population"]).reindex(
             sorted(spectra_data), axis="columns"
@@ -250,30 +264,16 @@ def spectra(args):
 
     else:
         spectra_data = defaultdict(lambda: np.zeros_like(vcf.samples, dtype=int))
-        seen_ambiguous = False
         if args.randomize:
-            for variant in vcf:
-                random_haplotype = choice(
-                    [x for x, y in enumerate(variant.gt_types) for _ in range(y)]
-                )
+            for variant in iterate_with_ambiguity_warning():
+                counts = np.array([gt.count(1) for gt in variant.genotypes])
+                rng = np.random.default_rng()
+                random_haplotype = rng.choice(len(counts), p=counts / counts.sum())
                 spectra_data[variant.INFO["mutation_type"]][random_haplotype] += 1.0
         else:
-            for variant in vcf:
-                if variant.ploidy == 1:
-                    # haploid ALT are coded as 2 (homozygous ALT)
-                    variant.gt_types[variant.gt_types == 2] = 1
-                # from the cyvcf2 docs:
-                #   gt_types is array of 0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT
-                #   gts012 (bool) – if True, then gt_types will be 0=HOM_REF, 1=HET, 2=HOM_ALT, 3=UNKNOWN. If False, 3, 2 are flipped.
-                # but for our case unknown should be 0, not 3.
-                ambiguous_genotypes = variant.gt_types == 3
-                if not seen_ambiguous and any(ambiguous_genotypes):
-                    logging.warning(
-                        "Ambiguous genotypes found! Continuing by assuming reference genotypes for these variants."
-                    )
-                    seen_ambiguous = True
-                variant.gt_types[ambiguous_genotypes] = 0
-                spectra_data[variant.INFO["mutation_type"]] += variant.gt_types
+            for variant in iterate_with_ambiguity_warning():
+                counts = np.array([gt.count(1) for gt in variant.genotypes])
+                spectra_data[variant.INFO["mutation_type"]] += counts
 
         spectra = pd.DataFrame(spectra_data, vcf.samples).reindex(
             sorted(spectra_data), axis="columns"
@@ -334,7 +334,7 @@ def get_parser():
         "states, and stream to stdout",
     )
     parser_targets = subparsers.add_parser(
-        "targets", description="compute 𝑘-mer target sizes and stream to " "stdout"
+        "targets", description="compute 𝑘-mer target sizes and stream to stdout"
     )
     parser_spectra = subparsers.add_parser(
         "spectra",
@@ -372,19 +372,19 @@ def get_parser():
             "--target",
             type=int,
             default=None,
-            help="0-based mutation target position in kmer" " (default middle)",
+            help="0-based mutation target position in kmer (default middle)",
         )
         sub_parser.add_argument(
             "--sep",
             type=str,
             default=":",
-            help="field delimiter in FASTA headers " '(default ":")',
+            help='field delimiter in FASTA headers (default ":")',
         )
         sub_parser.add_argument(
             "--chrom_pos",
             type=int,
             default=0,
-            help="0-based chromosome field position in " "FASTA headers (default 0)",
+            help="0-based chromosome field position in FASTA headers (default 0)",
         )
         sub_parser.add_argument(
             "--strand_file",
@@ -422,20 +422,20 @@ def get_parser():
 
     # arguments specific to ancestor subcommand
     parser_ancestor.add_argument(
-        "reference", type=str, help="path to reference FASTA for one " "chromosome"
+        "reference", type=str, help="path to reference FASTA for one chromosome"
     )
     parser_ancestor.add_argument(
         "outgroup", type=str, help="path to outgroup genome FASTA"
     )
     parser_ancestor.add_argument(
         "chain",
         type=str,
-        help="path to alignment chain file " "(reference to outgroup)",
+        help="path to alignment chain file (reference to outgroup)",
     )
     parser_ancestor.add_argument(
         "output",
         type=str,
-        help="path for output ancestral FASTA for " "this chromosome",
+        help="path for output ancestral FASTA for this chromosome",
     )
     parser_ancestor.set_defaults(func=ancestral_fasta)
 
@@ -449,12 +449,12 @@ def get_parser():
     parser_spectra.add_argument(
         "--population",
         action="store_true",
-        help="population-wise spectrum, instead of " "individual",
+        help="population-wise spectrum, instead of individual",
     )
     parser_spectra.add_argument(
         "--randomize",
         action="store_true",
-        help="randomly assign mutation to a single " "haplotype",
+        help="randomly assign mutation to a single haplotype",
     )
     parser_spectra.set_defaults(func=spectra)
 

diff --git a/setup.py b/setup.py
@@ -16,7 +16,7 @@
     long_description_content_type='text/markdown',
     url='https://github.com/harrispopgen/mutyper',
     entry_points={'console_scripts': ['mutyper=mutyper.cli:main']},
-    # packages=setuptools.find_packages(exclude=['tests', 'docs', 'docs']),
+    # packages=setuptools.find_packages(exclude=['tests', 'docs']),
     packages=['mutyper'],
     classifiers=[
         'Programming Language :: Python :: 3',

diff --git a/tests/test_ancestor.py b/tests/test_ancestor.py
@@ -6,47 +6,72 @@
 
 class TestAncestor(unittest.TestCase):
     def setUp(self):
-        self.anc = Ancestor('tests/test_data/ancestor.fa')
-        self.anc_stranded = Ancestor('tests/test_data/ancestor.fa',
-                                     strand_file='tests/test_data/strandedness.bed')
+        self.anc = Ancestor("tests/test_data/ancestor.fa")
+        self.anc_stranded = Ancestor(
+            "tests/test_data/ancestor.fa",
+            strand_file="tests/test_data/strandedness.bed",
+        )
 
     def test_seq(self):
-        anc_seq = self.anc['foo'][:].seq
-        anc_seq_true = 'AAACCCgggTTT'
+        anc_seq = self.anc["foo"][:].seq
+        anc_seq_true = "AAACCCgggTTT"
         self.assertEqual(anc_seq, anc_seq_true)
 
     def test_context(self):
-        self.assertEqual(list(self.anc.region_contexts('foo')),
-                         [None, 'AAA', 'AAC', 'ACC', 'CCC', None, None, None,
-                          None, None, 'AAA', None])
+        self.assertEqual(
+            list(self.anc.region_contexts("foo")),
+            [
+                None,
+                "AAA",
+                "AAC",
+                "ACC",
+                "CCC",
+                None,
+                None,
+                None,
+                None,
+                None,
+                "AAA",
+                None,
+            ],
+        )
 
         # same as above but using strand polarized ancestor
-        self.assertEqual(list(self.anc_stranded.region_contexts('foo')),
-                         [None, 'AAA', 'AAC', 'ACC', 'GGG', None, None, None,
-                          None, None, 'AAA', None])
+        self.assertEqual(
+            list(self.anc_stranded.region_contexts("foo")),
+            [
+                None,
+                "AAA",
+                "AAC",
+                "ACC",
+                "GGG",
+                None,
+                None,
+                None,
+                None,
+                None,
+                "AAA",
+                None,
+            ],
+        )
 
-        self.assertEqual(list(self.anc.region_contexts('foo', 1, 3)),
-                         ['AAA', 'AAC'])
+        self.assertEqual(list(self.anc.region_contexts("foo", 1, 3)), ["AAA", "AAC"])
 
-        self.assertEqual(list(self.anc.region_contexts('foo', 9, 11)),
-                         [None, 'AAA'])
+        self.assertEqual(list(self.anc.region_contexts("foo", 9, 11)), [None, "AAA"])
 
     def test_mutation_type(self):
-        self.assertEqual(self.anc.mutation_type('foo', 1, 'A', 'T'),
-                         ('AAA', 'ATA'))
+        self.assertEqual(self.anc.mutation_type("foo", 1, "A", "T"), ("AAA", "ATA"))
         # infinite sites violation
-        self.assertEqual(self.anc.mutation_type('foo', 1, 'C', 'T'),
-                         (None, None))
+        self.assertEqual(self.anc.mutation_type("foo", 1, "C", "T"), (None, None))
         # low confidence (lower case) ancestral state
-        self.assertEqual(self.anc.mutation_type('foo', 7, 'G', 'T'),
-                         (None, None))
+        self.assertEqual(self.anc.mutation_type("foo", 7, "G", "T"), (None, None))
         # reverse complement
-        self.assertEqual(self.anc.mutation_type('foo', 10, 'G', 'T'),
-                         ('AAA', 'ACA'))
+        self.assertEqual(self.anc.mutation_type("foo", 10, "G", "T"), ("AAA", "ACA"))
         # same as above but using strand polarized ancestor
-        self.assertEqual(self.anc_stranded.mutation_type('foo', 10, 'G', 'T'),
-                         ('AAA', 'ACA'))
+        self.assertEqual(
+            self.anc_stranded.mutation_type("foo", 10, "G", "T"), ("AAA", "ACA")
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -0,0 +1,98 @@
+#! /usr/bin/env python
+
+import pytest
+from mutyper import cli
+import argparse
+import pandas as pd
+import io
+
+
+def test_spectra(capsys):
+    args = argparse.Namespace(
+        vcf="tests/test_data/snps.vcf", population=False, randomize=False
+    )
+    cli.spectra(args)
+    captured = capsys.readouterr()
+    df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0)
+    df_target = pd.DataFrame(
+        {"ACA>ATA": [0, 1], "ACC>ATC": [1, 0], "ACG>ATG": [1, 1], "ACT>ATT": [2, 0]},
+        index=pd.Index(["sample1", "sample2"], name="sample"),
+    )
+    pd.testing.assert_frame_equal(df, df_target)
+
+
+def test_spectra_randomize(capsys):
+    args = argparse.Namespace(
+        vcf="tests/test_data/snps.vcf", population=False, randomize=True
+    )
+    cli.spectra(args)
+    captured = capsys.readouterr()
+    df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0)
+    # there are two possibilities due to haplotype randomization
+    df_target1 = pd.DataFrame(
+        {"ACA>ATA": [0, 1], "ACC>ATC": [1, 0], "ACG>ATG": [1, 0], "ACT>ATT": [1, 0]},
+        index=pd.Index(["sample1", "sample2"], name="sample"),
+    )
+    df_target2 = pd.DataFrame(
+        {"ACA>ATA": [0, 1], "ACC>ATC": [1, 0], "ACG>ATG": [0, 1], "ACT>ATT": [1, 0]},
+        index=pd.Index(["sample1", "sample2"], name="sample"),
+    )
+
+    try:
+        pd.testing.assert_frame_equal(df, df_target1)
+    except AssertionError:
+        pd.testing.assert_frame_equal(df, df_target2)
+
+
+def test_spectra_haploid(capsys):
+    args = argparse.Namespace(
+        vcf="tests/test_data/snps.haploid.vcf", population=False, randomize=False
+    )
+    cli.spectra(args)
+    captured = capsys.readouterr()
+    df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0)
+    df_target = pd.DataFrame(
+        {"ACA>ATA": [0, 1], "ACC>ATC": [0, 0], "ACG>ATG": [1, 1], "ACT>ATT": [1, 0]},
+        index=pd.Index(["sample1", "sample2"], name="sample"),
+    )
+    pd.testing.assert_frame_equal(df, df_target)
+
+
+def test_spectra_missing_gts(capsys, caplog):
+    args = argparse.Namespace(
+        vcf="tests/test_data/snps.missing_gts.vcf", population=False, randomize=False
+    )
+    cli.spectra(args)
+    captured = capsys.readouterr()
+    df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0)
+    df_target = pd.DataFrame(
+        {"ACA>ATA": [0, 1], "ACC>ATC": [1, 0], "ACG>ATG": [1, 1], "ACT>ATT": [1, 1]},
+        index=pd.Index(["sample1", "sample2"], name="sample"),
+    )
+    pd.testing.assert_frame_equal(df, df_target)
+    assert "Ambiguous genotypes found" in caplog.text
+
+
+def test_ksfs(capsys):
+    args = argparse.Namespace(vcf="tests/test_data/snps.vcf", k=3)
+    cli.ksfs(args)
+    captured = capsys.readouterr()
+    df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0)
+    df_target = pd.DataFrame(
+        {
+            "ACA>ATA": [1, 0, 0],
+            "ACC>ATC": [1, 0, 0],
+            "ACG>ATG": [0, 1, 0],
+            "ACT>ATT": [0, 1, 0],
+        },
+        index=pd.Index([1, 2, 3], name="sample_frequency"),
+    )
+    pd.testing.assert_frame_equal(df, df_target)
+
+
+def test_ksfs_missing_gts():
+    args = argparse.Namespace(vcf="tests/test_data/snps.missing_gts.vcf", k=3)
+    with pytest.raises(
+        ValueError, match=r"different AN [0-9]* and [0-9]* indicates missing genotypes"
+    ):
+        cli.ksfs(args)
diff --git a/tests/test_data/snps.haploid.vcf b/tests/test_data/snps.haploid.vcf
@@ -0,0 +1,12 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=chr1,length=248387328>
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">
+##INFO=<ID=mutation_type,Number=1,Type=Character,Description="ancestral 3-mer mutation type">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample1	sample2
+chr1	10	.	C	T	30	PASS	AN=4;AC=1;mutation_type=ACA>ATA	GT	0	1
+chr1	20	.	C	T	30	PASS	AN=4;AC=1;mutation_type=ACC>ATC	GT	0	0
+chr1	30	.	C	T	30	PASS	AN=4;AC=1;mutation_type=ACG>ATG	GT	1	1
+chr1	40	.	C	T	30	PASS	AN=4;AC=2;mutation_type=ACT>ATT	GT	1	0