Skip to content

Commit

Permalink
CLI tests (#33)
Browse files Browse the repository at this point in the history
* removing contribution comment, since GH tracks such things

* tidying

* cli test module, and test data

* avoid problematic gt_types attribute in cyvcf2

* more cli tests

* lint

* update badges

* Update snps.missing_gts.vcf

Add a test for a double missing variant in a sample.

* Update test_cli.py

Updating the truth based on the new line in the test vcf for for a double missing variant in a sample.

* move `iterate_with_ambiguity_warning` inside spectra function for safety

Co-authored-by: Mitchell Robert Vollger <[email protected]>
  • Loading branch information
wsdewitt and mrvollger authored Mar 14, 2022
1 parent 4d2ad77 commit 74deb32
Show file tree
Hide file tree
Showing 8 changed files with 226 additions and 65 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

A Python package and command line utility for annotating the local ancestral sequence context of biallelic SNPs

![build and test](https://github.com/harrispopgen/mutyper/workflows/build%20and%20test/badge.svg)
![Python package](https://github.com/harrispopgen/mutyper/workflows/Python%20package/badge.svg)
[![build and test](https://github.com/harrispopgen/mutyper/actions/workflows/build-and-test.yml/badge.svg)](https://github.com/harrispopgen/mutyper/actions/workflows/build-and-test.yml)
[![Docs build and deploy](https://github.com/harrispopgen/mutyper/actions/workflows/docs-build-and-deploy.yml/badge.svg)](https://github.com/harrispopgen/mutyper/actions/workflows/docs-build-and-deploy.yml)
[![Python package](https://github.com/harrispopgen/mutyper/actions/workflows/python-publish.yml/badge.svg)](https://github.com/harrispopgen/mutyper/actions/workflows/python-publish.yml)

#### See [documentation](https://harrispopgen.github.io/mutyper) for install and usage information.

Expand Down
72 changes: 36 additions & 36 deletions mutyper/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import numpy as np
from shutil import copyfile, copyfileobj
import pyfaidx
from random import choice
from pyliftover import LiftOver
from Bio.Seq import reverse_complement
import logging
Expand All @@ -31,7 +30,6 @@ def setup_ancestor(args):
)


# mrv addition, whole function
def is_compressed(file):
"""returns true if file is compressed."""
f = open(file, "rb")
Expand Down Expand Up @@ -154,7 +152,7 @@ def variants(args):
vcf.add_info_to_header(
{
"ID": "mutation_type",
"Description": f"ancestral {args.k}-mer mutation " "type",
"Description": f"ancestral {args.k}-mer mutation type",
"Type": "Character",
"Number": "1",
}
Expand Down Expand Up @@ -232,13 +230,29 @@ def targets(args):

def spectra(args):
"""subroutine for spectra subcommand."""
vcf = cyvcf2.VCF(args.vcf, gts012=True)

# NOTE: vcf must be instantiated with gts012=False (the default) due to cyvcf2
# num_unknown property bug https://github.com/brentp/cyvcf2/issues/236
vcf = cyvcf2.VCF(args.vcf, strict_gt=True)

def iterate_with_ambiguity_warning():
"""In several places we want to check for genotype ambiguity as we
iterate over vcf variants, so we define this generator wrapper."""
for variant in vcf:
yield variant
if variant.num_unknown:
logging.warning(
"Ambiguous genotypes found! Continuing by assuming reference genotypes for these variants."
)
break
yield from vcf

if args.population:
spectra_data = Counter()

for variant in vcf:
spectra_data[variant.INFO["mutation_type"]] += 1
for variant in iterate_with_ambiguity_warning():
if variant.aaf:
spectra_data[variant.INFO["mutation_type"]] += 1

spectra = pd.DataFrame(spectra_data, ["population"]).reindex(
sorted(spectra_data), axis="columns"
Expand All @@ -250,30 +264,16 @@ def spectra(args):

else:
spectra_data = defaultdict(lambda: np.zeros_like(vcf.samples, dtype=int))
seen_ambiguous = False
if args.randomize:
for variant in vcf:
random_haplotype = choice(
[x for x, y in enumerate(variant.gt_types) for _ in range(y)]
)
for variant in iterate_with_ambiguity_warning():
counts = np.array([gt.count(1) for gt in variant.genotypes])
rng = np.random.default_rng()
random_haplotype = rng.choice(len(counts), p=counts / counts.sum())
spectra_data[variant.INFO["mutation_type"]][random_haplotype] += 1.0
else:
for variant in vcf:
if variant.ploidy == 1:
# haploid ALT are coded as 2 (homozygous ALT)
variant.gt_types[variant.gt_types == 2] = 1
# from the cyvcf2 docs:
# gt_types is array of 0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT
# gts012 (bool) – if True, then gt_types will be 0=HOM_REF, 1=HET, 2=HOM_ALT, 3=UNKNOWN. If False, 3, 2 are flipped.
# but for our case unknown should be 0, not 3.
ambiguous_genotypes = variant.gt_types == 3
if not seen_ambiguous and any(ambiguous_genotypes):
logging.warning(
"Ambiguous genotypes found! Continuing by assuming reference genotypes for these variants."
)
seen_ambiguous = True
variant.gt_types[ambiguous_genotypes] = 0
spectra_data[variant.INFO["mutation_type"]] += variant.gt_types
for variant in iterate_with_ambiguity_warning():
counts = np.array([gt.count(1) for gt in variant.genotypes])
spectra_data[variant.INFO["mutation_type"]] += counts

spectra = pd.DataFrame(spectra_data, vcf.samples).reindex(
sorted(spectra_data), axis="columns"
Expand Down Expand Up @@ -334,7 +334,7 @@ def get_parser():
"states, and stream to stdout",
)
parser_targets = subparsers.add_parser(
"targets", description="compute 𝑘-mer target sizes and stream to " "stdout"
"targets", description="compute 𝑘-mer target sizes and stream to stdout"
)
parser_spectra = subparsers.add_parser(
"spectra",
Expand Down Expand Up @@ -372,19 +372,19 @@ def get_parser():
"--target",
type=int,
default=None,
help="0-based mutation target position in kmer" " (default middle)",
help="0-based mutation target position in kmer (default middle)",
)
sub_parser.add_argument(
"--sep",
type=str,
default=":",
help="field delimiter in FASTA headers " '(default ":")',
help='field delimiter in FASTA headers (default ":")',
)
sub_parser.add_argument(
"--chrom_pos",
type=int,
default=0,
help="0-based chromosome field position in " "FASTA headers (default 0)",
help="0-based chromosome field position in FASTA headers (default 0)",
)
sub_parser.add_argument(
"--strand_file",
Expand Down Expand Up @@ -422,20 +422,20 @@ def get_parser():

# arguments specific to ancestor subcommand
parser_ancestor.add_argument(
"reference", type=str, help="path to reference FASTA for one " "chromosome"
"reference", type=str, help="path to reference FASTA for one chromosome"
)
parser_ancestor.add_argument(
"outgroup", type=str, help="path to outgroup genome FASTA"
)
parser_ancestor.add_argument(
"chain",
type=str,
help="path to alignment chain file " "(reference to outgroup)",
help="path to alignment chain file (reference to outgroup)",
)
parser_ancestor.add_argument(
"output",
type=str,
help="path for output ancestral FASTA for " "this chromosome",
help="path for output ancestral FASTA for this chromosome",
)
parser_ancestor.set_defaults(func=ancestral_fasta)

Expand All @@ -449,12 +449,12 @@ def get_parser():
parser_spectra.add_argument(
"--population",
action="store_true",
help="population-wise spectrum, instead of " "individual",
help="population-wise spectrum, instead of individual",
)
parser_spectra.add_argument(
"--randomize",
action="store_true",
help="randomly assign mutation to a single " "haplotype",
help="randomly assign mutation to a single haplotype",
)
parser_spectra.set_defaults(func=spectra)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
long_description_content_type='text/markdown',
url='https://github.com/harrispopgen/mutyper',
entry_points={'console_scripts': ['mutyper=mutyper.cli:main']},
# packages=setuptools.find_packages(exclude=['tests', 'docs', 'docs']),
# packages=setuptools.find_packages(exclude=['tests', 'docs']),
packages=['mutyper'],
classifiers=[
'Programming Language :: Python :: 3',
Expand Down
77 changes: 51 additions & 26 deletions tests/test_ancestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,47 +6,72 @@

class TestAncestor(unittest.TestCase):
def setUp(self):
self.anc = Ancestor('tests/test_data/ancestor.fa')
self.anc_stranded = Ancestor('tests/test_data/ancestor.fa',
strand_file='tests/test_data/strandedness.bed')
self.anc = Ancestor("tests/test_data/ancestor.fa")
self.anc_stranded = Ancestor(
"tests/test_data/ancestor.fa",
strand_file="tests/test_data/strandedness.bed",
)

def test_seq(self):
anc_seq = self.anc['foo'][:].seq
anc_seq_true = 'AAACCCgggTTT'
anc_seq = self.anc["foo"][:].seq
anc_seq_true = "AAACCCgggTTT"
self.assertEqual(anc_seq, anc_seq_true)

def test_context(self):
self.assertEqual(list(self.anc.region_contexts('foo')),
[None, 'AAA', 'AAC', 'ACC', 'CCC', None, None, None,
None, None, 'AAA', None])
self.assertEqual(
list(self.anc.region_contexts("foo")),
[
None,
"AAA",
"AAC",
"ACC",
"CCC",
None,
None,
None,
None,
None,
"AAA",
None,
],
)

# same as above but using strand polarized ancestor
self.assertEqual(list(self.anc_stranded.region_contexts('foo')),
[None, 'AAA', 'AAC', 'ACC', 'GGG', None, None, None,
None, None, 'AAA', None])
self.assertEqual(
list(self.anc_stranded.region_contexts("foo")),
[
None,
"AAA",
"AAC",
"ACC",
"GGG",
None,
None,
None,
None,
None,
"AAA",
None,
],
)

self.assertEqual(list(self.anc.region_contexts('foo', 1, 3)),
['AAA', 'AAC'])
self.assertEqual(list(self.anc.region_contexts("foo", 1, 3)), ["AAA", "AAC"])

self.assertEqual(list(self.anc.region_contexts('foo', 9, 11)),
[None, 'AAA'])
self.assertEqual(list(self.anc.region_contexts("foo", 9, 11)), [None, "AAA"])

def test_mutation_type(self):
self.assertEqual(self.anc.mutation_type('foo', 1, 'A', 'T'),
('AAA', 'ATA'))
self.assertEqual(self.anc.mutation_type("foo", 1, "A", "T"), ("AAA", "ATA"))
# infinite sites violation
self.assertEqual(self.anc.mutation_type('foo', 1, 'C', 'T'),
(None, None))
self.assertEqual(self.anc.mutation_type("foo", 1, "C", "T"), (None, None))
# low confidence (lower case) ancestral state
self.assertEqual(self.anc.mutation_type('foo', 7, 'G', 'T'),
(None, None))
self.assertEqual(self.anc.mutation_type("foo", 7, "G", "T"), (None, None))
# reverse complement
self.assertEqual(self.anc.mutation_type('foo', 10, 'G', 'T'),
('AAA', 'ACA'))
self.assertEqual(self.anc.mutation_type("foo", 10, "G", "T"), ("AAA", "ACA"))
# same as above but using strand polarized ancestor
self.assertEqual(self.anc_stranded.mutation_type('foo', 10, 'G', 'T'),
('AAA', 'ACA'))
self.assertEqual(
self.anc_stranded.mutation_type("foo", 10, "G", "T"), ("AAA", "ACA")
)


if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()
98 changes: 98 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#! /usr/bin/env python

import pytest
from mutyper import cli
import argparse
import pandas as pd
import io


def test_spectra(capsys):
args = argparse.Namespace(
vcf="tests/test_data/snps.vcf", population=False, randomize=False
)
cli.spectra(args)
captured = capsys.readouterr()
df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0)
df_target = pd.DataFrame(
{"ACA>ATA": [0, 1], "ACC>ATC": [1, 0], "ACG>ATG": [1, 1], "ACT>ATT": [2, 0]},
index=pd.Index(["sample1", "sample2"], name="sample"),
)
pd.testing.assert_frame_equal(df, df_target)


def test_spectra_randomize(capsys):
args = argparse.Namespace(
vcf="tests/test_data/snps.vcf", population=False, randomize=True
)
cli.spectra(args)
captured = capsys.readouterr()
df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0)
# there are two possibilities due to haplotype randomization
df_target1 = pd.DataFrame(
{"ACA>ATA": [0, 1], "ACC>ATC": [1, 0], "ACG>ATG": [1, 0], "ACT>ATT": [1, 0]},
index=pd.Index(["sample1", "sample2"], name="sample"),
)
df_target2 = pd.DataFrame(
{"ACA>ATA": [0, 1], "ACC>ATC": [1, 0], "ACG>ATG": [0, 1], "ACT>ATT": [1, 0]},
index=pd.Index(["sample1", "sample2"], name="sample"),
)

try:
pd.testing.assert_frame_equal(df, df_target1)
except AssertionError:
pd.testing.assert_frame_equal(df, df_target2)


def test_spectra_haploid(capsys):
args = argparse.Namespace(
vcf="tests/test_data/snps.haploid.vcf", population=False, randomize=False
)
cli.spectra(args)
captured = capsys.readouterr()
df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0)
df_target = pd.DataFrame(
{"ACA>ATA": [0, 1], "ACC>ATC": [0, 0], "ACG>ATG": [1, 1], "ACT>ATT": [1, 0]},
index=pd.Index(["sample1", "sample2"], name="sample"),
)
pd.testing.assert_frame_equal(df, df_target)


def test_spectra_missing_gts(capsys, caplog):
args = argparse.Namespace(
vcf="tests/test_data/snps.missing_gts.vcf", population=False, randomize=False
)
cli.spectra(args)
captured = capsys.readouterr()
df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0)
df_target = pd.DataFrame(
{"ACA>ATA": [0, 1], "ACC>ATC": [1, 0], "ACG>ATG": [1, 1], "ACT>ATT": [1, 1]},
index=pd.Index(["sample1", "sample2"], name="sample"),
)
pd.testing.assert_frame_equal(df, df_target)
assert "Ambiguous genotypes found" in caplog.text


def test_ksfs(capsys):
args = argparse.Namespace(vcf="tests/test_data/snps.vcf", k=3)
cli.ksfs(args)
captured = capsys.readouterr()
df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0)
df_target = pd.DataFrame(
{
"ACA>ATA": [1, 0, 0],
"ACC>ATC": [1, 0, 0],
"ACG>ATG": [0, 1, 0],
"ACT>ATT": [0, 1, 0],
},
index=pd.Index([1, 2, 3], name="sample_frequency"),
)
pd.testing.assert_frame_equal(df, df_target)


def test_ksfs_missing_gts():
args = argparse.Namespace(vcf="tests/test_data/snps.missing_gts.vcf", k=3)
with pytest.raises(
ValueError, match=r"different AN [0-9]* and [0-9]* indicates missing genotypes"
):
cli.ksfs(args)
12 changes: 12 additions & 0 deletions tests/test_data/snps.haploid.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=chr1,length=248387328>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">
##INFO=<ID=mutation_type,Number=1,Type=Character,Description="ancestral 3-mer mutation type">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2
chr1 10 . C T 30 PASS AN=4;AC=1;mutation_type=ACA>ATA GT 0 1
chr1 20 . C T 30 PASS AN=4;AC=1;mutation_type=ACC>ATC GT 0 0
chr1 30 . C T 30 PASS AN=4;AC=1;mutation_type=ACG>ATG GT 1 1
chr1 40 . C T 30 PASS AN=4;AC=2;mutation_type=ACT>ATT GT 1 0
Loading

0 comments on commit 74deb32

Please sign in to comment.