diff --git a/README.md b/README.md index 6f2e67d..5c9a4b5 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,9 @@ A Python package and command line utility for annotating the local ancestral sequence context of biallelic SNPs -![build and test](https://github.com/harrispopgen/mutyper/workflows/build%20and%20test/badge.svg) -![Python package](https://github.com/harrispopgen/mutyper/workflows/Python%20package/badge.svg) +[![build and test](https://github.com/harrispopgen/mutyper/actions/workflows/build-and-test.yml/badge.svg)](https://github.com/harrispopgen/mutyper/actions/workflows/build-and-test.yml) +[![Docs build and deploy](https://github.com/harrispopgen/mutyper/actions/workflows/docs-build-and-deploy.yml/badge.svg)](https://github.com/harrispopgen/mutyper/actions/workflows/docs-build-and-deploy.yml) +[![Python package](https://github.com/harrispopgen/mutyper/actions/workflows/python-publish.yml/badge.svg)](https://github.com/harrispopgen/mutyper/actions/workflows/python-publish.yml) #### See [documentation](https://harrispopgen.github.io/mutyper) for install and usage information. diff --git a/mutyper/cli.py b/mutyper/cli.py index ad387ff..5b75e09 100644 --- a/mutyper/cli.py +++ b/mutyper/cli.py @@ -9,7 +9,6 @@ import numpy as np from shutil import copyfile, copyfileobj import pyfaidx -from random import choice from pyliftover import LiftOver from Bio.Seq import reverse_complement import logging @@ -31,7 +30,6 @@ def setup_ancestor(args): ) -# mrv addition, whole function def is_compressed(file): """returns true if file is compressed.""" f = open(file, "rb") @@ -154,7 +152,7 @@ def variants(args): vcf.add_info_to_header( { "ID": "mutation_type", - "Description": f"ancestral {args.k}-mer mutation " "type", + "Description": f"ancestral {args.k}-mer mutation type", "Type": "Character", "Number": "1", } @@ -232,13 +230,29 @@ def targets(args): def spectra(args): """subroutine for spectra subcommand.""" - vcf = cyvcf2.VCF(args.vcf, gts012=True) + + # NOTE: vcf must be instantiated with gts012=False (the default) due to cyvcf2 + # num_unknown property bug https://github.com/brentp/cyvcf2/issues/236 + vcf = cyvcf2.VCF(args.vcf, strict_gt=True) + + def iterate_with_ambiguity_warning(): + """In several places we want to check for genotype ambiguity as we + iterate over vcf variants, so we define this generator wrapper.""" + for variant in vcf: + yield variant + if variant.num_unknown: + logging.warning( + "Ambiguous genotypes found! Continuing by assuming reference genotypes for these variants." + ) + break + yield from vcf if args.population: spectra_data = Counter() - for variant in vcf: - spectra_data[variant.INFO["mutation_type"]] += 1 + for variant in iterate_with_ambiguity_warning(): + if variant.aaf: + spectra_data[variant.INFO["mutation_type"]] += 1 spectra = pd.DataFrame(spectra_data, ["population"]).reindex( sorted(spectra_data), axis="columns" @@ -250,30 +264,16 @@ def spectra(args): else: spectra_data = defaultdict(lambda: np.zeros_like(vcf.samples, dtype=int)) - seen_ambiguous = False if args.randomize: - for variant in vcf: - random_haplotype = choice( - [x for x, y in enumerate(variant.gt_types) for _ in range(y)] - ) + for variant in iterate_with_ambiguity_warning(): + counts = np.array([gt.count(1) for gt in variant.genotypes]) + rng = np.random.default_rng() + random_haplotype = rng.choice(len(counts), p=counts / counts.sum()) spectra_data[variant.INFO["mutation_type"]][random_haplotype] += 1.0 else: - for variant in vcf: - if variant.ploidy == 1: - # haploid ALT are coded as 2 (homozygous ALT) - variant.gt_types[variant.gt_types == 2] = 1 - # from the cyvcf2 docs: - # gt_types is array of 0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT - # gts012 (bool) – if True, then gt_types will be 0=HOM_REF, 1=HET, 2=HOM_ALT, 3=UNKNOWN. If False, 3, 2 are flipped. - # but for our case unknown should be 0, not 3. - ambiguous_genotypes = variant.gt_types == 3 - if not seen_ambiguous and any(ambiguous_genotypes): - logging.warning( - "Ambiguous genotypes found! Continuing by assuming reference genotypes for these variants." - ) - seen_ambiguous = True - variant.gt_types[ambiguous_genotypes] = 0 - spectra_data[variant.INFO["mutation_type"]] += variant.gt_types + for variant in iterate_with_ambiguity_warning(): + counts = np.array([gt.count(1) for gt in variant.genotypes]) + spectra_data[variant.INFO["mutation_type"]] += counts spectra = pd.DataFrame(spectra_data, vcf.samples).reindex( sorted(spectra_data), axis="columns" @@ -334,7 +334,7 @@ def get_parser(): "states, and stream to stdout", ) parser_targets = subparsers.add_parser( - "targets", description="compute 𝑘-mer target sizes and stream to " "stdout" + "targets", description="compute 𝑘-mer target sizes and stream to stdout" ) parser_spectra = subparsers.add_parser( "spectra", @@ -372,19 +372,19 @@ def get_parser(): "--target", type=int, default=None, - help="0-based mutation target position in kmer" " (default middle)", + help="0-based mutation target position in kmer (default middle)", ) sub_parser.add_argument( "--sep", type=str, default=":", - help="field delimiter in FASTA headers " '(default ":")', + help='field delimiter in FASTA headers (default ":")', ) sub_parser.add_argument( "--chrom_pos", type=int, default=0, - help="0-based chromosome field position in " "FASTA headers (default 0)", + help="0-based chromosome field position in FASTA headers (default 0)", ) sub_parser.add_argument( "--strand_file", @@ -422,7 +422,7 @@ def get_parser(): # arguments specific to ancestor subcommand parser_ancestor.add_argument( - "reference", type=str, help="path to reference FASTA for one " "chromosome" + "reference", type=str, help="path to reference FASTA for one chromosome" ) parser_ancestor.add_argument( "outgroup", type=str, help="path to outgroup genome FASTA" @@ -430,12 +430,12 @@ def get_parser(): parser_ancestor.add_argument( "chain", type=str, - help="path to alignment chain file " "(reference to outgroup)", + help="path to alignment chain file (reference to outgroup)", ) parser_ancestor.add_argument( "output", type=str, - help="path for output ancestral FASTA for " "this chromosome", + help="path for output ancestral FASTA for this chromosome", ) parser_ancestor.set_defaults(func=ancestral_fasta) @@ -449,12 +449,12 @@ def get_parser(): parser_spectra.add_argument( "--population", action="store_true", - help="population-wise spectrum, instead of " "individual", + help="population-wise spectrum, instead of individual", ) parser_spectra.add_argument( "--randomize", action="store_true", - help="randomly assign mutation to a single " "haplotype", + help="randomly assign mutation to a single haplotype", ) parser_spectra.set_defaults(func=spectra) diff --git a/setup.py b/setup.py index c53ea39..883b28e 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ long_description_content_type='text/markdown', url='https://github.com/harrispopgen/mutyper', entry_points={'console_scripts': ['mutyper=mutyper.cli:main']}, - # packages=setuptools.find_packages(exclude=['tests', 'docs', 'docs']), + # packages=setuptools.find_packages(exclude=['tests', 'docs']), packages=['mutyper'], classifiers=[ 'Programming Language :: Python :: 3', diff --git a/tests/test_ancestor.py b/tests/test_ancestor.py index 5a85626..03a0b0c 100644 --- a/tests/test_ancestor.py +++ b/tests/test_ancestor.py @@ -6,47 +6,72 @@ class TestAncestor(unittest.TestCase): def setUp(self): - self.anc = Ancestor('tests/test_data/ancestor.fa') - self.anc_stranded = Ancestor('tests/test_data/ancestor.fa', - strand_file='tests/test_data/strandedness.bed') + self.anc = Ancestor("tests/test_data/ancestor.fa") + self.anc_stranded = Ancestor( + "tests/test_data/ancestor.fa", + strand_file="tests/test_data/strandedness.bed", + ) def test_seq(self): - anc_seq = self.anc['foo'][:].seq - anc_seq_true = 'AAACCCgggTTT' + anc_seq = self.anc["foo"][:].seq + anc_seq_true = "AAACCCgggTTT" self.assertEqual(anc_seq, anc_seq_true) def test_context(self): - self.assertEqual(list(self.anc.region_contexts('foo')), - [None, 'AAA', 'AAC', 'ACC', 'CCC', None, None, None, - None, None, 'AAA', None]) + self.assertEqual( + list(self.anc.region_contexts("foo")), + [ + None, + "AAA", + "AAC", + "ACC", + "CCC", + None, + None, + None, + None, + None, + "AAA", + None, + ], + ) # same as above but using strand polarized ancestor - self.assertEqual(list(self.anc_stranded.region_contexts('foo')), - [None, 'AAA', 'AAC', 'ACC', 'GGG', None, None, None, - None, None, 'AAA', None]) + self.assertEqual( + list(self.anc_stranded.region_contexts("foo")), + [ + None, + "AAA", + "AAC", + "ACC", + "GGG", + None, + None, + None, + None, + None, + "AAA", + None, + ], + ) - self.assertEqual(list(self.anc.region_contexts('foo', 1, 3)), - ['AAA', 'AAC']) + self.assertEqual(list(self.anc.region_contexts("foo", 1, 3)), ["AAA", "AAC"]) - self.assertEqual(list(self.anc.region_contexts('foo', 9, 11)), - [None, 'AAA']) + self.assertEqual(list(self.anc.region_contexts("foo", 9, 11)), [None, "AAA"]) def test_mutation_type(self): - self.assertEqual(self.anc.mutation_type('foo', 1, 'A', 'T'), - ('AAA', 'ATA')) + self.assertEqual(self.anc.mutation_type("foo", 1, "A", "T"), ("AAA", "ATA")) # infinite sites violation - self.assertEqual(self.anc.mutation_type('foo', 1, 'C', 'T'), - (None, None)) + self.assertEqual(self.anc.mutation_type("foo", 1, "C", "T"), (None, None)) # low confidence (lower case) ancestral state - self.assertEqual(self.anc.mutation_type('foo', 7, 'G', 'T'), - (None, None)) + self.assertEqual(self.anc.mutation_type("foo", 7, "G", "T"), (None, None)) # reverse complement - self.assertEqual(self.anc.mutation_type('foo', 10, 'G', 'T'), - ('AAA', 'ACA')) + self.assertEqual(self.anc.mutation_type("foo", 10, "G", "T"), ("AAA", "ACA")) # same as above but using strand polarized ancestor - self.assertEqual(self.anc_stranded.mutation_type('foo', 10, 'G', 'T'), - ('AAA', 'ACA')) + self.assertEqual( + self.anc_stranded.mutation_type("foo", 10, "G", "T"), ("AAA", "ACA") + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..8e691e3 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,98 @@ +#! /usr/bin/env python + +import pytest +from mutyper import cli +import argparse +import pandas as pd +import io + + +def test_spectra(capsys): + args = argparse.Namespace( + vcf="tests/test_data/snps.vcf", population=False, randomize=False + ) + cli.spectra(args) + captured = capsys.readouterr() + df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0) + df_target = pd.DataFrame( + {"ACA>ATA": [0, 1], "ACC>ATC": [1, 0], "ACG>ATG": [1, 1], "ACT>ATT": [2, 0]}, + index=pd.Index(["sample1", "sample2"], name="sample"), + ) + pd.testing.assert_frame_equal(df, df_target) + + +def test_spectra_randomize(capsys): + args = argparse.Namespace( + vcf="tests/test_data/snps.vcf", population=False, randomize=True + ) + cli.spectra(args) + captured = capsys.readouterr() + df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0) + # there are two possibilities due to haplotype randomization + df_target1 = pd.DataFrame( + {"ACA>ATA": [0, 1], "ACC>ATC": [1, 0], "ACG>ATG": [1, 0], "ACT>ATT": [1, 0]}, + index=pd.Index(["sample1", "sample2"], name="sample"), + ) + df_target2 = pd.DataFrame( + {"ACA>ATA": [0, 1], "ACC>ATC": [1, 0], "ACG>ATG": [0, 1], "ACT>ATT": [1, 0]}, + index=pd.Index(["sample1", "sample2"], name="sample"), + ) + + try: + pd.testing.assert_frame_equal(df, df_target1) + except AssertionError: + pd.testing.assert_frame_equal(df, df_target2) + + +def test_spectra_haploid(capsys): + args = argparse.Namespace( + vcf="tests/test_data/snps.haploid.vcf", population=False, randomize=False + ) + cli.spectra(args) + captured = capsys.readouterr() + df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0) + df_target = pd.DataFrame( + {"ACA>ATA": [0, 1], "ACC>ATC": [0, 0], "ACG>ATG": [1, 1], "ACT>ATT": [1, 0]}, + index=pd.Index(["sample1", "sample2"], name="sample"), + ) + pd.testing.assert_frame_equal(df, df_target) + + +def test_spectra_missing_gts(capsys, caplog): + args = argparse.Namespace( + vcf="tests/test_data/snps.missing_gts.vcf", population=False, randomize=False + ) + cli.spectra(args) + captured = capsys.readouterr() + df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0) + df_target = pd.DataFrame( + {"ACA>ATA": [0, 1], "ACC>ATC": [1, 0], "ACG>ATG": [1, 1], "ACT>ATT": [1, 1]}, + index=pd.Index(["sample1", "sample2"], name="sample"), + ) + pd.testing.assert_frame_equal(df, df_target) + assert "Ambiguous genotypes found" in caplog.text + + +def test_ksfs(capsys): + args = argparse.Namespace(vcf="tests/test_data/snps.vcf", k=3) + cli.ksfs(args) + captured = capsys.readouterr() + df = pd.read_csv(io.StringIO(captured.out), sep="\t", index_col=0) + df_target = pd.DataFrame( + { + "ACA>ATA": [1, 0, 0], + "ACC>ATC": [1, 0, 0], + "ACG>ATG": [0, 1, 0], + "ACT>ATT": [0, 1, 0], + }, + index=pd.Index([1, 2, 3], name="sample_frequency"), + ) + pd.testing.assert_frame_equal(df, df_target) + + +def test_ksfs_missing_gts(): + args = argparse.Namespace(vcf="tests/test_data/snps.missing_gts.vcf", k=3) + with pytest.raises( + ValueError, match=r"different AN [0-9]* and [0-9]* indicates missing genotypes" + ): + cli.ksfs(args) diff --git a/tests/test_data/snps.haploid.vcf b/tests/test_data/snps.haploid.vcf new file mode 100644 index 0000000..3729728 --- /dev/null +++ b/tests/test_data/snps.haploid.vcf @@ -0,0 +1,12 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##FORMAT= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 +chr1 10 . C T 30 PASS AN=4;AC=1;mutation_type=ACA>ATA GT 0 1 +chr1 20 . C T 30 PASS AN=4;AC=1;mutation_type=ACC>ATC GT 0 0 +chr1 30 . C T 30 PASS AN=4;AC=1;mutation_type=ACG>ATG GT 1 1 +chr1 40 . C T 30 PASS AN=4;AC=2;mutation_type=ACT>ATT GT 1 0 diff --git a/tests/test_data/snps.missing_gts.vcf b/tests/test_data/snps.missing_gts.vcf new file mode 100644 index 0000000..6435699 --- /dev/null +++ b/tests/test_data/snps.missing_gts.vcf @@ -0,0 +1,13 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##FORMAT= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 +chr1 10 . C T 30 PASS AN=4;AC=1;mutation_type=ACA>ATA GT 0/0 1/0 +chr1 20 . C T 30 PASS AN=4;AC=1;mutation_type=ACC>ATC GT 0/1 0/0 +chr1 30 . C T 30 PASS AN=3;AC=2;mutation_type=ACG>ATG GT 1/. 1/0 +chr1 40 . C T 30 PASS AN=3;AC=1;mutation_type=ACT>ATT GT ./1 0/0 +chr1 50 . C T 30 PASS AN=2;AC=1;mutation_type=ACT>ATT GT ./. 1/0 diff --git a/tests/test_data/snps.vcf b/tests/test_data/snps.vcf new file mode 100644 index 0000000..738342b --- /dev/null +++ b/tests/test_data/snps.vcf @@ -0,0 +1,12 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##FORMAT= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 +chr1 10 . C T 30 PASS AN=4;AC=1;mutation_type=ACA>ATA GT 0/0 1/0 +chr1 20 . C T 30 PASS AN=4;AC=1;mutation_type=ACC>ATC GT 0/1 0/0 +chr1 30 . C T 30 PASS AN=4;AC=2;mutation_type=ACG>ATG GT 1/0 1/0 +chr1 40 . C T 30 PASS AN=4;AC=2;mutation_type=ACT>ATT GT 1/1 0/0