diff --git a/sourmash/_minhash.pxd b/sourmash/_minhash.pxd index 1d6fe3a2dd..5b0aaf9e3f 100644 --- a/sourmash/_minhash.pxd +++ b/sourmash/_minhash.pxd @@ -25,15 +25,18 @@ cdef extern from "kmer_min_hash.hh": const unsigned int num; const unsigned int ksize; const bool is_protein; + const bool dayhoff; const HashIntoType max_hash; CMinHashType mins; - KmerMinHash(unsigned int, unsigned int, bool, uint32_t, HashIntoType) + KmerMinHash(unsigned int, unsigned int, bool, bool, uint32_t, HashIntoType) void add_hash(HashIntoType) except +ValueError void remove_hash(HashIntoType) except +ValueError void add_word(string word) except +ValueError void add_sequence(const char *, bool) except +ValueError void merge(const KmerMinHash&) except +ValueError + string aa_to_dayhoff(string aa) except +ValueError + string translate_codon(string codon) except +ValueError unsigned int count_common(const KmerMinHash&) except +ValueError unsigned long size() @@ -41,13 +44,15 @@ cdef extern from "kmer_min_hash.hh": cdef cppclass KmerMinAbundance(KmerMinHash): CMinHashType abunds; - KmerMinAbundance(unsigned int, unsigned int, bool, uint32_t, HashIntoType) + KmerMinAbundance(unsigned int, unsigned int, bool, bool, uint32_t, HashIntoType) void add_hash(HashIntoType) except +ValueError void remove_hash(HashIntoType) except +ValueError void add_word(string word) except +ValueError void add_sequence(const char *, bool) except +ValueError void merge(const KmerMinAbundance&) except +ValueError void merge(const KmerMinHash&) except +ValueError + string aa_to_dayhoff(string aa) except +ValueError + string translate_codon(string codon) except +ValueError unsigned int count_common(const KmerMinAbundance&) except +ValueError unsigned long size() diff --git a/sourmash/_minhash.pyx b/sourmash/_minhash.pyx index af9471c305..beaff29f1c 100644 --- a/sourmash/_minhash.pyx +++ b/sourmash/_minhash.pyx @@ -45,11 +45,15 @@ def get_scaled_for_max_hash(max_hash): cdef bytes to_bytes(s): - if not isinstance(s, (basestring, bytes)): + # Allow for strings, bytes or int + # Single item of byte string = int + if not isinstance(s, (basestring, bytes, int)): raise TypeError("Requires a string-like sequence") if isinstance(s, unicode): s = s.encode('utf-8') + if isinstance(s, int): + s = bytes([s]) return s @@ -88,6 +92,7 @@ cdef class MinHash(object): def __init__(self, unsigned int n, unsigned int ksize, bool is_protein=False, + bool dayhoff=False, bool track_abundance=False, uint32_t seed=MINHASH_DEFAULT_SEED, HashIntoType max_hash=0, @@ -107,9 +112,9 @@ cdef class MinHash(object): cdef KmerMinHash *mh = NULL if track_abundance: - mh = new KmerMinAbundance(n, ksize, is_protein, seed, max_hash) + mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, seed, max_hash) else: - mh = new KmerMinHash(n, ksize, is_protein, seed, max_hash) + mh = new KmerMinHash(n, ksize, is_protein, dayhoff, seed, max_hash) self._this.reset(mh) @@ -122,7 +127,8 @@ cdef class MinHash(object): def __copy__(self): a = MinHash(deref(self._this).num, deref(self._this).ksize, - deref(self._this).is_protein, self.track_abundance, + deref(self._this).is_protein, deref(self._this).dayhoff, + self.track_abundance, deref(self._this).seed, deref(self._this).max_hash) a.merge(self) return a @@ -135,23 +141,24 @@ cdef class MinHash(object): return (deref(self._this).num, deref(self._this).ksize, deref(self._this).is_protein, + deref(self._this).dayhoff, self.get_mins(with_abundance=with_abundance), None, self.track_abundance, deref(self._this).max_hash, deref(self._this).seed) def __setstate__(self, tup): - (n, ksize, is_protein, mins, _, track_abundance, max_hash, seed) =\ + (n, ksize, is_protein, dayhoff, mins, _, track_abundance, max_hash, seed) =\ tup self.track_abundance = track_abundance cdef KmerMinHash *mh = NULL if track_abundance: - mh = new KmerMinAbundance(n, ksize, is_protein, seed, max_hash) + mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, seed, max_hash) self._this.reset(mh) self.set_abundances(mins) else: - mh = new KmerMinHash(n, ksize, is_protein, seed, max_hash) + mh = new KmerMinHash(n, ksize, is_protein, dayhoff, seed, max_hash) self._this.reset(mh) self.add_many(mins) @@ -160,6 +167,7 @@ cdef class MinHash(object): (deref(self._this).num, deref(self._this).ksize, deref(self._this).is_protein, + deref(self._this).dayhoff, self.track_abundance, deref(self._this).seed, deref(self._this).max_hash, @@ -173,7 +181,8 @@ cdef class MinHash(object): def copy_and_clear(self): a = MinHash(deref(self._this).num, deref(self._this).ksize, - deref(self._this).is_protein, self.track_abundance, + deref(self._this).is_protein, deref(self._this).dayhoff, + self.track_abundance, deref(self._this).seed, deref(self._this).max_hash) return a @@ -234,6 +243,10 @@ cdef class MinHash(object): def is_protein(self): return deref(self._this).is_protein + @property + def dayhoff(self): + return deref(self._this).dayhoff + @property def ksize(self): return deref(self._this).ksize @@ -247,6 +260,9 @@ cdef class MinHash(object): def add_hash(self, uint64_t h): deref(self._this).add_hash(h) + def translate_codon(self, codon): + return deref(self._this).translate_codon(to_bytes(codon)) + def count_common(self, MinHash other): return deref(self._this).count_common(deref(other._this)) @@ -255,7 +271,8 @@ cdef class MinHash(object): raise ValueError('new sample n is higher than current sample n') a = MinHash(new_num, deref(self._this).ksize, - deref(self._this).is_protein, self.track_abundance, + deref(self._this).is_protein, deref(self._this).dayhoff, + self.track_abundance, deref(self._this).seed, 0) if self.track_abundance: a.set_abundances(self.get_mins(with_abundance=True)) @@ -286,7 +303,8 @@ cdef class MinHash(object): new_max_hash = get_max_hash_for_scaled(new_num) a = MinHash(0, deref(self._this).ksize, - deref(self._this).is_protein, self.track_abundance, + deref(self._this).is_protein, deref(self._this).dayhoff, + self.track_abundance, deref(self._this).seed, new_max_hash) if self.track_abundance: a.set_abundances(self.get_mins(with_abundance=True)) @@ -307,6 +325,7 @@ cdef class MinHash(object): combined_mh = new KmerMinAbundance(num, deref(self._this).ksize, deref(self._this).is_protein, + deref(self._this).dayhoff, deref(self._this).seed, deref(self._this).max_hash) @@ -314,6 +333,7 @@ cdef class MinHash(object): combined_mh = new KmerMinHash(num, deref(self._this).ksize, deref(self._this).is_protein, + deref(self._this).dayhoff, deref(self._this).seed, deref(self._this).max_hash) @@ -424,12 +444,27 @@ cdef class MinHash(object): if not deref(self._this).is_protein: raise ValueError("cannot add amino acid sequence to DNA MinHash!") - for i in range(0, len(sequence) - ksize + 1): - deref(self._this).add_word(to_bytes(sequence[i:i + ksize])) + aa_kmers = (sequence[i:i + ksize] for i in range(0, len(sequence) - ksize + 1)) + if not self.dayhoff: + for aa_kmer in aa_kmers: + deref(self._this).add_word(to_bytes(aa_kmer)) + else: + for aa_kmer in aa_kmers: + dayhoff_kmer = '' + for aa in aa_kmer: + dayhoff_letter = deref(self._this).aa_to_dayhoff(to_bytes(aa)) + dayhoff_kmer += dayhoff_letter + # dayhoff_kmer = ''.join( for aa in aa_kmer) + deref(self._this).add_word(to_bytes(dayhoff_kmer)) def is_molecule_type(self, molecule): if molecule.upper() == 'DNA' and not self.is_protein: return True - if molecule == 'protein' and self.is_protein: - return True + if self.is_protein: + if self.dayhoff: + if molecule == 'dayhoff': + return True + else: + if molecule == 'protein': + return True return False diff --git a/sourmash/commands.py b/sourmash/commands.py index 84a206d59e..32810f02da 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -149,14 +149,22 @@ def compute(args): if args.dna and args.protein: notify('Computing both nucleotide and protein signatures.') num_sigs = 2*len(ksizes) + elif args.dna and args.dayhoff: + notify('Computing both nucleotide and Dayhoff-encoded protein ' + 'signatures.') + num_sigs = 2*len(ksizes) elif args.dna: notify('Computing only nucleotide (and not protein) signatures.') num_sigs = len(ksizes) elif args.protein: notify('Computing only protein (and not nucleotide) signatures.') num_sigs = len(ksizes) + elif args.dayhoff: + notify('Computing only Dayhoff-encoded protein (and not nucleotide) ' + 'signatures.') + num_sigs = len(ksizes) - if args.protein and not args.input_is_protein: + if (args.protein or args.dayhoff) and not args.input_is_protein: bad_ksizes = [ str(k) for k in ksizes if k % 3 != 0 ] if bad_ksizes: error('protein ksizes must be divisible by 3, sorry!') @@ -182,6 +190,15 @@ def make_minhashes(): if args.protein: E = MinHash(ksize=k, n=args.num_hashes, is_protein=True, + dayhoff=False, + track_abundance=args.track_abundance, + scaled=args.scaled, + seed=seed) + Elist.append(E) + if args.dayhoff: + E = MinHash(ksize=k, n=args.num_hashes, + is_protein=True, + dayhoff=True, track_abundance=args.track_abundance, scaled=args.scaled, seed=seed) @@ -189,6 +206,7 @@ def make_minhashes(): if args.dna: E = MinHash(ksize=k, n=args.num_hashes, is_protein=False, + dayhoff=False, track_abundance=args.track_abundance, scaled=args.scaled, seed=seed) @@ -1319,9 +1337,15 @@ def watch(args): if args.dna: moltype = 'DNA' is_protein = False - else: + dayhoff = False + elif args.protein: moltype = 'protein' is_protein = True + dayhoff = False + else: + moltype = 'dayhoff' + is_protein = True + dayhoff = True tree = load_sbt_index(args.sbt_name) @@ -1332,7 +1356,7 @@ def watch(args): tree_mh = leaf.data.minhash ksize = tree_mh.ksize - E = MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein) + E = MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein, dayhoff=dayhoff) streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name) notify('Computing signature for k={}, {} from stdin', ksize, moltype) diff --git a/sourmash/kmer_min_hash.hh b/sourmash/kmer_min_hash.hh index 09ab787356..a85b5c0c20 100644 --- a/sourmash/kmer_min_hash.hh +++ b/sourmash/kmer_min_hash.hh @@ -2,6 +2,7 @@ #define KMER_MIN_HASH_HH #include +#include #include #include #include @@ -59,13 +60,14 @@ public: const unsigned int num; const unsigned int ksize; const bool is_protein; + const bool dayhoff; const uint32_t seed; const HashIntoType max_hash; CMinHashType mins; - KmerMinHash(unsigned int n, unsigned int k, bool prot, uint32_t s, + KmerMinHash(unsigned int n, unsigned int k, bool prot, bool dyhoff, uint32_t s, HashIntoType mx) - : num(n), ksize(k), is_protein(prot), seed(s), max_hash(mx) { + : num(n), ksize(k), is_protein(prot), dayhoff(dyhoff), seed(s), max_hash(mx) { if (n > 0) { mins.reserve(num + 1); } @@ -82,6 +84,9 @@ public: if (is_protein != other.is_protein) { throw minhash_exception("DNA/prot minhashes cannot be compared"); } + if (dayhoff != other.dayhoff) { + throw minhash_exception("DNA/prot minhashes cannot be compared"); + } if (max_hash != other.max_hash) { throw minhash_exception("mismatch in max_hash; comparison fail"); } @@ -177,19 +182,54 @@ public: } } - std::string _dna_to_aa(const std::string& dna) { - std::string aa; - unsigned int dna_size = (dna.size() / 3) * 3; // floor it - for (unsigned int j = 0; j < dna_size; j += 3) { - std::string codon = dna.substr(j, 3); + std::string translate_codon(std::string& codon) { + std::string residue; + + if (codon.length() >= 2 && codon.length() <= 3){ + // If codon is length 2, pad with an N for ambiguous codon amino acids + if (codon.length() == 2) { + codon += "N"; + } auto translated = _codon_table.find(codon); + if (translated != _codon_table.end()) { // "second" is the element mapped to by the codon - aa += translated -> second; + // Because .find returns an iterator + residue = translated -> second; } else { // Otherwise, assign the "X" or "unknown" amino acid - aa += "X"; + residue = "X"; } + } else if (codon.length() == 1){ + // Then we only have one nucleotides and the amino acid is unknown + residue = "X"; + } else { + std::string msg = "Codon is invalid length: "; + msg += codon; + throw minhash_exception(msg); + } + return residue; + } + + std::string _dna_to_aa(const std::string& dna) { + std::string aa; + std::string codon; + std::string residue; + unsigned int dna_size = (dna.size() / 3) * 3; // floor it + for (unsigned int j = 0; j < dna_size; j += 3) { + + codon = dna.substr(j, 3); + + residue = translate_codon(codon); + + // Use dayhoff encoding of amino acids + if (dayhoff) { + std::string new_letter = aa_to_dayhoff(residue); + aa += new_letter; + } else { + aa += residue; + } + } return aa; } @@ -226,6 +266,22 @@ public: return out; } + std::string aa_to_dayhoff(const std::string& aa) const { + // Convert an amino acid letter to dayhoff encoding + std::string new_letter; + + auto dayhoff_encoded = _dayhoff_table.find(aa); + if (dayhoff_encoded != _dayhoff_table.end()) { + // "second" is the element mapped to by the codon + // Because .find returns an iterator + new_letter = dayhoff_encoded -> second; + } else { + // Otherwise, assign the "X" or "unknown" amino acid + new_letter = "X"; + } + return new_letter; + } + virtual void merge(const KmerMinHash& other) { check_compatible(other); @@ -263,7 +319,7 @@ private: {"TTT", "F"}, {"TTC", "F"}, {"TTA", "L"}, {"TTG", "L"}, - {"TCT", "S"}, {"TCC", "S"}, {"TCA", "S"}, {"TCG", "S"}, + {"TCT", "S"}, {"TCC", "S"}, {"TCA", "S"}, {"TCG", "S"}, {"TCN", "S"}, {"TAT", "Y"}, {"TAC", "Y"}, {"TAA", "*"}, {"TAG", "*"}, @@ -272,19 +328,19 @@ private: {"TGA", "*"}, {"TGG", "W"}, - {"CTT", "L"}, {"CTC", "L"}, {"CTA", "L"}, {"CTG", "L"}, + {"CTT", "L"}, {"CTC", "L"}, {"CTA", "L"}, {"CTG", "L"}, {"CTN", "L"}, - {"CCT", "P"}, {"CCC", "P"}, {"CCA", "P"}, {"CCG", "P"}, + {"CCT", "P"}, {"CCC", "P"}, {"CCA", "P"}, {"CCG", "P"}, {"CCN", "P"}, {"CAT", "H"}, {"CAC", "H"}, {"CAA", "Q"}, {"CAG", "Q"}, - {"CGT", "R"}, {"CGC", "R"}, {"CGA", "R"}, {"CGG", "R"}, + {"CGT", "R"}, {"CGC", "R"}, {"CGA", "R"}, {"CGG", "R"}, {"CGN", "R"}, {"ATT", "I"}, {"ATC", "I"}, {"ATA", "I"}, {"ATG", "M"}, - {"ACT", "T"}, {"ACC", "T"}, {"ACA", "T"}, {"ACG", "T"}, + {"ACT", "T"}, {"ACC", "T"}, {"ACA", "T"}, {"ACG", "T"}, {"ACN", "T"}, {"AAT", "N"}, {"AAC", "N"}, {"AAA", "K"}, {"AAG", "K"}, @@ -292,24 +348,62 @@ private: {"AGT", "S"}, {"AGC", "S"}, {"AGA", "R"}, {"AGG", "R"}, - {"GTT", "V"}, {"GTC", "V"}, {"GTA", "V"}, {"GTG", "V"}, + {"GTT", "V"}, {"GTC", "V"}, {"GTA", "V"}, {"GTG", "V"}, {"GTN", "V"}, - {"GCT", "A"}, {"GCC", "A"}, {"GCA", "A"}, {"GCG", "A"}, + {"GCT", "A"}, {"GCC", "A"}, {"GCA", "A"}, {"GCG", "A"}, {"GCN", "A"}, {"GAT", "D"}, {"GAC", "D"}, {"GAA", "E"}, {"GAG", "E"}, - {"GGT", "G"}, {"GGC", "G"}, {"GGA", "G"}, {"GGG", "G"} + {"GGT", "G"}, {"GGC", "G"}, {"GGA", "G"}, {"GGG", "G"}, {"GGN", "G"} + }; + + +// Dayhoff table from +// Peris, P., López, D., & Campos, M. (2008). +// IgTM: An algorithm to predict transmembrane domains and topology in +// proteins. BMC Bioinformatics, 9(1), 1029–11. +// http://doi.org/10.1186/1471-2105-9-367 +// +// Original source: +// Dayhoff M. O., Schwartz R. M., Orcutt B. C. (1978). +// A model of evolutionary change in proteins, +// in Atlas of Protein Sequence and Structure, +// ed Dayhoff M. O., editor. +// (Washington, DC: National Biomedical Research Foundation; ), 345–352. +// +// | Amino acid | Property | Dayhoff | +// |---------------|-----------------------|---------| +// | C | Sulfur polymerization | a | +// | A, G, P, S, T | Small | b | +// | D, E, N, Q | Acid and amide | c | +// | H, K, R | Basic | d | +// | I, L, M, V | Hydrophobic | e | +// | F, W, Y | Aromatic | f | + std::map _dayhoff_table = { + {"C", "a"}, + + {"A", "b"}, {"G", "b"}, {"P", "b"}, {"S", "b"}, {"T", "b"}, + + {"D", "c"}, {"E", "c"}, {"N", "c"}, {"Q", "c"}, + + {"H", "d"}, {"K", "d"}, {"R", "d"}, + + {"I", "e"}, {"L", "e"}, {"M", "e"}, {"V", "e"}, + + {"F", "f"}, {"W", "f"}, {"Y", "f"} + }; + }; class KmerMinAbundance: public KmerMinHash { public: CMinHashType abunds; - KmerMinAbundance(unsigned int n, unsigned int k, bool prot, uint32_t seed, - HashIntoType mx) : - KmerMinHash(n, k, prot, seed, mx) { }; + KmerMinAbundance(unsigned int n, unsigned int k, bool prot, bool dayhoff, + uint32_t seed, HashIntoType mx) : + KmerMinHash(n, k, prot, dayhoff, seed, mx) { }; virtual void add_hash(HashIntoType h) { if ((max_hash and h <= max_hash) or not max_hash) { diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py index e08761387b..295dcc3477 100644 --- a/sourmash/sig/__main__.py +++ b/sourmash/sig/__main__.py @@ -46,7 +46,8 @@ def _check_abundance_compatibility(sig1, sig2): def _flatten(mh): "turn off track_abundance on a MinHash object" mh_params = list(mh.__getstate__()) - mh_params[5] = False + # Abundance is 6th parameter + mh_params[6] = False mh.__setstate__(mh_params) assert not mh.track_abundance @@ -54,8 +55,10 @@ def _flatten(mh): def _set_num_scaled(mh, num, scaled): "set num and scaled values on a MinHash object" mh_params = list(mh.__getstate__()) + # Number of hashes is 0th parameter mh_params[0] = num - mh_params[6] = get_max_hash_for_scaled(scaled) + # Scale is 7th parameter + mh_params[7] = get_max_hash_for_scaled(scaled) mh.__setstate__(mh_params) assert mh.num == num assert mh.scaled == scaled @@ -111,7 +114,10 @@ def describe(args): ksize = mh.ksize moltype = 'DNA' if mh.is_protein: - moltype = 'protein' + if mh.dayhoff: + moltype = 'dayhoff' + else: + moltype = 'protein' scaled = mh.scaled num = mh.num seed = mh.seed diff --git a/sourmash/signature.py b/sourmash/signature.py index d4fd4ab3fe..53d23d402b 100644 --- a/sourmash/signature.py +++ b/sourmash/signature.py @@ -99,8 +99,10 @@ def _save(self): sketch['mins'] = list(map(int, minhash.get_mins())) sketch['md5sum'] = self.md5sum() - if minhash.is_protein: + if minhash.is_protein and not minhash.dayhoff: sketch['molecule'] = 'protein' + elif minhash.dayhoff: + sketch['molecule'] = 'dayhoff' else: sketch['molecule'] = 'DNA' diff --git a/sourmash/signature_json.py b/sourmash/signature_json.py index b7c53915fe..851e64e139 100644 --- a/sourmash/signature_json.py +++ b/sourmash/signature_json.py @@ -82,16 +82,23 @@ def _json_next_signature(iterable, molecule = d.get('molecule', 'DNA') if molecule == 'protein': is_protein = True + dayhoff = False + elif molecule == "dayhoff": + is_protein = True + dayhoff = True elif molecule.upper() == 'DNA': is_protein = False + dayhoff = False else: raise Exception("unknown molecule type: {}".format(molecule)) + track_abundance = False if 'abundances' in d: track_abundance = True e = MinHash(ksize=ksize, n=n, is_protein=is_protein, + dayhoff=dayhoff, track_abundance=track_abundance, max_hash=max_hash, seed=seed) diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py index 214f6086f7..93562002ce 100644 --- a/sourmash/sourmash_args.py +++ b/sourmash/sourmash_args.py @@ -53,6 +53,13 @@ def add_moltype_args(parser): help='do not choose a protein signature') parser.set_defaults(protein=False) + parser.add_argument('--dayhoff', dest='dayhoff', action='store_true', + help='build Dayhoff-encoded amino acid signatures (default: False)') + parser.add_argument('--no-dayhoff', dest='dayhoff', + action='store_false', + help='do not build Dayhoff-encoded amino acid signatures') + parser.set_defaults(dayhoff=False) + parser.add_argument('--dna', '--rna', dest='dna', default=None, action='store_true', help='choose a nucleotide signature (default: True)') @@ -70,6 +77,13 @@ def add_construct_moltype_args(parser): help='do not build protein signatures') parser.set_defaults(protein=False) + parser.add_argument('--dayhoff', dest='dayhoff', action='store_true', + help='build Dayhoff-encoded amino acid signatures (default: False)') + parser.add_argument('--no-dayhoff', dest='dayhoff', + action='store_false', + help='do not build Dayhoff-encoded amino acid signatures') + parser.set_defaults(dayhoff=False) + parser.add_argument('--dna', '--rna', dest='dna', default=None, action='store_true', help='build nucleotide signatures (default: True)') @@ -89,6 +103,8 @@ def get_moltype(sig, require=False): moltype = 'DNA' elif sig.minhash.is_molecule_type('protein'): moltype = 'protein' + elif sig.minhash.is_molecule_type('dayhoff'): + moltype = 'dayhoff' else: raise ValueError('unknown molecule type for sig {}'.format(sig.name())) @@ -107,6 +123,8 @@ def calculate_moltype(args, default=None): moltype = 'protein' elif args.dna: moltype = 'DNA' + elif args.dayhoff: + moltype = 'dayhoff' return moltype diff --git a/tests/conftest.py b/tests/conftest.py index 37ac98a848..28d42ac5c5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,6 +6,35 @@ def track_abundance(request): return request.param +@pytest.fixture(params=[True, False]) +def dayhoff(request): + return request.param + + @pytest.fixture(params=[2, 5, 10]) def n_children(request): return request.param + + +# --- BEGIN - Only run tests using a particular fixture --- # +# Cribbed from: http://pythontesting.net/framework/pytest/pytest-run-tests-using-particular-fixture/ +def pytest_collection_modifyitems(items, config): + fixture_name = config.option.usesfixture + if fixture_name is not None: + selected_items = [] + deselected_items = [] + + for item in items: + if fixture_name in getattr(item, 'fixturenames', ()): + selected_items.append(item) + else: + deselected_items.append(item) + config.hook.pytest_deselected(items=deselected_items) + items[:] = selected_items + +def pytest_addoption(parser): + parser.addoption("--usesfixture", + action="store", + default=None, + help="just run tests that use a particular fixture") +# --- END - Only run tests using a particular fixture --- # diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 2f102fe05e..4508f9440b 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -107,9 +107,9 @@ def test_bytes_dna(track_abundance): assert len(b) == 1 -def test_bytes_protein(track_abundance): +def test_bytes_protein(track_abundance, dayhoff): # verify that we can hash protein/aa sequences - mh = MinHash(10, 6, True, track_abundance=track_abundance) + mh = MinHash(10, 6, True, dayhoff=dayhoff, track_abundance=track_abundance) mh.add_protein('AGYYG') mh.add_protein(u'AGYYG') mh.add_protein(b'AGYYG') @@ -117,14 +117,41 @@ def test_bytes_protein(track_abundance): assert len(mh.get_mins()) == 4 -def test_protein(track_abundance): +def test_protein(track_abundance, dayhoff): # verify that we can hash protein/aa sequences - mh = MinHash(10, 6, True, track_abundance=track_abundance) + mh = MinHash(10, 6, True, dayhoff=dayhoff, track_abundance=track_abundance) mh.add_protein('AGYYG') assert len(mh.get_mins()) == 4 +def test_translate_codon(track_abundance): + # Ensure that translation occurs properly + mh = MinHash(10, 6, is_protein=True) + assert "S" == mh.translate_codon('TCT') + assert "S" == mh.translate_codon('TC') + assert "X" == mh.translate_codon("T") + + with pytest.raises(ValueError): + mh.translate_codon("") + mh.translate_codon("TCTA") + + +def test_dayhoff(track_abundance): + # verify that we can hash to dayhoff-encoded protein/aa sequences + mh_dayhoff = MinHash(10, 6, is_protein=True, + dayhoff=True, track_abundance=track_abundance) + mh_dayhoff.add_sequence('ACTGAC') + + assert len(mh_dayhoff.get_mins()) == 2 + # verify that dayhoff-encoded hashes are different from protein/aa hashes + mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance) + mh_protein.add_sequence('ACTGAC') + + assert len(mh_protein.get_mins()) == 2 + assert mh_protein.get_mins() != mh_dayhoff.get_mins() + + def test_protein_short(track_abundance): # verify that we can hash protein/aa sequences mh = MinHash(10, 9, True, track_abundance=track_abundance) diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 57b9adad4f..16f3a8ca30 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -4,6 +4,7 @@ from __future__ import print_function, unicode_literals import csv import shutil +import os import pytest @@ -623,6 +624,85 @@ def test_sig_describe_1(c): for line in expected_output: assert line.strip() in out +@utils.in_tempdir +def test_sig_describe_1_dayhoff(c): + # get basic info on a signature + testdata = utils.get_test_data('short.fa') + c.run_sourmash('compute', '-k', '21,30', + '--dayhoff', '--protein', + '--dna', + testdata) + # stdout should be new signature + computed_sig = os.path.join(c.location, 'short.fa.sig') + c.run_sourmash('sig', 'describe', computed_sig) + + out = c.last_result.out + print(c.last_result) + + # Add final trailing slash for this OS + testdata_dirname = os.path.dirname(testdata) + os.sep + location = c.location + os.sep + + expected_output = """\ +--- +signature filename: short.fa.sig +signature: short.fa +source file: short.fa +md5: e45a080101751e044d6df861d3d0f3fd +k=21 molecule=protein num=500 scaled=0 seed=42 track_abundance=0 +size: 500 +signature license: CC0 + +--- +signature filename: short.fa.sig +signature: short.fa +source file: short.fa +md5: ef4fa1f3a90f3873187370f1eacc0d9a +k=21 molecule=dayhoff num=500 scaled=0 seed=42 track_abundance=0 +size: 500 +signature license: CC0 + +--- +signature filename: short.fa.sig +signature: short.fa +source file: short.fa +md5: 1136a8a68420bd93683e45cdaf109b80 +k=21 molecule=DNA num=500 scaled=0 seed=42 track_abundance=0 +size: 500 +signature license: CC0 + +--- +signature filename: short.fa.sig +signature: short.fa +source file: short.fa +md5: 4244d1612598af044e799587132f007e +k=30 molecule=protein num=500 scaled=0 seed=42 track_abundance=0 +size: 500 +signature license: CC0 + +--- +signature filename: short.fa.sig +signature: short.fa +source file: short.fa +md5: 5647819f2eac913e04af51c8d548ad56 +k=30 molecule=dayhoff num=500 scaled=0 seed=42 track_abundance=0 +size: 500 +signature license: CC0 + +--- +signature filename: short.fa.sig +signature: short.fa +source file: short.fa +md5: 71f7c111c01785e5f38efad45b00a0e1 +k=30 molecule=DNA num=500 scaled=0 seed=42 track_abundance=0 +size: 500 +signature license: CC0 +""".splitlines() + for line in out.splitlines(): + cleaned_line = line.strip().replace( + testdata_dirname, '').replace(location, '') + assert cleaned_line in expected_output + @utils.in_tempdir def test_sig_describe_1_multisig(c): diff --git a/tests/test_signature_json.py b/tests/test_signature_json.py index 7112d0963f..8bb81a2707 100644 --- a/tests/test_signature_json.py +++ b/tests/test_signature_json.py @@ -111,6 +111,40 @@ def test_load_signaturesset_json_iter(): assert len(sig_entries) == 2 +# integration test more than a unit test +def test_load_signaturesset_json_iter_molecules(): + + t = list() + molecules = 'DNA', 'protein', 'dayhoff' + names = "Foo", 'Bar', "Biz" + filenames = '/tmp/foo', '/tmp/bar', '/tmp/biz' + + for molecule, name, filename in zip(molecules, names, filenames): + minhash = (2,3,4,5,6) + t.append(OrderedDict(( + ('class', 'sourmash_signature'), + ('name', name), + ('filename', filename), + ('signatures', + ( + OrderedDict((('ksize', 21), + ('num', len(minhash)), + #('md5sum', ), + ('molecule', molecule), + ('cardinality', 123456), + ('mins', minhash))), + ))))) + + s = json.dumps(t) + if sys.version_info[0] < 3: + s = unicode(s) + # no MD5SUM + sig_entries = tuple(load_signatureset_json_iter(io.StringIO(s), + ignore_md5sum=True, + ijson=ijson)) + # Ensure all molecule types were read properly + assert len(sig_entries) == 3 + def test_save_load_multisig_json(): e1 = sourmash.MinHash(n=1, ksize=20) sig1 = SourmashSignature(e1) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 2a91cfde2c..756e63c6e9 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -301,6 +301,74 @@ def test_do_sourmash_compute_multik_with_protein(): assert 30 in ksizes +def test_do_sourmash_compute_multik_with_dayhoff(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,30', + '--dayhoff', '--no-dna', + testdata1], + in_directory=location) + assert 'Computing only Dayhoff-encoded protein (and not nucleotide) ' \ + 'signatures.' in err + outfile = os.path.join(location, 'short.fa.sig') + assert os.path.exists(outfile) + + with open(outfile, 'rt') as fp: + sigdata = fp.read() + siglist = list(signature.load_signatures(sigdata)) + assert len(siglist) == 2 + ksizes = set([ x.minhash.ksize for x in siglist ]) + assert 21 in ksizes + assert 30 in ksizes + assert all(x.minhash.dayhoff for x in siglist) + + +def test_do_sourmash_compute_multik_with_dayhoff_and_dna(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,30', + '--dayhoff', + testdata1], + in_directory=location) + outfile = os.path.join(location, 'short.fa.sig') + assert os.path.exists(outfile) + + with open(outfile, 'rt') as fp: + sigdata = fp.read() + siglist = list(signature.load_signatures(sigdata)) + assert len(siglist) == 4 + ksizes = set([ x.minhash.ksize for x in siglist ]) + assert 21 in ksizes + assert 30 in ksizes + assert sum(x.minhash.is_molecule_type('DNA') for x in siglist) == 2 + assert sum(x.minhash.is_molecule_type('dayhoff') for x in siglist) == 2 + + +def test_do_sourmash_compute_multik_with_dayhoff_dna_protein(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,30', + '--dayhoff', '--protein', + testdata1], + in_directory=location) + outfile = os.path.join(location, 'short.fa.sig') + assert os.path.exists(outfile) + + with open(outfile, 'rt') as fp: + sigdata = fp.read() + siglist = list(signature.load_signatures(sigdata)) + assert len(siglist) == 6 + ksizes = set([ x.minhash.ksize for x in siglist ]) + assert 21 in ksizes + assert 30 in ksizes + assert sum(x.minhash.is_molecule_type('DNA') for x in siglist) == 2 + assert sum(x.minhash.is_molecule_type('dayhoff') for x in siglist) == 2 + assert sum(x.minhash.is_molecule_type('protein') for x in siglist) == 2 + + def test_do_sourmash_compute_multik_with_nothing(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') @@ -866,6 +934,39 @@ def test_do_compare_output_multiple_moltype(): assert 'multiple molecule types loaded;' in err + +def test_do_compare_dayhoff(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21', '--dayhoff', + '--no-dna', testdata1], + in_directory=location) + assert status == 0 + + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21', '--dayhoff', + '--no-dna', testdata2], + in_directory=location) + assert status == 0 + + status, out, err = utils.runscript('sourmash', + ['compare', 'short.fa.sig', + 'short2.fa.sig', '--dayhoff', + '--csv', 'xxx'], + in_directory=location) + true_out = '''[1. 0.94] +[0.94 1. ] +min similarity in matrix: 0.940'''.splitlines() + for line in out: + cleaned_line = line.split('...')[-1].strip() + cleaned_line in true_out + assert status == 0 + + + + def test_do_plot_comparison(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa')