diff --git a/sourmash_lib/commands.py b/sourmash_lib/commands.py index 9b1925eb9d..9071c2400d 100644 --- a/sourmash_lib/commands.py +++ b/sourmash_lib/commands.py @@ -1,4 +1,4 @@ -from __future__ import print_function +from __future__ import print_function, division import argparse import csv @@ -580,7 +580,8 @@ def sbt_combine(args): def index(args): - import sourmash_lib.sbt + import sourmash_lib.sbtmh + parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name to save SBT into') parser.add_argument('signatures', nargs='+', @@ -649,12 +650,14 @@ def index(args): def search(args): - from sourmash_lib.sbtmh import search_minhashes, SearchMinHashesFindBest + from .search import search_databases parser = argparse.ArgumentParser() parser.add_argument('query', help='query signature') parser.add_argument('databases', help='signatures/SBTs to search', nargs='+') + parser.add_argument('--traverse-directory', action='store_true', + help='search all signatures underneath directories.') parser.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') parser.add_argument('--threshold', default=0.08, type=float, @@ -699,66 +702,24 @@ def search(args): query.minhash.scaled, int(args.scaled)) query.minhash = query.minhash.downsample_scaled(args.scaled) - # set up the search function(s) - search_fn = search_minhashes - - # similarity vs containment - query_similarity = lambda x: query.similarity(x, downsample=True) - if args.containment: - query_similarity = lambda x: query.contained_by(x, downsample=True) - # set up the search databases databases = sourmash_args.load_sbts_and_sigs(args.databases, - query_ksize, query_moltype) + query_ksize, query_moltype, + args.traverse_directory) if not len(databases): error('Nothing found to search!') sys.exit(-1) - # collect results across all the trees - SearchResult = namedtuple('SearchResult', - 'similarity, match_sig, md5, filename, name') - results = [] - found_md5 = set() - for (sbt_or_siglist, filename, is_sbt) in databases: - if args.best_only: - search_fn = sourmash_lib.sbtmh.SearchMinHashesFindBest().search - - if is_sbt: - tree = sbt_or_siglist - notify('Searching SBT {}', filename) - for leaf in tree.find(search_fn, query, args.threshold): - similarity = query_similarity(leaf.data) - if similarity >= args.threshold and \ - leaf.data.md5sum() not in found_md5: - sr = SearchResult(similarity=similarity, - match_sig=leaf.data, - md5=leaf.data.md5sum(), - filename=filename, - name=leaf.data.name()) - found_md5.add(sr.md5) - results.append(sr) - - else: # list of signatures - for ss in sbt_or_siglist: - similarity = query_similarity(ss) - if similarity >= args.threshold and \ - ss.md5sum() not in found_md5: - sr = SearchResult(similarity=similarity, - match_sig=ss, - md5=ss.md5sum(), - filename=filename, - name=ss.name()) - found_md5.add(sr.md5) - results.append(sr) - - # sort results on similarity (reverse) - results.sort(key=lambda x: -x.similarity) + # do the actual search + results = search_databases(query, databases, + args.threshold, args.containment, + args.best_only) + n_matches = len(results) if args.best_only: - notify("(truncated search because of --best-only; only trust top result") + args.num_results = 1 - n_matches = len(results) if n_matches <= args.num_results: print_results('{} matches:'.format(len(results))) else: @@ -774,6 +735,9 @@ def search(args): name = sr.match_sig._display_name(60) print_results('{:>6} {}', pct, name) + if args.best_only: + notify("** reporting only one match because --best-only was set") + if args.output: fieldnames = ['similarity', 'name', 'filename', 'md5'] w = csv.DictWriter(args.output, fieldnames=fieldnames) @@ -867,10 +831,14 @@ def categorize(args): def gather(args): + from .search import gather_databases + parser = argparse.ArgumentParser() parser.add_argument('query', help='query signature') parser.add_argument('databases', help='signatures/SBTs to search', nargs='+') + parser.add_argument('--traverse-directory', action='store_true', + help='search all signatures underneath directories.') parser.add_argument('-o', '--output', type=argparse.FileType('wt'), help='output CSV containing matches to this file') parser.add_argument('--save-matches', type=argparse.FileType('wt'), @@ -919,54 +887,14 @@ def gather(args): # set up the search databases databases = sourmash_args.load_sbts_and_sigs(args.databases, - query_ksize, query_moltype) + query_ksize, query_moltype, + args.traverse_directory) if not len(databases): error('Nothing found to search!') sys.exit(-1) - orig_query = query - orig_mins = orig_query.minhash.get_hashes() - - # calculate the band size/resolution R for the genome - R_metagenome = orig_query.minhash.scaled - - # define a function to do a 'best' search and get only top match. - def find_best(dblist, query): - results = [] - for (sbt_or_siglist, filename, is_sbt) in dblist: - search_fn = sourmash_lib.sbtmh.SearchMinHashesFindBestIgnoreMaxHash().search - - if is_sbt: - tree = sbt_or_siglist - - for leaf in tree.find(search_fn, query, 0.0): - leaf_e = leaf.data.minhash - similarity = query.minhash.similarity_ignore_maxhash(leaf_e) - if similarity > 0.0: - results.append((similarity, leaf.data)) - else: - for ss in sbt_or_siglist: - similarity = query.minhash.similarity_ignore_maxhash(ss.minhash) - if similarity > 0.0: - results.append((similarity, ss)) - - if not results: - return None, None, None - - # take the best result - results.sort(key=lambda x: -x[0]) # reverse sort on similarity - best_similarity, best_leaf = results[0] - return best_similarity, best_leaf, filename - - - # define a function to build new signature object from set of mins - def build_new_signature(mins, template_sig): - e = template_sig.minhash.copy_and_clear() - e.add_many(mins) - return sig.SourmashSignature(e) - - # xxx + # pretty-printing code. def format_bp(bp): bp = float(bp) if bp < 500: @@ -979,76 +907,21 @@ def format_bp(bp): return '{:.1f} Gbp'.format(round(bp / 1e9, 1)) return '???' - # construct a new query that doesn't have the max_hash attribute set. - new_mins = query.minhash.get_hashes() - query = build_new_signature(new_mins, orig_query) - - sum_found = 0. found = [] - GatherResult = namedtuple('GatherResult', - 'intersect_bp, f_orig_query, f_match, f_unique_to_query, filename, name, md5, leaf') - while 1: - best_similarity, best_leaf, filename = find_best(databases, query) - if not best_leaf: # no matches at all! - break - - # subtract found hashes from search hashes, construct new search - query_mins = set(query.minhash.get_hashes()) - found_mins = best_leaf.minhash.get_hashes() - - # figure out what the resolution of the banding on the genome is, - # based either on an explicit --scaled parameter, or on genome - # cardinality (deprecated) - if not best_leaf.minhash.max_hash: - error('Best hash match in sbt_gather has no max_hash') - error('Please prepare database of sequences with --scaled') - sys.exit(-1) - - R_genome = best_leaf.minhash.scaled - - # pick the highest R / lowest resolution - R_comparison = max(R_metagenome, R_genome) - - # CTB: these could probably be replaced by minhash.downsample_scaled. - new_max_hash = sourmash_lib.MAX_HASH / float(R_comparison) - query_mins = set([ i for i in query_mins if i < new_max_hash ]) - found_mins = set([ i for i in found_mins if i < new_max_hash ]) - orig_mins = set([ i for i in orig_mins if i < new_max_hash ]) - - # calculate intersection: - intersect_mins = query_mins.intersection(found_mins) - intersect_orig_mins = orig_mins.intersection(found_mins) - intersect_bp = R_comparison * len(intersect_orig_mins) - sum_found += len(intersect_mins) - - if intersect_bp < args.threshold_bp: # hard cutoff for now - notify('found less than {} in common. => exiting', - format_bp(intersect_bp)) - break - - # calculate fractions wrt first denominator - genome size - genome_n_mins = len(found_mins) - f_match = len(intersect_mins) / float(genome_n_mins) - f_orig_query = len(intersect_orig_mins) / float(len(orig_mins)) + sum_found = 0 + for result, n_intersect_mins, new_max_hash, next_query in gather_databases(query, databases, + args.threshold_bp): + # print interim result & save in a list for later use + pct_query = '{:.1f}%'.format(result.f_orig_query*100) + pct_genome = '{:.1f}%'.format(result.f_match*100) - # calculate fractions wrt second denominator - metagenome size - query_n_mins = len(orig_query.minhash.get_hashes()) - f_unique_to_query = len(intersect_mins) / float(query_n_mins) + name = result.leaf._display_name(40) if not len(found): # first result? print header. print_results("") print_results("overlap p_query p_match ") print_results("--------- ------- --------") - result = GatherResult(intersect_bp=intersect_bp, - f_orig_query=f_orig_query, - f_match=f_match, - f_unique_to_query=f_unique_to_query, - filename=filename, - md5=best_leaf.md5sum(), - name=best_leaf.name(), - leaf=best_leaf) - # print interim result & save in a list for later use pct_query = '{:.1f}%'.format(result.f_orig_query*100) pct_genome = '{:.1f}%'.format(result.f_match*100) @@ -1058,16 +931,14 @@ def format_bp(bp): print_results('{:9} {:>6} {:>6} {}', format_bp(result.intersect_bp), pct_query, pct_genome, name) + sum_found += n_intersect_mins found.append(result) - # construct a new query, minus the previous one. - query_mins -= set(found_mins) - query = build_new_signature(query_mins, orig_query) # basic reporting print_results('\nfound {} matches total;', len(found)) - sum_found /= len(orig_query.minhash.get_hashes()) + sum_found /= len(query.minhash.get_hashes()) print_results('the recovered matches hit {:.1f}% of the query', sum_found * 100) print_results('') diff --git a/sourmash_lib/sbt.py b/sourmash_lib/sbt.py index 65bc31fdb6..163b7b61d0 100644 --- a/sourmash_lib/sbt.py +++ b/sourmash_lib/sbt.py @@ -2,10 +2,6 @@ """ A trial implementation of sequence bloom trees, Solomon & Kingsford, 2015. -This is a simple in-memory version where all of the graphs are in -memory at once; to move it onto disk, the graphs would need to be -dynamically loaded for each query. - To try it out, do:: factory = GraphFactory(ksize, tablesizes, n_tables) @@ -188,8 +184,7 @@ def save(self, tag): '.'.join([basetag, basename, 'sbt'])), 'name': node.name } - if isinstance(node, Leaf): - data['metadata'] = node.metadata + data['metadata'] = node.metadata node.save(os.path.join(dirprefix, data['filename'])) structure[i] = data @@ -381,6 +376,7 @@ def __init__(self, factory, name=None, fullpath=None): self._factory = factory self._data = None self._filename = fullpath + self.metadata = dict() def __str__(self): return '*Node:{name} [occupied: {nb}, fpr: {fpr:.2}]'.format( @@ -407,10 +403,14 @@ def data(self, new_data): def load(info, dirname): filename = os.path.join(dirname, info['filename']) new_node = Node(info['factory'], name=info['name'], fullpath=filename) + new_node.metadata = info.get('metadata', {}) return new_node def update(self, parent): parent.data.update(self.data) + max_n_below = max(parent.metadata.get('max_n_below', 0), + self.metadata.get('max_n_below')) + parent.metadata['max_n_below'] = max_n_below class Leaf(object): diff --git a/sourmash_lib/sbtmh.py b/sourmash_lib/sbtmh.py index d5f5fcb8c3..2b3d6b3ab6 100644 --- a/sourmash_lib/sbtmh.py +++ b/sourmash_lib/sbtmh.py @@ -51,13 +51,17 @@ def save(self, filename): def update(self, parent): for v in self.data.minhash.get_mins(): parent.data.count(v) + max_n_below = parent.metadata.get('max_n_below', 0) + max_n_below = max(len(self.data.minhash.get_mins()), + max_n_below) + parent.metadata['max_n_below'] = max_n_below + @property def data(self): if self._data is None: from sourmash_lib import signature - it = signature.load_signatures(self._filename) - self._data, = list(it) # should only be one signature + self._data = signature.load_one_signature(self._filename) return self._data @data.setter @@ -98,39 +102,68 @@ def __init__(self, downsample=True): def search(self, node, sig, threshold, results=None): mins = sig.minhash.get_mins() + score = 0 if isinstance(node, SigLeaf): try: - matches = node.data.minhash.count_common(sig.minhash) + score = node.data.minhash.similarity(sig.minhash) except Exception as e: if 'mismatch in max_hash' in str(e) and self.downsample: xx = sig.minhash.downsample_max_hash(node.data.minhash) yy = node.data.minhash.downsample_max_hash(sig.minhash) - matches = yy.count_common(xx) + score = yy.similarity(xx) else: raise - else: # Node or Leaf, Nodegraph by minhash comparison - matches = sum(1 for value in mins if node.data.get(value)) - - score = 0 - if len(mins): - score = float(matches) / len(mins) + else: # internal object, not leaf. + if len(mins): + matches = sum(1 for value in mins if node.data.get(value)) + max_mins = node.metadata.get('max_n_below', -1) + if max_mins == -1: + raise Exception('cannot do similarity search on this SBT; need to rebuild.') + score = float(matches) / max_mins if results is not None: results[node.name] = score if score >= threshold: # have we done better than this? if yes, truncate. - if float(matches) / len(mins) > self.best_match: + if score > self.best_match: # update best if it's a leaf node... if isinstance(node, SigLeaf): - self.best_match = float(matches) / len(mins) + self.best_match = score return 1 return 0 +def search_minhashes_containment(node, sig, threshold, + results=None, downsample=True): + mins = sig.minhash.get_mins() + + if isinstance(node, SigLeaf): + try: + matches = node.data.minhash.count_common(sig.minhash) + except Exception as e: + if 'mismatch in max_hash' in str(e) and downsample: + xx = sig.minhash.downsample_max_hash(node.data.minhash) + yy = node.data.minhash.downsample_max_hash(sig.minhash) + + matches = yy.count_common(xx) + else: + raise + + else: # Node or Leaf, Nodegraph by minhash comparison + matches = sum(1 for value in mins if node.data.get(value)) + + if results is not None: + results[node.name] = float(matches) / len(mins) + + if len(mins) and float(matches) / len(mins) >= threshold: + return 1 + return 0 + + class SearchMinHashesFindBestIgnoreMaxHash(object): def __init__(self): self.best_match = 0. diff --git a/sourmash_lib/search.py b/sourmash_lib/search.py new file mode 100644 index 0000000000..6dc410ecb3 --- /dev/null +++ b/sourmash_lib/search.py @@ -0,0 +1,172 @@ +from collections import namedtuple + +import sourmash_lib +from .signature import SourmashSignature +from .sbtmh import (search_minhashes, + search_minhashes_containment) +from .sbtmh import SearchMinHashesFindBest + +# generic SearchResult across individual signatures + SBTs. +SearchResult = namedtuple('SearchResult', + 'similarity, match_sig, md5, filename, name') + +def search_databases(query, databases, threshold, do_containment, best_only): + # set up the search & score function(s) - similarity vs containment + search_fn = search_minhashes + query_match = lambda x: query.similarity(x, downsample=True) + if do_containment: + search_fn = search_minhashes_containment + query_match = lambda x: query.contained_by(x, downsample=True) + + results = [] + found_md5 = set() + for (sbt_or_siglist, filename, is_sbt) in databases: + if is_sbt: + if best_only: # this needs to be reset for each SBT + search_fn = SearchMinHashesFindBest().search + + tree = sbt_or_siglist + for leaf in tree.find(search_fn, query, threshold): + similarity = query_match(leaf.data) + if similarity >= threshold and \ + leaf.data.md5sum() not in found_md5: + sr = SearchResult(similarity=similarity, + match_sig=leaf.data, + md5=leaf.data.md5sum(), + filename=filename, + name=leaf.data.name()) + found_md5.add(sr.md5) + results.append(sr) + + else: # list of signatures + for ss in sbt_or_siglist: + similarity = query_match(ss) + if similarity >= threshold and \ + ss.md5sum() not in found_md5: + sr = SearchResult(similarity=similarity, + match_sig=ss, + md5=ss.md5sum(), + filename=filename, + name=ss.name()) + found_md5.add(sr.md5) + results.append(sr) + + + # sort results on similarity (reverse) + results.sort(key=lambda x: -x.similarity) + + return results + + +def gather_databases(query, databases, threshold_bp): + from sourmash_lib.sbtmh import SearchMinHashesFindBestIgnoreMaxHash + + orig_query = query + orig_mins = orig_query.minhash.get_hashes() + + # calculate the band size/resolution R for the genome + R_metagenome = orig_query.minhash.scaled + + # define a function to do a 'best' search and get only top match. + def find_best(dblist, query): + results = [] + for (sbt_or_siglist, filename, is_sbt) in dblist: + search_fn = SearchMinHashesFindBestIgnoreMaxHash().search + + if is_sbt: + tree = sbt_or_siglist + + for leaf in tree.find(search_fn, query, 0.0): + leaf_e = leaf.data.minhash + similarity = query.minhash.similarity_ignore_maxhash(leaf_e) + if similarity > 0.0: + results.append((similarity, leaf.data)) + else: + for ss in sbt_or_siglist: + similarity = query.minhash.similarity_ignore_maxhash(ss.minhash) + if similarity > 0.0: + results.append((similarity, ss)) + + if not results: + return None, None, None + + # take the best result + results.sort(key=lambda x: -x[0]) # reverse sort on similarity + best_similarity, best_leaf = results[0] + return best_similarity, best_leaf, filename + + + # define a function to build new signature object from set of mins + def build_new_signature(mins, template_sig): + e = template_sig.minhash.copy_and_clear() + e.add_many(mins) + return SourmashSignature(e) + + # construct a new query that doesn't have the max_hash attribute set. + new_mins = query.minhash.get_hashes() + query = build_new_signature(new_mins, orig_query) + + sum_found = 0. + GatherResult = namedtuple('GatherResult', + 'intersect_bp, f_orig_query, f_match, f_unique_to_query, filename, name, md5, leaf') + while 1: + best_similarity, best_leaf, filename = find_best(databases, query) + if not best_leaf: # no matches at all! + break + + # subtract found hashes from search hashes, construct new search + query_mins = set(query.minhash.get_hashes()) + found_mins = best_leaf.minhash.get_hashes() + + # figure out what the resolution of the banding on the genome is, + # based either on an explicit --scaled parameter, or on genome + # cardinality (deprecated) + if not best_leaf.minhash.max_hash: + error('Best hash match in sbt_gather has no max_hash') + error('Please prepare database of sequences with --scaled') + sys.exit(-1) + + R_genome = best_leaf.minhash.scaled + + # pick the highest R / lowest resolution + R_comparison = max(R_metagenome, R_genome) + + # CTB: these could probably be replaced by minhash.downsample_scaled. + new_max_hash = sourmash_lib.MAX_HASH / float(R_comparison) + query_mins = set([ i for i in query_mins if i < new_max_hash ]) + found_mins = set([ i for i in found_mins if i < new_max_hash ]) + orig_mins = set([ i for i in orig_mins if i < new_max_hash ]) + + # calculate intersection: + intersect_mins = query_mins.intersection(found_mins) + intersect_orig_mins = orig_mins.intersection(found_mins) + intersect_bp = R_comparison * len(intersect_orig_mins) + + if intersect_bp < threshold_bp: # hard cutoff for now + notify('found less than {} in common. => exiting', + format_bp(intersect_bp)) + break + + # calculate fractions wrt first denominator - genome size + genome_n_mins = len(found_mins) + f_match = len(intersect_mins) / float(genome_n_mins) + f_orig_query = len(intersect_orig_mins) / float(len(orig_mins)) + + # calculate fractions wrt second denominator - metagenome size + query_n_mins = len(orig_query.minhash.get_hashes()) + f_unique_to_query = len(intersect_mins) / float(query_n_mins) + + result = GatherResult(intersect_bp=intersect_bp, + f_orig_query=f_orig_query, + f_match=f_match, + f_unique_to_query=f_unique_to_query, + filename=filename, + md5=best_leaf.md5sum(), + name=best_leaf.name(), + leaf=best_leaf) + + # construct a new query, minus the previous one. + query_mins -= set(found_mins) + query = build_new_signature(query_mins, orig_query) + + yield result, len(intersect_mins), new_max_hash, query diff --git a/sourmash_lib/sourmash_args.py b/sourmash_lib/sourmash_args.py index 94ff4246e8..7a09f66512 100644 --- a/sourmash_lib/sourmash_args.py +++ b/sourmash_lib/sourmash_args.py @@ -160,11 +160,24 @@ def get_ksize(tree): return node.data.minhash.ksize -def load_sbts_and_sigs(filenames, query_ksize, query_moltype): +def load_sbts_and_sigs(filenames, query_ksize, query_moltype, traverse=False): n_signatures = 0 n_databases = 0 databases = [] for sbt_or_sigfile in filenames: + if traverse and os.path.isdir(sbt_or_sigfile): + for sigfile in traverse_find_sigs([sbt_or_sigfile]): + try: + siglist = sig.load_signatures(sigfile, + ksize=query_ksize, + select_moltype=query_moltype) + siglist = list(siglist) + databases.append((list(siglist), sbt_or_sigfile, False)) + notify('loaded {} signatures from {}', len(siglist), + sigfile, end='\r') + except: # ignore errors with traverse + continue + continue try: tree = SBT.load(sbt_or_sigfile, leaf_loader=SigLeaf.load) ksize = get_ksize(tree) @@ -189,10 +202,13 @@ def load_sbts_and_sigs(filenames, query_ksize, query_moltype): sbt_or_sigfile, end='\r') n_signatures += len(siglist) except EnvironmentError: - error("file '{}' does not exist", sbt_or_sigfile) + error("\nfile '{}' does not exist", sbt_or_sigfile) sys.exit(-1) notify(' '*79, end='\r') notify('loaded {} signatures and {} databases total.'.format(n_signatures, n_databases)) + if databases: + print('') + return databases diff --git a/tests/test-data/genome-s10+s11.fa.gz b/tests/test-data/genome-s10+s11.fa.gz new file mode 100644 index 0000000000..53bd0b99b7 Binary files /dev/null and b/tests/test-data/genome-s10+s11.fa.gz differ diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index a161e1380b..86d5dc188e 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1278,6 +1278,24 @@ def test_search_metagenome(): assert '12 matches; showing first 3:' in out +def test_search_metagenome_traverse(): + with utils.TempDirectory() as location: + testdata_dir = utils.get_test_data('gather') + + query_sig = utils.get_test_data('gather/combined.sig') + + cmd = 'search {} {} -k 21 --traverse-directory' + cmd = cmd.format(query_sig, testdata_dir) + status, out, err = utils.runscript('sourmash', cmd.split(' '), + in_directory=location) + + print(out) + print(err) + + assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T...' in out + assert '13 matches; showing first 3:' in out + + def test_search_metagenome_downsample(): with utils.TempDirectory() as location: testdata_glob = utils.get_test_data('gather/GCF*.sig') @@ -1747,6 +1765,60 @@ def test_do_sourmash_sbt_search_bestonly(): assert 'short.fa' in out +def test_sbt_search_order_dependence(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('genome-s10.fa.gz') + testdata2 = utils.get_test_data('genome-s11.fa.gz') + testdata3 = utils.get_test_data('genome-s12.fa.gz') + testdata4 = utils.get_test_data('genome-s10+s11.fa.gz') + + cmd = 'compute --scaled 10000 -k 21,31 {} {} {} {}' + cmd = cmd.format(testdata1, testdata2, testdata3, testdata4) + + status, out, err = utils.runscript('sourmash', cmd.split(' '), + in_directory=location) + + cmd = 'index -k 21 134 genome-s10+s11.fa.gz.sig genome-s11.fa.gz.sig genome-s12.fa.gz.sig' + status, out, err = utils.runscript('sourmash', cmd.split(' '), + in_directory=location) + + cmd = 'search -k 21 genome-s11.fa.gz.sig 134 --best-only -k 21 --dna' + status, out, err = utils.runscript('sourmash', cmd.split(' '), + in_directory=location) + + print(out) + print(err) + assert '100.0%' in out + + +def test_sbt_search_order_dependence_2(): + # *should* return the same result as test_sbt_search_order_dependence, + # but does not due to a bug. + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('genome-s10.fa.gz') + testdata2 = utils.get_test_data('genome-s11.fa.gz') + testdata3 = utils.get_test_data('genome-s12.fa.gz') + testdata4 = utils.get_test_data('genome-s10+s11.fa.gz') + + cmd = 'compute --scaled 10000 -k 21,31 {} {} {} {}' + cmd = cmd.format(testdata1, testdata2, testdata3, testdata4) + + status, out, err = utils.runscript('sourmash', cmd.split(' '), + in_directory=location) + + cmd = 'index -k 21 314 genome-s11.fa.gz.sig genome-s10+s11.fa.gz.sig genome-s12.fa.gz.sig' + status, out, err = utils.runscript('sourmash', cmd.split(' '), + in_directory=location) + + cmd = 'search -k 21 genome-s11.fa.gz.sig 314 --best-only --dna' + status, out, err = utils.runscript('sourmash', cmd.split(' '), + in_directory=location) + + print(out) + print(err) + assert '100.0%' in out + + def test_compare_with_abundance_1(): with utils.TempDirectory() as location: # create two signatures @@ -2056,6 +2128,34 @@ def test_gather_metagenome(): assert '4.7 Mbp 32.1% 1.5% NC_011294.1 Salmonella enterica subsp...' in out +def test_gather_metagenome_traverse(): + with utils.TempDirectory() as location: + # set up a directory $location/gather that contains + # everything in the 'tests/test-data/gather' directory + # *except* the query sequence, which is 'combined.sig'. + testdata_dir = utils.get_test_data('gather') + copy_testdata = os.path.join(location, 'somesigs') + shutil.copytree(testdata_dir, copy_testdata) + os.unlink(os.path.join(copy_testdata, 'combined.sig')) + + query_sig = utils.get_test_data('gather/combined.sig') + + # now, feed in the new directory -- + cmd = 'gather {} {} -k 21 --traverse-directory' + cmd = cmd.format(query_sig, copy_testdata) + status, out, err = utils.runscript('sourmash', cmd.split(' '), + in_directory=location) + + print(cmd) + print(out) + print(err) + + assert 'found 12 matches total' in out + assert 'the recovered matches hit 100.0% of the query' in out + assert '4.9 Mbp 33.2% 100.0% NC_003198.1 Salmonella enterica subsp...' in out + assert '4.7 Mbp 32.1% 1.5% NC_011294.1 Salmonella enterica subsp...' in out + + def test_gather_metagenome_output_unassigned(): with utils.TempDirectory() as location: testdata_glob = utils.get_test_data('gather/GCF_000195995*g') @@ -2085,7 +2185,7 @@ def test_gather_metagenome_output_unassigned(): print(out) print(err) - assert '1.3 Mbp 13.6% 28.2% NC_011294.1 Salmonella enterica subsp...' in out + assert '4.7 Mbp 32.1% 100.0% NC_011294.1 Salmonella enterica subsp...' in out def test_gather_metagenome_downsample():