Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Fix problem where tree search is truncated incorrectly. #244

Merged
merged 35 commits into from
Oct 25, 2017
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
4a39670
add (failing) test for --best-only and similarity calculation
ctb May 20, 2017
cb13a76
some simple Python API improvements + tests
ctb Sep 17, 2017
54a4776
support MinHash(..., scaled=val)
ctb Sep 17, 2017
afc5473
pass tests with new scaled convenience functions
ctb Sep 17, 2017
e0f5eb4
reorder scaled args
ctb Sep 17, 2017
149b251
Merge branch 'master' into api/improvements
ctb Sep 17, 2017
cf663bd
Merge branch 'master' into api/improvements
ctb Sep 18, 2017
1140573
Merge remote-tracking branch 'origin/master' into bug/sbt_similarity
ctb Sep 19, 2017
44731ba
update tests after merge
ctb Sep 19, 2017
5b16c2a
remove unnecessary imports, refactor search a bit
ctb Sep 20, 2017
4953487
fix SBT internal nodes to have metadata; added max_n_below to metadata
ctb Sep 20, 2017
7e13c99
move search mechanics into sourmash_lib.search
ctb Sep 20, 2017
74dfa93
move gather code into sourmash_lib.search
ctb Sep 20, 2017
32aab99
ugly fix to get --output-unassigned working!
ctb Sep 20, 2017
e984749
rm whitespace
ctb Sep 20, 2017
af9702d
add in --traverse-directory to search and gather, along with tests
ctb Sep 20, 2017
9a49b8a
add -U to pip install
ctb Sep 20, 2017
f4dfef6
Merge branch 'master' into api/improvements
ctb Sep 20, 2017
1f21ec4
add --randomize to sourmash compute
ctb Sep 20, 2017
652bca2
add a nicer sbt API
ctb Sep 20, 2017
b9340a9
Fixed a bug when loading minhashes with track_abundance, and a bunch …
luizirber Sep 20, 2017
fb39d72
Merge branch 'master' of github.com:dib-lab/sourmash into api/improve…
ctb Sep 20, 2017
f2cf8cd
Merge branch 'api/improvements' of github.com:dib-lab/sourmash into a…
ctb Sep 20, 2017
12269a0
clean up a test failure from merge
ctb Sep 20, 2017
28e0587
Merge branch 'master' of github.com:dib-lab/sourmash into api/improve…
ctb Sep 20, 2017
6748c6a
Merge branch 'master' of github.com:dib-lab/sourmash into api/improve…
ctb Sep 20, 2017
c0887ee
Merge branch 'master' of github.com:dib-lab/sourmash into bug/sbt_sim…
ctb Sep 20, 2017
3f0bfab
Merge branch 'api/improvements' of github.com:dib-lab/sourmash into b…
ctb Sep 20, 2017
aa2f612
update tests
ctb Sep 20, 2017
29e0d23
Merge branch 'master' of github.com:dib-lab/sourmash into bug/sbt_sim…
ctb Sep 21, 2017
1abcb06
Merge branch 'master' into bug/sbt_similarity
ctb Oct 1, 2017
208f714
Merge branch 'master' of github.com:dib-lab/sourmash into bug/sbt_sim…
ctb Oct 25, 2017
d655ec3
fix tests after merge
ctb Oct 25, 2017
bff34c3
add missing test file
ctb Oct 25, 2017
4322140
Fix division problem
luizirber Oct 25, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 33 additions & 162 deletions sourmash_lib/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,8 @@ def sbt_combine(args):


def index(args):
import sourmash_lib.sbt
import sourmash_lib.sbtmh

parser = argparse.ArgumentParser()
parser.add_argument('sbt_name', help='name to save SBT into')
parser.add_argument('signatures', nargs='+',
Expand Down Expand Up @@ -650,12 +651,14 @@ def index(args):


def search(args):
from sourmash_lib.sbtmh import search_minhashes, SearchMinHashesFindBest
from .search import search_databases

parser = argparse.ArgumentParser()
parser.add_argument('query', help='query signature')
parser.add_argument('databases', help='signatures/SBTs to search',
nargs='+')
parser.add_argument('--traverse-directory', action='store_true',
help='search all signatures underneath directories.')
parser.add_argument('-q', '--quiet', action='store_true',
help='suppress non-error output')
parser.add_argument('--threshold', default=0.08, type=float,
Expand Down Expand Up @@ -700,66 +703,24 @@ def search(args):
query.minhash.scaled, int(args.scaled))
query.minhash = query.minhash.downsample_scaled(args.scaled)

# set up the search function(s)
search_fn = search_minhashes

# similarity vs containment
query_similarity = lambda x: query.similarity(x, downsample=True)
if args.containment:
query_similarity = lambda x: query.contained_by(x, downsample=True)

# set up the search databases
databases = sourmash_args.load_sbts_and_sigs(args.databases,
query_ksize, query_moltype)
query_ksize, query_moltype,
args.traverse_directory)

if not len(databases):
error('Nothing found to search!')
sys.exit(-1)

# collect results across all the trees
SearchResult = namedtuple('SearchResult',
'similarity, match_sig, md5, filename, name')
results = []
found_md5 = set()
for (sbt_or_siglist, filename, is_sbt) in databases:
if args.best_only:
search_fn = sourmash_lib.sbtmh.SearchMinHashesFindBest().search

if is_sbt:
tree = sbt_or_siglist
notify('Searching SBT {}', filename)
for leaf in tree.find(search_fn, query, args.threshold):
similarity = query_similarity(leaf.data)
if similarity >= args.threshold and \
leaf.data.md5sum() not in found_md5:
sr = SearchResult(similarity=similarity,
match_sig=leaf.data,
md5=leaf.data.md5sum(),
filename=filename,
name=leaf.data.name())
found_md5.add(sr.md5)
results.append(sr)

else: # list of signatures
for ss in sbt_or_siglist:
similarity = query_similarity(ss)
if similarity >= args.threshold and \
ss.md5sum() not in found_md5:
sr = SearchResult(similarity=similarity,
match_sig=ss,
md5=ss.md5sum(),
filename=filename,
name=ss.name())
found_md5.add(sr.md5)
results.append(sr)

# sort results on similarity (reverse)
results.sort(key=lambda x: -x.similarity)
# do the actual search
results = search_databases(query, databases,
args.threshold, args.containment,
args.best_only)

n_matches = len(results)
if args.best_only:
notify("(truncated search because of --best-only; only trust top result")
args.num_results = 1

n_matches = len(results)
if n_matches <= args.num_results:
print_results('{} matches:'.format(len(results)))
else:
Expand All @@ -775,6 +736,9 @@ def search(args):
name = sr.match_sig._display_name(60)
print_results('{:>6} {}', pct, name)

if args.best_only:
notify("** reporting only one match because --best-only was set")

if args.output:
fieldnames = ['similarity', 'name', 'filename', 'md5']
w = csv.DictWriter(args.output, fieldnames=fieldnames)
Expand Down Expand Up @@ -868,10 +832,14 @@ def categorize(args):


def gather(args):
from .search import gather_databases

parser = argparse.ArgumentParser()
parser.add_argument('query', help='query signature')
parser.add_argument('databases', help='signatures/SBTs to search',
nargs='+')
parser.add_argument('--traverse-directory', action='store_true',
help='search all signatures underneath directories.')
parser.add_argument('-o', '--output', type=argparse.FileType('wt'),
help='output CSV containing matches to this file')
parser.add_argument('--save-matches', type=argparse.FileType('wt'),
Expand Down Expand Up @@ -920,54 +888,14 @@ def gather(args):

# set up the search databases
databases = sourmash_args.load_sbts_and_sigs(args.databases,
query_ksize, query_moltype)
query_ksize, query_moltype,
args.traverse_directory)

if not len(databases):
error('Nothing found to search!')
sys.exit(-1)

orig_query = query
orig_mins = orig_query.minhash.get_hashes()

# calculate the band size/resolution R for the genome
R_metagenome = orig_query.minhash.scaled

# define a function to do a 'best' search and get only top match.
def find_best(dblist, query):
results = []
for (sbt_or_siglist, filename, is_sbt) in dblist:
search_fn = sourmash_lib.sbtmh.SearchMinHashesFindBestIgnoreMaxHash().search

if is_sbt:
tree = sbt_or_siglist

for leaf in tree.find(search_fn, query, 0.0):
leaf_e = leaf.data.minhash
similarity = query.minhash.similarity_ignore_maxhash(leaf_e)
if similarity > 0.0:
results.append((similarity, leaf.data))
else:
for ss in sbt_or_siglist:
similarity = query.minhash.similarity_ignore_maxhash(ss.minhash)
if similarity > 0.0:
results.append((similarity, ss))

if not results:
return None, None, None

# take the best result
results.sort(key=lambda x: -x[0]) # reverse sort on similarity
best_similarity, best_leaf = results[0]
return best_similarity, best_leaf, filename


# define a function to build new signature object from set of mins
def build_new_signature(mins, template_sig):
e = template_sig.minhash.copy_and_clear()
e.add_many(mins)
return sig.SourmashSignature('', e)

# xxx
# pretty-printing code.
def format_bp(bp):
bp = float(bp)
if bp < 500:
Expand All @@ -980,76 +908,21 @@ def format_bp(bp):
return '{:.1f} Gbp'.format(round(bp / 1e9, 1))
return '???'

# construct a new query that doesn't have the max_hash attribute set.
new_mins = query.minhash.get_hashes()
query = build_new_signature(new_mins, orig_query)

sum_found = 0.
found = []
GatherResult = namedtuple('GatherResult',
'intersect_bp, f_orig_query, f_match, f_unique_to_query, filename, name, md5, leaf')
while 1:
best_similarity, best_leaf, filename = find_best(databases, query)
if not best_leaf: # no matches at all!
break

# subtract found hashes from search hashes, construct new search
query_mins = set(query.minhash.get_hashes())
found_mins = best_leaf.minhash.get_hashes()

# figure out what the resolution of the banding on the genome is,
# based either on an explicit --scaled parameter, or on genome
# cardinality (deprecated)
if not best_leaf.minhash.max_hash:
error('Best hash match in sbt_gather has no max_hash')
error('Please prepare database of sequences with --scaled')
sys.exit(-1)

R_genome = best_leaf.minhash.scaled

# pick the highest R / lowest resolution
R_comparison = max(R_metagenome, R_genome)

# CTB: these could probably be replaced by minhash.downsample_scaled.
new_max_hash = sourmash_lib.MAX_HASH / float(R_comparison)
query_mins = set([ i for i in query_mins if i < new_max_hash ])
found_mins = set([ i for i in found_mins if i < new_max_hash ])
orig_mins = set([ i for i in orig_mins if i < new_max_hash ])

# calculate intersection:
intersect_mins = query_mins.intersection(found_mins)
intersect_orig_mins = orig_mins.intersection(found_mins)
intersect_bp = R_comparison * len(intersect_orig_mins)
sum_found += len(intersect_mins)

if intersect_bp < args.threshold_bp: # hard cutoff for now
notify('found less than {} in common. => exiting',
format_bp(intersect_bp))
break

# calculate fractions wrt first denominator - genome size
genome_n_mins = len(found_mins)
f_match = len(intersect_mins) / float(genome_n_mins)
f_orig_query = len(intersect_orig_mins) / float(len(orig_mins))
sum_found = 0
for result, n_intersect_mins, new_max_hash, next_query in gather_databases(query, databases,
args.threshold_bp):
# print interim result & save in a list for later use
pct_query = '{:.1f}%'.format(result.f_orig_query*100)
pct_genome = '{:.1f}%'.format(result.f_match*100)

# calculate fractions wrt second denominator - metagenome size
query_n_mins = len(orig_query.minhash.get_hashes())
f_unique_to_query = len(intersect_mins) / float(query_n_mins)
name = result.leaf._display_name(40)

if not len(found): # first result? print header.
print_results("")
print_results("overlap p_query p_match ")
print_results("--------- ------- --------")

result = GatherResult(intersect_bp=intersect_bp,
f_orig_query=f_orig_query,
f_match=f_match,
f_unique_to_query=f_unique_to_query,
filename=filename,
md5=best_leaf.md5sum(),
name=best_leaf.name(),
leaf=best_leaf)

# print interim result & save in a list for later use
pct_query = '{:.1f}%'.format(result.f_orig_query*100)
pct_genome = '{:.1f}%'.format(result.f_match*100)
Expand All @@ -1059,16 +932,14 @@ def format_bp(bp):
print_results('{:9} {:>6} {:>6} {}',
format_bp(result.intersect_bp), pct_query, pct_genome,
name)
sum_found += n_intersect_mins
found.append(result)

# construct a new query, minus the previous one.
query_mins -= set(found_mins)
query = build_new_signature(query_mins, orig_query)

# basic reporting
print_results('\nfound {} matches total;', len(found))

sum_found /= len(orig_query.minhash.get_hashes())
sum_found /= len(query.minhash.get_hashes())
print_results('the recovered matches hit {:.1f}% of the query',
sum_found * 100)
print_results('')
Expand Down Expand Up @@ -1102,7 +973,7 @@ def format_bp(bp):

e = sourmash_lib.MinHash(ksize=query_ksize, n=0,
max_hash=new_max_hash)
e.add_many(query.minhash.get_mins())
e.add_many(next_query.minhash.get_mins())
sig.save_signatures([ sig.SourmashSignature('', e) ],
args.output_unassigned)

Expand Down
12 changes: 6 additions & 6 deletions sourmash_lib/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@
"""
A trial implementation of sequence bloom trees, Solomon & Kingsford, 2015.

This is a simple in-memory version where all of the graphs are in
memory at once; to move it onto disk, the graphs would need to be
dynamically loaded for each query.

To try it out, do::

factory = GraphFactory(ksize, tablesizes, n_tables)
Expand Down Expand Up @@ -188,8 +184,7 @@ def save(self, tag):
'.'.join([basetag, basename, 'sbt'])),
'name': node.name
}
if isinstance(node, Leaf):
data['metadata'] = node.metadata
data['metadata'] = node.metadata

node.save(os.path.join(dirprefix, data['filename']))
structure[i] = data
Expand Down Expand Up @@ -381,6 +376,7 @@ def __init__(self, factory, name=None, fullpath=None):
self._factory = factory
self._data = None
self._filename = fullpath
self.metadata = dict()

def __str__(self):
return '*Node:{name} [occupied: {nb}, fpr: {fpr:.2}]'.format(
Expand All @@ -407,10 +403,14 @@ def data(self, new_data):
def load(info, dirname):
filename = os.path.join(dirname, info['filename'])
new_node = Node(info['factory'], name=info['name'], fullpath=filename)
new_node.metadata = info.get('metadata', {})
return new_node

def update(self, parent):
parent.data.update(self.data)
max_n_below = max(parent.metadata.get('max_n_below', 0),
self.metadata.get('max_n_below'))
parent.metadata['max_n_below'] = max_n_below


class Leaf(object):
Expand Down
Loading