Skip to content

Commit

Permalink
MRG: add tests for RocksDB/RevIndex, standalone manifests, and flexib…
Browse files Browse the repository at this point in the history
…le pathlists (#436)

* test using rocksdb as source of sketches

* test file lists of zips

* cargo fmt

* hackity hack hack a picklist

* ok that makes more sense

* it works

* comments around future par_iter

* support loading from a .sig.gz for index

* test pairwise loading from rocksdb

* add test for queries from Rocksdb

* decide not to implement lists of manifests :)
  • Loading branch information
ctb authored Aug 24, 2024
1 parent b3e5b81 commit 97db857
Show file tree
Hide file tree
Showing 14 changed files with 414 additions and 38 deletions.
13 changes: 13 additions & 0 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,18 @@ pub fn index<P: AsRef<Path>>(

let collection = match siglist {
x if x.ends_with(".zip") => Collection::from_zipfile(x)?,
x if x.ends_with(".sig") || x.ends_with(".sig.gz") => {
let signatures = Signature::from_path(&x)
.with_context(|| format!("Failed to load signatures from: '{}'", x))?;

let coll = Collection::from_sigs(signatures).with_context(|| {
format!(
"Loaded signatures but failed to load as collection: '{}'",
x
)
})?;
coll
}
_ => {
let file = File::open(siglist.clone())
.with_context(|| format!("Failed to open pathlist file: '{}'", siglist))?;
Expand Down Expand Up @@ -59,6 +71,7 @@ pub fn index<P: AsRef<Path>>(
if collection.is_empty() {
Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into())
} else {
eprintln!("Indexing {} sketches.", collection.len());
let mut index = RevIndex::create(output.as_ref(), collection, colors)?;

if use_internal_storage {
Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,8 @@ fn do_multisearch(
estimate_ani: bool,
output_path: Option<String>,
) -> anyhow::Result<u8> {
let _ = env_logger::try_init();

let selection = build_selection(ksize, scaled, &moltype);
let allow_failed_sigpaths = true;

Expand Down
8 changes: 8 additions & 0 deletions src/python/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,11 @@ def zip_against(request):
@pytest.fixture(params=[True, False])
def indexed(request):
return request.param

@pytest.fixture(params=[True, False])
def indexed_query(request):
return request.param

@pytest.fixture(params=[True, False])
def indexed_against(request):
return request.param
9 changes: 9 additions & 0 deletions src/python/tests/sourmash_tst_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,15 @@ def zip_siglist(runtmp, siglist, db):
return db


def index_siglist(runtmp, siglist, db, *, ksize=31, scaled=1000, moltype='DNA',
toggle_internal_storage='--internal-storage'):
# build index
runtmp.sourmash('scripts', 'index', siglist,
'-o', db, '-k', str(ksize), '--scaled', str(scaled),
'--moltype', moltype, toggle_internal_storage)
return db


def scriptpath(scriptname='sourmash'):
"""Return the path to the scripts, in both dev and install situations."""
# note - it doesn't matter what the scriptname is here, as long as
Expand Down
Binary file added src/python/tests/test-data/2.sig.zip
Binary file not shown.
Binary file added src/python/tests/test-data/47.sig.zip
Binary file not shown.
Binary file added src/python/tests/test-data/63.sig.zip
Binary file not shown.
59 changes: 56 additions & 3 deletions src/python/tests/test_fastgather.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

import sourmash
from . import sourmash_tst_utils as utils
from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist)
from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist,
index_siglist)


def test_installed(runtmp):
Expand All @@ -14,7 +15,7 @@ def test_installed(runtmp):
assert 'usage: fastgather' in runtmp.last_result.err


def test_simple(runtmp, zip_against):
def test_simple(runtmp, capfd, indexed_query, indexed_against, zip_against, toggle_internal_storage):
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand All @@ -25,23 +26,40 @@ def test_simple(runtmp, zip_against):

make_file_list(against_list, [sig2, sig47, sig63])

if indexed_query:
query = index_siglist(runtmp, query, runtmp.output('query'),
scaled=100000)

if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

if indexed_against:
against_list = index_siglist(runtmp, against_list, runtmp.output('db'),
toggle_internal_storage=toggle_internal_storage)

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

runtmp.sourmash('scripts', 'fastgather', query, against_list,
'-o', g_output, '-s', '100000')
assert os.path.exists(g_output)

captured = capfd.readouterr()
print(captured.err)

df = pandas.read_csv(g_output)
assert len(df) == 3
keys = set(df.keys())
assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys)

# CTB note: we do not need to worry about this warning for query from a
# RocksDB, since there is only one.
if indexed_against:
print('indexed against:', indexed_against)
assert "WARNING: loading all sketches from a RocksDB into memory!" in captured.err

def test_simple_with_prefetch(runtmp, zip_against):

def test_simple_with_prefetch(runtmp, zip_against, indexed, toggle_internal_storage):
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand All @@ -55,6 +73,41 @@ def test_simple_with_prefetch(runtmp, zip_against):
if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

if indexed:
against_list = index_siglist(runtmp, against_list, runtmp.output('db'),
toggle_internal_storage=toggle_internal_storage)

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

runtmp.sourmash('scripts', 'fastgather', query, against_list,
'-o', g_output, '--output-prefetch', p_output,
'-s', '100000')
assert os.path.exists(g_output)
assert os.path.exists(p_output)

df = pandas.read_csv(g_output)
assert len(df) == 3
keys = set(df.keys())
assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys)

df = pandas.read_csv(p_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'}


def test_simple_with_prefetch_list_of_zips(runtmp):
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')

sig2 = get_test_data('2.sig.zip')
sig47 = get_test_data('47.sig.zip')
sig63 = get_test_data('63.sig.zip')

make_file_list(against_list, [sig2, sig47, sig63])

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

Expand Down
53 changes: 43 additions & 10 deletions src/python/tests/test_fastmultigather.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,8 @@

import sourmash
from . import sourmash_tst_utils as utils
from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist)


def index_siglist(runtmp, siglist, db, *, ksize=31, scaled=1000, moltype='DNA',
toggle_internal_storage='--internal-storage'):
# build index
runtmp.sourmash('scripts', 'index', siglist,
'-o', db, '-k', str(ksize), '--scaled', str(scaled),
'--moltype', moltype, toggle_internal_storage)
return db
from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist,
index_siglist)


def test_installed(runtmp):
Expand Down Expand Up @@ -71,6 +63,47 @@ def test_simple(runtmp, zip_against):
assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys)


def test_simple_list_of_zips(runtmp):
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
sig2 = get_test_data('2.sig.zip')
sig47 = get_test_data('47.sig.zip')
sig63 = get_test_data('63.sig.zip')

query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')

make_file_list(query_list, [query])
make_file_list(against_list, [sig2, sig47, sig63])

cwd = os.getcwd()
try:
os.chdir(runtmp.output(''))
runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list,
'-s', '100000', '-t', '0')
finally:
os.chdir(cwd)

print(os.listdir(runtmp.output('')))

g_output = runtmp.output('SRR606249.gather.csv')
p_output = runtmp.output('SRR606249.prefetch.csv')
assert os.path.exists(p_output)

# check prefetch output (only non-indexed gather)
df = pandas.read_csv(p_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'}

assert os.path.exists(g_output)
df = pandas.read_csv(g_output)
print(df)
assert len(df) == 3
keys = set(df.keys())
assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys)


def test_simple_space_in_signame(runtmp):
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
Expand Down
71 changes: 60 additions & 11 deletions src/python/tests/test_manysearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import sourmash

from . import sourmash_tst_utils as utils
from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist)
from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist,
index_siglist)


def test_installed(runtmp):
Expand All @@ -14,13 +15,6 @@ def test_installed(runtmp):
assert 'usage: manysearch' in runtmp.last_result.err


def index_siglist(runtmp, siglist, db, ksize=31, scaled=1000, moltype='DNA'):
# build index
runtmp.sourmash('scripts', 'index', siglist,
'-o', db, '-k', str(ksize), '--scaled', str(scaled),
'--moltype', moltype)
return db

def test_simple(runtmp, zip_query, zip_against):
# test basic execution!
query_list = runtmp.output('query.txt')
Expand Down Expand Up @@ -176,7 +170,7 @@ def test_simple_abund(runtmp):
assert total_weighted_hashes == 73489


def test_simple_indexed(runtmp, zip_query):
def test_simple_indexed(runtmp, zip_query, indexed_query):
# test basic execution!
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -188,12 +182,67 @@ def test_simple_indexed(runtmp, zip_query):
make_file_list(query_list, [sig2, sig47, sig63])
make_file_list(against_list, [sig2, sig47, sig63])

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

if indexed_query:
query_list = index_siglist(runtmp, query_list, runtmp.output('query_db'))

output = runtmp.output('out.csv')

against_list = index_siglist(runtmp, against_list, runtmp.output('db'))

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))
print('query_list is:', query_list)
runtmp.sourmash('scripts', 'manysearch', query_list, against_list,
'-o', output, '-t', '0.01')
assert os.path.exists(output)

df = pandas.read_csv(output)
assert len(df) == 5

dd = df.to_dict(orient='index')
print(dd)

for idx, row in dd.items():
# identical?
if row['match_name'] == row['query_name']:
assert float(row['containment'] == 1.0)
assert float(row['query_containment_ani'] == 1.0)
else:
# confirm hand-checked numbers
q = row['query_name'].split()[0]
m = row['match_name'].split()[0]
cont = float(row['containment'])
intersect_hashes = int(row['intersect_hashes'])
query_ani = float(row['query_containment_ani'])
cont = round(cont, 4)
query_ani = round(query_ani, 4)
print(q, m, f"{cont:.04}", f"{query_ani:.04}")

if q == 'NC_011665.1' and m == 'NC_009661.1':
assert cont == 0.4828
assert intersect_hashes == 2529
assert query_ani == 0.9768

if q == 'NC_009661.1' and m == 'NC_011665.1':
assert cont == 0.4885
assert intersect_hashes == 2529
assert query_ani == 0.9772


def test_simple_list_of_zips(runtmp):
# test basic execution!
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')

sig2 = get_test_data('2.sig.zip')
sig47 = get_test_data('47.sig.zip')
sig63 = get_test_data('63.sig.zip')

make_file_list(query_list, [sig2, sig47, sig63])
make_file_list(against_list, [sig2, sig47, sig63])

output = runtmp.output('out.csv')

runtmp.sourmash('scripts', 'manysearch', query_list, against_list,
'-o', output, '-t', '0.01')
Expand Down
Loading

0 comments on commit 97db857

Please sign in to comment.