Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: add tests for RocksDB/RevIndex, standalone manifests, and flexible pathlists #436

Merged
merged 12 commits into from
Aug 24, 2024
13 changes: 13 additions & 0 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,18 @@ pub fn index<P: AsRef<Path>>(

let collection = match siglist {
x if x.ends_with(".zip") => Collection::from_zipfile(x)?,
x if x.ends_with(".sig") || x.ends_with(".sig.gz") => {
let signatures = Signature::from_path(&x)
.with_context(|| format!("Failed to load signatures from: '{}'", x))?;

let coll = Collection::from_sigs(signatures).with_context(|| {
format!(
"Loaded signatures but failed to load as collection: '{}'",
x
)
})?;
coll
}
_ => {
let file = File::open(siglist.clone())
.with_context(|| format!("Failed to open pathlist file: '{}'", siglist))?;
Expand Down Expand Up @@ -59,6 +71,7 @@ pub fn index<P: AsRef<Path>>(
if collection.is_empty() {
Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into())
} else {
eprintln!("Indexing {} sketches.", collection.len());
let mut index = RevIndex::create(output.as_ref(), collection, colors)?;

if use_internal_storage {
Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,8 @@ fn do_multisearch(
estimate_ani: bool,
output_path: Option<String>,
) -> anyhow::Result<u8> {
let _ = env_logger::try_init();

let selection = build_selection(ksize, scaled, &moltype);
let allow_failed_sigpaths = true;

Expand Down
8 changes: 8 additions & 0 deletions src/python/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,11 @@ def zip_against(request):
@pytest.fixture(params=[True, False])
def indexed(request):
return request.param

@pytest.fixture(params=[True, False])
def indexed_query(request):
return request.param

@pytest.fixture(params=[True, False])
def indexed_against(request):
return request.param
9 changes: 9 additions & 0 deletions src/python/tests/sourmash_tst_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,15 @@ def zip_siglist(runtmp, siglist, db):
return db


def index_siglist(runtmp, siglist, db, *, ksize=31, scaled=1000, moltype='DNA',
toggle_internal_storage='--internal-storage'):
# build index
runtmp.sourmash('scripts', 'index', siglist,
'-o', db, '-k', str(ksize), '--scaled', str(scaled),
'--moltype', moltype, toggle_internal_storage)
return db


def scriptpath(scriptname='sourmash'):
"""Return the path to the scripts, in both dev and install situations."""
# note - it doesn't matter what the scriptname is here, as long as
Expand Down
Binary file added src/python/tests/test-data/2.sig.zip
Binary file not shown.
Binary file added src/python/tests/test-data/47.sig.zip
Binary file not shown.
Binary file added src/python/tests/test-data/63.sig.zip
Binary file not shown.
59 changes: 56 additions & 3 deletions src/python/tests/test_fastgather.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

import sourmash
from . import sourmash_tst_utils as utils
from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist)
from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist,
index_siglist)


def test_installed(runtmp):
Expand All @@ -14,7 +15,7 @@ def test_installed(runtmp):
assert 'usage: fastgather' in runtmp.last_result.err


def test_simple(runtmp, zip_against):
def test_simple(runtmp, capfd, indexed_query, indexed_against, zip_against, toggle_internal_storage):
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand All @@ -25,23 +26,40 @@ def test_simple(runtmp, zip_against):

make_file_list(against_list, [sig2, sig47, sig63])

if indexed_query:
query = index_siglist(runtmp, query, runtmp.output('query'),
scaled=100000)

if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

if indexed_against:
against_list = index_siglist(runtmp, against_list, runtmp.output('db'),
toggle_internal_storage=toggle_internal_storage)

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

runtmp.sourmash('scripts', 'fastgather', query, against_list,
'-o', g_output, '-s', '100000')
assert os.path.exists(g_output)

captured = capfd.readouterr()
print(captured.err)

df = pandas.read_csv(g_output)
assert len(df) == 3
keys = set(df.keys())
assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys)

# CTB note: we do not need to worry about this warning for query from a
# RocksDB, since there is only one.
if indexed_against:
print('indexed against:', indexed_against)
assert "WARNING: loading all sketches from a RocksDB into memory!" in captured.err

def test_simple_with_prefetch(runtmp, zip_against):

def test_simple_with_prefetch(runtmp, zip_against, indexed, toggle_internal_storage):
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand All @@ -55,6 +73,41 @@ def test_simple_with_prefetch(runtmp, zip_against):
if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

if indexed:
against_list = index_siglist(runtmp, against_list, runtmp.output('db'),
toggle_internal_storage=toggle_internal_storage)

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

runtmp.sourmash('scripts', 'fastgather', query, against_list,
'-o', g_output, '--output-prefetch', p_output,
'-s', '100000')
assert os.path.exists(g_output)
assert os.path.exists(p_output)

df = pandas.read_csv(g_output)
assert len(df) == 3
keys = set(df.keys())
assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys)

df = pandas.read_csv(p_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'}


def test_simple_with_prefetch_list_of_zips(runtmp):
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')

sig2 = get_test_data('2.sig.zip')
sig47 = get_test_data('47.sig.zip')
sig63 = get_test_data('63.sig.zip')

make_file_list(against_list, [sig2, sig47, sig63])

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

Expand Down
53 changes: 43 additions & 10 deletions src/python/tests/test_fastmultigather.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,8 @@

import sourmash
from . import sourmash_tst_utils as utils
from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist)


def index_siglist(runtmp, siglist, db, *, ksize=31, scaled=1000, moltype='DNA',
toggle_internal_storage='--internal-storage'):
# build index
runtmp.sourmash('scripts', 'index', siglist,
'-o', db, '-k', str(ksize), '--scaled', str(scaled),
'--moltype', moltype, toggle_internal_storage)
return db
from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist,
index_siglist)


def test_installed(runtmp):
Expand Down Expand Up @@ -71,6 +63,47 @@ def test_simple(runtmp, zip_against):
assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys)


def test_simple_list_of_zips(runtmp):
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
sig2 = get_test_data('2.sig.zip')
sig47 = get_test_data('47.sig.zip')
sig63 = get_test_data('63.sig.zip')

query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')

make_file_list(query_list, [query])
make_file_list(against_list, [sig2, sig47, sig63])

cwd = os.getcwd()
try:
os.chdir(runtmp.output(''))
runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list,
'-s', '100000', '-t', '0')
finally:
os.chdir(cwd)

print(os.listdir(runtmp.output('')))

g_output = runtmp.output('SRR606249.gather.csv')
p_output = runtmp.output('SRR606249.prefetch.csv')
assert os.path.exists(p_output)

# check prefetch output (only non-indexed gather)
df = pandas.read_csv(p_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'}

assert os.path.exists(g_output)
df = pandas.read_csv(g_output)
print(df)
assert len(df) == 3
keys = set(df.keys())
assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys)


def test_simple_space_in_signame(runtmp):
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
Expand Down
71 changes: 60 additions & 11 deletions src/python/tests/test_manysearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import sourmash

from . import sourmash_tst_utils as utils
from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist)
from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist,
index_siglist)


def test_installed(runtmp):
Expand All @@ -14,13 +15,6 @@ def test_installed(runtmp):
assert 'usage: manysearch' in runtmp.last_result.err


def index_siglist(runtmp, siglist, db, ksize=31, scaled=1000, moltype='DNA'):
# build index
runtmp.sourmash('scripts', 'index', siglist,
'-o', db, '-k', str(ksize), '--scaled', str(scaled),
'--moltype', moltype)
return db

def test_simple(runtmp, zip_query, zip_against):
# test basic execution!
query_list = runtmp.output('query.txt')
Expand Down Expand Up @@ -176,7 +170,7 @@ def test_simple_abund(runtmp):
assert total_weighted_hashes == 73489


def test_simple_indexed(runtmp, zip_query):
def test_simple_indexed(runtmp, zip_query, indexed_query):
# test basic execution!
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -188,12 +182,67 @@ def test_simple_indexed(runtmp, zip_query):
make_file_list(query_list, [sig2, sig47, sig63])
make_file_list(against_list, [sig2, sig47, sig63])

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

if indexed_query:
query_list = index_siglist(runtmp, query_list, runtmp.output('query_db'))

output = runtmp.output('out.csv')

against_list = index_siglist(runtmp, against_list, runtmp.output('db'))

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))
print('query_list is:', query_list)
runtmp.sourmash('scripts', 'manysearch', query_list, against_list,
'-o', output, '-t', '0.01')
assert os.path.exists(output)

df = pandas.read_csv(output)
assert len(df) == 5

dd = df.to_dict(orient='index')
print(dd)

for idx, row in dd.items():
# identical?
if row['match_name'] == row['query_name']:
assert float(row['containment'] == 1.0)
assert float(row['query_containment_ani'] == 1.0)
else:
# confirm hand-checked numbers
q = row['query_name'].split()[0]
m = row['match_name'].split()[0]
cont = float(row['containment'])
intersect_hashes = int(row['intersect_hashes'])
query_ani = float(row['query_containment_ani'])
cont = round(cont, 4)
query_ani = round(query_ani, 4)
print(q, m, f"{cont:.04}", f"{query_ani:.04}")

if q == 'NC_011665.1' and m == 'NC_009661.1':
assert cont == 0.4828
assert intersect_hashes == 2529
assert query_ani == 0.9768

if q == 'NC_009661.1' and m == 'NC_011665.1':
assert cont == 0.4885
assert intersect_hashes == 2529
assert query_ani == 0.9772


def test_simple_list_of_zips(runtmp):
# test basic execution!
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')

sig2 = get_test_data('2.sig.zip')
sig47 = get_test_data('47.sig.zip')
sig63 = get_test_data('63.sig.zip')

make_file_list(query_list, [sig2, sig47, sig63])
make_file_list(against_list, [sig2, sig47, sig63])

output = runtmp.output('out.csv')

runtmp.sourmash('scripts', 'manysearch', query_list, against_list,
'-o', output, '-t', '0.01')
Expand Down
Loading