MRG: add tests for RocksDB/RevIndex, standalone manifests, and flexib…

…le pathlists (#436) * test using rocksdb as source of sketches * test file lists of zips * cargo fmt * hackity hack hack a picklist * ok that makes more sense * it works * comments around future par_iter * support loading from a .sig.gz for index * test pairwise loading from rocksdb * add test for queries from Rocksdb * decide not to implement lists of manifests :)
sourmash-bio · Aug 24, 2024 · 97db857 · 97db857
1 parent b3e5b81
commit 97db857
Show file tree

Hide file tree

Showing 14 changed files with 414 additions and 38 deletions.
diff --git a/src/index.rs b/src/index.rs
@@ -21,6 +21,18 @@ pub fn index<P: AsRef<Path>>(
 
     let collection = match siglist {
         x if x.ends_with(".zip") => Collection::from_zipfile(x)?,
+        x if x.ends_with(".sig") || x.ends_with(".sig.gz") => {
+            let signatures = Signature::from_path(&x)
+                .with_context(|| format!("Failed to load signatures from: '{}'", x))?;
+
+            let coll = Collection::from_sigs(signatures).with_context(|| {
+                format!(
+                    "Loaded signatures but failed to load as collection: '{}'",
+                    x
+                )
+            })?;
+            coll
+        }
         _ => {
             let file = File::open(siglist.clone())
                 .with_context(|| format!("Failed to open pathlist file: '{}'", siglist))?;
@@ -59,6 +71,7 @@ pub fn index<P: AsRef<Path>>(
     if collection.is_empty() {
         Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into())
     } else {
+        eprintln!("Indexing {} sketches.", collection.len());
         let mut index = RevIndex::create(output.as_ref(), collection, colors)?;
 
         if use_internal_storage {

diff --git a/src/lib.rs b/src/lib.rs
@@ -234,6 +234,8 @@ fn do_multisearch(
     estimate_ani: bool,
     output_path: Option<String>,
 ) -> anyhow::Result<u8> {
+    let _ = env_logger::try_init();
+
     let selection = build_selection(ksize, scaled, &moltype);
     let allow_failed_sigpaths = true;
 

diff --git a/src/python/tests/conftest.py b/src/python/tests/conftest.py
@@ -27,3 +27,11 @@ def zip_against(request):
 @pytest.fixture(params=[True, False])
 def indexed(request):
     return request.param
+
+@pytest.fixture(params=[True, False])
+def indexed_query(request):
+    return request.param
+
+@pytest.fixture(params=[True, False])
+def indexed_against(request):
+    return request.param
diff --git a/src/python/tests/sourmash_tst_utils.py b/src/python/tests/sourmash_tst_utils.py
@@ -31,6 +31,15 @@ def zip_siglist(runtmp, siglist, db):
     return db
 
 
+def index_siglist(runtmp, siglist, db, *, ksize=31, scaled=1000, moltype='DNA',
+                  toggle_internal_storage='--internal-storage'):
+    # build index
+    runtmp.sourmash('scripts', 'index', siglist,
+                    '-o', db, '-k', str(ksize), '--scaled', str(scaled),
+                    '--moltype', moltype, toggle_internal_storage)
+    return db
+
+
 def scriptpath(scriptname='sourmash'):
     """Return the path to the scripts, in both dev and install situations."""
     # note - it doesn't matter what the scriptname is here, as long as

diff --git a/src/python/tests/test-data/2.sig.zip b/src/python/tests/test-data/2.sig.zip
diff --git a/src/python/tests/test-data/47.sig.zip b/src/python/tests/test-data/47.sig.zip
diff --git a/src/python/tests/test-data/63.sig.zip b/src/python/tests/test-data/63.sig.zip
diff --git a/src/python/tests/test_fastgather.py b/src/python/tests/test_fastgather.py
@@ -4,7 +4,8 @@
 
 import sourmash
 from . import sourmash_tst_utils as utils
-from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist)
+from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist,
+                                 index_siglist)
 
 
 def test_installed(runtmp):
@@ -14,7 +15,7 @@ def test_installed(runtmp):
     assert 'usage:  fastgather' in runtmp.last_result.err
 
 
-def test_simple(runtmp, zip_against):
+def test_simple(runtmp, capfd, indexed_query, indexed_against, zip_against, toggle_internal_storage):
     # test basic execution!
     query = get_test_data('SRR606249.sig.gz')
     against_list = runtmp.output('against.txt')
@@ -25,23 +26,40 @@ def test_simple(runtmp, zip_against):
 
     make_file_list(against_list, [sig2, sig47, sig63])
 
+    if indexed_query:
+        query = index_siglist(runtmp, query, runtmp.output('query'),
+                              scaled=100000)
+
     if zip_against:
         against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))
 
+    if indexed_against:
+        against_list = index_siglist(runtmp, against_list, runtmp.output('db'),
+                                     toggle_internal_storage=toggle_internal_storage)
+
     g_output = runtmp.output('gather.csv')
     p_output = runtmp.output('prefetch.csv')
 
     runtmp.sourmash('scripts', 'fastgather', query, against_list,
                     '-o', g_output, '-s', '100000')
     assert os.path.exists(g_output)
 
+    captured = capfd.readouterr()
+    print(captured.err)
+
     df = pandas.read_csv(g_output)
     assert len(df) == 3
     keys = set(df.keys())
     assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys)
 
+    # CTB note: we do not need to worry about this warning for query from a
+    # RocksDB, since there is only one.
+    if indexed_against:
+        print('indexed against:', indexed_against)
+        assert "WARNING: loading all sketches from a RocksDB into memory!" in captured.err
 
-def test_simple_with_prefetch(runtmp, zip_against):
+
+def test_simple_with_prefetch(runtmp, zip_against, indexed, toggle_internal_storage):
     # test basic execution!
     query = get_test_data('SRR606249.sig.gz')
     against_list = runtmp.output('against.txt')
@@ -55,6 +73,41 @@ def test_simple_with_prefetch(runtmp, zip_against):
     if zip_against:
         against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))
 
+    if indexed:
+        against_list = index_siglist(runtmp, against_list, runtmp.output('db'),
+                                     toggle_internal_storage=toggle_internal_storage)
+
+    g_output = runtmp.output('gather.csv')
+    p_output = runtmp.output('prefetch.csv')
+
+    runtmp.sourmash('scripts', 'fastgather', query, against_list,
+                    '-o', g_output, '--output-prefetch', p_output,
+                    '-s', '100000')
+    assert os.path.exists(g_output)
+    assert os.path.exists(p_output)
+
+    df = pandas.read_csv(g_output)
+    assert len(df) == 3
+    keys = set(df.keys())
+    assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'gather_result_rank', 'intersect_bp'}.issubset(keys)
+
+    df = pandas.read_csv(p_output)
+    assert len(df) == 3
+    keys = set(df.keys())
+    assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'}
+
+
+def test_simple_with_prefetch_list_of_zips(runtmp):
+    # test basic execution!
+    query = get_test_data('SRR606249.sig.gz')
+    against_list = runtmp.output('against.txt')
+
+    sig2 = get_test_data('2.sig.zip')
+    sig47 = get_test_data('47.sig.zip')
+    sig63 = get_test_data('63.sig.zip')
+
+    make_file_list(against_list, [sig2, sig47, sig63])
+
     g_output = runtmp.output('gather.csv')
     p_output = runtmp.output('prefetch.csv')
 

diff --git a/src/python/tests/test_fastmultigather.py b/src/python/tests/test_fastmultigather.py
@@ -8,16 +8,8 @@
 
 import sourmash
 from . import sourmash_tst_utils as utils
-from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist)
-
-
-def index_siglist(runtmp, siglist, db, *, ksize=31, scaled=1000, moltype='DNA',
-                  toggle_internal_storage='--internal-storage'):
-    # build index
-    runtmp.sourmash('scripts', 'index', siglist,
-                    '-o', db, '-k', str(ksize), '--scaled', str(scaled),
-                    '--moltype', moltype, toggle_internal_storage)
-    return db
+from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist,
+                                 index_siglist)
 
 
 def test_installed(runtmp):
@@ -71,6 +63,47 @@ def test_simple(runtmp, zip_against):
     assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys)
 
 
+def test_simple_list_of_zips(runtmp):
+    # test basic execution!
+    query = get_test_data('SRR606249.sig.gz')
+    sig2 = get_test_data('2.sig.zip')
+    sig47 = get_test_data('47.sig.zip')
+    sig63 = get_test_data('63.sig.zip')
+
+    query_list = runtmp.output('query.txt')
+    against_list = runtmp.output('against.txt')
+
+    make_file_list(query_list, [query])
+    make_file_list(against_list, [sig2, sig47, sig63])
+
+    cwd = os.getcwd()
+    try:
+        os.chdir(runtmp.output(''))
+        runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list,
+                        '-s', '100000', '-t', '0')
+    finally:
+        os.chdir(cwd)
+
+    print(os.listdir(runtmp.output('')))
+
+    g_output = runtmp.output('SRR606249.gather.csv')
+    p_output = runtmp.output('SRR606249.prefetch.csv')
+    assert os.path.exists(p_output)
+
+    # check prefetch output (only non-indexed gather)
+    df = pandas.read_csv(p_output)
+    assert len(df) == 3
+    keys = set(df.keys())
+    assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'}
+
+    assert os.path.exists(g_output)
+    df = pandas.read_csv(g_output)
+    print(df)
+    assert len(df) == 3
+    keys = set(df.keys())
+    assert {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp', 'gather_result_rank'}.issubset(keys)
+
+
 def test_simple_space_in_signame(runtmp):
     # test basic execution!
     query = get_test_data('SRR606249.sig.gz')

diff --git a/src/python/tests/test_manysearch.py b/src/python/tests/test_manysearch.py
@@ -4,7 +4,8 @@
 import sourmash
 
 from . import sourmash_tst_utils as utils
-from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist)
+from .sourmash_tst_utils import (get_test_data, make_file_list, zip_siglist,
+                                 index_siglist)
 
 
 def test_installed(runtmp):
@@ -14,13 +15,6 @@ def test_installed(runtmp):
     assert 'usage:  manysearch' in runtmp.last_result.err
 
 
-def index_siglist(runtmp, siglist, db, ksize=31, scaled=1000, moltype='DNA'):
-    # build index
-    runtmp.sourmash('scripts', 'index', siglist,
-                    '-o', db, '-k', str(ksize), '--scaled', str(scaled),
-                    '--moltype', moltype)
-    return db
-
 def test_simple(runtmp, zip_query, zip_against):
     # test basic execution!
     query_list = runtmp.output('query.txt')
@@ -176,7 +170,7 @@ def test_simple_abund(runtmp):
     assert total_weighted_hashes == 73489
 
 
-def test_simple_indexed(runtmp, zip_query):
+def test_simple_indexed(runtmp, zip_query, indexed_query):
     # test basic execution!
     query_list = runtmp.output('query.txt')
     against_list = runtmp.output('against.txt')
@@ -188,12 +182,67 @@ def test_simple_indexed(runtmp, zip_query):
     make_file_list(query_list, [sig2, sig47, sig63])
     make_file_list(against_list, [sig2, sig47, sig63])
 
+    if zip_query:
+        query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))
+
+    if indexed_query:
+        query_list = index_siglist(runtmp, query_list, runtmp.output('query_db'))
+
     output = runtmp.output('out.csv')
 
     against_list = index_siglist(runtmp, against_list, runtmp.output('db'))
 
-    if zip_query:
-        query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))
+    print('query_list is:', query_list)
+    runtmp.sourmash('scripts', 'manysearch', query_list, against_list,
+                    '-o', output, '-t', '0.01')
+    assert os.path.exists(output)
+
+    df = pandas.read_csv(output)
+    assert len(df) == 5
+
+    dd = df.to_dict(orient='index')
+    print(dd)
+
+    for idx, row in dd.items():
+        # identical?
+        if row['match_name'] == row['query_name']:
+            assert float(row['containment'] == 1.0)
+            assert float(row['query_containment_ani'] == 1.0)
+        else:
+            # confirm hand-checked numbers
+            q = row['query_name'].split()[0]
+            m = row['match_name'].split()[0]
+            cont = float(row['containment'])
+            intersect_hashes = int(row['intersect_hashes'])
+            query_ani = float(row['query_containment_ani'])
+            cont = round(cont, 4)
+            query_ani = round(query_ani, 4)
+            print(q, m, f"{cont:.04}", f"{query_ani:.04}")
+
+            if q == 'NC_011665.1' and m == 'NC_009661.1':
+                assert cont == 0.4828
+                assert intersect_hashes == 2529
+                assert query_ani == 0.9768
+
+            if q == 'NC_009661.1' and m == 'NC_011665.1':
+                assert cont == 0.4885
+                assert intersect_hashes == 2529
+                assert query_ani == 0.9772
+
+
+def test_simple_list_of_zips(runtmp):
+    # test basic execution!
+    query_list = runtmp.output('query.txt')
+    against_list = runtmp.output('against.txt')
+
+    sig2 = get_test_data('2.sig.zip')
+    sig47 = get_test_data('47.sig.zip')
+    sig63 = get_test_data('63.sig.zip')
+
+    make_file_list(query_list, [sig2, sig47, sig63])
+    make_file_list(against_list, [sig2, sig47, sig63])
+
+    output = runtmp.output('out.csv')
 
     runtmp.sourmash('scripts', 'manysearch', query_list, against_list,
                     '-o', output, '-t', '0.01')