From 7826fbc083ce37f46d285a3f846f9db4d1b945b5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 12 May 2022 19:52:22 -0700 Subject: [PATCH] [MRG] add `sig collect` command (#2036) * add 'sig collect' command * todo * support not-in-memory * some cleanup * more cleanup * start adding tests * Add test of multiple * add 'create_or_open', and some more tests * add rewrite location * use 'get_manifest' * implement sig collect --no-require * doc me * more doc * add abspath, fix a few different future problems :) * simplify and rationalize CLI parameters for collect * more tests * test merge combinations * remove @CTB * do final tests * fix doc * add some more tests, fix __add__ * fix notify * swizzle 'sig check' to rewrite internal location, too. * Update src/sourmash/sig/__main__.py Co-authored-by: Tessa Pierce Ward * Update tests/test_cmd_signature_collect.py Co-authored-by: Tessa Pierce Ward Co-authored-by: Tessa Pierce Ward --- doc/command-line.md | 76 +++--- doc/databases-advanced.md | 11 +- src/sourmash/cli/sig/__init__.py | 1 + src/sourmash/cli/sig/collect.py | 62 +++++ src/sourmash/index/sqlite_index.py | 32 ++- src/sourmash/manifest.py | 11 +- src/sourmash/sig/__main__.py | 121 +++++++++- tests/conftest.py | 5 + tests/test_cmd_signature.py | 51 ++++ tests/test_cmd_signature_collect.py | 346 ++++++++++++++++++++++++++++ tests/test_manifest_protocol.py | 32 +++ tests/test_sqlite_index.py | 14 +- 12 files changed, 702 insertions(+), 60 deletions(-) create mode 100644 src/sourmash/cli/sig/collect.py create mode 100644 tests/test_cmd_signature_collect.py diff --git a/doc/command-line.md b/doc/command-line.md index ea0a91bd07..d625810674 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -1436,6 +1436,24 @@ sourmash database. `sourmash sig check` is particularly useful when working with large collections of signatures and identifiers. +### `sourmash signature collect` - collect manifests across databases + +Collect manifests from across (many) files and merge into a single +standalone manifest. + +For example, +``` +sourmash sig collect tests/test-data/gather/GCF*.sig -o mf.sqlmf +``` +will load all of the `GCF` signatures and build a manifest file `mf.sqlmf` +that contains references to all of the signatures, but not the signatures +themselves. +This manifest file can be loaded directly from the command line by sourmash. + +`sourmash sig collect` defaults to outputting SQLite manifests. It is +particularly useful when working with large collections of signatures and +identifiers, and has command line options for merging and updating manifests. + ## Advanced command-line usage ### Loading signatures and databases @@ -1689,7 +1707,7 @@ signatures that were just created. ### Using manifests to explicitly refer to collections of files -(sourmash v4.4.0 and later) +(sourmash v4.4 and later) Manifests are metadata catalogs of signatures that are used for signature selection and loading. They are used extensively by sourmash @@ -1698,52 +1716,28 @@ pattern matching. Manifests can _also_ be used externally (via the command-line), and may be useful for organizing large collections of signatures. They can -be generated with `sourmash sig manifest` as well as `sourmash sig check`. +be generated with the `sig collect`, `sig manifest`, and `sig check` +subcommands. -Suppose you have a large collection of signature (`.sig` or `.sig.gz` -files) under a directory. You can create a manifest file for them like so: +Suppose you have a large collection of signatures (`.sig` or `.sig.gz` +files) in a location (e.g., under a directory, or in a zip file). You +can create a manifest file for them like so: ``` -sourmash sig manifest -o /manifest.csv +sourmash sig collect -o manifest.sqlmf ``` -and then use the manifest directly for sourmash operations: +and then use the manifest directly for sourmash operations, for example: ``` -sourmash sig fileinfo /manifest.csv +sourmash sig fileinfo manifest.sqlmf ``` -This manifest can be used as a database target for most sourmash -operations - search, gather, etc. Note that manifests for directories -must be placed within (and loaded from) the directory from which the -manifest was generated; the specific manifest filename does not -matter. - -A more advanced and slightly tricky way to use explicit manifest files -is with lists of files. If you create a file with a path list -containing the locations of loadable sourmash collections, you can run -`sourmash sig manifest pathlist.txt -o mf.csv` to generate a manifest -of all of the files. The resulting manifest in `mf.csv` can then be -loaded directly. This is very handy when you have many sourmash -signatures, or large signature files. The tricky part in doing this -is that the manifest will store the same paths listed in the pathlist -file - whether they are relative or absolute paths - and these paths -must be resolvable by sourmash from the current working directory. -This makes explicit manifests built from pathlist files less portable -within or across systems than the other sourmash collections, which -are all relocatable. +This manifest contains _references_ to the signatures (but not the +signatures themselves) and can then be used as a database target for most +sourmash operations - search, gather, etc. -For example, if you create a pathlist file `paths.txt` containing the -following: -``` -/path/to/zipfile.zip -local_directory/some_signature.sig.gz -local_dir2/ -``` -and then run: -``` -sourmash sig manifest paths.txt -o mf.csv -``` -you will be able to use `mf.csv` as a database for `sourmash search` -and `sourmash gather` commands. But, because it contains two relative paths, -you will only be able to use it _from the directory that contains those -two relative paths_. +Note that `sig collect` will generate manifests containing the +pathnames given to it - so if you use relative paths, the references +will be relative to the working directory in which `sig collect` was +run. You can use `sig collect --abspath` to rewrite the paths +into absolute paths. **Our advice:** We suggest using zip file collections for most situations; we primarily recommend using explicit manifests for diff --git a/doc/databases-advanced.md b/doc/databases-advanced.md index 816f22f0a9..9e4d1c25d7 100644 --- a/doc/databases-advanced.md +++ b/doc/databases-advanced.md @@ -2,11 +2,11 @@ sourmash uses a variety of different mechanisms and formats for storing, organizing, and searching signatures. Some of these mechanisms, "collections", just store the signatures; others ("indexed" databases) provide indices on the signatures for fast content-based search. _Most_ of the mechanisms now use manifests that permit fast selection and loading of signatures based on metadata. Below we refer to "databases" generically as any on-disk storage mechanism for sourmash signatures. -Which database type is best to use depends on what you're doing - which is what this document is about! In general, however, sourmash should be fast enough that database choice will only impacts performance when searching 1000s of signatures, or doing many 1000s of searches. +Which database type is best to use depends on what you're doing - which is what this document is about! In general, however, sourmash should be fast enough that database choice will only impact performance when searching thousands of signatures, or doing thousands of searches. The recommended file extensions below are conventions used to signal the output format when using `-o` with `sourmash sketch` and the `sourmash sig` subcommands; so, for example, `sourmash sketch dna *.fa -o xyz.zip` will output signatures in the .zip format. -sourmash will automatically detect and load the database, based on the database _content_ in most cases. +sourmash will automatically detect and load the database, based on the database _content_ and not the database extension, in most cases. Unless noted otherwise, the below database formats are supported in all release since sourmash v3.5. @@ -56,15 +56,16 @@ We recommend SBT and LCA databases for use only in specific situations - e.g. SB ### Manifests -Manifests are catalogs of signature metadata - name, molecule type, k-mer size, and other information - that can be used to select specific signatures for searching or processing. Typically when using manifests the actual signatures themselves are not loaded until they are needed. +Manifests are catalogs of signature metadata - name, molecule type, k-mer size, and other information - that can be used to select specific signatures for searching or processing. Typically when using manifests the actual signatures themselves are not loaded until they are needed, although the efficiency of this depends on the signature storage mechanism; for example, JSON-format containers (`.sig` and `.lca.json` files) must be entirely loaded before any signature in the file them can be used, unlike zip containers. As of sourmash 4.4 manifests can be *directly* loaded from the command line as standalone collections. This lets manifests serve as a catalog of signatures stored in many different locations. Standalone manifests are preferable to both directory storage and pathlists (below), because they support fast selection and direct lazy loading. They are the most effective solution for managing custom collections of thousands to millions of signatures. -Manifests can be created with `sourmash sig manifest` and `sourmash sig check`. For complex situations, we recommend using custom Python scripts to manage them - for example, see [sigs-to-manifest.py in database-examples](https://github.com/sourmash-bio/database-examples/blob/main/sigs-to-manifest.py). +Standalone manifests can be created with `sourmash sig collect` +(sourmash v4.4 and later). -Sourmash supports two manifest file formats - CSV and SQLite. SQLite manifests are much faster than CSV manifests in exchange for extra disk space. +Sourmash supports two manifest file formats - CSV and SQLite. SQLite manifests are much faster and lower-memory than CSV manifests in exchange for consuming some extra disk space. ### Directories diff --git a/src/sourmash/cli/sig/__init__.py b/src/sourmash/cli/sig/__init__.py index 4bb956e7b3..f256a7473d 100644 --- a/src/sourmash/cli/sig/__init__.py +++ b/src/sourmash/cli/sig/__init__.py @@ -16,6 +16,7 @@ from . import grep from . import kmers from . import check +from . import collect from . import intersect from . import inflate from . import manifest diff --git a/src/sourmash/cli/sig/collect.py b/src/sourmash/cli/sig/collect.py new file mode 100644 index 0000000000..d44aaa9a39 --- /dev/null +++ b/src/sourmash/cli/sig/collect.py @@ -0,0 +1,62 @@ +"""collect manifest information across many files""" + +usage=""" + + sourmash sig collect -o all.sqlmf + +This will collect manifests from across many files and save the information +into a standalone manifest database. + +By default, 'sig collect' requires a pre-existing manifest for collections; +this prevents potentially slow manifest rebuilding. You +can turn this check off with '--no-require-manifest'. + +""" + +from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, + add_picklist_args, add_pattern_args) + + +def subparser(subparsers): + subparser = subparsers.add_parser('collect', usage=usage) + subparser.add_argument('locations', nargs='*', + help='locations of input signatures') + subparser.add_argument('-o', '--output', help='manifest output file', + required=True) + subparser.add_argument( + '-q', '--quiet', action='store_true', + help='suppress non-error output' + ) + subparser.add_argument( + '-d', '--debug', action='store_true', + help='provide debugging output' + ) + subparser.add_argument( + '--from-file', + help='a text file containing a list of files to load signatures from' + ) + subparser.add_argument( + '--no-require-manifest', + help='do not require a manifest; generate dynamically if needed', + action='store_true' + ) + subparser.add_argument( + '-F', '--manifest-format', + help="format of manifest output file; default is 'csv')", + default='sql', + choices=['csv', 'sql'], + ) + + subparser.add_argument('--merge-previous', action='store_true', + help='merge new manifests into existing') + subparser.add_argument('--abspath', + help="convert all locations to absolute paths", + action='store_true') + + add_ksize_arg(subparser, 31) + add_moltype_args(subparser) + + +def main(args): + import sourmash + return sourmash.sig.__main__.collect(args) diff --git a/src/sourmash/index/sqlite_index.py b/src/sourmash/index/sqlite_index.py index 9fc84ecb0f..26b26e25ae 100644 --- a/src/sourmash/index/sqlite_index.py +++ b/src/sourmash/index/sqlite_index.py @@ -141,7 +141,7 @@ def load_sqlite_index(filename, *, request_manifest=False): is_lca_db = True debug_literal("load_sqlite_index: it's got a lineage table!") - if internal_d['SqliteManifest']: + if 'SqliteManifest' in internal_d: v = internal_d['SqliteManifest'] if v != '1.0': raise IndexNotSupported @@ -598,6 +598,17 @@ def create(cls, filename): cls._create_tables(cursor) return cls(conn) + @classmethod + def create_or_open(cls, filename): + "Connect to 'filename' and create tables if not exist." + conn = sqlite3.connect(filename) + cursor = conn.cursor() + try: + cls._create_tables(cursor) + except sqlite3.OperationalError: + pass + return cls(conn) + @classmethod def load_from_manifest(cls, manifest, *, dbfile=":memory:", append=False): "Create a new sqlite manifest from an existing manifest object." @@ -646,6 +657,10 @@ def _create_tables(cls, cursor): ) """) + def add_row(self, row): + c = self.conn.cursor() + self._insert_row(c, row) + def _insert_row(self, cursor, row, *, call_is_from_index=False): "Insert a new manifest row." # check - is this manifest managed by SqliteIndex? If so, prevent @@ -699,6 +714,21 @@ def __len__(self): self._num_rows = sum(1 for _ in self.rows) return self._num_rows + def __iadd__(self, other): + c = self.conn.cursor() + for row in other.rows: + self._insert_row(c, row) + return self + + def __add__(self, other): + new_mf = self.create(":memory:") + new_mf += self + new_mf += other + return new_mf + + def close(self): + self.conn.commit() + def _make_select(self): """Build a set of SQL SELECT conditions and matching value tuple that can be used to select the right sketches from the diff --git a/src/sourmash/manifest.py b/src/sourmash/manifest.py index 44a1163dae..e2431dde36 100644 --- a/src/sourmash/manifest.py +++ b/src/sourmash/manifest.py @@ -78,15 +78,17 @@ def load_from_csv(cls, fp): row['signature'] = None manifest_list.append(row) - return cls(manifest_list) + return CollectionManifest(manifest_list) @classmethod def load_from_sql(cls, filename): from sourmash.index.sqlite_index import load_sqlite_index db = load_sqlite_index(filename, request_manifest=True) - if db: + if db is not None: return db.manifest + return None + def write_to_filename(self, filename, *, database_format='csv', ok_if_exists=False): if database_format == 'csv': @@ -207,7 +209,7 @@ class CollectionManifest(BaseCollectionManifest): """ An in-memory manifest that simply stores the rows in a list. """ - def __init__(self, rows): + def __init__(self, rows=[]): "Initialize from an iterable of metadata dictionaries." self.rows = [] self._md5_set = set() @@ -219,6 +221,9 @@ def load_from_manifest(cls, manifest, **kwargs): "Load this manifest from another manifest object." return cls(manifest.rows) + def add_row(self, row): + self._add_rows([row]) + def _add_rows(self, rows): self.rows.extend(rows) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 3dd495b6d3..8bec700db5 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -40,6 +40,8 @@ import [ ... ] - import a mash or other signature export - export a signature, e.g. to mash overlap - see detailed comparison of sigs +check --picklist ... - check picklist against (many) sigs +collect -o manifest.sqlmf - collect sigs metadata into manifest ** Use '-h' to get subcommand-specific help, e.g. @@ -52,12 +54,15 @@ def _check_abundance_compatibility(sig1, sig2): raise ValueError("incompatible signatures: track_abundance is {} in first sig, {} in second".format(sig1.minhash.track_abundance, sig2.minhash.track_abundance)) -def _extend_signatures_with_from_file(args): +def _extend_signatures_with_from_file(args, *, target_attr='signatures'): # extend input signatures with --from-file if args.from_file: more_files = sourmash_args.load_pathlist_from_file(args.from_file) - args.signatures = list(args.signatures) - args.signatures.extend(more_files) + + sigs = list(getattr(args, target_attr)) + sigs.extend(more_files) + setattr(args, target_attr, sigs) + def _set_num_scaled(mh, num, scaled): "set num and scaled values on a MinHash object" @@ -1336,11 +1341,18 @@ def check(args): sys.exit(-1) # has manifest, or ok to build (require_manifest=False) - continue! - manifest = sourmash_args.get_manifest(idx, require=True) - manifest_rows = manifest.select_to_manifest(picklist=picklist) - total_rows_examined += len(manifest) - total_manifest_rows += manifest_rows - debug_literal(f"examined {len(manifest)} new rows, found {len(manifest_rows)} matching rows") + new_manifest = sourmash_args.get_manifest(idx, require=True) + sub_manifest = new_manifest.select_to_manifest(picklist=picklist) + total_rows_examined += len(new_manifest) + + # rewrite locations so that each signature can be found by filename + # of its container; this follows `sig collect` logic. + rows = [] + for row in sub_manifest.rows: + row['internal_location'] = filename + total_manifest_rows.add_row(row) + + debug_literal(f"examined {len(new_manifest)} new rows, found {len(sub_manifest)} matching rows") notify(f"loaded {total_rows_examined} signatures.") @@ -1384,6 +1396,99 @@ def check(args): sys.exit(-1) +def collect(args): + "Collect signature metadata across many locations, save to manifest" + # TODO: + # test what happens with directories :) + set_quiet(False, args.debug) + + if os.path.exists(args.output): + if args.merge_previous: + pass + else: + error(f"ERROR: '{args.output}' already exists!") + error(f"ERROR: please remove it, or use --merge-previous to merge") + sys.exit(-1) + elif args.merge_previous: + notify(f"WARNING: --merge-previous specified, but output file '{args.output}' does not already exist?") + + # load previous manifest for --merge-previous. This gets tricky with + # mismatched manifest types, which we forbid. + try: + if args.manifest_format == 'sql': + # create on-disk manifest + from sourmash.index.sqlite_index import SqliteCollectionManifest + + if args.merge_previous: + collected_mf = SqliteCollectionManifest.create_or_open(args.output) + else: + collected_mf = SqliteCollectionManifest.create(args.output) + else: + # create in-memory manifest that will be saved as CSV + assert args.manifest_format == 'csv' + + if args.merge_previous and os.path.exists(args.output): + collected_mf = CollectionManifest.load_from_filename(args.output) + else: + collected_mf = CollectionManifest() + + if not isinstance(collected_mf, CollectionManifest): + raise Exception + except: + error(f"ERROR loading '{args.output}' with --merge-previous. Is it of type {args.manifest_format}?") + sys.exit(-1) + + if args.merge_previous: + notify(f"merging new locations with {len(collected_mf)} previous rows.") + + # require manifests? yes by default, since generating can be slow. + require_manifest = True + if args.no_require_manifest: + require_manifest = False + debug("sig check: manifest will not be required") + else: + debug("sig check: manifest required") + + n_files = 0 + + # load from_file + _extend_signatures_with_from_file(args, target_attr='locations') + + # convert to abspath + if args.abspath: + args.locations = [ os.path.abspath(iloc) for iloc in args.locations ] + + # iterate through, loading all the manifests from all the locations. + for n_files, loc in enumerate(args.locations): + notify(f"Loading signature information from {loc}.") + + if n_files % 100 == 0: + notify(f'... loaded {len(collected_mf)} sigs from {n_files} files') + idx = sourmash.load_file_as_index(loc) + if idx.manifest is None and require_manifest: + error(f"ERROR on location '{loc}'") + error(f"sig collect requires a manifest by default, but no manifest present.") + error("specify --no-require-manifest to dynamically generate one.") + sys.exit(-1) + + mf = sourmash_args.get_manifest(idx) + + rows = [] + for row in mf.rows: + row['internal_location'] = loc + collected_mf.add_row(row) + + if args.manifest_format == 'csv': + collected_mf.write_to_filename(args.output, database_format='csv', + ok_if_exists=args.merge_previous) + else: + collected_mf.close() + + notify(f"saved {len(collected_mf)} manifest rows to '{args.output}'") + + return 0 + + def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) submod = getattr(sourmash.cli.sig, args.subcmd) diff --git a/tests/conftest.py b/tests/conftest.py index 51cdd81a12..ad249537ca 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -70,6 +70,11 @@ def lca_db_format(request): return request.param +@pytest.fixture(params=['csv', 'sql']) +def manifest_db_format(request): + return request.param + + # --- BEGIN - Only run tests using a particular fixture --- # # Cribbed from: http://pythontesting.net/framework/pytest/pytest-run-tests-using-particular-fixture/ def pytest_collection_modifyitems(items, config): diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 7340f38dae..42f24593da 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -4475,6 +4475,57 @@ def test_sig_check_1_diff_col_name(runtmp): mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 + # internal locations should match + sigfile_set = set(sigfiles) + for row in mf.rows: + assert row['internal_location'] in sigfile_set + + idx = sourmash.load_file_as_index(out_mf) + siglist = list(idx.signatures()) + assert len(siglist) == 24 + ksizes = set([ ss.minhash.ksize for ss in siglist ]) + assert len(ksizes) == 3 + assert 11 in ksizes + assert 21 in ksizes + assert 31 in ksizes + + # should be one non-matching picklist row + with open(missing_csv, newline='') as fp: + rows = list(csv.reader(fp)) + assert len(rows) == 2 # header row + data row + assert rows[1][0] == 'NOT THERE' + + +def test_sig_check_1_diff_col_name_zip(runtmp): + # 'sig check' with 'name2' column instead of default name, on a zip file + sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) + picklist = utils.get_test_data('gather/salmonella-picklist-diffcolumn.csv') + + # first create a zip db + runtmp.sourmash('sig', 'cat', *sigfiles, '-o', 'gcf.zip') + + # now run against this zip + runtmp.sourmash('sig', 'check', 'gcf.zip', + "--picklist", f"{picklist}:name2:name", + "-o", "missing.csv", + '-m', 'mf.csv') + + out_mf = runtmp.output('mf.csv') + assert os.path.exists(out_mf) + + missing_csv = runtmp.output('missing.csv') + assert os.path.exists(missing_csv) + + # should be 24 matching manifest rows + with open(out_mf, newline='') as fp: + mf = CollectionManifest.load_from_csv(fp) + assert len(mf) == 24 + + # internal locations should all point to zip + ilocs = set(( row['internal_location'] for row in mf.rows )) + assert len(ilocs) == 1 + + # can we get 'em? idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 diff --git a/tests/test_cmd_signature_collect.py b/tests/test_cmd_signature_collect.py new file mode 100644 index 0000000000..1f346eb464 --- /dev/null +++ b/tests/test_cmd_signature_collect.py @@ -0,0 +1,346 @@ +""" +Tests for 'sourmash sig collect' +""" +import pytest +import shutil +import os.path + +import sourmash +from sourmash.manifest import BaseCollectionManifest + +import sourmash_tst_utils as utils +from sourmash_tst_utils import SourmashCommandFailed + + +def test_sig_collect_0_nothing(runtmp, manifest_db_format): + # run with just output + ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + if manifest_db_format != 'sql': return + + runtmp.sourmash('sig', 'collect', '-o', f'mf.{ext}', + '-F', manifest_db_format) + + manifest_fn = runtmp.output(f'mf.{ext}') + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + + assert len(manifest) == 0 + + +def test_sig_collect_1_zipfile(runtmp, manifest_db_format): + # collect a manifest from a .zip file + protzip = utils.get_test_data('prot/protein.zip') + + ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + + runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', + '-F', manifest_db_format) + + manifest_fn = runtmp.output(f'mf.{ext}') + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + + assert len(manifest) == 2 + md5_list = [ row['md5'] for row in manifest.rows ] + assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list + assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + + +def test_sig_collect_2_exists_fail(runtmp, manifest_db_format): + # collect a manifest from two .zip files + protzip = utils.get_test_data('prot/protein.zip') + allzip = utils.get_test_data('prot/protein.zip') + + ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + + runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', + '-F', manifest_db_format) + + manifest_fn = runtmp.output(f'mf.{ext}') + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + + assert len(manifest) == 2 + md5_list = [ row['md5'] for row in manifest.rows ] + assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list + assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + + # now run with same filename - should fail + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, + '-F', manifest_db_format) + + +def test_sig_collect_2_exists_merge(runtmp, manifest_db_format): + # collect a manifest from two .zip files + protzip = utils.get_test_data('prot/protein.zip') + allzip = utils.get_test_data('prot/all.zip') + + ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + + runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', + '-F', manifest_db_format) + + manifest_fn = runtmp.output(f'mf.{ext}') + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + + assert len(manifest) == 2 + md5_list = [ row['md5'] for row in manifest.rows ] + assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list + assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + + # now run with same filename - should merge + runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, + '-F', manifest_db_format, '--merge') + + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + assert len(manifest) == 10 + + +def test_sig_collect_2_exists_sql_merge_csv(runtmp, manifest_db_format): + # try to merge csv into sql + protzip = utils.get_test_data('prot/protein.zip') + allzip = utils.get_test_data('prot/all.zip') + + ext = 'sqlmf' + + # save as sql... + runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', + '-F', 'sql') + + manifest_fn = runtmp.output(f'mf.{ext}') + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + + assert len(manifest) == 2 + md5_list = [ row['md5'] for row in manifest.rows ] + assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list + assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, + '-F', 'csv', '--merge') + + assert "ERROR loading" in runtmp.last_result.err + + +def test_sig_collect_2_exists_csv_merge_sql(runtmp): + # try to merge sql into csv + protzip = utils.get_test_data('prot/protein.zip') + allzip = utils.get_test_data('prot/all.zip') + + ext = 'csv' + + # save as sql... + runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', + '-F', 'csv') + + manifest_fn = runtmp.output(f'mf.{ext}') + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + + assert len(manifest) == 2 + md5_list = [ row['md5'] for row in manifest.rows ] + assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list + assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, + '-F', 'sql', '--merge') + + assert "ERROR loading" in runtmp.last_result.err + + +def test_sig_collect_2_no_exists_merge(runtmp, manifest_db_format): + # test 'merge' when args.output doesn't already exist => warning + protzip = utils.get_test_data('prot/protein.zip') + allzip = utils.get_test_data('prot/all.zip') + + ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + manifest_fn = runtmp.output(f'mf.{ext}') + + # run with --merge but no previous: + runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, + '-F', manifest_db_format, '--merge') + + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + assert len(manifest) == 8 + + err = runtmp.last_result.err + print(err) + assert "WARNING: --merge-previous specified, but output file" in err + + +def test_sig_collect_3_multiple(runtmp, manifest_db_format): + # collect a manifest from two .zip files + protzip = utils.get_test_data('prot/protein.zip') + hpzip = utils.get_test_data('prot/hp.zip') + dayzip = utils.get_test_data('prot/dayhoff.zip') + + ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + + runtmp.sourmash('sig', 'collect', protzip, hpzip, dayzip, + '-o', f'mf.{ext}', '-F', manifest_db_format) + + manifest_fn = runtmp.output(f'mf.{ext}') + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + + assert len(manifest) == 6 + md5_list = [ row['md5'] for row in manifest.rows ] + assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list + assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + assert 'ea2a1ad233c2908529d124a330bcb672' in md5_list + assert 'bb0e6d90df01b7bd5d0956a5f9e3ed12' in md5_list + assert 'fbca5e5211e4d58427997fd5c8343e9a' in md5_list + assert '1cbd888bf910f83ad8f1715509183223' in md5_list + + locations = set([ row['internal_location'] for row in manifest.rows ]) + assert protzip in locations + assert hpzip in locations + assert dayzip in locations + assert len(locations) == 3, locations + + +def test_sig_collect_3_multiple_use_fromfile(runtmp, manifest_db_format): + # collect a manifest from two .zip files using --from-file + protzip = utils.get_test_data('prot/protein.zip') + hpzip = utils.get_test_data('prot/hp.zip') + dayzip = utils.get_test_data('prot/dayhoff.zip') + + ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + + fromfile = runtmp.output('fromfile.txt') + with open(fromfile, 'wt') as fp: + print(protzip, file=fp) + print(hpzip, file=fp) + print(dayzip, file=fp) + + runtmp.sourmash('sig', 'collect', '--from-file', 'fromfile.txt', + '-o', f'mf.{ext}', '-F', manifest_db_format) + + manifest_fn = runtmp.output(f'mf.{ext}') + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + + assert len(manifest) == 6 + md5_list = [ row['md5'] for row in manifest.rows ] + assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list + assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + assert 'ea2a1ad233c2908529d124a330bcb672' in md5_list + assert 'bb0e6d90df01b7bd5d0956a5f9e3ed12' in md5_list + assert 'fbca5e5211e4d58427997fd5c8343e9a' in md5_list + assert '1cbd888bf910f83ad8f1715509183223' in md5_list + + locations = set([ row['internal_location'] for row in manifest.rows ]) + assert protzip in locations + assert hpzip in locations + assert dayzip in locations + assert len(locations) == 3, locations + + +def test_sig_collect_4_multiple_from_sig(runtmp, manifest_db_format): + # collect a manifest from sig files + sig43 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + + runtmp.sourmash('sig', 'collect', sig43, sig63, + '-o', f'mf.{ext}', '-F', manifest_db_format) + + manifest_fn = runtmp.output(f'mf.{ext}') + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + + assert len(manifest) == 2 + md5_list = [ row['md5'] for row in manifest.rows ] + assert '09a08691ce52952152f0e866a59f6261' in md5_list + assert '38729c6374925585db28916b82a6f513' in md5_list + + locations = set([ row['internal_location'] for row in manifest.rows ]) + assert sig43 in locations + assert sig63 in locations + assert len(locations) == 2, locations + + +def test_sig_collect_4_multiple_from_sig_abspath(runtmp, manifest_db_format): + # collect a manifest from sig files, forcing abspath + sig43 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + shutil.copyfile(sig43, runtmp.output('47.fa.sig')) + shutil.copyfile(sig63, runtmp.output('63.fa.sig')) + + ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + + runtmp.sourmash('sig', 'collect', '47.fa.sig', '63.fa.sig', '--abspath', + '-o', f'mf.{ext}', '-F', manifest_db_format) + + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + manifest_fn = runtmp.output(f'mf.{ext}') + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + + assert len(manifest) == 2 + md5_list = [ row['md5'] for row in manifest.rows ] + assert '09a08691ce52952152f0e866a59f6261' in md5_list + assert '38729c6374925585db28916b82a6f513' in md5_list + + locations = set([ row['internal_location'] for row in manifest.rows ]) + print(locations) + assert len(locations) == 2, locations + + for xx in locations: + assert xx.startswith('/') + + +def test_sig_collect_4_multiple_no_abspath(runtmp, manifest_db_format): + # collect a manifest from sig files, no abspath + sig43 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # copy files to tmp, where they will not have full paths + shutil.copyfile(sig43, runtmp.output('47.fa.sig')) + shutil.copyfile(sig63, runtmp.output('63.fa.sig')) + + ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + + runtmp.sourmash('sig', 'collect', '47.fa.sig', '63.fa.sig', + '-o', f'mf.{ext}', '-F', manifest_db_format) + + manifest_fn = runtmp.output(f'mf.{ext}') + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + + assert len(manifest) == 2 + md5_list = [ row['md5'] for row in manifest.rows ] + assert '09a08691ce52952152f0e866a59f6261' in md5_list + assert '38729c6374925585db28916b82a6f513' in md5_list + + locations = set([ row['internal_location'] for row in manifest.rows ]) + print(locations) + assert len(locations) == 2, locations + assert '47.fa.sig' in locations + assert '63.fa.sig' in locations + + +def test_sig_collect_5_no_manifest_sbt_fail(runtmp, manifest_db_format): + # collect a manifest from files that don't have one + sbt_zip = utils.get_test_data('v6.sbt.zip') + + ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('sig', 'collect', sbt_zip, + '-o', f'mf.{ext}', '-F', manifest_db_format) + + +def test_sig_collect_5_no_manifest_sbt_succeed(runtmp, manifest_db_format): + # generate a manifest from files that don't have one when --no-require + sbt_zip = utils.get_test_data('v6.sbt.zip') + + ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + + runtmp.sourmash('sig', 'collect', sbt_zip, '--no-require-manifest', + '-o', f'mf.{ext}', '-F', manifest_db_format) + + manifest_fn = runtmp.output(f'mf.{ext}') + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + + assert len(manifest) == 7 + locations = set([ row['internal_location'] for row in manifest.rows ]) + assert len(locations) == 1, locations + assert sbt_zip in locations diff --git a/tests/test_manifest_protocol.py b/tests/test_manifest_protocol.py index 3f8abeeb65..5b9ea003d5 100644 --- a/tests/test_manifest_protocol.py +++ b/tests/test_manifest_protocol.py @@ -177,3 +177,35 @@ def test_manifest_filter_cols(manifest_obj): assert len(mf) == 1 row = list(mf.rows)[0] assert row['name'] == 'NC_011663.1 Shewanella baltica OS223, complete genome' + + +def test_manifest_iadd(manifest_obj): + # test the 'create_manifest' method + sig47 = utils.get_test_data('47.fa.sig') + ss = sourmash.load_one_signature(sig47) + + def yield_sigs(): + yield ss, 'fiz' + + new_mf = manifest_obj.create_manifest(yield_sigs(), + include_signature=False) + assert len(new_mf) == 1 + + new_mf += manifest_obj + assert len(new_mf) == len(manifest_obj) + 1 + + +def test_manifest_add(manifest_obj): + # test the 'create_manifest' method + sig47 = utils.get_test_data('47.fa.sig') + ss = sourmash.load_one_signature(sig47) + + def yield_sigs(): + yield ss, 'fiz' + + new_mf = manifest_obj.create_manifest(yield_sigs(), + include_signature=False) + assert len(new_mf) == 1 + + new_mf2 = new_mf + manifest_obj + assert len(new_mf2) == len(manifest_obj) + len(new_mf) diff --git a/tests/test_sqlite_index.py b/tests/test_sqlite_index.py index ea64137aae..d524478cf9 100644 --- a/tests/test_sqlite_index.py +++ b/tests/test_sqlite_index.py @@ -775,6 +775,16 @@ def test_sqlite_manifest_load_existing_index_insert_fail(): assert "must use SqliteIndex.insert to add to this manifest" in str(exc) +def test_sqlite_manifest_create_load_empty(runtmp): + # try creating an empty manifest, then loading + mfname = runtmp.output("some.sqlmf") + mf = SqliteCollectionManifest.create(mfname) + mf.close() + + mf2 = load_sqlite_index(mfname) + assert len(mf2) == 0 + + def test_sqlite_lca_db_load_existing(): # try loading an existing sqlite index filename = utils.get_test_data('sqlite/lca.sqldb') @@ -819,8 +829,8 @@ def test_sqlite_lca_db_load_empty(runtmp): runtmp.sourmash('tax', 'prepare', '-F', 'sql', '-t', empty_tax, '-o', dbname) - with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'describe', dbname) + runtmp.sourmash('sig', 'describe', dbname) + assert 'loaded 0 signatures' in runtmp.last_result.err def test_sqlite_lca_db_try_load_sqlite_index():