From a90ab012cfc234158602ee2b10357f9d7a671110 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 8 Mar 2024 15:34:18 -0800 Subject: [PATCH] MRG: support proper manifest creation with `--relpath` for `sig check` and `sig collect` (#3054) This PR updates `sig collect` and `sig check` so that they can produce standalone manifests that work properly with default sourmash loading behavior. The default behavior produces broken manifests in some situations and is not changed, but will be deprecated in v5. ## Details Currently, `sig collect` and `sig check` default to producing standalone manifests with internal path locations relative to the current working directory. This conflicts with the default `StandaloneManifest` behavior implemented in `save_load.py` that loads path locations relative to the manifest location. As a result, whenever the manifest was in a subdirectory, the standalone manifests output by `sig check` and `sig collect` were broken. The only way to make good manifests in this situation was to use `sig collect --abspath`, but `sig check` didn't support `--abspath`, and using absolute paths is brittle in situations where you want to distribute manifests. This PR adds `--relpath` to both `sig check` and `sig collect`, and adds `--abspath` to `sig check`. It also demonstrates the bad behavior in tests and annotates the tests appropriately. See https://github.com/sourmash-bio/sourmash/issues/3008#issuecomment-1975174211 for more detailed discussion of why I think `--relpath` is the right behavior for the future. - [x] adds `--abspath` and `--relpath` to `sig check`, to properly support relative paths; - [x] adds `--relpath` to `sig collect`, to properly support relative paths; - [x] documents this behavior properly for creating standalone manifests; - [ ] create issue to change default `sig check` and `sig collect` behavior for v4, and disable cwd behavior. Techie TODO: - [x] explicitly test `relpath` and `abspath` behavior in `sig check`; - [x] explicitly test `relpath` behavior in `sig collect` - [x] write some tests for `sig check` and `sig collect` to explore the relative path loading issue, with all three combinations of relpath: mf in cwd, sigs in subdir; mf in subdir, sigs in cwd; mf in subdir, sigs in subdir. Related issues: * Addresses #3008 * Addresses issues in https://github.com/sourmash-bio/sourmash/issues/3048 by updating `sig check` to support `--relpath`; * Fixes https://github.com/sourmash-bio/sourmash/issues/3053 - `--relpath` again --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/command-line.md | 21 +- doc/sourmash-internals.md | 6 +- src/sourmash/cli/sig/check.py | 24 ++ src/sourmash/cli/sig/collect.py | 23 +- src/sourmash/index/__init__.py | 18 +- src/sourmash/sig/__main__.py | 63 ++++- tests/conftest.py | 15 ++ tests/test_cmd_signature.py | 363 ++++++++++++++++++++++++++-- tests/test_cmd_signature_collect.py | 281 ++++++++++++++++++++- 9 files changed, 771 insertions(+), 43 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 3279921a36..71173792cf 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -551,7 +551,7 @@ sourmash multigather --query --db ``` Note that multigather is single threaded, so it offers no substantial -efficiency gains over just running gather multiple times! Nontheless, it +efficiency gains over just running gather multiple times! Nonetheless, it is useful for situations where you have many sketches organized in a combined file, e.g. sketches built with `sourmash sketch ... --singleton`). @@ -1959,6 +1959,16 @@ sourmash database. `sourmash sig check` is particularly useful when working with large collections of signatures and identifiers. +With `-m/--save-manifest-matching`, `sig check` creates a standalone +manifest. In these manifests, sourmash v4 will by default write paths +to the matched elements that are relative to the current working +directory. In some cases - when the output manifest is in different +directory - this will create manifests that do not work properly +with sourmash. The `--relpath` argument will rewrite the paths to be +relative to the manifest, while the `--abspath` argument will rewrite +paths to be absolute. The `--relpath` behavior will be the default in +sourmash v5. + ### `sourmash signature collect` - collect manifests across databases Collect manifests from across (many) files and merge into a single @@ -1977,6 +1987,15 @@ This manifest file can be loaded directly from the command line by sourmash. particularly useful when working with large collections of signatures and identifiers, and has command line options for merging and updating manifests. +As with `sig check`, the standalone manifests created by `sig collect` +in sourmash v4 will by default write paths to the matched elements +relative to the current working directory. When the output manifest +is in a different directory, this will create manifests that do not work +properly with sourmash. The `--relpath` argument will rewrite the +paths to be relative to the manifest, while the `--abspath` argument +will rewrite paths to be absolute. The `--relpath` behavior will be +the default in sourmash v5. + ## Advanced command-line usage ### Loading signatures and databases diff --git a/doc/sourmash-internals.md b/doc/sourmash-internals.md index a615844290..ebf66ae219 100644 --- a/doc/sourmash-internals.md +++ b/doc/sourmash-internals.md @@ -368,7 +368,11 @@ Thus, while standalone manifests can point at any kind of container, including JSON files or LCA databases, they are most efficient when `internal_location` points at a file with either a single sketch in it, or a manifest that supports direct loading of sketches. Therefore, -we suggest using standalone manifest indices. +we suggest using standalone manifest indices. Note that sourmash +interprets paths to locations in standalone manifests relative to the +manifest filename; see the `--relpath` behavior in `sig check` and +`sig collect` to output manifests that deal with relative filenames +properly. Note that searching a standalone manifest is currently done through a linear iteration, and does not use any features of indexed containers diff --git a/src/sourmash/cli/sig/check.py b/src/sourmash/cli/sig/check.py index a4c940eecb..93335d4308 100644 --- a/src/sourmash/cli/sig/check.py +++ b/src/sourmash/cli/sig/check.py @@ -67,6 +67,30 @@ def subparser(subparsers): default="csv", choices=["csv", "sql"], ) + subparser.add_argument( + "--abspath", + "--use-absolute-paths", + help="convert all locations to absolute paths", + action="store_true", + ) + subparser.add_argument( + "--no-abspath", + help="do not convert all locations to absolute paths", + action="store_false", + dest="abspath", + ) + subparser.add_argument( + "--relpath", + "--use-relative-paths", + help="convert all locations to paths relative to the output manifest", + action="store_true", + ) + subparser.add_argument( + "--no-relpath", + help="do not convert all locations to paths relative to the output manifest", + action="store_false", + dest="relpath", + ) add_ksize_arg(subparser) add_moltype_args(subparser) diff --git a/src/sourmash/cli/sig/collect.py b/src/sourmash/cli/sig/collect.py index 1e5d8ded2f..73077fe9fb 100644 --- a/src/sourmash/cli/sig/collect.py +++ b/src/sourmash/cli/sig/collect.py @@ -54,7 +54,28 @@ def subparser(subparsers): help="merge new manifests into existing", ) subparser.add_argument( - "--abspath", help="convert all locations to absolute paths", action="store_true" + "--abspath", + "--use-absolute-paths", + help="convert all locations to absolute paths", + action="store_true", + ) + subparser.add_argument( + "--no-abspath", + help="do not convert all locations to absolute paths", + action="store_false", + dest="abspath", + ) + subparser.add_argument( + "--relpath", + "--use-relative-paths", + help="convert all locations to paths relative to the output manifest", + action="store_true", + ) + subparser.add_argument( + "--no-relpath", + help="do not convert all locations to paths relative to the output manifest", + action="store_false", + dest="relpath", ) add_ksize_arg(subparser) diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index 154f37c126..ffd698b423 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -1113,16 +1113,18 @@ class StandaloneManifestIndex(Index): with many signature collections underneath it, and you don't want to load every collection each time you run sourmash. - Instead, you can run 'sourmash sig manifest -o mf.csv' to - output a manifest and then use this class to load 'mf.csv' directly. + Instead, you can run 'sourmash sig collect -o ' to + output a manifest and then use this class to load directly. Sketch type selection, picklists, and pattern matching will all work directly on the manifest and will load signatures only upon demand. - One feature of this class is that absolute paths to sketches in - the 'internal_location' field of the manifests will be loaded properly. - This permits manifests to be constructed for various collections of - signatures that reside elsewhere, and not just below a single directory - prefix. + One feature of this class is that external paths to sketches in + the 'internal_location' field of the manifests will be loaded + properly. This permits manifests to be constructed for various + collections of signatures that reside elsewhere, and not just + below a single directory prefix. By default paths are interpreted + relative to the location of the manifest, unless an absolute path + is provided in the 'internal_location' field. StandaloneManifestIndex does _not_ store signatures in memory. @@ -1130,6 +1132,7 @@ class StandaloneManifestIndex(Index): MultiIndex.load_from_pathlist is used to load other Index objects. However, this class does not store any signatures in memory, unlike MultiIndex. + """ is_database = True @@ -1155,6 +1158,7 @@ def load(cls, location, *, prefix=None): m = CollectionManifest.load_from_filename(location) if prefix is None: + # by default, calculate paths relative to manifest location. prefix = os.path.dirname(location) return cls(m, location, prefix=prefix) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 1a89d6239f..94e1928175 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -1435,11 +1435,42 @@ def check(args): else: debug("sig check: manifest required") + # abspath/relpath checks + if args.abspath and args.relpath: + error("** Cannot specify both --abspath and --relpath; pick one!") + sys.exit(-1) + + if args.relpath or args.abspath and not args.save_manifest_matching: + notify( + "** WARNING: --abspath and --relpath only have effects when saving a manifest" + ) + + relpath = "." + if args.relpath and args.save_manifest_matching: + output_manifest_dir = os.path.dirname(args.save_manifest_matching) + relpath = os.path.relpath(os.curdir, output_manifest_dir) + total_manifest_rows = CollectionManifest([]) # start loading! total_rows_examined = 0 for filename in args.signatures: + # if saving a manifest, think about how to rewrite locations. + if args.abspath: + # convert to abspath + new_iloc = os.path.abspath(filename) + elif args.relpath: + # interpret paths relative to manifest directory. + if filename.startswith("/"): + notify( + f"** WARNING: cannot convert abspath {filename} into relative path." + ) + new_iloc = os.path.join(relpath, filename) + else: + # default: paths are relative to cwd. This breaks when sketches + # are in subdirectories; will be deprecated for v5. + new_iloc = filename + idx = sourmash_args.load_file_as_index(filename, yield_all_files=args.force) idx = idx.select(ksize=args.ksize, moltype=moltype) @@ -1458,7 +1489,7 @@ def check(args): # rewrite locations so that each signature can be found by filename # of its container; this follows `sig collect` logic. for row in sub_manifest.rows: - row["internal_location"] = filename + row["internal_location"] = new_iloc total_manifest_rows.add_row(row) # the len(sub_manifest) here should only be run when needed :) @@ -1535,6 +1566,11 @@ def collect(args): f"WARNING: --merge-previous specified, but output file '{args.output}' does not already exist?" ) + # abspath/relpath checks + if args.abspath and args.relpath: + error("** Cannot specify both --abspath and --relpath; pick one!") + sys.exit(-1) + # load previous manifest for --merge-previous. This gets tricky with # mismatched manifest types, which we forbid. try: @@ -1579,15 +1615,16 @@ def collect(args): # load from_file _extend_signatures_with_from_file(args, target_attr="locations") - # convert to abspath - if args.abspath: - args.locations = [os.path.abspath(iloc) for iloc in args.locations] + relpath = None + if args.relpath: + output_manifest_dir = os.path.dirname(args.output) + relpath = os.path.relpath(os.curdir, output_manifest_dir) # iterate through, loading all the manifests from all the locations. for n_files, loc in enumerate(args.locations): notify(f"Loading signature information from {loc}.") - if n_files % 100 == 0: + if n_files and n_files % 100 == 0: notify(f"... loaded {len(collected_mf)} sigs from {n_files} files") idx = sourmash.load_file_as_index(loc) if idx.manifest is None and require_manifest: @@ -1600,8 +1637,22 @@ def collect(args): mf = sourmash_args.get_manifest(idx) + # decide how to rewrite locations to container: + if args.abspath: + # convert to abspath + new_iloc = os.path.abspath(loc) + elif args.relpath: + # interpret paths relative to manifest directory + if loc.startswith("/"): + notify(f"** WARNING: cannot convert abspath {loc} into relative path.") + new_iloc = os.path.join(relpath, loc) + else: + # default: paths are relative to cwd. This breaks when sketches + # are in subdirectories; will be deprecated for v5. + new_iloc = loc + for row in mf.rows: - row["internal_location"] = loc + row["internal_location"] = new_iloc collected_mf.add_row(row) if args.manifest_format == "csv": diff --git a/tests/conftest.py b/tests/conftest.py index 00476e8a7d..4274382e74 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -89,6 +89,21 @@ def sig_save_extension_abund(request): return request.param +# these should both always succeed for 'sig check' and 'sig collect' output +# manifests. +@pytest.fixture(params=["--abspath", "--relpath"]) +def abspath_or_relpath(request): + return request.param + + +# this will fail if subdirs used; see #3008. but this ensures v4 behavior of +# sig collect/sig check works, where manifest paths are interpreted relative +# to cwd. +@pytest.fixture(params=["--no-abspath", "--abspath", "--relpath"]) +def abspath_relpath_v4(request): + return request.param + + # --- BEGIN - Only run tests using a particular fixture --- # # Cribbed from: http://pythontesting.net/framework/pytest/pytest-run-tests-using-particular-fixture/ def pytest_collection_modifyitems(items, config): diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 7f8365118f..9f14b6df58 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -4799,13 +4799,20 @@ def test_sig_kmers_2_hp(runtmp): assert check_mh2.similarity(mh) == 1.0 -def test_sig_check_1(runtmp): +def test_sig_check_1(runtmp, abspath_relpath_v4): # basic check functionality sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) picklist = utils.get_test_data("gather/salmonella-picklist.csv") runtmp.sourmash( - "sig", "check", *sigfiles, "--picklist", f"{picklist}::manifest", "-m", "mf.csv" + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}::manifest", + "-m", + "mf.csv", + abspath_relpath_v4, ) out_mf = runtmp.output("mf.csv") @@ -4826,7 +4833,50 @@ def test_sig_check_1(runtmp): assert 31 in ksizes -def test_sig_check_1_mf_csv_gz(runtmp): +def test_sig_check_1_fail_abspath_relpath(runtmp): + # basic check functionality + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + with pytest.raises( + SourmashCommandFailed, + match="Cannot specify both --abspath and --relpath; pick one!", + ): + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}::manifest", + "-m", + "mf.csv", + "--abspath", + "--relpath", + ) + + +def test_sig_check_1_warn_abspath_relpath(runtmp, abspath_or_relpath): + # warn that without -m, --abspath/--relpath are not helpful + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}::manifest", + abspath_or_relpath, + ) + + err = runtmp.last_result.err + assert ( + " WARNING: --abspath and --relpath only have effects when saving a manifest" + in err + ) + + +def test_sig_check_1_mf_csv_gz(runtmp, abspath_relpath_v4): # basic check functionality, with gzipped manifest output sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) picklist = utils.get_test_data("gather/salmonella-picklist.csv") @@ -4839,6 +4889,7 @@ def test_sig_check_1_mf_csv_gz(runtmp): f"{picklist}::manifest", "-m", "mf.csv.gz", + abspath_relpath_v4, ) out_mf = runtmp.output("mf.csv.gz") @@ -4859,7 +4910,7 @@ def test_sig_check_1_mf_csv_gz(runtmp): assert 31 in ksizes -def test_sig_check_1_gz(runtmp): +def test_sig_check_1_gz(runtmp, abspath_relpath_v4): # basic check functionality with gzipped picklist sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) picklist = utils.get_test_data("gather/salmonella-picklist.csv") @@ -4877,6 +4928,7 @@ def test_sig_check_1_gz(runtmp): "salmonella.csv.gz::manifest", "-m", "mf.csv", + abspath_relpath_v4, ) out_mf = runtmp.output("mf.csv") @@ -4897,7 +4949,7 @@ def test_sig_check_1_gz(runtmp): assert 31 in ksizes -def test_sig_check_1_nofail(runtmp): +def test_sig_check_1_nofail(runtmp, abspath_relpath_v4): # basic check functionality with --fail-if-missing sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) picklist = utils.get_test_data("gather/salmonella-picklist.csv") @@ -4911,6 +4963,7 @@ def test_sig_check_1_nofail(runtmp): "-m", "mf.csv", "--fail-if-missing", + abspath_relpath_v4, ) out_mf = runtmp.output("mf.csv") @@ -4952,7 +5005,7 @@ def test_sig_check_1_no_picklist(runtmp): ("name", "identprefix"), ), ) -def test_sig_check_1_column(runtmp, column, coltype): +def test_sig_check_1_column(runtmp, column, coltype, abspath_relpath_v4): # basic check functionality for various columns/coltypes sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) picklist = utils.get_test_data("gather/salmonella-picklist.csv") @@ -4967,6 +5020,7 @@ def test_sig_check_1_column(runtmp, column, coltype): "mf.csv", "-o", "missing.csv", + abspath_relpath_v4, ) out_mf = runtmp.output("mf.csv") @@ -4987,7 +5041,7 @@ def test_sig_check_1_column(runtmp, column, coltype): assert 31 in ksizes -def test_sig_check_1_diff_col_name(runtmp): +def test_sig_check_1_diff_col_name(runtmp, abspath_relpath_v4): # 'sig check' with 'name2' column instead of default name sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) picklist = utils.get_test_data("gather/salmonella-picklist-diffcolumn.csv") @@ -5002,6 +5056,7 @@ def test_sig_check_1_diff_col_name(runtmp): "missing.csv", "-m", "mf.csv", + abspath_relpath_v4, ) out_mf = runtmp.output("mf.csv") @@ -5036,7 +5091,7 @@ def test_sig_check_1_diff_col_name(runtmp): assert rows[1][0] == "NOT THERE" -def test_sig_check_1_diff_col_name_zip(runtmp): +def test_sig_check_1_diff_col_name_zip(runtmp, abspath_relpath_v4): # 'sig check' with 'name2' column instead of default name, on a zip file sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) picklist = utils.get_test_data("gather/salmonella-picklist-diffcolumn.csv") @@ -5055,6 +5110,7 @@ def test_sig_check_1_diff_col_name_zip(runtmp): "missing.csv", "-m", "mf.csv", + abspath_relpath_v4, ) out_mf = runtmp.output("mf.csv") @@ -5089,7 +5145,7 @@ def test_sig_check_1_diff_col_name_zip(runtmp): assert rows[1][0] == "NOT THERE" -def test_sig_check_1_diff_col_name_exclude(runtmp): +def test_sig_check_1_diff_col_name_exclude(runtmp, abspath_relpath_v4): # 'sig check' with 'name2' column, :exclude picklist sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) picklist = utils.get_test_data("gather/salmonella-picklist-diffcolumn.csv") @@ -5102,6 +5158,7 @@ def test_sig_check_1_diff_col_name_exclude(runtmp): f"{picklist}:name2:name:exclude", "-m", "mf.csv", + abspath_relpath_v4, ) out_mf = runtmp.output("mf.csv") @@ -5122,7 +5179,7 @@ def test_sig_check_1_diff_col_name_exclude(runtmp): assert 31 in ksizes -def test_sig_check_1_ksize(runtmp): +def test_sig_check_1_ksize(runtmp, abspath_relpath_v4): # basic check functionality with selection for ksize sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) picklist = utils.get_test_data("gather/salmonella-picklist.csv") @@ -5137,6 +5194,7 @@ def test_sig_check_1_ksize(runtmp): f"{picklist}::manifest", "-m", "mf.csv", + abspath_relpath_v4, ) out_mf = runtmp.output("mf.csv") @@ -5155,7 +5213,7 @@ def test_sig_check_1_ksize(runtmp): assert 31 in ksizes -def test_sig_check_1_ksize_output_sql(runtmp): +def test_sig_check_1_ksize_output_sql(runtmp, abspath_relpath_v4): # basic check functionality with selection for ksize sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) picklist = utils.get_test_data("gather/salmonella-picklist.csv") @@ -5172,6 +5230,7 @@ def test_sig_check_1_ksize_output_sql(runtmp): "mf.mfsql", "-F", "sql", + abspath_relpath_v4, ) out_mf = runtmp.output("mf.mfsql") @@ -5190,7 +5249,7 @@ def test_sig_check_1_ksize_output_sql(runtmp): assert 31 in ksizes -def test_sig_check_2_output_missing(runtmp): +def test_sig_check_2_output_missing(runtmp, abspath_relpath_v4): # output missing all as identical to input picklist sigfiles = utils.get_test_data("gather/combined.sig") picklist = utils.get_test_data("gather/salmonella-picklist.csv") @@ -5205,6 +5264,7 @@ def test_sig_check_2_output_missing(runtmp): "missing.csv", "-m", "mf.csv", + abspath_relpath_v4, ) out_csv = runtmp.output("missing.csv") @@ -5265,7 +5325,7 @@ def test_sig_check_2_output_missing_error_exit(runtmp): ("name", "identprefix"), ), ) -def test_sig_check_2_output_missing_column(runtmp, column, coltype): +def test_sig_check_2_output_missing_column(runtmp, column, coltype, abspath_relpath_v4): # output missing all as identical to input picklist sigfiles = utils.get_test_data("gather/combined.sig") picklist = utils.get_test_data("gather/salmonella-picklist.csv") @@ -5278,6 +5338,7 @@ def test_sig_check_2_output_missing_column(runtmp, column, coltype): f"{picklist}::manifest", "-o", "missing.csv", + abspath_relpath_v4, ) out_csv = runtmp.output("missing.csv") @@ -5328,7 +5389,7 @@ def test_sig_check_3_no_manifest(runtmp): assert "sig check requires a manifest by default, but no manifest present." in err -def test_sig_check_3_no_manifest_ok(runtmp): +def test_sig_check_3_no_manifest_ok(runtmp, abspath_relpath_v4): # generate manifest if --no-require-manifest sbt = utils.get_test_data("v6.sbt.zip") picklist = utils.get_test_data("v6.sbt.zip.mf.csv") @@ -5340,6 +5401,7 @@ def test_sig_check_3_no_manifest_ok(runtmp): "--no-require-manifest", "--picklist", f"{picklist}::manifest", + abspath_relpath_v4, ) print(runtmp.last_result.out) @@ -5348,3 +5410,276 @@ def test_sig_check_3_no_manifest_ok(runtmp): "for given picklist, found 7 matches to 7 distinct values" in runtmp.last_result.err ) + + +def test_sig_check_4_manifest_cwd_cwd(runtmp, abspath_relpath_v4): + # check: manifest and sigs in cwd + prot_zip = utils.get_test_data("prot/all.zip") + + shutil.copyfile(prot_zip, runtmp.output("prot.zip")) + + # generate a picklist, whatever + runtmp.sourmash("sig", "manifest", "prot.zip", "-o", "picklist.csv") + assert os.path.exists(runtmp.output("picklist.csv")) + + # use picklist with sig check to generate a manifest + runtmp.sourmash( + "sig", + "check", + "-m", + "mf.csv", + "--picklist", + "picklist.csv::manifest", + "prot.zip", + abspath_relpath_v4, + ) + + # check that it all works + runtmp.sourmash("sig", "cat", "mf.csv") + + +def test_sig_check_4_manifest_subdir_cwd(runtmp, abspath_or_relpath): + # check: manifest in subdir and sigs in cwd. note, + # fails with default v4 behavior. see #3008. + prot_zip = utils.get_test_data("prot/all.zip") + + shutil.copyfile(prot_zip, runtmp.output("prot.zip")) + os.mkdir(runtmp.output("mf_dir")) + + # generate a picklist, whatever + runtmp.sourmash("sig", "manifest", "prot.zip", "-o", "picklist.csv") + assert os.path.exists(runtmp.output("picklist.csv")) + + # use picklist with sig check to generate a manifest + runtmp.sourmash( + "sig", + "check", + "-m", + "mf_dir/mf.csv", + "--picklist", + "picklist.csv::manifest", + "prot.zip", + abspath_or_relpath, + ) + + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + # check that it all works + runtmp.sourmash("sig", "cat", "mf_dir/mf.csv") + + +def test_sig_check_4_manifest_cwd_subdir(runtmp, abspath_relpath_v4): + # check: manifest in cwd and sigs in subdir + prot_zip = utils.get_test_data("prot/all.zip") + + os.mkdir(runtmp.output("zip_dir")) + shutil.copyfile(prot_zip, runtmp.output("zip_dir/prot.zip")) + + # generate a picklist, whatever + runtmp.sourmash("sig", "manifest", "zip_dir/prot.zip", "-o", "picklist.csv") + assert os.path.exists(runtmp.output("picklist.csv")) + + # use picklist with sig check to generate a manifest + runtmp.sourmash( + "sig", + "check", + "-m", + "mf.csv", + "--picklist", + "picklist.csv::manifest", + "zip_dir/prot.zip", + abspath_relpath_v4, + ) + + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + # check that it all works + runtmp.sourmash("sig", "cat", "mf.csv") + + +def test_sig_check_4_manifest_subdir_subdir(runtmp, abspath_or_relpath): + # check: manifest and sigs in subdir. note, fails with default v4 behavior. + # see #3008. + prot_zip = utils.get_test_data("prot/all.zip") + + os.mkdir(runtmp.output("zip_dir")) + shutil.copyfile(prot_zip, runtmp.output("zip_dir/prot.zip")) + os.mkdir(runtmp.output("mf_dir")) + + # generate a picklist, whatever + runtmp.sourmash("sig", "manifest", "zip_dir/prot.zip", "-o", "picklist.csv") + assert os.path.exists(runtmp.output("picklist.csv")) + + # use picklist with sig check to generate a manifest + runtmp.sourmash( + "sig", + "check", + "-m", + "mf_dir/mf.csv", + "--picklist", + "picklist.csv::manifest", + "zip_dir/prot.zip", + abspath_or_relpath, + ) + + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + # check that it all works + runtmp.sourmash("sig", "cat", "mf_dir/mf.csv") + + +def test_sig_check_5_relpath(runtmp): + # check path rewriting when sketches are in a subdir. + # this will be the default behavior in v5 => remove --relpath. + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + os.mkdir(runtmp.output("mf_dir")) + + os.mkdir(runtmp.output("sigs_dir")) + new_names = [] + for f in sigfiles: + basename = os.path.basename(f) + filename = os.path.join("sigs_dir", basename) + + shutil.copyfile(f, runtmp.output(filename)) + new_names.append(filename) + + runtmp.sourmash( + "sig", + "check", + *new_names, + "--picklist", + f"{picklist}::manifest", + "-m", + "mf_dir/mf.csv", + "--relpath", + ) + + out_mf = runtmp.output("mf_dir/mf.csv") + assert os.path.exists(out_mf) + + # all should match. + with open(out_mf, newline="") as fp: + mf = CollectionManifest.load_from_csv(fp) + assert len(mf) == 24 + + locations = [row["internal_location"] for row in mf.rows] + expected_names = ["../" + f for f in new_names] + assert set(locations).issubset(expected_names), (locations, expected_names) + + +def test_sig_check_5_relpath_subdir(runtmp): + # check path rewriting when both sigs and mf are in different subdirs. + # this will be the default behavior in v5 => can remove --relpath then. + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + os.mkdir(runtmp.output("sigs_dir")) + new_names = [] + for f in sigfiles: + basename = os.path.basename(f) + filename = os.path.join("sigs_dir", basename) + + shutil.copyfile(f, runtmp.output(filename)) + new_names.append(filename) + + runtmp.sourmash( + "sig", + "check", + *new_names, + "--picklist", + f"{picklist}::manifest", + "-m", + "mf.csv", + "--relpath", + ) + + out_mf = runtmp.output("mf.csv") + assert os.path.exists(out_mf) + + # all should match. + with open(out_mf, newline="") as fp: + mf = CollectionManifest.load_from_csv(fp) + assert len(mf) == 24 + + locations = [row["internal_location"] for row in mf.rows] + print("XXX", locations) + print("YYY", new_names) + expected_names = ["./" + f for f in new_names] + assert set(locations).issubset(expected_names), (locations, expected_names) + + +def test_sig_check_5_abspath(runtmp): + # check path rewriting with `--abspath` => absolute paths. + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + for f in sigfiles: + shutil.copyfile(f, runtmp.output(os.path.basename(f))) + + # strip off abspath + sigfiles = [os.path.basename(f) for f in sigfiles] + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}::manifest", + "-m", + "mf.csv", + "--abspath", + ) + + out_mf = runtmp.output("mf.csv") + assert os.path.exists(out_mf) + + # all should match. + with open(out_mf, newline="") as fp: + mf = CollectionManifest.load_from_csv(fp) + assert len(mf) == 24 + + locations = [row["internal_location"] for row in mf.rows] + for k in locations: + assert k.startswith("/") # absolute + assert os.path.basename(k) in sigfiles # converts back to basic + + +def test_sig_check_5_no_abspath(runtmp): + # check path rewriting for default (--no-relpath --no-abspath) + # this behavior will change in v5; specify `--no-abspath` then? + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + for f in sigfiles: + shutil.copyfile(f, runtmp.output(os.path.basename(f))) + + # strip off abspath + sigfiles = [os.path.basename(f) for f in sigfiles] + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}::manifest", + "-m", + "mf.csv", + # "--no-abspath" # => default behavior + ) + + out_mf = runtmp.output("mf.csv") + assert os.path.exists(out_mf) + + # all should match. + with open(out_mf, newline="") as fp: + mf = CollectionManifest.load_from_csv(fp) + assert len(mf) == 24 + + locations = [row["internal_location"] for row in mf.rows] + # no rewriting + assert set(locations).issubset(sigfiles) diff --git a/tests/test_cmd_signature_collect.py b/tests/test_cmd_signature_collect.py index edd7c16a29..917286e881 100644 --- a/tests/test_cmd_signature_collect.py +++ b/tests/test_cmd_signature_collect.py @@ -13,13 +13,21 @@ from sourmash_tst_utils import SourmashCommandFailed -def test_sig_collect_0_nothing(runtmp, manifest_db_format): +def test_sig_collect_0_nothing(runtmp, manifest_db_format, abspath_relpath_v4): # run with just output ext = "sqlmf" if manifest_db_format == "sql" else "csv" if manifest_db_format != "sql": return - runtmp.sourmash("sig", "collect", "-o", f"mf.{ext}", "-F", manifest_db_format) + runtmp.sourmash( + "sig", + "collect", + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + abspath_relpath_v4, + ) manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) @@ -27,14 +35,43 @@ def test_sig_collect_0_nothing(runtmp, manifest_db_format): assert len(manifest) == 0 -def test_sig_collect_1_zipfile(runtmp, manifest_db_format): +def test_sig_collect_0_fail_abspath_relpath(runtmp, manifest_db_format): + # check that it complains if both --abspath and --relpath are specified + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + if manifest_db_format != "sql": + return + + with pytest.raises( + SourmashCommandFailed, + match="Cannot specify both --abspath and --relpath; pick one!", + ): + runtmp.sourmash( + "sig", + "collect", + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + "--abspath", + "--relpath", + ) + + +def test_sig_collect_1_zipfile(runtmp, manifest_db_format, abspath_relpath_v4): # collect a manifest from a .zip file protzip = utils.get_test_data("prot/protein.zip") ext = "sqlmf" if manifest_db_format == "sql" else "csv" runtmp.sourmash( - "sig", "collect", protzip, "-o", f"mf.{ext}", "-F", manifest_db_format + "sig", + "collect", + protzip, + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + abspath_relpath_v4, ) manifest_fn = runtmp.output(f"mf.{ext}") @@ -46,11 +83,13 @@ def test_sig_collect_1_zipfile(runtmp, manifest_db_format): assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list -def test_sig_collect_1_zipfile_csv_gz(runtmp): +def test_sig_collect_1_zipfile_csv_gz(runtmp, abspath_relpath_v4): # collect a manifest from a .zip file, save to csv.gz protzip = utils.get_test_data("prot/protein.zip") - runtmp.sourmash("sig", "collect", protzip, "-o", "mf.csv.gz", "-F", "csv") + runtmp.sourmash( + "sig", "collect", protzip, "-o", "mf.csv.gz", "-F", "csv", abspath_relpath_v4 + ) manifest_fn = runtmp.output("mf.csv.gz") @@ -67,11 +106,13 @@ def test_sig_collect_1_zipfile_csv_gz(runtmp): assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list -def test_sig_collect_1_zipfile_csv_gz_roundtrip(runtmp): +def test_sig_collect_1_zipfile_csv_gz_roundtrip(runtmp, abspath_relpath_v4): # collect a manifest from a .zip file, save to csv.gz; then load again protzip = utils.get_test_data("prot/protein.zip") - runtmp.sourmash("sig", "collect", protzip, "-o", "mf.csv.gz", "-F", "csv") + runtmp.sourmash( + "sig", "collect", protzip, "-o", "mf.csv.gz", "-F", "csv", abspath_relpath_v4 + ) manifest_fn = runtmp.output("mf.csv.gz") @@ -125,7 +166,7 @@ def test_sig_collect_2_exists_fail(runtmp, manifest_db_format): ) -def test_sig_collect_2_exists_merge(runtmp, manifest_db_format): +def test_sig_collect_2_exists_merge(runtmp, manifest_db_format, abspath_relpath_v4): # collect a manifest from two .zip files protzip = utils.get_test_data("prot/protein.zip") allzip = utils.get_test_data("prot/all.zip") @@ -133,7 +174,14 @@ def test_sig_collect_2_exists_merge(runtmp, manifest_db_format): ext = "sqlmf" if manifest_db_format == "sql" else "csv" runtmp.sourmash( - "sig", "collect", protzip, "-o", f"mf.{ext}", "-F", manifest_db_format + "sig", + "collect", + protzip, + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + abspath_relpath_v4, ) manifest_fn = runtmp.output(f"mf.{ext}") @@ -205,7 +253,7 @@ def test_sig_collect_2_exists_csv_merge_sql(runtmp): assert "ERROR loading" in runtmp.last_result.err -def test_sig_collect_2_no_exists_merge(runtmp, manifest_db_format): +def test_sig_collect_2_no_exists_merge(runtmp, manifest_db_format, abspath_relpath_v4): # test 'merge' when args.output doesn't already exist => warning utils.get_test_data("prot/protein.zip") allzip = utils.get_test_data("prot/all.zip") @@ -215,7 +263,15 @@ def test_sig_collect_2_no_exists_merge(runtmp, manifest_db_format): # run with --merge but no previous: runtmp.sourmash( - "sig", "collect", allzip, "-o", manifest_fn, "-F", manifest_db_format, "--merge" + "sig", + "collect", + allzip, + "-o", + manifest_fn, + "-F", + manifest_db_format, + "--merge", + abspath_relpath_v4, ) manifest = BaseCollectionManifest.load_from_filename(manifest_fn) @@ -411,6 +467,94 @@ def test_sig_collect_4_multiple_no_abspath(runtmp, manifest_db_format): assert "47.fa.sig" in locations assert "63.fa.sig" in locations + runtmp.sourmash("sig", "cat", f"mf.{ext}") + + +def test_sig_collect_4_multiple_subdir_subdir_no_abspath(runtmp, manifest_db_format): + # collect a manifest from sig files, no abspath; use a subdir for sketches + # this should work with default behavior. + sig43 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + # copy files to tmp, where they will not have full paths + os.mkdir(runtmp.output("sigs_dir")) + shutil.copyfile(sig43, runtmp.output("sigs_dir/47.fa.sig")) + shutil.copyfile(sig63, runtmp.output("sigs_dir/63.fa.sig")) + + # put manifest in subdir too. + os.mkdir(runtmp.output("mf_dir")) + + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + + runtmp.sourmash( + "sig", + "collect", + "sigs_dir/47.fa.sig", + "sigs_dir/63.fa.sig", + "-o", + f"mf_dir/mf.{ext}", + "-F", + manifest_db_format, + "--relpath", + ) + + manifest_fn = runtmp.output(f"mf_dir/mf.{ext}") + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + + assert len(manifest) == 2 + md5_list = [row["md5"] for row in manifest.rows] + assert "09a08691ce52952152f0e866a59f6261" in md5_list + assert "38729c6374925585db28916b82a6f513" in md5_list + + locations = set([row["internal_location"] for row in manifest.rows]) + print(locations) + assert len(locations) == 2, locations + assert "../sigs_dir/47.fa.sig" in locations + assert "../sigs_dir/63.fa.sig" in locations + + runtmp.sourmash("sig", "cat", f"mf_dir/mf.{ext}") + + +def test_sig_collect_4_multiple_cwd_subdir_no_abspath(runtmp, manifest_db_format): + # collect a manifest from sig files, no abspath; use a subdir for sketches + # this should work with default behavior. + sig43 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + # copy files to tmp, where they will not have full paths + os.mkdir(runtmp.output("sigs_dir")) + shutil.copyfile(sig43, runtmp.output("sigs_dir/47.fa.sig")) + shutil.copyfile(sig63, runtmp.output("sigs_dir/63.fa.sig")) + + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + + runtmp.sourmash( + "sig", + "collect", + "sigs_dir/47.fa.sig", + "sigs_dir/63.fa.sig", + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + ) + + manifest_fn = runtmp.output(f"mf.{ext}") + manifest = BaseCollectionManifest.load_from_filename(manifest_fn) + + assert len(manifest) == 2 + md5_list = [row["md5"] for row in manifest.rows] + assert "09a08691ce52952152f0e866a59f6261" in md5_list + assert "38729c6374925585db28916b82a6f513" in md5_list + + locations = set([row["internal_location"] for row in manifest.rows]) + print(locations) + assert len(locations) == 2, locations + assert "sigs_dir/47.fa.sig" in locations + assert "sigs_dir/63.fa.sig" in locations + + runtmp.sourmash("sig", "cat", f"mf.{ext}") + def test_sig_collect_5_no_manifest_sbt_fail(runtmp, manifest_db_format): # collect a manifest from files that don't have one @@ -424,7 +568,9 @@ def test_sig_collect_5_no_manifest_sbt_fail(runtmp, manifest_db_format): ) -def test_sig_collect_5_no_manifest_sbt_succeed(runtmp, manifest_db_format): +def test_sig_collect_5_no_manifest_sbt_succeed( + runtmp, manifest_db_format, abspath_relpath_v4 +): # generate a manifest from files that don't have one when --no-require sbt_zip = utils.get_test_data("v6.sbt.zip") @@ -439,6 +585,7 @@ def test_sig_collect_5_no_manifest_sbt_succeed(runtmp, manifest_db_format): f"mf.{ext}", "-F", manifest_db_format, + abspath_relpath_v4, ) manifest_fn = runtmp.output(f"mf.{ext}") @@ -448,3 +595,111 @@ def test_sig_collect_5_no_manifest_sbt_succeed(runtmp, manifest_db_format): locations = set([row["internal_location"] for row in manifest.rows]) assert len(locations) == 1, locations assert sbt_zip in locations + + +def test_sig_collect_6_path_cwd_cwd(runtmp, manifest_db_format, abspath_relpath_v4): + # check: manifest and sigs in cwd + protzip = utils.get_test_data("prot/protein.zip") + + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + + protzip_path = "protein.zip" + shutil.copyfile(protzip, runtmp.output(protzip_path)) + + mf_path = f"mf.{ext}" + + runtmp.sourmash( + "sig", + "collect", + protzip_path, + "-o", + mf_path, + "-F", + manifest_db_format, + abspath_relpath_v4, + ) + + runtmp.sourmash("sig", "cat", mf_path) + + +def test_sig_collect_6_path_cwd_subdir(runtmp, manifest_db_format, abspath_relpath_v4): + # check: manifest in cwd, sigs in subdir + protzip = utils.get_test_data("prot/protein.zip") + + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + + os.mkdir(runtmp.output("sigs_dir")) + protzip_path = "sigs_dir/protein.zip" + shutil.copyfile(protzip, runtmp.output(protzip_path)) + + mf_path = f"mf.{ext}" + + runtmp.sourmash( + "sig", + "collect", + protzip_path, + "-o", + mf_path, + "-F", + manifest_db_format, + abspath_relpath_v4, + ) + + runtmp.sourmash("sig", "cat", mf_path) + + +def test_sig_collect_6_path_subdir_cwd(runtmp, manifest_db_format, abspath_or_relpath): + # check: manifest in cwd, sigs in subdir. note, fails with default v4 + # behavior. see #3008. + protzip = utils.get_test_data("prot/protein.zip") + + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + + protzip_path = "protein.zip" + shutil.copyfile(protzip, runtmp.output(protzip_path)) + + os.mkdir(runtmp.output("mf_dir")) + mf_path = f"mf_dir/mf.{ext}" + + runtmp.sourmash( + "sig", + "collect", + protzip_path, + "-o", + mf_path, + "-F", + manifest_db_format, + abspath_or_relpath, + ) + + runtmp.sourmash("sig", "cat", mf_path) + + +def test_sig_collect_6_path_subdir_subdir( + runtmp, manifest_db_format, abspath_or_relpath +): + # check: manifest and sigs in subdir. note, fails with default v4 + # behavior. see #3008. + protzip = utils.get_test_data("prot/protein.zip") + + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + + os.mkdir(runtmp.output("sigs_dir")) + protzip_path = "sigs_dir/protein.zip" + shutil.copyfile(protzip, runtmp.output(protzip_path)) + + os.mkdir(runtmp.output("mf_dir")) + mf_path = f"mf_dir/mf.{ext}" + + runtmp.sourmash( + "sig", + "collect", + protzip_path, + "-o", + mf_path, + "-F", + manifest_db_format, + abspath_or_relpath, + ) + + runtmp.sourmash("sig", "cat", mf_path)