From 69edb6090fea99ea4a207e1829f1afbc2f8d6172 Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Tue, 12 Mar 2024 14:05:13 -0700 Subject: [PATCH 01/19] MRG: enable loading lineages from annotated gather with match_name instead of name (#3078) This PR enables loading from gather lineages files that contain 'match_name' instead of name. Rationale: - we generally plan to replace 'name' with 'match_name' in gather output (https://github.com/sourmash-bio/sourmash/issues/1555) - branchwater plugin's fastmultigather already uses 'match_name' --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- src/sourmash/tax/tax_utils.py | 15 ++++++++--- .../tax/test1.gather.with-lineages.csv | 5 ++++ tests/test_tax_utils.py | 25 ++++++++++++++++++- 3 files changed, 40 insertions(+), 5 deletions(-) create mode 100644 tests/test-data/tax/test1.gather.with-lineages.csv diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 55feed66d2..1615c90d74 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1274,7 +1274,9 @@ def load( elif "accession" in header: identifier = "accession" header = ["ident" if "accession" == x else x for x in header] - elif "name" in header and "lineage" in header: + elif "lineage" in header and any( + ["name" in header, "match_name" in header] + ): return cls.load_from_gather_with_lineages( filename, force=force, lins=lins, ictv=ictv ) @@ -1390,9 +1392,14 @@ def load_from_gather_with_lineages( if not header: raise ValueError(f"cannot read taxonomy assignments from {filename}") - if "name" not in header or "lineage" not in header: + ident_col = None + if "name" in header: + ident_col = "name" + elif "match_name" in header: + ident_col = "match_name" + if "lineage" not in header or ident_col is None: raise ValueError( - "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" + "Expected headers 'name'/'match_name' and 'lineage' not found. Is this a with-lineages file?" ) ranks = None @@ -1405,7 +1412,7 @@ def load_from_gather_with_lineages( for n, row in enumerate(r): num_rows += 1 - name = row["name"] + name = row[ident_col] ident = get_ident(name) if lins: diff --git a/tests/test-data/tax/test1.gather.with-lineages.csv b/tests/test-data/tax/test1.gather.with-lineages.csv new file mode 100644 index 0000000000..1c81221737 --- /dev/null +++ b/tests/test-data/tax/test1.gather.with-lineages.csv @@ -0,0 +1,5 @@ +intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,query_bp,ksize,scaled,query_n_hashes,lineage +442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Pseudomonadota;Gammaproteobacteria;Enterobacterales;Enterobacteriaceae;Escherichia;Escherichia coli +390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Prevotellaceae;Prevotella;Prevotella copri +138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Bacteroidaceae;Phocaeicola;Phocaeicola vulgatus +338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Prevotellaceae;Prevotella;Prevotella copri diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 192406e251..dfca20628a 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1014,7 +1014,7 @@ def test_check_and_load_gather_lineage_csvs_bad_header(runtmp): with pytest.raises(ValueError) as exc: LineageDB.load_from_gather_with_lineages(g_res) assert ( - "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" + "Expected headers 'name'/'match_name' and 'lineage' not found. Is this a with-lineages file?" in str(exc.value) ) @@ -1038,6 +1038,29 @@ def test_check_and_load_gather_lineage_csvs_isdir(runtmp): assert "is a directory" in str(exc.value) +def test_check_and_load_gather_lineage_csvs_name(runtmp): + # test loading a with-lineage file that has 'name', not 'match_name' + g_res = utils.get_test_data("tax/test1.gather.with-lineages.csv") + + lins = LineageDB.load_from_gather_with_lineages(g_res) + assert len(lins) == 4 + + +def test_check_and_load_gather_lineage_csvs_match_name(runtmp): + # test loading a with-lineage file that has 'match_name' instead of 'name' + g_res = utils.get_test_data("tax/test1.gather.with-lineages.csv") + out_lins = runtmp.output("match-name.lineages.csv") + with open(g_res) as f_in: + first_line = f_in.readline().replace("name", "match_name") + with open(out_lins, "w") as f_out: + f_out.write(first_line) + for line in f_in: + f_out.write(line) + + lins = LineageDB.load_from_gather_with_lineages(out_lins) + assert len(lins) == 4 + + def test_check_and_load_gather_csvs_fail_on_missing(runtmp): g_csv = utils.get_test_data("tax/test1.gather.csv") # make gather results with taxonomy name not in tax_assign From ac21e1d927697a33479b206c91ff363051a5b2b5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 Mar 2024 15:26:22 -0700 Subject: [PATCH 02/19] Bump pypa/cibuildwheel from 2.16.5 to 2.17.0 (#3084) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.16.5 to 2.17.0.
Release notes

Sourced from pypa/cibuildwheel's releases.

v2.17.0

  • 🌟 Adds the ability to inherit configuration in TOML overrides. This makes certain configurations much simpler. If you're overriding an option like before-build or environment, and you just want to add an extra command or environment variable, you can just append (or prepend) to the previous config. See the docs for more information. (#1730)
  • 🌟 Adds official support for native arm64 macOS GitHub runners. To use them, just specify macos-14 as an os of your job in your workflow file. You can also keep macos-13 in your build matrix to build x86_64. Check out the new GitHub Actions example config.
  • ✨ You no longer need to specify --platform to run cibuildwheel locally! Instead it will detect your platform automatically. This was a safety feature, no longer necessary. (#1727)
  • 🛠 Removed setuptools and wheel pinned versions. This only affects old-style projects without a pyproject.toml, projects with pyproject.toml are already getting fresh versions of their build-system.requires installed into an isolated environment. (#1725)
  • 🛠 Improve how the GitHub Action passes arguments (#1757)
  • 🛠 Remove a system-wide install of pipx in the GitHub Action (#1745)
  • 🐛 No longer will cibuildwheel override the PIP_CONSTRAINT environment variable when using the build frontend. Instead it will be extended. (#1675)
  • 🐛 Fix a bug where building and testing both x86_86 and arm64 wheels on the same runner caused the wrong architectures in the test environment (#1750)
  • 🐛 Fix a bug that prevented testing a CPython 3.8 wheel targeting macOS 11+ on x86_64 (#1768)
  • 📚 Moved the docs onto the official PyPA domain - they're now available at https://cibuildwheel.pypa.io . (#1775)
  • 📚 Docs and examples improvements (#1762, #1734)
Changelog

Sourced from pypa/cibuildwheel's changelog.

v2.17.0

11 March 2024

  • 🌟 Adds the ability to inherit configuration in TOML overrides. This makes certain configurations much simpler. If you're overriding an option like before-build or environment, and you just want to add an extra command or environment variable, you can just append (or prepend) to the previous config. See the docs for more information. (#1730)
  • 🌟 Adds official support for native arm64 macOS GitHub runners. To use them, just specify macos-14 as an os of your job in your workflow file. You can also keep macos-13 in your build matrix to build x86_64. Check out the new GitHub Actions example config.
  • ✨ You no longer need to specify --platform to run cibuildwheel locally! Instead it will detect your platform automatically. This was a safety feature, no longer necessary. (#1727)
  • 🛠 Removed setuptools and wheel pinned versions. This only affects old-style projects without a pyproject.toml, projects with pyproject.toml are already getting fresh versions of their build-system.requires installed into an isolated environment. (#1725)
  • 🛠 Improve how the GitHub Action passes arguments (#1757)
  • 🛠 Remove a system-wide install of pipx in the GitHub Action (#1745)
  • 🐛 No longer will cibuildwheel override the PIP_CONSTRAINT environment variable when using the build frontend. Instead it will be extended. (#1675)
  • 🐛 Fix a bug where building and testing both x86_86 and arm64 wheels on the same runner caused the wrong architectures in the test environment (#1750)
  • 🐛 Fix a bug that prevented testing a CPython 3.8 wheel targeting macOS 11+ on x86_64 (#1768)
  • 📚 Moved the docs onto the official PyPA domain - they're now available at https://cibuildwheel.pypa.io . (#1775)
  • 📚 Docs and examples improvements (#1762, #1734)
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=pypa/cibuildwheel&package-manager=github_actions&previous-version=2.16.5&new-version=2.17.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build_wheel.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml index 841c0a03b9..1f85234218 100644 --- a/.github/workflows/build_wheel.yml +++ b/.github/workflows/build_wheel.yml @@ -46,7 +46,7 @@ jobs: python-version: '3.10' - name: Build wheels - uses: pypa/cibuildwheel@v2.16.5 + uses: pypa/cibuildwheel@v2.17.0 env: CIBW_ENVIRONMENT_MACOS: ${{ matrix.macos_target }} CIBW_ARCHS_LINUX: ${{ matrix.arch }} From d0b3c4737571486d04b659029920cffbfc096ac3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 Mar 2024 23:45:38 +0000 Subject: [PATCH 03/19] Bump thiserror from 1.0.57 to 1.0.58 (#3082) Bumps [thiserror](https://github.com/dtolnay/thiserror) from 1.0.57 to 1.0.58.
Release notes

Sourced from thiserror's releases.

1.0.58

  • Make backtrace support available when using -Dwarnings (#292)
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=thiserror&package-manager=cargo&previous-version=1.0.57&new-version=1.0.58)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f39774cc5b..2db96670de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1736,18 +1736,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.57" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" +checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.57" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" +checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" dependencies = [ "proc-macro2", "quote", From feaa9b155dd186c0694d9d457762e965d6756bd0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 19 Mar 2024 12:12:32 +0000 Subject: [PATCH 04/19] Bump DeterminateSystems/nix-installer-action from 9 to 10 (#3083) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [DeterminateSystems/nix-installer-action](https://github.com/determinatesystems/nix-installer-action) from 9 to 10.
Release notes

Sourced from DeterminateSystems/nix-installer-action's releases.

v10

What's Changed

Full Changelog: https://github.com/DeterminateSystems/nix-installer-action/compare/v9...v10

Commits
  • de22e16 DETERMINATE_NIX_KVM fixup, support Magic Nix Cache + FlakeHub Cache on Namesp...
  • e279ba5 Merge pull request #67 from DeterminateSystems/hoverbear/fh-161-after-running...
  • f4a0ffe Don't use docker shim if only using a mounted docker.sock instead of docker-i...
  • ffea801 Merge pull request #66 from DeterminateSystems/hoverbear/fh-160-action-should...
  • 4126bb8 Merge branch 'main' into hoverbear/fh-160-action-should-work-under-nektosact-...
  • 81ee88f Handle docker not existing
  • 0f8fa3d Merge pull request #64 from DeterminateSystems/hoverbear/fh-156-installer-act...
  • f576e90 Fix logic inversion
  • 161c1f6 Use uid not username
  • 0e5b724 No longer require sudo
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=DeterminateSystems/nix-installer-action&package-manager=github_actions&previous-version=9&new-version=10)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/dev_envs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev_envs.yml b/.github/workflows/dev_envs.yml index a34c4e5301..0936086444 100644 --- a/.github/workflows/dev_envs.yml +++ b/.github/workflows/dev_envs.yml @@ -15,7 +15,7 @@ jobs: fetch-depth: 0 - name: Install Nix - uses: DeterminateSystems/nix-installer-action@v9 + uses: DeterminateSystems/nix-installer-action@v10 - name: Run the Magic Nix Cache uses: DeterminateSystems/magic-nix-cache-action@v3 From 571904dfbc9af01a7ad195e01f91acf1feba91a0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 19 Mar 2024 12:49:27 +0000 Subject: [PATCH 05/19] Bump DeterminateSystems/magic-nix-cache-action from 3 to 4 (#3085) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [DeterminateSystems/magic-nix-cache-action](https://github.com/determinatesystems/magic-nix-cache-action) from 3 to 4.
Release notes

Sourced from DeterminateSystems/magic-nix-cache-action's releases.

v4

What's Changed

Full Changelog: https://github.com/DeterminateSystems/magic-nix-cache-action/compare/v3...v4

Commits
  • fc6aace Merge pull request #38 from DeterminateSystems/flakehub-cache-readme
  • f7df689 Don't publish to ids
  • b505f49 Update readme for flakehub cache
  • 122e91d Merge pull request #35 from DeterminateSystems/flakehub-cache
  • 2c553b2 Add shellcheck checks to CI
  • 455b918 Use -closure ids
  • 1b7beca Merge remote-tracking branch 'origin/main' into flakehub-cache
  • 54acdd1 Merge pull request #36 from DeterminateSystems/grahamc-patch-1
  • 79f590d /latest/ -> /stable/
  • 14fda4e Stop setting the default source branch to 'main'
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=DeterminateSystems/magic-nix-cache-action&package-manager=github_actions&previous-version=3&new-version=4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/dev_envs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev_envs.yml b/.github/workflows/dev_envs.yml index 0936086444..8ffb98db64 100644 --- a/.github/workflows/dev_envs.yml +++ b/.github/workflows/dev_envs.yml @@ -17,7 +17,7 @@ jobs: - name: Install Nix uses: DeterminateSystems/nix-installer-action@v10 - name: Run the Magic Nix Cache - uses: DeterminateSystems/magic-nix-cache-action@v3 + uses: DeterminateSystems/magic-nix-cache-action@v4 - run: nix run .# -- --version From 8d5b6bfbf9438aae90d2e0954133daac42330fd3 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 20 Mar 2024 14:14:29 -0700 Subject: [PATCH 06/19] MRG: fix clippy beta issues (#3088) Fixes https://github.com/sourmash-bio/sourmash/issues/3087 --- src/core/src/collection.rs | 3 +- src/core/src/encodings.rs | 2 -- src/core/src/manifest.rs | 3 +- src/core/src/signature.rs | 3 -- src/core/src/sketch/minhash.rs | 52 +--------------------------------- 5 files changed, 3 insertions(+), 60 deletions(-) diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index 8cc6129cf4..9f708381ef 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -6,8 +6,7 @@ use camino::Utf8PathBuf as PathBuf; use crate::encodings::Idx; use crate::manifest::{Manifest, Record}; use crate::prelude::*; -use crate::signature::Signature; -use crate::storage::{FSStorage, InnerStorage, MemStorage, SigStore, Storage, ZipStorage}; +use crate::storage::{FSStorage, InnerStorage, MemStorage, SigStore, ZipStorage}; use crate::{Error, Result}; #[cfg(feature = "parallel")] diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index ac69cd58eb..f8934596dc 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -1,8 +1,6 @@ use serde::{Deserialize, Serialize}; use std::collections::HashMap; -use std::convert::TryFrom; use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}; -use std::iter::Iterator; use std::str; use nohash_hasher::BuildNoHashHasher; diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs index c82ca6ee1e..7441a9b69f 100644 --- a/src/core/src/manifest.rs +++ b/src/core/src/manifest.rs @@ -1,4 +1,3 @@ -use std::convert::TryInto; use std::fs::File; use std::io::{BufRead, BufReader, Read, Write}; use std::ops::Deref; @@ -12,7 +11,7 @@ use serde::{Deserialize, Serialize}; use crate::encodings::HashFunctions; use crate::prelude::*; -use crate::signature::{Signature, SigsTrait}; +use crate::signature::SigsTrait; use crate::sketch::Sketch; use crate::Result; diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index da38587dc3..0ab8190f98 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -6,7 +6,6 @@ use core::iter::FusedIterator; use std::fs::File; use std::io; -use std::iter::Iterator; use std::path::Path; use std::str; @@ -18,7 +17,6 @@ use typed_builder::TypedBuilder; use crate::encodings::{aa_to_dayhoff, aa_to_hp, revcomp, to_aa, HashFunctions, VALID}; use crate::prelude::*; -use crate::selection::{Select, Selection}; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; use crate::Error; @@ -891,7 +889,6 @@ impl PartialEq for Signature { #[cfg(test)] mod test { - use std::convert::TryInto; use std::fs::File; use std::io::{BufReader, Read}; use std::path::PathBuf; diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 24cdc9539f..1ee747745a 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -2,7 +2,7 @@ use std::cmp::Ordering; use std::collections::{BTreeMap, BTreeSet}; use std::f64::consts::PI; use std::fmt::Write; -use std::iter::{Iterator, Peekable}; +use std::iter::Peekable; use std::str; use std::sync::Mutex; @@ -942,56 +942,6 @@ impl> Iterator for Intersection { } } -struct Union> { - iter: Peekable, - other: Peekable, -} - -impl> Iterator for Union { - type Item = T; - - fn next(&mut self) -> Option { - let res = match (self.iter.peek(), self.other.peek()) { - (Some(ref left_key), Some(ref right_key)) => left_key.cmp(right_key), - (None, Some(_)) => { - return self.other.next(); - } - (Some(_), None) => { - return self.iter.next(); - } - _ => return None, - }; - - match res { - Ordering::Less => self.iter.next(), - Ordering::Greater => self.other.next(), - Ordering::Equal => { - self.other.next(); - self.iter.next() - } - } - } -} - -#[cfg(test)] -mod test { - use super::Union; - - #[test] - fn test_union() { - let v1 = [1u64, 2, 4, 10]; - let v2 = [1u64, 3, 4, 9]; - - let union: Vec = Union { - iter: v1.iter().peekable(), - other: v2.iter().peekable(), - } - .cloned() - .collect(); - assert_eq!(union, [1, 2, 3, 4, 9, 10]); - } -} - //############# // A MinHash implementation for low scaled or large cardinalities From cfe6a968aadea555bcbd265047f93b61fcad2b60 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 20 Mar 2024 15:08:17 -0700 Subject: [PATCH 07/19] MRG: rework the manifest documentation; do misc cleanup (#3027) This PR: * fixes a minor nit in `sourmash sig collect` output where it said "loaded 0 signatures" * updates a lot of the documentation around standalone manifests to encourage their use * in tandem, modifies docs to discourage loading from pathlists/from-files and directory hierarchies TODO: - [x] look at TODO item re directories in sig collect - [x] think about adding https://github.com/sourmash-bio/sourmash/issues/3023 information into docs about lazy loading; maybe in the advanced databases document? - [x] update `sig manifest` docs to point out that they do not generate standalone manifests - [x] revisit branchwater plugin documentation to, to either make issues or make changes - [x] update `sig check` and `sig collect` to tell people to expand their paths ref https://github.com/sourmash-bio/sourmash/issues/3039 - [x] update docs more to recommend against pathlists and directories per https://github.com/sourmash-bio/sourmash/issues/3040 Related issues: * https://github.com/sourmash-bio/sourmash_plugin_branchwater/issues/235 * Fixes https://github.com/sourmash-bio/sourmash/issues/3048 * Fixes https://github.com/sourmash-bio/sourmash/issues/3009 by recommending `sig collect` and `sig check` instead of `sig manifest` for making standalone manifests * https://github.com/sourmash-bio/sourmash/issues/3053 * Fixes https://github.com/sourmash-bio/sourmash/issues/3023 * Fixes https://github.com/sourmash-bio/sourmash/issues/3039 * Fixes https://github.com/sourmash-bio/sourmash/issues/3040 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tessa Pierce Ward --- doc/command-line.md | 165 +++++++++++++++++++----------- doc/databases-advanced.md | 82 +++++++++++---- doc/faq.md | 2 +- doc/release-notes/sourmash-2.0.md | 2 +- doc/sourmash-sketch.md | 6 +- doc/using-sourmash-a-guide.md | 2 +- src/sourmash/sig/__main__.py | 2 - 7 files changed, 173 insertions(+), 88 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 71173792cf..90633d342e 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -1914,7 +1914,10 @@ will continue processing input sequences. ### `sourmash signature manifest` - output a manifest for a file -Output a manifest for a file, database, or collection. +Output a manifest for a file, database, or collection. Note that +these manifests are not usually suitable for use as standalone +manifests; the `sourmash sig collect` and `sourmash sig check` +commands produce standalone manifests. For example, ``` @@ -1942,8 +1945,10 @@ CSV and SQLite manifest files. ### `sourmash signature check` - compare picklists and manifests -Compare picklists and manifests across databases, and optionally output matches -and missing items. +Compare picklists and manifests across databases, and optionally +output matches and missing items. In particular, `sig check` can be +used to create standalone manifests for a subset of a large collection, +using picklists. For example, ``` @@ -1962,17 +1967,28 @@ collections of signatures and identifiers. With `-m/--save-manifest-matching`, `sig check` creates a standalone manifest. In these manifests, sourmash v4 will by default write paths to the matched elements that are relative to the current working -directory. In some cases - when the output manifest is in different +directory. In some cases - when the output manifest is in a different directory - this will create manifests that do not work properly with sourmash. The `--relpath` argument will rewrite the paths to be relative to the manifest, while the `--abspath` argument will rewrite paths to be absolute. The `--relpath` behavior will be the default in sourmash v5. +Standalone manifests created with `-m/--save-manifest-matching` will +use the paths given to `sig check` on the command line; we recommend +using zip files and sig files, and avoiding directory hierarchies or +path lists. You can use `--from-file` to pass in long lists of +filenames via a text file. + ### `sourmash signature collect` - collect manifests across databases Collect manifests from across (many) files and merge into a single -standalone manifest. +standalone manifest. Standalone manifests can be used directly as a +sourmash database; they support efficient searching and selection of +sketches, as well as lazy loading of individual sketches from large +collections. See +[advanced usage information on sourmash databases](databases-advanced.md) +for more information. For example, ``` @@ -1987,20 +2003,30 @@ This manifest file can be loaded directly from the command line by sourmash. particularly useful when working with large collections of signatures and identifiers, and has command line options for merging and updating manifests. +The standalone manifests created by `sig collect` will reference the +paths given on the command line; we recommend using zip files and sig +files, and avoiding directory hierarchies or path lists. You can also +use `--from-file` to pass in long lists of filenames. + +Standalone manifests produced by `sig collect` work most efficiently +when constructed from many small zip file collections. + As with `sig check`, the standalone manifests created by `sig collect` in sourmash v4 will by default write paths to the matched elements relative to the current working directory. When the output manifest -is in a different directory, this will create manifests that do not work -properly with sourmash. The `--relpath` argument will rewrite the -paths to be relative to the manifest, while the `--abspath` argument -will rewrite paths to be absolute. The `--relpath` behavior will be -the default in sourmash v5. +is in a different directory, this will create manifests that do not +work properly with sourmash. The `--relpath` argument will rewrite +the paths to be relative to the manifest, while the `--abspath` +argument will rewrite paths to be absolute. The `--relpath` behavior +will be the default in sourmash v5. ## Advanced command-line usage ### Loading signatures and databases -sourmash uses several different command-line styles. +sourmash uses several different command-line styles. Most sourmash +commands can load sketches from any standard collection type; we +primarily recommend using zipfiles (but read on!) Briefly, @@ -2011,22 +2037,18 @@ Briefly, need to provide a selector (ksize with `-k`, moltype with `--dna` etc, or md5sum with `--query-md5`) that picks out a single signature. -* `compare` takes multiple signatures and can load them from files, - directories, and indexed databases (SBT or LCA). It can also take - a list of file paths in a text file, using `--from-file` (see below). +* `compare` takes multiple signatures and can load them from any + sourmash collection type. * the `lca classify` and `lca summarize` commands take multiple signatures with `--query`, and multiple LCA databases, with `--db`. `sourmash multigather` also uses this style. This allows these commands to specify multiple queries **and** multiple databases without - (too much) confusion. These commands will take files containing - signature files using `--query-from-file` (see below). + (too much) confusion. The database must be LCA databases. * `index` and `lca index` take a few fixed parameters (database name, and for `lca index`, a taxonomy file) and then an arbitrary number of - other files that contain signatures, including files, directories, - and indexed databases. These commands will also take `--from-file` - (see below). + other files that contain signatures. None of these commands currently support searching, comparing, or indexing signatures with multiple ksizes or moltypes at the same time; you need @@ -2092,7 +2114,7 @@ The following `coltype`s are currently supported for picklists: * `gather` - use the CSV output of `sourmash gather` as a picklist * `prefetch` - use the CSV output of `sourmash prefetch` as a picklist * `search` - use the CSV output of `sourmash prefetch` as a picklist -* `manifest` - use the CSV output of `sourmash sig manifest` as a picklist +* `manifest` - use CSV manifests produced by `sig manifest` as a picklist Identifiers are constructed by using the first space delimited word in the signature name. @@ -2101,7 +2123,7 @@ One way to build a picklist is to use `sourmash sig grep --csv out.csv` to construct a CSV file containing a list of all sketches that match the pattern (which can be a string or regexp). The `out.csv` file can be used as a picklist via the picklist -manifest format with `--picklist out.csv::manifest`. +manifest CSV format with `--picklist out.csv::manifest`. You can also use `sourmash sig describe --csv out.csv ` or `sourmash sig manifest -o out.csv ` to construct an @@ -2144,7 +2166,9 @@ slow, especially for many (100s or 1000s) of signatures. All of the `sourmash` commands support loading collections of signatures from zip files. You can create a compressed collection of signatures using `sourmash sig cat *.sig -o collections.zip` and then -specifying `collections.zip` on the command line in place of `*.sig`. +specifying `collections.zip` on the command line in place of `*.sig`; +you can also sketch FASTA/FASTQ files directly into a zip file with +`-o collections.zip`. ### Choosing signature output formats @@ -2171,7 +2195,7 @@ to stdout. All of these save formats can be loaded by sourmash commands. **We strongly suggest using .zip files to store signatures: they are fast, -small, and fully supported by all the sourmash commands.** +small, and fully supported by all the sourmash commands and API.** Note that when outputting large collections of signatures, some save formats require holding all the sketches in memory until they can be @@ -2186,19 +2210,6 @@ databases!](databases-advanced.md) ### Loading many signatures -#### Loading signatures within a directory hierarchy - -All of the `sourmash` commands support loading signatures from -beneath directories; provide the paths on the command line. - -#### Passing in lists of files - -Most sourmash commands will also take a `--from-file` or -`--query-from-file`, which will take the location of a text file containing -a list of file paths. This can be useful for situations where you want -to specify thousands of queries, or a subset of signatures produced by -some other command. - #### Indexed databases Indexed databases can make searching signatures much faster. SBT @@ -2209,9 +2220,6 @@ SQLite databases (new in sourmash v4.4.0) are typically larger on disk than SBTs and LCAs, but in turn are fast to load and support very low memory search. -(LCA databases also directly permit taxonomic searches using `sourmash lca` -functions.) - Commands that take multiple signatures or collections of signatures will also work with indexed databases. @@ -2223,9 +2231,9 @@ only at one scaled value. If the database signature type is incompatible with the other signatures, sourmash will complain appropriately. -In contrast, signature files, zip collections, and directory -hierarchies can contain many different types of signatures, and -compatible ones will be selected automatically. +In contrast, signature files and zip collections can contain many +different types of signatures, and compatible ones will be selected +automatically. Use the `sourmash index` command to create an SBT. @@ -2235,6 +2243,29 @@ database can be saved in JSON or SQL format with `-F json` or `-F sql`. Use `sourmash sig cat -o .sqldb` to create a SQLite indexed database. +#### Loading signatures within a directory hierarchy + +All of the `sourmash` commands support loading signatures (`.sig` or +`.sig.gz` files) from within directory hierarchies; you can just +provide the paths to the top-level directory on the command line. + +However, this is no longer recommended because it can be very +inefficient; we instead suggest passing all of the sketch files in +the directory into `sig collect` to build a standalone manifest, or +using `sig cat` on the directory to generate a zip file. + +#### Passing in lists of files + +sourmash commands support `--from-file` or `--query-from-file`, which +will take the location of a text file containing a list of file +paths. This can be useful for situations where you want to specify +thousands of queries, or a subset of signatures produced by some other +command. + +This is no longer recommended when using large collections; we instead +suggest using standalone manifests built with `sig collect` and `sig +check`, which will include extra metadata that supports fast loading. + ### Combining search databases on the command line All of the commands in sourmash operate in "online" mode, so you can @@ -2242,7 +2273,7 @@ combine multiple databases and signatures on the command line and get the same answer as if you built a single large database from all of them. The only caveat to this rule is that if you have multiple identical matches present across the databases, the order in which -they are found will differ depending on the order that the files are +they are used may depend on the order that the files are passed in on the command line. ### Using stdin @@ -2250,11 +2281,12 @@ passed in on the command line. Most commands will take signature JSON data via stdin using the usual UNIX convention, `-`. Moreover, `sourmash sketch` and the `sourmash sig` commands will output to stdout. So, for example, +``` +sourmash sketch ... -o - | sourmash sig describe - +``` +will describe the signatures that were just created. -`sourmash sketch ... -o - | sourmash sig describe -` will describe the -signatures that were just created. - -### Using manifests to explicitly refer to collections of files +### Using standalone manifests to explicitly refer to collections of files (sourmash v4.4 and later) @@ -2264,9 +2296,9 @@ internals to speed up signature selection through picklists and pattern matching. Manifests can _also_ be used externally (via the command-line), and -may be useful for organizing large collections of signatures. They can -be generated with the `sig collect`, `sig manifest`, and `sig check` -subcommands. +these "standalone manifests" may be useful for organizing large +collections of signatures. They can be generated with the `sig +collect`, `sig manifest`, and `sig check` subcommands. Suppose you have a large collection of signatures (`.sig` or `.sig.gz` files) in a location (e.g., under a directory, or in a zip file). You @@ -2280,21 +2312,32 @@ sourmash sig fileinfo manifest.sqlmf ``` This manifest contains _references_ to the signatures (but not the signatures themselves) and can then be used as a database target for most -sourmash operations - search, gather, etc. +sourmash operations - search, gather, etc. Manifests support +fast selection and lazy loading of sketches in many situations. + +The `sig check` command can also be used to create standalone manifests +from collections using a picklist, with the `-m/--save-manifest-matching` +option. This is useful for commands that don't support picklists natively, +e.g. plugins and extensions. -Note that `sig collect` will generate manifests containing the -pathnames given to it - so if you use relative paths, the references -will be relative to the working directory in which `sig collect` was +Note that `sig collect` and `sig check` will generate manifests containing the +pathnames given to them - so if you use relative paths, the references +will be relative to the working directory in which the command was run. You can use `sig collect --abspath` to rewrite the paths -into absolute paths. +into absolute paths, or `sig collect --relpath` to rewrite the paths +relative to the manifest file. **Our advice:** We suggest using zip file collections for most -situations; we primarily recommend using explicit manifests for -situations where you have a **very large** collection of signatures -(1000s or more), and don't want to make multiple copies of signatures -in the collection (as you would have to, with a zipfile). This can be -useful if you want to refer to different subsets of the collection -without making multiple copies in a zip file. +situations; we strongly recommend using standalone manifests for +situations where you have **very large** sketches or a **very large** +collection of sketches (1000s or more), and don't want to make +multiple copies of signatures in the collection (as you would have to, +with a zipfile). This is particularly useful if you want to refer to different +subsets of the collection without making multiple copies in a zip +file. + +You can read more about the details of zip files and manifests in +[the advanced usage information for databases](databases-advanced.md). ### Using sourmash plugins diff --git a/doc/databases-advanced.md b/doc/databases-advanced.md index 9e4d1c25d7..2a1f61fd28 100644 --- a/doc/databases-advanced.md +++ b/doc/databases-advanced.md @@ -54,39 +54,83 @@ Both SBTs and LCA databases can only store homogenous collections of signature t We recommend SBT and LCA databases for use only in specific situations - e.g. SBTs are great for single-genome "best match" search for SBTs, and `sourmash lca` commands require LCA databases. -### Manifests - -Manifests are catalogs of signature metadata - name, molecule type, k-mer size, and other information - that can be used to select specific signatures for searching or processing. Typically when using manifests the actual signatures themselves are not loaded until they are needed, although the efficiency of this depends on the signature storage mechanism; for example, JSON-format containers (`.sig` and `.lca.json` files) must be entirely loaded before any signature in the file them can be used, unlike zip containers. - -As of sourmash 4.4 manifests can be *directly* loaded from the command line as standalone collections. This lets manifests serve as a catalog of signatures stored in many different locations. - -Standalone manifests are preferable to both directory storage and pathlists (below), because they support fast selection and direct lazy loading. They are the most effective solution for managing custom collections of thousands to millions of signatures. - -Standalone manifests can be created with `sourmash sig collect` -(sourmash v4.4 and later). - -Sourmash supports two manifest file formats - CSV and SQLite. SQLite manifests are much faster and lower-memory than CSV manifests in exchange for consuming some extra disk space. +### Standalone manifests + +Manifests are catalogs of signature metadata - name, molecule type, +k-mer size, and other information - that can be used to select +specific signatures for searching or processing. Typically when using +manifests the actual signatures themselves are not loaded until they +are needed, although the efficiency of this depends on the signature +storage mechanism; for example, JSON-format containers (`.sig` and +`.lca.json` files) must be entirely loaded before any signature in the +file them can be used, unlike zip containers. + +As of sourmash 4.4 manifests can be *directly* loaded from the command +line as standalone collections. This lets manifests serve as a catalog +of signatures stored in many different locations. Sketches can be +selected by name, k-mer size, molecule type, and other features +without loading the actual sketch data. + +Standalone manifests are preferable to both directory storage and +pathlists (below), because they support fast selection and direct lazy +loading. This means that sourmash operations that support streaming or +online search (such as `prefetch` and `gather`, among others) can +avoid loading everything all at once. + +Standalone manifests are the most effective solution for managing custom +collections of thousands to millions of signatures, as well as working +with multiple large sketches. + +They can be created with `sourmash sig collect` and `sourmash sig +check` (sourmash v4.4 and later). + +Sourmash supports two manifest file formats - CSV and SQLite. SQLite +manifests are much faster and lower-memory than CSV manifests. ### Directories -Directory hierarchies of signatures are read natively by sourmash, and can be created or extended by specifying `-o dirname/` (with a trailing slash). +Directory hierarchies of signatures are read natively by sourmash, and +can be created or extended by specifying `-o dirname/` (with a +trailing slash). -To read from a directory, specify the directory name on the sourmash command line. When reading from directories, the entire directory hierarchy is traversed and all `.sig` and `.sig.gz` files are loaded as signatures. If `--force` is specified, _all_ files will be read, and failures will be ignored. +To read from a directory, specify the directory name on the sourmash +command line. When reading from directories, the entire directory +hierarchy is traversed and all `.sig` and `.sig.gz` files are loaded +as signatures. If `--force` is specified, _all_ files will be read, +and failures will be ignored. -When directories are specified as outputs, the signatures will be saved by their complete md5sum underneath the directory. +When directories are specified as outputs, the signatures will be +saved by their complete md5sum underneath the directory. -We don't particularly recommend storing signatures in directory hierarchies, since most of their use cases are now covered by other approaches. +We don't recommend loading signatures from directory hierarchies, +since the implementation is not particularly memory efficient and most +of the use cases for directories are now covered by other approaches - +in particular, standalone manifests. ### Pathlists -Pathlists are text files containing paths to one or more sourmash databases; any type of sourmash-readable collection can be listed. +Pathlists are text files containing paths to one or more sourmash +databases; any type of sourmash-readable collection can be listed. -The paths in pathlists can be relative or absolute within the file system. If they are relative, they must resolve with respect to the current working directory of the sourmash command. +The paths in pathlists can be relative or absolute within the file +system. If they are relative, they must resolve with respect to the +current working directory of the sourmash command. -We don't recommend using pathlists any more, since the original use cases are now supported with picklists, but they are still supported! +We don't recommend using pathlists, since the original use cases are +now supported with picklists and standalone manifests, but they are +still supported. Loading sketches from pathlists is also not very +efficient. Pathlists are not output by any sourmash commands. +Many commands support `--query-from-file` or `--from-file` as a way to +pass in a file containing many paths to sketches or collections. The +internal implementation of sourmash simply adds these to the +command-line arguments, and this is an effective and efficient way to +provide long lists of files to commands like `sig check` and `sig +collect` that create standalone manifests to support efficient lazy +loading. + ## Storing taxonomies sourmash supports taxonomic information output via the `sourmash lca` and `sourmash tax` subcommands. Both sets of commands rely on the same 7 taxonomic ranks: superkingdom, phylum, class, order, family, genus, and species (with limited support for a 'strain' rank). And both sets of subcommands take lineage spreadsheets that link specific identifiers to taxonomic lineages. diff --git a/doc/faq.md b/doc/faq.md index d8d9da0622..227952ff40 100644 --- a/doc/faq.md +++ b/doc/faq.md @@ -139,7 +139,7 @@ you use [the precomputed databases](databases.md), you will always end up using your query sketches at a minimum scaled of 1000, even if you created them with a lower scaled value. -Please also see [What resolution should my signatures be?](using-sourmash-a-guide.md#what-resolution-should-my-signatures-be-how-should-i-create-them). +Please also see [What resolution should my signatures be?](using-sourmash-a-guide.md#what-resolution-should-my-signatures-be-and-how-should-i-create-them). ## What threshold-bp value should I use with `sourmash prefetch` and `sourmash gather`? diff --git a/doc/release-notes/sourmash-2.0.md b/doc/release-notes/sourmash-2.0.md index c3b8647dd5..fbb541ad49 100644 --- a/doc/release-notes/sourmash-2.0.md +++ b/doc/release-notes/sourmash-2.0.md @@ -23,7 +23,7 @@ This is a list of substantial new features and functionality in sourmash 2.0. * Created [precomputed databases](../databases.md) for most of GenBank genomes. * Added taxonomic reporting functionality in the `sourmash lca` submodule - [see command-line docs](../command-line.md#sourmash-lca-subcommands-for-in-memory-taxonomy-integration). * Added signature manipulation utilities in the `sourmash signature` submodule - [see command-line docs](../command-line.md#sourmash-signature-subcommands-for-signature-manipulation) -* Introduced new modulo hash or "scaled" signatures for containment analysis; see [Using sourmash: a practical guide](../using-sourmash-a-guide.md#what-resolution-should-my-signatures-be--how-should-i-create-them) and [more details in the Python API examples](../api-example.md#advanced-features-of-sourmash-minhash-objects---scaled-and-num). +* Introduced new modulo hash or "scaled" signatures for containment analysis; see [Using sourmash: a practical guide](../using-sourmash-a-guide.md#what-resolution-should-my-signatures-be-and-how-should-i-create-them) and [more details in the Python API examples](../api-example.md#advanced-features-of-sourmash-minhash-objects---scaled-and-num). * Switched to using JSON instead of YAML for signatures. * Many performance optimizations! * Many more tests! diff --git a/doc/sourmash-sketch.md b/doc/sourmash-sketch.md index caba1a19a8..5ad43d266e 100644 --- a/doc/sourmash-sketch.md +++ b/doc/sourmash-sketch.md @@ -146,7 +146,7 @@ Some of the key command-line options supported by `fromfile` are: * `-o/--output-signatures` will save generated signatures to any of the [standard supported output formats](command-line.md#choosing-signature-output-formats). * `-o/--output-csv-info` will save a CSV file of input filenames and parameter strings for use with the `sourmash sketch` command line; this can be used to construct signatures in parallel. * `--already-done` will take a list of existing signatures/databases to check against; signatures with matching names and parameter strings will not be rebuilt. -* `--output-manifest-matching` will output a manifest of already-existing signatures, which can then be used with `sourmash sig cat` to collate signatures across databases; see [using manifests](command-line.md#using-manifests-to-explicitly-refer-to-collections-of-files). (This provides [`sourmash sig check` functionality](command-line.md#sourmash-signature-check---compare-picklists-and-manifests) in `sketch fromfile`.) +* `--output-manifest-matching` will output a manifest of already-existing signatures, which can then be used with `sourmash sig cat` to collate signatures across databases; see [using manifests](command-line.md#using-standalone-manifests-to-explicitly-refer-to-collections-of-files). (This provides [`sourmash sig check` functionality](command-line.md#sourmash-signature-check---compare-picklists-and-manifests) in `sketch fromfile`.) If you would like help and advice on constructing large databases, or pointers to code for generating the `fromfile` CSV format, please ask @@ -200,8 +200,8 @@ The `-p` argument to `sourmash sketch` provides parameter strings to sourmash, a A parameter string is a space-delimited collection that can contain one or more fields, comma-separated. * `k=` - create a sketch at this k-mer size; can provide more than one time in a parameter string. Typically `ksize` is between 4 and 100. -* `scaled=` - create a scaled MinHash with k-mers sampled deterministically at 1 per `` value. This controls sketch compression rates and resolution; for example, a 5 Mbp genome sketched with a scaled of 1000 would yield approximately 5,000 k-mers. `scaled` is incompatible with `num`. See [our guide to signature resolution](using-sourmash-a-guide.md#what-resolution-should-my-signatures-be--how-should-i-create-them) for more information. -* `num=` - create a standard MinHash with no more than `` k-mers kept. This will produce sketches identical to [mash sketches](https://mash.readthedocs.io/en/latest/). `num` is incompatible with `scaled`. See [our guide to signature resolution](using-sourmash-a-guide.md#what-resolution-should-my-signatures-be--how-should-i-create-them) for more information. +* `scaled=` - create a scaled MinHash with k-mers sampled deterministically at 1 per `` value. This controls sketch compression rates and resolution; for example, a 5 Mbp genome sketched with a scaled of 1000 would yield approximately 5,000 k-mers. `scaled` is incompatible with `num`. See [our guide to signature resolution](using-sourmash-a-guide.md#what-resolution-should-my-signatures-be-and-how-should-i-create-them) for more information. +* `num=` - create a standard MinHash with no more than `` k-mers kept. This will produce sketches identical to [mash sketches](https://mash.readthedocs.io/en/latest/). `num` is incompatible with `scaled`. See [our guide to signature resolution](using-sourmash-a-guide.md#what-resolution-should-my-signatures-be-and-how-should-i-create-them) for more information. * `abund` / `noabund` - create abundance-weighted (or not) sketches. See [Classify signatures: Abundance Weighting](classifying-signatures.md#abundance-weighting) for details of how this works. * `dna`, `protein`, `dayhoff`, `hp` - create this kind of sketch. Note that `sourmash sketch dna -p protein` and `sourmash sketch protein -p dna` are invalid; please use `sourmash sketch translate` for the former. * `seed=` - set the random number seed used for k-mer hashing. This is for advanced users who want to choose a completely different set of k-mers for sketches! The default is 42. diff --git a/doc/using-sourmash-a-guide.md b/doc/using-sourmash-a-guide.md index 29ccc52ec1..a3600c1337 100644 --- a/doc/using-sourmash-a-guide.md +++ b/doc/using-sourmash-a-guide.md @@ -41,7 +41,7 @@ however, and it probably doesn't really matter. (When we have blog posts or publications providing more formal guidance, we'll link to them here!) -## What resolution should my signatures be / how should I create them? +## What resolution should my signatures be and how should I create them? sourmash supports two ways of choosing the resolution or size of your signatures: using `num` to specify the maximum number of hashes, diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 94e1928175..0a9cd4bc9e 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -1550,8 +1550,6 @@ def check(args): def collect(args): "Collect signature metadata across many locations, save to manifest" - # TODO: - # test what happens with directories :) set_quiet(False, args.debug) if os.path.exists(args.output): From 24ab89cb1240c125c1cdf0b1517d6e9a1c07b691 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 21 Mar 2024 09:40:37 -0700 Subject: [PATCH 08/19] MRG: 4.8.7 release branch (#3090) # Release notes for sourmash v4.8.7 Note: This release changes the way `sourmash multigather` names output files in some situations. Please see https://github.com/sourmash-bio/sourmash/pull/2722 for details. Minor new features: * support proper manifest creation with `--relpath` for `sig check` and `sig collect` (#3054) * fix `multigather` output by adding md5sum along with `-U/--output-add-query-md5sum` (#2722) * enable loading lineages from annotated gather with match_name instead of name (#3078) Bug fixes: * fix output for `sketch ... --singleton` (#3066) * fix `calculate_gather_stats` `threshold=0` bug (#3052) Cleanup and documentation updates: * adjust protein ksize for record/manifest (#3019) * Resolve `sourmash gather --help` issue (#3032) * rework the manifest documentation; do misc cleanup (#3027) * add branchwater web to docs (#3018) Developer updates: * make core Manifest booleans python compatible (core) (#3007) * safer ksize selection while still accommodating k=k*3 (#3028) * fix clippy beta issues (#3088) * tell dependabot to ignore upgrades to `byteorder`, `chrono`, `once_cell`, and `wasm-bindgen` (#3065) * update rust changelog for r0.13.0 in preparation for release (#3033) * Allow changing storage location for a collection in RevIndex (#3015) * Fix tox and nix configs so all tox tests execute correctly (#2992) * Calculate all gather stats in rust; use for rocksdb gather (#2943) * bump screed req to 1.1.3 (#3067) * bump to v4.8.7-dev (#2989) Dependabot updates: * Bump DeterminateSystems/magic-nix-cache-action from 1 to 3 (#2994) * Bump DeterminateSystems/magic-nix-cache-action from 3 to 4 (#3085) * Bump DeterminateSystems/nix-installer-action from 4 to 9 (#2995) * Bump DeterminateSystems/nix-installer-action from 9 to 10 (#3083) * Bump chrono from 0.4.33 to 0.4.34 (#3000) * Bump conda-incubator/setup-miniconda from 3.0.1 to 3.0.2 (#3046) * Bump conda-incubator/setup-miniconda from 3.0.2 to 3.0.3 (#3057) * Bump histogram from 0.9.0 to 0.9.1 (#3002) * Bump itertools from 0.12.0 to 0.12.1 (#3043) * Bump log from 0.4.20 to 0.4.21 (#3062) * Bump num-iter from 0.1.43 to 0.1.44 (#2997) * Bump pypa/cibuildwheel from 2.16.5 to 2.17.0 (#3084) * Bump rayon from 1.8.1 to 1.9.0 (#3058) * Bump roaring from 0.10.2 to 0.10.3 (#3014) * Bump serde from 1.0.196 to 1.0.197 (#3045) * Bump serde_json from 1.0.113 to 1.0.114 (#3044) * Bump tempfile from 3.10.0 to 3.10.1 (#3059) * Bump thiserror from 1.0.56 to 1.0.57 (#2999) * Bump thiserror from 1.0.57 to 1.0.58 (#3082) * Bump wasm-bindgen from 0.2.91 to 0.2.92 (#3060) * Bump wasm-bindgen-test from 0.3.40 to 0.3.41 (#2996) * Bump wasm-bindgen-test from 0.3.41 to 0.3.42 (#3063) * Bump web-sys from 0.3.67 to 0.3.68 (#2998) * Bump web-sys from 0.3.68 to 0.3.69 (#3061) * Revert "Bump wasm-bindgen from 0.2.91 to 0.2.92 (#3060)" (#3064) * Update asv to 0.6.2 (#3025) * Update pytest requirement from <8.1.0,>=6.2.4 to >=6.2.4,<8.2.0 (#3075) --- flake.nix | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flake.nix b/flake.nix index 06ecc32fb4..8d4fae898e 100644 --- a/flake.nix +++ b/flake.nix @@ -68,7 +68,7 @@ sourmash = python.buildPythonPackage ( commonArgs // rec { pname = "sourmash"; - version = "4.8.6"; + version = "4.8.7"; format = "pyproject"; cargoDeps = rustPlatform.importCargoLock { diff --git a/pyproject.toml b/pyproject.toml index 291c732093..3e93f49d9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ build-backend = 'maturin' name = "sourmash" description = "tools for comparing biological sequences with k-mer sketches" readme = "README.md" -version = "4.8.7-dev" +version = "4.8.7" authors = [ { name="Luiz Irber", orcid="0000-0003-4371-9659" }, From 2b1bf0ddc27b342fdd686a01e7a6a0bdbe0e8585 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 23 Mar 2024 20:31:50 +0000 Subject: [PATCH 09/19] Implement file parsing for webassembly (#3047) Address https://github.com/sourmash-bio/sourmash/issues/1577#issuecomment-916891602 This PR implements `Read` for `File` in browsers, which allows using `niffler` + `needletail` to parse FASTA/Q, `.gz`compressed or not, in browsers. I also added error handling, so the browser can print nicer error messages instead of something cryptic to `console.log`. --- Cargo.lock | 3 +- Makefile | 3 + flake.nix | 1 + src/core/CHANGELOG.md | 30 ++++- src/core/Cargo.toml | 27 ++-- src/core/src/wasm.rs | 193 ++++++++++++++++++++++++++--- src/core/tests/dedicated_worker.rs | 5 + src/core/tests/node.rs | 8 ++ src/core/tests/service_worker.rs | 5 + src/core/tests/shared_worker.rs | 5 + src/core/tests/web.rs | 5 + 11 files changed, 254 insertions(+), 31 deletions(-) create mode 100644 src/core/tests/dedicated_worker.rs create mode 100644 src/core/tests/node.rs create mode 100644 src/core/tests/service_worker.rs create mode 100644 src/core/tests/shared_worker.rs create mode 100644 src/core/tests/web.rs diff --git a/Cargo.lock b/Cargo.lock index 2db96670de..0795b19d9b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1607,7 +1607,7 @@ checksum = "9f1341053f34bb13b5e9590afb7d94b48b48d4b87467ec28e3c238693bb553de" [[package]] name = "sourmash" -version = "0.13.0" +version = "0.13.1" dependencies = [ "az", "byteorder", @@ -1624,6 +1624,7 @@ dependencies = [ "getset", "histogram", "itertools 0.12.1", + "js-sys", "log", "md5", "memmap2", diff --git a/Makefile b/Makefile index 9b26d91331..891b710732 100644 --- a/Makefile +++ b/Makefile @@ -56,6 +56,9 @@ last-tag: wasm: wasm-pack build src/core -d ../../pkg +wasm-test: + wasm-pack test --node src/core + wasi: cargo wasi build diff --git a/flake.nix b/flake.nix index 8d4fae898e..57213ac6aa 100644 --- a/flake.nix +++ b/flake.nix @@ -128,6 +128,7 @@ cargo-outdated cargo-udeps cargo-deny + cargo-wasi #cargo-semver-checks nixpkgs-fmt ]; diff --git a/src/core/CHANGELOG.md b/src/core/CHANGELOG.md index 67a3134144..ac4d169e80 100644 --- a/src/core/CHANGELOG.md +++ b/src/core/CHANGELOG.md @@ -5,6 +5,29 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [unreleased] + +## [0.13.1] - 2024-03-23 + +MSRV: 1.65 + +Changes/additions: + +* Implement file parsing for webassembly (#3047) +* fix `calculate_gather_stats` `threshold=0` bug (#3052) +* fix clippy beta issues (#3088) + +Updates: + +* Bump wasm-bindgen-test from 0.3.41 to 0.3.42 (#3063) +* Bump web-sys from 0.3.68 to 0.3.69 (#3061) +* Bump log from 0.4.20 to 0.4.21 (#3062) +* Bump rayon from 1.8.1 to 1.9.0 (#3058) +* Bump tempfile from 3.10.0 to 3.10.1 (#3059) +* Bump serde_json from 1.0.113 to 1.0.114 (#3044) +* Bump serde from 1.0.196 to 1.0.197 (#3045) +* Bump itertools from 0.12.0 to 0.12.1 (#3043) + ## [0.13.0] - 2024-02-23 MSRV: 1.65 @@ -17,6 +40,7 @@ Changes/additions: * make core Manifest booleans python compatible (core) (#3007) Updates: + * Bump roaring from 0.10.2 to 0.10.3 (#3014) * Bump histogram from 0.9.0 to 0.9.1 (#3002) * Bump chrono from 0.4.33 to 0.4.34 (#3000) @@ -287,7 +311,11 @@ Fixed: - Fix mem leak in get_mins (#807) - Fixes for WASI and WASM compilation (#771) (#723) -[unreleased]: https://github.com/sourmash-bio/sourmash/compare/r0.11.0...HEAD +[unreleased]: https://github.com/sourmash-bio/sourmash/compare/r0.13.1...HEAD +[0.13.1]: https://github.com/sourmash-bio/sourmash/compare/r0.13.0...r0.13.1 +[0.13.0]: https://github.com/sourmash-bio/sourmash/compare/r0.12.1...r0.13.0 +[0.12.1]: https://github.com/sourmash-bio/sourmash/compare/r0.12.0...r0.12.1 +[0.12.0]: https://github.com/sourmash-bio/sourmash/compare/r0.11.0...r0.12.0 [0.11.0]: https://github.com/sourmash-bio/sourmash/compare/r0.10.0...r0.11.0 [0.10.0]: https://github.com/sourmash-bio/sourmash/compare/r0.9.0...r0.10.0 [0.9.0]: https://github.com/sourmash-bio/sourmash/compare/r0.9.0...r0.10.0 diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 0f292db6d6..2b4ae08b59 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "sourmash" -version = "0.13.0" -authors = ["Luiz Irber "] -description = "MinHash sketches for genomic data" +version = "0.13.1" +authors = ["Luiz Irber ", "N. Tessa Pierce-Ward "] +description = "tools for comparing biological sequences with k-mer sketches" repository = "https://github.com/sourmash-bio/sourmash" keywords = ["minhash", "bioinformatics"] categories = ["science", "algorithms", "data-structures"] @@ -43,6 +43,7 @@ log = "0.4.21" md5 = "0.7.0" memmap2 = "0.9.4" murmurhash3 = "0.0.5" +needletail = { version = "0.5.1", default-features = false } niffler = { version = "2.3.1", default-features = false, features = [ "gz" ] } nohash-hasher = "0.2.0" num-iter = "0.1.44" @@ -64,8 +65,6 @@ typed-builder = "0.18.0" vec-collections = "0.4.3" [dev-dependencies] -criterion = "0.5.1" -needletail = { version = "0.5.1", default-features = false } proptest = { version = "1.4.0", default-features = false, features = ["std"]} rand = "0.8.2" tempfile = "3.10.1" @@ -95,17 +94,13 @@ skip_feature_sets = [ ## Wasm section. Crates only used for WASM, as well as specific configurations -[target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies.wasm-bindgen] -version = "0.2.89" -features = ["serde-serialize"] +[target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies] +js-sys = "0.3.68" +web-sys = { version = "0.3.69", features = ["console", "File", "FileReaderSync"] } +wasm-bindgen = { version = "0.2.89", features = ["serde-serialize"] } -[target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies.web-sys] -version = "0.3.69" -features = ["console", "File"] - -[target.'cfg(all(target_arch = "wasm32"))'.dependencies.chrono] -version = "0.4.32" -features = ["wasmbind"] +[target.'cfg(all(target_arch = "wasm32"))'.dependencies] +chrono = { version = "0.4.32", features = ["wasmbind"] } [target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dev-dependencies] wasm-bindgen-test = "0.3.42" @@ -113,3 +108,5 @@ wasm-bindgen-test = "0.3.42" ### These crates don't compile on wasm [target.'cfg(not(target_arch = "wasm32"))'.dependencies] rocksdb = { version = "0.21.0", optional = true } +[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] +criterion = "0.5.1" diff --git a/src/core/src/wasm.rs b/src/core/src/wasm.rs index c2a0eb6c30..cd9efec091 100644 --- a/src/core/src/wasm.rs +++ b/src/core/src/wasm.rs @@ -4,6 +4,7 @@ #[global_allocator] static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT; +use needletail::parse_fastx_reader; use wasm_bindgen::prelude::*; use crate::cmd::ComputeParameters as _ComputeParameters; @@ -57,15 +58,15 @@ impl KmerMinHash { } #[wasm_bindgen] - pub fn add_sequence_js(&mut self, buf: &str) { - self.0 - .add_sequence(buf.as_bytes(), true) - .expect("Error adding sequence"); + pub fn add_sequence_js(&mut self, buf: &str) -> Result<(), JsErrors> { + self.0.add_sequence(buf.as_bytes(), true)?; + Ok(()) } #[wasm_bindgen] - pub fn to_json(&mut self) -> String { - serde_json::to_string(&self.0).unwrap() + pub fn to_json(&mut self) -> Result { + let json = serde_json::to_string(&self.0)?; + Ok(json) } } @@ -81,6 +82,40 @@ impl ComputeParameters { pub fn set_ksizes(&mut self, ksizes: Vec) { self.0.set_ksizes(ksizes); } + + #[wasm_bindgen] + pub fn set_scaled(&mut self, scaled: u32) { + self.0.set_scaled(scaled as u64); + } + + #[wasm_bindgen] + pub fn set_num(&mut self, num: u32) { + self.0.set_num_hashes(num); + } + + #[wasm_bindgen] + pub fn set_protein(&mut self, is_protein: bool) { + self.0.set_protein(is_protein); + } + + #[wasm_bindgen] + pub fn set_dayhoff(&mut self, dayhoff: bool) { + self.0.set_dayhoff(dayhoff); + } + + #[wasm_bindgen] + pub fn set_hp(&mut self, hp: bool) { + self.0.set_hp(hp); + } + + #[wasm_bindgen] + pub fn set_track_abundance(&mut self, track: bool) { + self.0.set_track_abundance(track); + } + #[wasm_bindgen] + pub fn set_seed(&mut self, seed: u32) { + self.0.set_seed(seed.into()); + } } #[wasm_bindgen] @@ -93,20 +128,39 @@ impl Signature { } #[wasm_bindgen] - pub fn add_sequence_js(&mut self, buf: &str) { - self.0 - .add_sequence(buf.as_bytes(), true) - .expect("Error adding sequence"); + pub fn add_sequence_js(&mut self, buf: &str) -> Result<(), JsErrors> { + self.0.add_sequence(buf.as_bytes(), true)?; + + Ok(()) } #[wasm_bindgen] - pub fn add_from_file(&mut self, fp: web_sys::File) { - unimplemented!() + pub fn add_from_file( + &mut self, + fp: web_sys::File, + callback: Option, + ) -> Result<(), JsErrors> { + let wf = SyncFile::new(fp, callback); + + let (rdr, _format) = niffler::send::get_reader(Box::new(wf))?; + + let mut parser = parse_fastx_reader(std::io::BufReader::with_capacity( + 1024 << 14, // 16 MiB + rdr, + ))?; + + while let Some(record) = parser.next() { + let record = record?; + self.0.add_sequence(&record.seq(), true)?; + } + + Ok(()) } #[wasm_bindgen] - pub fn to_json(&mut self) -> String { - serde_json::to_string(&self.0).unwrap() + pub fn to_json(&mut self) -> Result { + let json = serde_json::to_string(&self.0)?; + Ok(json) } pub fn size(&self) -> usize { @@ -114,6 +168,28 @@ impl Signature { } } +#[derive(thiserror::Error, Debug)] +pub enum JsErrors { + #[error(transparent)] + SourmashError(#[from] crate::Error), + + #[error(transparent)] + SerdeError(#[from] serde_json::error::Error), + + #[error(transparent)] + NifflerError(#[from] niffler::Error), + + #[error(transparent)] + NeedletailError(#[from] needletail::errors::ParseError), +} + +impl Into for JsErrors { + fn into(self) -> JsValue { + let error = js_sys::Error::new(&self.to_string()); + error.into() + } +} + #[cfg(test)] mod test { use super::*; @@ -127,3 +203,92 @@ mod test { assert_eq!(sig.size(), 3); } } + +// ============================== + +use js_sys::Number; +use js_sys::Uint8Array; +use once_cell::sync::Lazy; +use web_sys::FileReaderSync; + +thread_local! { + static FILE_READER_SYNC: Lazy = Lazy::new(|| { + FileReaderSync::new().expect("Failed to create FileReaderSync. Is it running in a web worker context?") + }); +} + +/// Wrapper around a `web_sys::File` that implements `Read` and `Seek`. +pub struct SyncFile { + file: web_sys::File, + pos: u64, + cb: Option, +} + +/// Because this needs to be initialized in a Web Worker, it is safe to make it Send. +/// (hopefully. I don't think they can be sent across Web Workers, nor accessed from other WW) +unsafe impl Send for SyncFile {} + +impl SyncFile { + pub fn new(file: web_sys::File, cb: Option) -> Self { + Self { file, pos: 0, cb } + } + + /// File size in bytes. + pub fn size(&self) -> u64 { + let size = self.file.size(); + if size <= Number::MAX_SAFE_INTEGER { + return size as u64; + } else { + panic!("size is not safe to convert to integer from float") + } + } + + fn set_pos(&mut self, pos: u64) { + self.pos = pos; + self.cb.as_ref().map(|f| { + let arr = js_sys::Array::new_with_length(1); + arr.set(0, self.progress().into()); + f.apply(&JsValue::null(), &arr) + .expect("Error calling progress callback"); + }); + } + + /// Current progress on the file + pub fn progress(&self) -> f64 { + self.pos as f64 / self.file.size() + } +} + +impl std::io::Read for SyncFile { + fn read(&mut self, buf: &mut [u8]) -> Result { + let current_offset = self.pos; + let new_offset_f64 = current_offset as f64; + let new_offset_end_f64 = current_offset.saturating_add( + u64::try_from(buf.len()).map_err(|_| std::io::Error::other("Can't convert to u64"))?, + ) as f64; + + let blob = self + .file + .slice_with_f64_and_f64(new_offset_f64, new_offset_end_f64) + .map_err(|_| std::io::Error::other("failed to slice file"))?; + let array_buffer = FILE_READER_SYNC + .with(|frs| frs.read_as_array_buffer(&blob)) + .map_err(|_| std::io::Error::other("failed to read as array buffer"))?; + + let array = Uint8Array::new(&array_buffer); + let read_bytes = usize::try_from(array.byte_length()) + .map_err(|_| std::io::Error::other("read too many bytes at once"))?; + + // Copy to output buffer + array.copy_to(&mut buf[..read_bytes]); + + // Update position + self.set_pos( + current_offset + .checked_add(read_bytes as u64) + .ok_or_else(|| std::io::Error::other("new position too large"))?, + ); + + Ok(read_bytes) + } +} diff --git a/src/core/tests/dedicated_worker.rs b/src/core/tests/dedicated_worker.rs new file mode 100644 index 0000000000..f7186a003f --- /dev/null +++ b/src/core/tests/dedicated_worker.rs @@ -0,0 +1,5 @@ +#![cfg(all(target_arch = "wasm32", target_os = "unknown"))] + +use wasm_bindgen_test::wasm_bindgen_test_configure; + +wasm_bindgen_test_configure!(run_in_dedicated_worker); diff --git a/src/core/tests/node.rs b/src/core/tests/node.rs new file mode 100644 index 0000000000..f846433061 --- /dev/null +++ b/src/core/tests/node.rs @@ -0,0 +1,8 @@ +#![cfg(all(target_arch = "wasm32", target_os = "unknown"))] + +use wasm_bindgen_test::*; + +#[wasm_bindgen_test] +fn pass() { + assert_eq!(1, 1); +} diff --git a/src/core/tests/service_worker.rs b/src/core/tests/service_worker.rs new file mode 100644 index 0000000000..dae9341d9e --- /dev/null +++ b/src/core/tests/service_worker.rs @@ -0,0 +1,5 @@ +#![cfg(all(target_arch = "wasm32", target_os = "unknown"))] + +use wasm_bindgen_test::wasm_bindgen_test_configure; + +wasm_bindgen_test_configure!(run_in_service_worker); diff --git a/src/core/tests/shared_worker.rs b/src/core/tests/shared_worker.rs new file mode 100644 index 0000000000..8d8bfc7a4f --- /dev/null +++ b/src/core/tests/shared_worker.rs @@ -0,0 +1,5 @@ +#![cfg(all(target_arch = "wasm32", target_os = "unknown"))] + +use wasm_bindgen_test::wasm_bindgen_test_configure; + +wasm_bindgen_test_configure!(run_in_shared_worker); diff --git a/src/core/tests/web.rs b/src/core/tests/web.rs new file mode 100644 index 0000000000..3bbc3dad61 --- /dev/null +++ b/src/core/tests/web.rs @@ -0,0 +1,5 @@ +#![cfg(all(target_arch = "wasm32", target_os = "unknown"))] + +use wasm_bindgen_test::wasm_bindgen_test_configure; + +wasm_bindgen_test_configure!(run_in_browser); From 9d9fe98089c708776114dfd5ccb291d15978a9e2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 05:43:40 -0700 Subject: [PATCH 10/19] Bump rayon from 1.9.0 to 1.10.0 (#3098) Bumps [rayon](https://github.com/rayon-rs/rayon) from 1.9.0 to 1.10.0.
Changelog

Sourced from rayon's changelog.

Release rayon 1.10.0 (2024-03-23)

  • The new methods ParallelSlice::par_chunk_by and ParallelSliceMut::par_chunk_by_mut work like the slice methods chunk_by and chunk_by_mut added in Rust 1.77.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=rayon&package-manager=cargo&previous-version=1.9.0&new-version=1.10.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- src/core/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0795b19d9b..3d3f72ec05 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1347,9 +1347,9 @@ checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" [[package]] name = "rayon" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4963ed1bc86e4f3ee217022bd855b297cef07fb9eac5dfa1f788b220b49b3bd" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" dependencies = [ "either", "rayon-core", diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 2b4ae08b59..1161e21428 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -51,7 +51,7 @@ once_cell = "1.18.0" ouroboros = "0.18.3" piz = "0.5.0" primal-check = "0.3.1" -rayon = { version = "1.9.0", optional = true } +rayon = { version = "1.10.0", optional = true } rkyv = { version = "0.7.44", optional = true } roaring = "0.10.3" roots = "0.0.8" From e0ed4c3c86046316aa725a0b56ea7e7f0c06d7df Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 13:25:30 +0000 Subject: [PATCH 11/19] Update pytest-cov requirement from <5.0,>=4 to >=4,<6.0 (#3097) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates the requirements on [pytest-cov](https://github.com/pytest-dev/pytest-cov) to permit the latest version.
Changelog

Sourced from pytest-cov's changelog.

5.0.0 (2024-03-24)

  • Removed support for xdist rsync (now deprecated). Contributed by Matthias Reichenbach in [#623](https://github.com/pytest-dev/pytest-cov/issues/623) <https://github.com/pytest-dev/pytest-cov/pull/623>_.
  • Switched docs theme to Furo.
  • Various legacy Python cleanup and CI improvements. Contributed by Christian Clauss and Hugo van Kemenade in [#630](https://github.com/pytest-dev/pytest-cov/issues/630) <https://github.com/pytest-dev/pytest-cov/pull/630>, [#631](https://github.com/pytest-dev/pytest-cov/issues/631) <https://github.com/pytest-dev/pytest-cov/pull/631>, [#632](https://github.com/pytest-dev/pytest-cov/issues/632) <https://github.com/pytest-dev/pytest-cov/pull/632>_ and [#633](https://github.com/pytest-dev/pytest-cov/issues/633) <https://github.com/pytest-dev/pytest-cov/pull/633>_.
  • Added a pyproject.toml example in the docs. Contributed by Dawn James in [#626](https://github.com/pytest-dev/pytest-cov/issues/626) <https://github.com/pytest-dev/pytest-cov/pull/626>_.
  • Modernized project's pre-commit hooks to use ruff. Initial POC contributed by Christian Clauss in [#584](https://github.com/pytest-dev/pytest-cov/issues/584) <https://github.com/pytest-dev/pytest-cov/pull/584>_.

4.1.0 (2023-05-24)

  • Updated CI with new Pythons and dependencies.
  • Removed rsyncdir support. This makes pytest-cov compatible with xdist 3.0. Contributed by Sorin Sbarnea in [#558](https://github.com/pytest-dev/pytest-cov/issues/558) <https://github.com/pytest-dev/pytest-cov/pull/558>_.
  • Optimized summary generation to not be performed if no reporting is active (for example, when --cov-report='' is used without --cov-fail-under). Contributed by Jonathan Stewmon in [#589](https://github.com/pytest-dev/pytest-cov/issues/589) <https://github.com/pytest-dev/pytest-cov/pull/589>_.
  • Added support for JSON reporting. Contributed by Matthew Gamble in [#582](https://github.com/pytest-dev/pytest-cov/issues/582) <https://github.com/pytest-dev/pytest-cov/pull/582>_.
  • Refactored code to use f-strings. Contributed by Mark Mayo in [#572](https://github.com/pytest-dev/pytest-cov/issues/572) <https://github.com/pytest-dev/pytest-cov/pull/572>_.
  • Fixed a skip in the test suite for some old xdist. Contributed by a bunch of people in [#565](https://github.com/pytest-dev/pytest-cov/issues/565) <https://github.com/pytest-dev/pytest-cov/pull/565>_.

4.0.0 (2022-09-28)

Note that this release drops support for multiprocessing.

  • --cov-fail-under no longer causes pytest --collect-only to fail Contributed by Zac Hatfield-Dodds in [#511](https://github.com/pytest-dev/pytest-cov/issues/511) <https://github.com/pytest-dev/pytest-cov/pull/511>_.

  • Dropped support for multiprocessing (mostly because issue 82408 <https://github.com/python/cpython/issues/82408>_). This feature was mostly working but very broken in certain scenarios and made the test suite very flaky and slow.

    There is builtin multiprocessing support in coverage and you can migrate to that. All you need is this in your .coveragerc::

    [run] concurrency = multiprocessing

... (truncated)

Commits
  • 5295ce0 Bump version: 4.1.0 → 5.0.0
  • 1181b06 Update changelog.
  • 9757222 Fix a minor grammar error (#636)
  • 9f5cd81 Cleanup releasing instructions. Closes #616.
  • 93b5047 Add test for pyproject.toml loading without explicit --cov-config. Ref #508.
  • ff50860 docs: add config instructions for pyproject.toml.
  • 4a5a4b5 Keep GitHub Actions up to date with GitHub's Dependabot
  • 1d7f559 Fix or remove URLs that are causing docs tests to fail
  • 6a5af8e Update changelog.
  • d9fe8df Switch to furo. Closes #618.
  • Additional commits viewable in compare view

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3e93f49d9c..3a141d4b79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ license = { text = "BSD 3-Clause License" } [project.optional-dependencies] test = [ "pytest>=6.2.4,<8.2.0", - "pytest-cov>=4,<5.0", + "pytest-cov>=4,<6.0", "pytest-xdist>=3.1", "pyyaml>=6,<7", "diff-cover>=7.3", From 534a3dbf2aba6b7c1d2c75b60698aba8a1651689 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Apr 2024 22:14:08 +0000 Subject: [PATCH 12/19] Bump serde_json from 1.0.114 to 1.0.115 (#3101) Bumps [serde_json](https://github.com/serde-rs/json) from 1.0.114 to 1.0.115.
Release notes

Sourced from serde_json's releases.

v1.0.115

  • Documentation improvements
Commits
  • b1ebf38 Release 1.0.115
  • c3dc153 Merge pull request #1119 from titaniumtraveler/pr
  • 218770b Explicitly install a Rust toolchain for cargo-outdated job
  • 840da8e Fix missing backticks in doc comments
  • 3a3f61b Temporarily disable miri on doctests
  • 4a0be88 Format regression tests with rustfmt
  • d2dbbf7 Ignore dead code lint in tests
  • 8e7b37b Merge pull request #1118 from serde-rs/transparent
  • a25f6c6 Remove conditional on repr(transparent)
  • fedf834 Ignore non_local_definitions false positive in test
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=serde_json&package-manager=cargo&previous-version=1.0.114&new-version=1.0.115)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- src/core/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3d3f72ec05..a6ea3e12ad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1559,9 +1559,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.114" +version = "1.0.115" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" +checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" dependencies = [ "itoa", "ryu", diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 1161e21428..574d38be65 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -56,7 +56,7 @@ rkyv = { version = "0.7.44", optional = true } roaring = "0.10.3" roots = "0.0.8" serde = { version = "1.0.197", features = ["derive"] } -serde_json = "1.0.114" +serde_json = "1.0.115" statrs = "0.16.0" streaming-stats = "0.2.3" thiserror = "1.0" From 0af3cbb828e77676496abd1dfe0196818c54c511 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Apr 2024 16:01:36 -0700 Subject: [PATCH 13/19] Bump enum_dispatch from 0.3.12 to 0.3.13 (#3102) Bumps [enum_dispatch](https://gitlab.com/antonok/enum_dispatch) from 0.3.12 to 0.3.13.
Changelog

Sourced from enum_dispatch's changelog.

0.3.13

  • Fix namespace collision with imports named core (!35)
Commits
  • c20b482 v0.3.13
  • 36b13dc add test for ::core namespace collision
  • e7e6ce5 Merge branch 'master' into 'master'
  • 9a2e6e0 added prefix to specifer to fix naming conflicts
  • 38d9dd5 Merge branch 'include-tests' into 'master'
  • 5239382 Cargo.toml: include tests in crate
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=enum_dispatch&package-manager=cargo&previous-version=0.3.12&new-version=0.3.13)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- src/core/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a6ea3e12ad..2e9f16247f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -535,9 +535,9 @@ checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" [[package]] name = "enum_dispatch" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" +checksum = "aa18ce2bc66555b3218614519ac839ddb759a7d6720732f979ef8d13be147ecd" dependencies = [ "once_cell", "proc-macro2", diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 574d38be65..23a30b3b57 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -32,7 +32,7 @@ camino = { version = "1.1.6", features = ["serde1"] } cfg-if = "1.0" counter = "0.5.7" csv = "1.3.0" -enum_dispatch = "0.3.12" +enum_dispatch = "0.3.13" finch = { version = "0.6.0", optional = true } fixedbitset = "0.4.0" getrandom = { version = "0.2", features = ["js"] } From 20e61952e1926711d6e9f28c021bc9bcad21271a Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Sat, 6 Apr 2024 07:25:51 -0700 Subject: [PATCH 14/19] MRG: add pyopensci review badge (#3105) Add pyopensci review badge, linking to accepted review: https://github.com/pyOpenSci/software-submission/issues/129 --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index f12d6a65ce..702a729dd9 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,9 @@ Quickly search, compare, and analyze genomic and metagenomic data sets. License: 3-Clause BSD [![Documentation](https://readthedocs.org/projects/sourmash/badge/?version=latest)](http://sourmash.readthedocs.io/en/latest/) [![Gitter](https://badges.gitter.im/sourmash-bio/community.svg)](https://gitter.im/sourmash-bio/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) + [![DOI](http://joss.theoj.org/papers/10.21105/joss.00027/status.svg)](http://joss.theoj.org/papers/10.21105/joss.00027) +[![pyOpenSci](https://tinyurl.com/y22nb8up)](https://github.com/pyOpenSci/software-submission/issues/129) [![Bioconda install](https://img.shields.io/conda/dn/bioconda/sourmash.svg?style=flag&label=Bioconda)](https://anaconda.org/bioconda/sourmash) PyPI From 5b5337e14f8e7f08ec7d85eb98e1bd40ffca7464 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Apr 2024 14:42:57 -0700 Subject: [PATCH 15/19] Bump getrandom from 0.2.12 to 0.2.14 (#3108) Bumps [getrandom](https://github.com/rust-random/getrandom) from 0.2.12 to 0.2.14.
Changelog

Sourced from getrandom's changelog.

[0.2.14] - 2024-04-08

Fixed

  • Enable /dev/urandom fallback for MUSL-based Linux targets #408

#408: rust-random/getrandom#408

[0.2.13] - 2024-04-06

Added

  • linux_disable_fallback crate feature to disable /dev/urandom-based fallback on Linux and Android targets. Enabling this feature bumps minimum supported Linux kernel version to 3.17 and Android API level to 23 (Marshmallow). #396

Changed

  • Disable /dev/urandom fallback for Linux targets outside of the following target_arches: aarch64, arm, powerpc, powerpc64, s390x, x86, x86_64 #396
  • Do not catch EPERM error code on Android while checking availability of the getrandom syscall #396

#396: rust-random/getrandom#396

Commits
  • a39033a Enable /dev/urandom fallback for MUSL-based Linux targets (#408)
  • 968dd48 Release v0.2.13 (#405)
  • 8ffd43e Conditionally disable file fallback for Android and Linux (#396)
  • 6b7bcb5 Replace man7.org links with manned.org (#404)
  • 5f0701f CI: Run tests on aarch64-apple-darwin and aarch64-apple-ios-sim. (#398)
  • 489eeee Fix nightly build by removing redundant use (#399)
  • d102c36 Use doc_auto_cfg instead of doc_cfg (#392)
  • 2e4bb4d Correct comments regarding LazyUsize (#391)
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=getrandom&package-manager=cargo&previous-version=0.2.12&new-version=0.2.14)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2e9f16247f..6743510def 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -604,9 +604,9 @@ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" [[package]] name = "getrandom" -version = "0.2.12" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c" dependencies = [ "cfg-if", "js-sys", From 9d472a1828a7c361929d3731dcf12cc3ac1d840e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Apr 2024 15:56:04 -0700 Subject: [PATCH 16/19] Bump histogram from 0.9.1 to 0.10.0 (#3109) Bumps [histogram](https://github.com/pelikan-io/rustcommon) from 0.9.1 to 0.10.0.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=histogram&package-manager=cargo&previous-version=0.9.1&new-version=0.10.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- src/core/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6743510def..51ef233466 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -662,9 +662,9 @@ checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" [[package]] name = "histogram" -version = "0.9.1" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b634390eb8a63662e127836d4e2f26d7ae930600d4e05ee0fd85a009eeb1175" +checksum = "f4d3bddd75a32b17e75762f128ffc7a33158b933b6eb27424da9be4a58f30eb9" dependencies = [ "thiserror", ] diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 23a30b3b57..7be417cfd6 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -37,7 +37,7 @@ finch = { version = "0.6.0", optional = true } fixedbitset = "0.4.0" getrandom = { version = "0.2", features = ["js"] } getset = "0.1.1" -histogram = "0.9.1" +histogram = "0.10.0" itertools = "0.12.1" log = "0.4.21" md5 = "0.7.0" From a387d222677e81c1736739a24a7a4a57be3c55df Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 9 Apr 2024 15:05:14 -0700 Subject: [PATCH 17/19] MRG: 4.8.8 release branch (#3110) Release candidate testing: - [x] Command line tests pass for a release candidate - [x] All eight release candidate wheels are built Releasing to PyPI: - [ ] RC tag(s)s deleted on github - [ ] Release tag cut - [ ] Release notes written - [ ] All eight release wheels built - [ ] Release wheels uploaded to pypi - [ ] tar.gz distribution uploaded to pypi After release to PyPI and conda-forge/bioconda packages built: - [ ] [PyPI page](https://pypi.org/project/sourmash/) updated - [ ] Zenodo DOI successfully minted upon new github release - [see search results](https://zenodo.org/search?page=1&size=20&q=sourmash&sort=mostrecent) - [ ] `pip install sourmash` installs the correct version - [ ] [conda-forge sourmash-minimal-feedstock](https://github.com/conda-forge/sourmash-minimal-feedstock) has updated `sourmash-minimal` to the correct version - [ ] `mamba create -n smash-release -y sourmash` installs the correct version Optional but recommended: - [ ] PR submitted to update pyodide version - [ ] PR submitted to update spack version --- ## Release notes: - Bump histogram from 0.9.1 to 0.10.0 (#3109) - Bump getrandom from 0.2.12 to 0.2.14 (#3108) - MRG: add pyopensci review badge (#3105) - Bump enum_dispatch from 0.3.12 to 0.3.13 (#3102) - Bump serde_json from 1.0.114 to 1.0.115 (#3101) - Update pytest-cov requirement from <5.0,>=4 to >=4,<6.0 (#3097) - Bump rayon from 1.9.0 to 1.10.0 (#3098) - Implement file parsing for webassembly (#3047) --- flake.nix | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flake.nix b/flake.nix index 57213ac6aa..5c3795f1d4 100644 --- a/flake.nix +++ b/flake.nix @@ -68,7 +68,7 @@ sourmash = python.buildPythonPackage ( commonArgs // rec { pname = "sourmash"; - version = "4.8.7"; + version = "4.8.8"; format = "pyproject"; cargoDeps = rustPlatform.importCargoLock { diff --git a/pyproject.toml b/pyproject.toml index 3a141d4b79..083016d1c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ build-backend = 'maturin' name = "sourmash" description = "tools for comparing biological sequences with k-mer sketches" readme = "README.md" -version = "4.8.7" +version = "4.8.8" authors = [ { name="Luiz Irber", orcid="0000-0003-4371-9659" }, From e0d002a55c67dfb62c73ba76b3e01addefa88cf6 Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Fri, 12 Apr 2024 10:34:00 -0700 Subject: [PATCH 18/19] MRG: force continue past `tax genome` classification errors (#3100) When we were doing one or a few genome classifications, it made sense to error out completely if there was an issue. Now that we have fastmultigather and can do 10s of thousands at once, It would be nice to be able to continue past errors (logging them). **Changed behavior:** - If there is a failed classification, notify the error and do not write that result. Continue with classification. - Finish classification and write output file, BUT exit with an error code if there were errors, except if --force is used. - Remove some previously useful reporting about the classification ranks, because it's too much output for large-scale classification. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- src/sourmash/tax/__main__.py | 25 ++++++- src/sourmash/tax/tax_utils.py | 3 - tests/test_tax.py | 133 ++++++++++++++++++++++++++++------ 3 files changed, 131 insertions(+), 30 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 073977cb79..1a5d22940a 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -313,6 +313,9 @@ def genome(args): sys.exit(-1) # for each queryResult, summarize at rank and classify according to thresholds, reporting any errors that occur. + n_total = len(query_gather_results) + classified_results = [] + found_error = False for queryResult in query_gather_results: try: queryResult.build_classification_result( @@ -322,10 +325,21 @@ def genome(args): lingroup_ranks=lg_ranks, lingroups=all_lgs, ) + classified_results.append(queryResult) except ValueError as exc: - error(f"ERROR: {str(exc)}") - sys.exit(-1) + found_error = True + notify(f"ERROR: {str(exc)}") + + n_classified = len(classified_results) + if n_classified == 0: + notify("No queries could be classified. Exiting.") + sys.exit(-1) + else: + classif_perc = (float(n_classified) / float(n_total)) * 100 + notify( + f"classified {n_classified}/{n_total} queries ({classif_perc :.2f}%). Writing results" + ) # write outputs if "csv_summary" in args.output_format: @@ -334,7 +348,7 @@ def genome(args): ) with FileOutputCSV(summary_outfile) as out_fp: tax_utils.write_summary( - query_gather_results, + classified_results, out_fp, limit_float_decimals=limit_float, classification=True, @@ -389,6 +403,11 @@ def genome(args): with FileOutputCSV(lineage_outfile) as out_fp: tax_utils.write_output(header, lineage_results, out_fp) + # if there was a classification error, exit with err code + if found_error: + if not args.force: + sys.exit(-1) + def annotate(args): """ diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 1615c90d74..a2fbeb3f30 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -2328,9 +2328,6 @@ def summarize_up_ranks(self, single_rank=None, force_resummarize=False): f"Error: rank '{single_rank}' not in available ranks ({', '.join(self.summarized_ranks)})" ) self.summarized_ranks = [single_rank] - notify( - f"Starting summarization up rank(s): {', '.join(self.summarized_ranks)} " - ) for taxres in self.raw_taxresults: lininfo = taxres.lineageInfo if ( diff --git a/tests/test_tax.py b/tests/test_tax.py index 70b4f14fc0..fc68b6448d 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -2700,6 +2700,115 @@ def test_genome_gather_two_files_empty_force(runtmp): ) +def test_genome_gather_two_files_one_classif_fail(runtmp): + # if one query cant be classified still get classif for second + # no --force = fail but still write file + c = runtmp + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") + + # make test2 results (identical to test1 except query_name and filename) + g_res2 = runtmp.output("test2.gather.csv") + test2_results = [ + x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines() + ] + test2_results[1] = test2_results[1].replace( + "0.08815317112086159", "1.1" + ) # make test2 f_unique_to_query sum to >1 + for line in test2_results: + print(line) + with open(g_res2, "w") as fp: + fp.writelines(test2_results) + + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == -1 + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert "test2" not in c.last_result.out + assert ( + "ERROR: Summarized fraction is > 100% of the query! This should not be possible. Please check that your input files come directly from a single gather run per query." + in c.last_result.err + ) + + +def test_genome_gather_two_files_one_classif(runtmp): + # if one query cant be classified, still get classif for second + c = runtmp + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") + + # make test2 results (identical to test1 except query_name and filename) + g_res2 = runtmp.output("test2.gather.csv") + test2_results = [ + x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines() + ] + test2_results[1] = test2_results[1].replace( + "0.08815317112086159", "1.1" + ) # make test2 f_unique_to_query sum to >1 + for line in test2_results: + print(line) + with open(g_res2, "w") as fp: + fp.writelines(test2_results) + + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + "--force", + ) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert "test2" not in c.last_result.out + assert ( + "ERROR: Summarized fraction is > 100% of the query! This should not be possible. Please check that your input files come directly from a single gather run per query." + in c.last_result.err + ) + + def test_genome_gather_duplicate_filename(runtmp): c = runtmp taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") @@ -5936,10 +6045,6 @@ def test_metagenome_LIN_lingroups(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert ( - "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" - in c.last_result.err - ) assert ( "Read 5 lingroup rows and found 5 distinct lingroup prefixes." in c.last_result.err @@ -5970,10 +6075,6 @@ def test_metagenome_LIN_human_summary_no_lin_position(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert ( - "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" - in c.last_result.err - ) assert "sample name proportion cANI lineage" in c.last_result.out assert "----------- ---------- ---- -------" in c.last_result.out assert "test1 86.9% - unclassified" in c.last_result.out @@ -6020,10 +6121,6 @@ def test_metagenome_LIN_human_summary_lin_position_5(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert ( - "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" - in c.last_result.err - ) assert "sample name proportion cANI lineage" in c.last_result.out assert "----------- ---------- ---- -------" in c.last_result.out assert "test1 86.9% - unclassified" in c.last_result.out @@ -6058,10 +6155,6 @@ def test_metagenome_LIN_krona_lin_position_5(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert ( - "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" - in c.last_result.err - ) assert "fraction 0 1 2 3 4 5" in c.last_result.out assert "0.08815317112086159 0 0 0 0 0 0" in c.last_result.out assert "0.07778220981252493 1 0 0 0 0 0" in c.last_result.out @@ -6133,10 +6226,6 @@ def test_metagenome_LIN_lingroups_empty_lg_file(runtmp): print(c.last_result.err) assert c.last_result.status != 0 - assert ( - "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" - in c.last_result.err - ) assert ( f"Cannot read lingroups from '{lg_file}'. Is file empty?" in c.last_result.err ) @@ -6302,8 +6391,4 @@ def test_metagenome_LIN_lingroups_lg_only_header(runtmp): print(c.last_result.err) assert c.last_result.status != 0 - assert ( - "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" - in c.last_result.err - ) assert f"No lingroups loaded from {lg_file}" in c.last_result.err From f4e720552c0e6dc22a9670289ef201ad0176225a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 12 Apr 2024 11:44:18 -0700 Subject: [PATCH 19/19] MRG: prepare to remove `sourmash compute` for sourmash v5.0 (#3103) This PR refactors code to eliminate any internal dependencies on the `compute` command or codebase, in preparation for removing `sourmash compute` in v5.0, per https://github.com/sourmash-bio/sourmash/issues/1286. Specifically, this PR: * shifts common sketching code from `src/sourmash/command_compute.py` to `src/sourmash/command_sketch.py`; * refactors three tests that were still using `sourmash compute` to use `sourmash sketch` instead. No functionality is altered or adjusted in this PR; it's all just code refactoring. The next step for #1286 would maybe be adjust the code and the tests to respect the `--v4` and `--v5` flags (as used in https://github.com/sourmash-bio/sourmash/pull/3072 / https://github.com/sourmash-bio/sourmash/pull/3074). The actual compute code and command wouldn't be removed until after a 5.0 release, I think. --- src/sourmash/command_compute.py | 12 +- src/sourmash/command_sketch.py | 461 +++++++++++++++++++++++++++++++- tests/test_cmd_signature.py | 34 ++- tests/test_sourmash.py | 38 ++- tests/test_sourmash_sketch.py | 3 +- 5 files changed, 519 insertions(+), 29 deletions(-) diff --git a/src/sourmash/command_compute.py b/src/sourmash/command_compute.py index aac66def13..dbb3c42ad1 100644 --- a/src/sourmash/command_compute.py +++ b/src/sourmash/command_compute.py @@ -13,9 +13,15 @@ from .utils import RustObject from ._lowlevel import ffi, lib -DEFAULT_COMPUTE_K = "21,31,51" -DEFAULT_MMHASH_SEED = 42 -DEFAULT_LINE_COUNT = 1500 + +from .command_sketch import ( + _compute_individual, + _compute_merged, + ComputeParameters, + add_seq, + set_sig_name, + DEFAULT_MMHASH_SEED, +) def compute(args): diff --git a/src/sourmash/command_sketch.py b/src/sourmash/command_sketch.py index 508cac7c01..e98212f8c1 100644 --- a/src/sourmash/command_sketch.py +++ b/src/sourmash/command_sketch.py @@ -12,18 +12,14 @@ import sourmash from .signature import SourmashSignature from .logging import notify, error, set_quiet, print_results -from .command_compute import ( - _compute_individual, - _compute_merged, - ComputeParameters, - add_seq, - set_sig_name, - DEFAULT_MMHASH_SEED, -) from sourmash import sourmash_args from sourmash.sourmash_args import check_scaled_bounds, check_num_bounds from sourmash.sig.__main__ import _summarize_manifest, _SketchInfo from sourmash.manifest import CollectionManifest +from .utils import RustObject +from ._lowlevel import ffi, lib + +DEFAULT_MMHASH_SEED = 42 DEFAULTS = dict( dna="k=31,scaled=1000,noabund", @@ -637,3 +633,452 @@ def fromfile(args): notify( f"** {total_sigs} total requested; output {total_sigs - skipped_sigs}, skipped {skipped_sigs}" ) + + +class _signatures_for_compute_factory: + "Build signatures on demand, based on args input to 'compute'." + + def __init__(self, args): + self.args = args + + def __call__(self): + args = self.args + params = ComputeParameters( + ksizes=args.ksizes, + seed=args.seed, + protein=args.protein, + dayhoff=args.dayhoff, + hp=args.hp, + dna=args.dna, + num_hashes=args.num_hashes, + track_abundance=args.track_abundance, + scaled=args.scaled, + ) + sig = SourmashSignature.from_params(params) + return [sig] + + +def _compute_individual(args, signatures_factory): + # this is where output signatures will go. + save_sigs = None + + # track: is this the first file? in cases where we have empty inputs, + # we don't want to open any outputs. + first_file_for_output = True + + # if args.output is set, we are aggregating all output to a single file. + # do not open a new output file for each input. + open_output_each_time = True + if args.output: + open_output_each_time = False + + for filename in args.filenames: + if open_output_each_time: + # for each input file, construct output filename + sigfile = os.path.basename(filename) + ".sig" + if args.output_dir: + sigfile = os.path.join(args.output_dir, sigfile) + + # does it already exist? skip if so. + if os.path.exists(sigfile) and not args.force: + notify("skipping {} - already done", filename) + continue # go on to next file. + + # nope? ok, let's save to it. + assert not save_sigs + save_sigs = sourmash_args.SaveSignaturesToLocation(sigfile) + + # + # calculate signatures! + # + + # now, set up to iterate over sequences. + with screed.open(filename) as screed_iter: + if not screed_iter: + notify(f"no sequences found in '{filename}'?!") + continue + + # open output for signatures + if open_output_each_time: + save_sigs.open() + # or... is this the first time to write something to args.output? + elif first_file_for_output: + save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) + save_sigs.open() + first_file_for_output = False + + # make a new signature for each sequence? + if args.singleton: + n_calculated = 0 + for n, record in enumerate(screed_iter): + sigs = signatures_factory() + try: + add_seq( + sigs, + record.sequence, + args.input_is_protein, + args.check_sequence, + ) + except ValueError as exc: + error(f"ERROR when reading from '{filename}' - ") + error(str(exc)) + sys.exit(-1) + + n_calculated += len(sigs) + set_sig_name(sigs, filename, name=record.name) + save_sigs_to_location(sigs, save_sigs) + + notify( + "calculated {} signatures for {} sequences in {}", + n_calculated, + n + 1, + filename, + ) + + # nope; make a single sig for the whole file + else: + sigs = signatures_factory() + + # consume & calculate signatures + notify(f"... reading sequences from {filename}") + name = None + for n, record in enumerate(screed_iter): + if n % 10000 == 0: + if n: + notify("\r...{} {}", filename, n, end="") + elif args.name_from_first: + name = record.name + + try: + add_seq( + sigs, + record.sequence, + args.input_is_protein, + args.check_sequence, + ) + except ValueError as exc: + error(f"ERROR when reading from '{filename}' - ") + error(str(exc)) + sys.exit(-1) + + notify("...{} {} sequences", filename, n, end="") + + set_sig_name(sigs, filename, name) + save_sigs_to_location(sigs, save_sigs) + + notify( + f"calculated {len(sigs)} signatures for {n+1} sequences in {filename}" + ) + + # if not args.output, close output for every input filename. + if open_output_each_time: + save_sigs.close() + notify( + f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0." + ) + save_sigs = None + + # if --output-dir specified, all collected signatures => args.output, + # and we need to close here. + if args.output and save_sigs is not None: + save_sigs.close() + notify( + f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0." + ) + + +def _compute_merged(args, signatures_factory): + # make a signature for the whole file + sigs = signatures_factory() + + total_seq = 0 + for filename in args.filenames: + # consume & calculate signatures + notify("... reading sequences from {}", filename) + + n = None + with screed.open(filename) as f: + for n, record in enumerate(f): + if n % 10000 == 0 and n: + notify("\r... {} {}", filename, n, end="") + + add_seq( + sigs, record.sequence, args.input_is_protein, args.check_sequence + ) + if n is not None: + notify("... {} {} sequences", filename, n + 1) + total_seq += n + 1 + else: + notify(f"no sequences found in '{filename}'?!") + + if total_seq: + set_sig_name(sigs, filename, name=args.merge) + notify( + "calculated 1 signature for {} sequences taken from {} files", + total_seq, + len(args.filenames), + ) + + # at end, save! + save_siglist(sigs, args.output) + + +def add_seq(sigs, seq, input_is_protein, check_sequence): + for sig in sigs: + if input_is_protein: + sig.add_protein(seq) + else: + sig.add_sequence(seq, not check_sequence) + + +def set_sig_name(sigs, filename, name=None): + if filename == "-": # if stdin, set filename to empty. + filename = "" + for sig in sigs: + if name is not None: + sig._name = name + + sig.filename = filename + + +def save_siglist(siglist, sigfile_name): + "Save multiple signatures to a filename." + + # save! + with sourmash_args.SaveSignaturesToLocation(sigfile_name) as save_sig: + for ss in siglist: + save_sig.add(ss) + + notify(f"saved {len(save_sig)} signature(s) to '{save_sig.location}'") + + +def save_sigs_to_location(siglist, save_sig): + "Save multiple signatures to an already-open location." + import sourmash + + for ss in siglist: + save_sig.add(ss) + + +class ComputeParameters(RustObject): + __dealloc_func__ = lib.computeparams_free + + def __init__( + self, + *, + ksizes=(21, 31, 51), + seed=42, + protein=False, + dayhoff=False, + hp=False, + dna=True, + num_hashes=500, + track_abundance=False, + scaled=0, + ): + self._objptr = lib.computeparams_new() + + self.seed = seed + self.ksizes = ksizes + self.protein = protein + self.dayhoff = dayhoff + self.hp = hp + self.dna = dna + self.num_hashes = num_hashes + self.track_abundance = track_abundance + self.scaled = scaled + + @classmethod + def from_manifest_row(cls, row): + "convert a CollectionManifest row into a ComputeParameters object" + is_dna = is_protein = is_dayhoff = is_hp = False + if row["moltype"] == "DNA": + is_dna = True + elif row["moltype"] == "protein": + is_protein = True + elif row["moltype"] == "hp": + is_hp = True + elif row["moltype"] == "dayhoff": + is_dayhoff = True + else: + assert 0 + + if is_dna: + ksize = row["ksize"] + else: + ksize = row["ksize"] * 3 + + p = cls( + ksizes=[ksize], + seed=DEFAULT_MMHASH_SEED, + protein=is_protein, + dayhoff=is_dayhoff, + hp=is_hp, + dna=is_dna, + num_hashes=row["num"], + track_abundance=row["with_abundance"], + scaled=row["scaled"], + ) + + return p + + def to_param_str(self): + "Convert object to equivalent params str." + pi = [] + + if self.dna: + pi.append("dna") + elif self.protein: + pi.append("protein") + elif self.hp: + pi.append("hp") + elif self.dayhoff: + pi.append("dayhoff") + else: + assert 0 # must be one of the previous + + if self.dna: + kstr = [f"k={k}" for k in self.ksizes] + else: + # for protein, divide ksize by three. + kstr = [f"k={k//3}" for k in self.ksizes] + assert kstr + pi.extend(kstr) + + if self.num_hashes != 0: + pi.append(f"num={self.num_hashes}") + elif self.scaled != 0: + pi.append(f"scaled={self.scaled}") + else: + assert 0 + + if self.track_abundance: + pi.append("abund") + # noabund is default + + if self.seed != DEFAULT_MMHASH_SEED: + pi.append(f"seed={self.seed}") + # self.seed + + return ",".join(pi) + + def __repr__(self): + return f"ComputeParameters(ksizes={self.ksizes}, seed={self.seed}, protein={self.protein}, dayhoff={self.dayhoff}, hp={self.hp}, dna={self.dna}, num_hashes={self.num_hashes}, track_abundance={self.track_abundance}, scaled={self.scaled})" + + def __eq__(self, other): + return ( + self.ksizes == other.ksizes + and self.seed == other.seed + and self.protein == other.protein + and self.dayhoff == other.dayhoff + and self.hp == other.hp + and self.dna == other.dna + and self.num_hashes == other.num_hashes + and self.track_abundance == other.track_abundance + and self.scaled == other.scaled + ) + + @staticmethod + def from_args(args): + ptr = lib.computeparams_new() + ret = ComputeParameters._from_objptr(ptr) + + for arg, value in vars(args).items(): + try: + getattr(type(ret), arg).fset(ret, value) + except AttributeError: + pass + + return ret + + @property + def seed(self): + return self._methodcall(lib.computeparams_seed) + + @seed.setter + def seed(self, v): + return self._methodcall(lib.computeparams_set_seed, v) + + @property + def ksizes(self): + size = ffi.new("uintptr_t *") + ksizes_ptr = self._methodcall(lib.computeparams_ksizes, size) + size = size[0] + ksizes = ffi.unpack(ksizes_ptr, size) + lib.computeparams_ksizes_free(ksizes_ptr, size) + return ksizes + + @ksizes.setter + def ksizes(self, v): + return self._methodcall(lib.computeparams_set_ksizes, list(v), len(v)) + + @property + def protein(self): + return self._methodcall(lib.computeparams_protein) + + @protein.setter + def protein(self, v): + return self._methodcall(lib.computeparams_set_protein, v) + + @property + def dayhoff(self): + return self._methodcall(lib.computeparams_dayhoff) + + @dayhoff.setter + def dayhoff(self, v): + return self._methodcall(lib.computeparams_set_dayhoff, v) + + @property + def hp(self): + return self._methodcall(lib.computeparams_hp) + + @hp.setter + def hp(self, v): + return self._methodcall(lib.computeparams_set_hp, v) + + @property + def dna(self): + return self._methodcall(lib.computeparams_dna) + + @dna.setter + def dna(self, v): + return self._methodcall(lib.computeparams_set_dna, v) + + @property + def moltype(self): + if self.dna: + moltype = "DNA" + elif self.protein: + moltype = "protein" + elif self.hp: + moltype = "hp" + elif self.dayhoff: + moltype = "dayhoff" + else: + assert 0 + + return moltype + + @property + def num_hashes(self): + return self._methodcall(lib.computeparams_num_hashes) + + @num_hashes.setter + def num_hashes(self, v): + return self._methodcall(lib.computeparams_set_num_hashes, v) + + @property + def track_abundance(self): + return self._methodcall(lib.computeparams_track_abundance) + + @track_abundance.setter + def track_abundance(self, v): + return self._methodcall(lib.computeparams_set_track_abundance, v) + + @property + def scaled(self): + return self._methodcall(lib.computeparams_scaled) + + @scaled.setter + def scaled(self, v): + return self._methodcall(lib.computeparams_set_scaled, int(v)) diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 9f14b6df58..8dfe8dc74a 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -3355,16 +3355,35 @@ def test_sig_describe_dayhoff(c): ) -@utils.in_tempdir -def test_sig_describe_1_hp(c): +def test_sig_describe_1_hp(runtmp): + c = runtmp + # get basic info on a signature testdata = utils.get_test_data("short.fa") - c.run_sourmash( - "compute", "-k", "21,30", "--dayhoff", "--hp", "--protein", "--dna", testdata + + # run four separate commands to make 4 different sets of sigs... + c.sourmash("sketch", "dna", "-p", "k=21,k=30,num=500", "-o", "out.zip", testdata) + c.sourmash( + "sketch", "translate", "-p", "k=7,k=10,num=500", "-o", "out.zip", testdata + ) + c.sourmash( + "sketch", "translate", "-p", "k=7,k=10,num=500,hp", "-o", "out.zip", testdata + ) + c.sourmash( + "sketch", + "translate", + "-p", + "k=7,k=10,num=500,dayhoff", + "-o", + "out.zip", + testdata, ) - # stdout should be new signature - computed_sig = os.path.join(c.location, "short.fa.sig") - c.run_sourmash("sig", "describe", computed_sig) + + # then combine into one .sig file + c.sourmash("sig", "cat", "out.zip", "-o", "short.fa.sig") + + # & run sig describe + c.run_sourmash("sig", "describe", "short.fa.sig") out = c.last_result.out print(c.last_result.out) @@ -3444,7 +3463,6 @@ def test_sig_describe_1_hp(c): signature license: CC0 --- -signature filename: short.fa.sig signature: ** no name ** source file: short.fa md5: 71f7c111c01785e5f38efad45b00a0e1 diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index fc083a21e5..23647e517b 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1791,26 +1791,48 @@ def test_compare_deduce_molecule(runtmp): def test_compare_choose_molecule_dna(runtmp): - # choose molecule type + # choose molecule type with --dna, ignoring protein testdata1 = utils.get_test_data("short.fa") testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash("compute", "-k", "30", "--dna", "--protein", testdata1, testdata2) - - runtmp.sourmash("compare", "--dna", "short.fa.sig", "short2.fa.sig") + runtmp.sourmash( + "sketch", "dna", "-p", "k=30,num=500", testdata1, testdata2, "-o", "sigs.zip" + ) + runtmp.sourmash( + "sketch", + "translate", + "-p", + "k=10,num=500", + testdata1, + testdata2, + "-o", + "sigs.zip", + ) + runtmp.sourmash("compare", "--dna", "sigs.zip") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert "min similarity in matrix: 0.938" in runtmp.last_result.out def test_compare_choose_molecule_protein(runtmp): - # choose molecule type + # choose molecule type with --protein, ignoring DNA testdata1 = utils.get_test_data("short.fa") testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash("compute", "-k", "30", "--dna", "--protein", testdata1, testdata2) - - runtmp.sourmash("compare", "--protein", "short.fa.sig", "short2.fa.sig") + runtmp.sourmash( + "sketch", "dna", "-p", "k=30,num=500", testdata1, testdata2, "-o", "sigs.zip" + ) + runtmp.sourmash( + "sketch", + "translate", + "-p", + "k=10,num=500", + testdata1, + testdata2, + "-o", + "sigs.zip", + ) + runtmp.sourmash("compare", "--protein", "sigs.zip") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert "min similarity in matrix: 0.91" in runtmp.last_result.out diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py index 87460dcbcb..98448e4d6b 100644 --- a/tests/test_sourmash_sketch.py +++ b/tests/test_sourmash_sketch.py @@ -15,8 +15,7 @@ from sourmash import MinHash from sourmash.sbt import SBT, Node from sourmash.sbtmh import SigLeaf, load_sbt_index -from sourmash.command_compute import ComputeParameters -from sourmash.cli.compute import subparser +from sourmash.command_sketch import ComputeParameters from sourmash.cli import SourmashParser from sourmash import manifest