diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 841c0a03b9..1f85234218 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -46,7 +46,7 @@ jobs:
python-version: '3.10'
- name: Build wheels
- uses: pypa/cibuildwheel@v2.16.5
+ uses: pypa/cibuildwheel@v2.17.0
env:
CIBW_ENVIRONMENT_MACOS: ${{ matrix.macos_target }}
CIBW_ARCHS_LINUX: ${{ matrix.arch }}
diff --git a/.github/workflows/dev_envs.yml b/.github/workflows/dev_envs.yml
index a34c4e5301..8ffb98db64 100644
--- a/.github/workflows/dev_envs.yml
+++ b/.github/workflows/dev_envs.yml
@@ -15,9 +15,9 @@ jobs:
fetch-depth: 0
- name: Install Nix
- uses: DeterminateSystems/nix-installer-action@v9
+ uses: DeterminateSystems/nix-installer-action@v10
- name: Run the Magic Nix Cache
- uses: DeterminateSystems/magic-nix-cache-action@v3
+ uses: DeterminateSystems/magic-nix-cache-action@v4
- run: nix run .# -- --version
diff --git a/Cargo.lock b/Cargo.lock
index f39774cc5b..51ef233466 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -535,9 +535,9 @@ checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "enum_dispatch"
-version = "0.3.12"
+version = "0.3.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e"
+checksum = "aa18ce2bc66555b3218614519ac839ddb759a7d6720732f979ef8d13be147ecd"
dependencies = [
"once_cell",
"proc-macro2",
@@ -604,9 +604,9 @@ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
[[package]]
name = "getrandom"
-version = "0.2.12"
+version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5"
+checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c"
dependencies = [
"cfg-if",
"js-sys",
@@ -662,9 +662,9 @@ checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b"
[[package]]
name = "histogram"
-version = "0.9.1"
+version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b634390eb8a63662e127836d4e2f26d7ae930600d4e05ee0fd85a009eeb1175"
+checksum = "f4d3bddd75a32b17e75762f128ffc7a33158b933b6eb27424da9be4a58f30eb9"
dependencies = [
"thiserror",
]
@@ -1347,9 +1347,9 @@ checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
[[package]]
name = "rayon"
-version = "1.9.0"
+version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e4963ed1bc86e4f3ee217022bd855b297cef07fb9eac5dfa1f788b220b49b3bd"
+checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
dependencies = [
"either",
"rayon-core",
@@ -1559,9 +1559,9 @@ dependencies = [
[[package]]
name = "serde_json"
-version = "1.0.114"
+version = "1.0.115"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0"
+checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd"
dependencies = [
"itoa",
"ryu",
@@ -1607,7 +1607,7 @@ checksum = "9f1341053f34bb13b5e9590afb7d94b48b48d4b87467ec28e3c238693bb553de"
[[package]]
name = "sourmash"
-version = "0.13.0"
+version = "0.13.1"
dependencies = [
"az",
"byteorder",
@@ -1624,6 +1624,7 @@ dependencies = [
"getset",
"histogram",
"itertools 0.12.1",
+ "js-sys",
"log",
"md5",
"memmap2",
@@ -1736,18 +1737,18 @@ dependencies = [
[[package]]
name = "thiserror"
-version = "1.0.57"
+version = "1.0.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b"
+checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
-version = "1.0.57"
+version = "1.0.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81"
+checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7"
dependencies = [
"proc-macro2",
"quote",
diff --git a/Makefile b/Makefile
index 9b26d91331..891b710732 100644
--- a/Makefile
+++ b/Makefile
@@ -56,6 +56,9 @@ last-tag:
wasm:
wasm-pack build src/core -d ../../pkg
+wasm-test:
+ wasm-pack test --node src/core
+
wasi:
cargo wasi build
diff --git a/README.md b/README.md
index f12d6a65ce..702a729dd9 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,9 @@ Quickly search, compare, and analyze genomic and metagenomic data sets.
[![Documentation](https://readthedocs.org/projects/sourmash/badge/?version=latest)](http://sourmash.readthedocs.io/en/latest/)
[![Gitter](https://badges.gitter.im/sourmash-bio/community.svg)](https://gitter.im/sourmash-bio/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
+
[![DOI](http://joss.theoj.org/papers/10.21105/joss.00027/status.svg)](http://joss.theoj.org/papers/10.21105/joss.00027)
+[![pyOpenSci](https://tinyurl.com/y22nb8up)](https://github.com/pyOpenSci/software-submission/issues/129)
[![Bioconda install](https://img.shields.io/conda/dn/bioconda/sourmash.svg?style=flag&label=Bioconda)](https://anaconda.org/bioconda/sourmash)
diff --git a/doc/command-line.md b/doc/command-line.md
index 71173792cf..90633d342e 100644
--- a/doc/command-line.md
+++ b/doc/command-line.md
@@ -1914,7 +1914,10 @@ will continue processing input sequences.
### `sourmash signature manifest` - output a manifest for a file
-Output a manifest for a file, database, or collection.
+Output a manifest for a file, database, or collection. Note that
+these manifests are not usually suitable for use as standalone
+manifests; the `sourmash sig collect` and `sourmash sig check`
+commands produce standalone manifests.
For example,
```
@@ -1942,8 +1945,10 @@ CSV and SQLite manifest files.
### `sourmash signature check` - compare picklists and manifests
-Compare picklists and manifests across databases, and optionally output matches
-and missing items.
+Compare picklists and manifests across databases, and optionally
+output matches and missing items. In particular, `sig check` can be
+used to create standalone manifests for a subset of a large collection,
+using picklists.
For example,
```
@@ -1962,17 +1967,28 @@ collections of signatures and identifiers.
With `-m/--save-manifest-matching`, `sig check` creates a standalone
manifest. In these manifests, sourmash v4 will by default write paths
to the matched elements that are relative to the current working
-directory. In some cases - when the output manifest is in different
+directory. In some cases - when the output manifest is in a different
directory - this will create manifests that do not work properly
with sourmash. The `--relpath` argument will rewrite the paths to be
relative to the manifest, while the `--abspath` argument will rewrite
paths to be absolute. The `--relpath` behavior will be the default in
sourmash v5.
+Standalone manifests created with `-m/--save-manifest-matching` will
+use the paths given to `sig check` on the command line; we recommend
+using zip files and sig files, and avoiding directory hierarchies or
+path lists. You can use `--from-file` to pass in long lists of
+filenames via a text file.
+
### `sourmash signature collect` - collect manifests across databases
Collect manifests from across (many) files and merge into a single
-standalone manifest.
+standalone manifest. Standalone manifests can be used directly as a
+sourmash database; they support efficient searching and selection of
+sketches, as well as lazy loading of individual sketches from large
+collections. See
+[advanced usage information on sourmash databases](databases-advanced.md)
+for more information.
For example,
```
@@ -1987,20 +2003,30 @@ This manifest file can be loaded directly from the command line by sourmash.
particularly useful when working with large collections of signatures and
identifiers, and has command line options for merging and updating manifests.
+The standalone manifests created by `sig collect` will reference the
+paths given on the command line; we recommend using zip files and sig
+files, and avoiding directory hierarchies or path lists. You can also
+use `--from-file` to pass in long lists of filenames.
+
+Standalone manifests produced by `sig collect` work most efficiently
+when constructed from many small zip file collections.
+
As with `sig check`, the standalone manifests created by `sig collect`
in sourmash v4 will by default write paths to the matched elements
relative to the current working directory. When the output manifest
-is in a different directory, this will create manifests that do not work
-properly with sourmash. The `--relpath` argument will rewrite the
-paths to be relative to the manifest, while the `--abspath` argument
-will rewrite paths to be absolute. The `--relpath` behavior will be
-the default in sourmash v5.
+is in a different directory, this will create manifests that do not
+work properly with sourmash. The `--relpath` argument will rewrite
+the paths to be relative to the manifest, while the `--abspath`
+argument will rewrite paths to be absolute. The `--relpath` behavior
+will be the default in sourmash v5.
## Advanced command-line usage
### Loading signatures and databases
-sourmash uses several different command-line styles.
+sourmash uses several different command-line styles. Most sourmash
+commands can load sketches from any standard collection type; we
+primarily recommend using zipfiles (but read on!)
Briefly,
@@ -2011,22 +2037,18 @@ Briefly,
need to provide a selector (ksize with `-k`, moltype with `--dna` etc,
or md5sum with `--query-md5`) that picks out a single signature.
-* `compare` takes multiple signatures and can load them from files,
- directories, and indexed databases (SBT or LCA). It can also take
- a list of file paths in a text file, using `--from-file` (see below).
+* `compare` takes multiple signatures and can load them from any
+ sourmash collection type.
* the `lca classify` and `lca summarize` commands take multiple
signatures with `--query`, and multiple LCA databases, with
`--db`. `sourmash multigather` also uses this style. This allows these
commands to specify multiple queries **and** multiple databases without
- (too much) confusion. These commands will take files containing
- signature files using `--query-from-file` (see below).
+ (too much) confusion. The database must be LCA databases.
* `index` and `lca index` take a few fixed parameters (database name,
and for `lca index`, a taxonomy file) and then an arbitrary number of
- other files that contain signatures, including files, directories,
- and indexed databases. These commands will also take `--from-file`
- (see below).
+ other files that contain signatures.
None of these commands currently support searching, comparing, or indexing
signatures with multiple ksizes or moltypes at the same time; you need
@@ -2092,7 +2114,7 @@ The following `coltype`s are currently supported for picklists:
* `gather` - use the CSV output of `sourmash gather` as a picklist
* `prefetch` - use the CSV output of `sourmash prefetch` as a picklist
* `search` - use the CSV output of `sourmash prefetch` as a picklist
-* `manifest` - use the CSV output of `sourmash sig manifest` as a picklist
+* `manifest` - use CSV manifests produced by `sig manifest` as a picklist
Identifiers are constructed by using the first space delimited word in
the signature name.
@@ -2101,7 +2123,7 @@ One way to build a picklist is to use `sourmash sig grep
--csv out.csv` to construct a CSV file containing a list
of all sketches that match the pattern (which can be a string or
regexp). The `out.csv` file can be used as a picklist via the picklist
-manifest format with `--picklist out.csv::manifest`.
+manifest CSV format with `--picklist out.csv::manifest`.
You can also use `sourmash sig describe --csv out.csv ` or
`sourmash sig manifest -o out.csv ` to construct an
@@ -2144,7 +2166,9 @@ slow, especially for many (100s or 1000s) of signatures.
All of the `sourmash` commands support loading collections of
signatures from zip files. You can create a compressed collection of
signatures using `sourmash sig cat *.sig -o collections.zip` and then
-specifying `collections.zip` on the command line in place of `*.sig`.
+specifying `collections.zip` on the command line in place of `*.sig`;
+you can also sketch FASTA/FASTQ files directly into a zip file with
+`-o collections.zip`.
### Choosing signature output formats
@@ -2171,7 +2195,7 @@ to stdout.
All of these save formats can be loaded by sourmash commands.
**We strongly suggest using .zip files to store signatures: they are fast,
-small, and fully supported by all the sourmash commands.**
+small, and fully supported by all the sourmash commands and API.**
Note that when outputting large collections of signatures, some save
formats require holding all the sketches in memory until they can be
@@ -2186,19 +2210,6 @@ databases!](databases-advanced.md)
### Loading many signatures
-#### Loading signatures within a directory hierarchy
-
-All of the `sourmash` commands support loading signatures from
-beneath directories; provide the paths on the command line.
-
-#### Passing in lists of files
-
-Most sourmash commands will also take a `--from-file` or
-`--query-from-file`, which will take the location of a text file containing
-a list of file paths. This can be useful for situations where you want
-to specify thousands of queries, or a subset of signatures produced by
-some other command.
-
#### Indexed databases
Indexed databases can make searching signatures much faster. SBT
@@ -2209,9 +2220,6 @@ SQLite databases (new in sourmash v4.4.0) are typically larger on disk
than SBTs and LCAs, but in turn are fast to load and support very low
memory search.
-(LCA databases also directly permit taxonomic searches using `sourmash lca`
-functions.)
-
Commands that take multiple signatures or collections of signatures
will also work with indexed databases.
@@ -2223,9 +2231,9 @@ only at one scaled value. If the database signature type is
incompatible with the other signatures, sourmash will complain
appropriately.
-In contrast, signature files, zip collections, and directory
-hierarchies can contain many different types of signatures, and
-compatible ones will be selected automatically.
+In contrast, signature files and zip collections can contain many
+different types of signatures, and compatible ones will be selected
+automatically.
Use the `sourmash index` command to create an SBT.
@@ -2235,6 +2243,29 @@ database can be saved in JSON or SQL format with `-F json` or `-F sql`.
Use `sourmash sig cat -o