diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml index 841c0a03b9..1f85234218 100644 --- a/.github/workflows/build_wheel.yml +++ b/.github/workflows/build_wheel.yml @@ -46,7 +46,7 @@ jobs: python-version: '3.10' - name: Build wheels - uses: pypa/cibuildwheel@v2.16.5 + uses: pypa/cibuildwheel@v2.17.0 env: CIBW_ENVIRONMENT_MACOS: ${{ matrix.macos_target }} CIBW_ARCHS_LINUX: ${{ matrix.arch }} diff --git a/.github/workflows/dev_envs.yml b/.github/workflows/dev_envs.yml index a34c4e5301..8ffb98db64 100644 --- a/.github/workflows/dev_envs.yml +++ b/.github/workflows/dev_envs.yml @@ -15,9 +15,9 @@ jobs: fetch-depth: 0 - name: Install Nix - uses: DeterminateSystems/nix-installer-action@v9 + uses: DeterminateSystems/nix-installer-action@v10 - name: Run the Magic Nix Cache - uses: DeterminateSystems/magic-nix-cache-action@v3 + uses: DeterminateSystems/magic-nix-cache-action@v4 - run: nix run .# -- --version diff --git a/Cargo.lock b/Cargo.lock index f39774cc5b..51ef233466 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -535,9 +535,9 @@ checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" [[package]] name = "enum_dispatch" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" +checksum = "aa18ce2bc66555b3218614519ac839ddb759a7d6720732f979ef8d13be147ecd" dependencies = [ "once_cell", "proc-macro2", @@ -604,9 +604,9 @@ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" [[package]] name = "getrandom" -version = "0.2.12" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c" dependencies = [ "cfg-if", "js-sys", @@ -662,9 +662,9 @@ checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" [[package]] name = "histogram" -version = "0.9.1" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b634390eb8a63662e127836d4e2f26d7ae930600d4e05ee0fd85a009eeb1175" +checksum = "f4d3bddd75a32b17e75762f128ffc7a33158b933b6eb27424da9be4a58f30eb9" dependencies = [ "thiserror", ] @@ -1347,9 +1347,9 @@ checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" [[package]] name = "rayon" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4963ed1bc86e4f3ee217022bd855b297cef07fb9eac5dfa1f788b220b49b3bd" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" dependencies = [ "either", "rayon-core", @@ -1559,9 +1559,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.114" +version = "1.0.115" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" +checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" dependencies = [ "itoa", "ryu", @@ -1607,7 +1607,7 @@ checksum = "9f1341053f34bb13b5e9590afb7d94b48b48d4b87467ec28e3c238693bb553de" [[package]] name = "sourmash" -version = "0.13.0" +version = "0.13.1" dependencies = [ "az", "byteorder", @@ -1624,6 +1624,7 @@ dependencies = [ "getset", "histogram", "itertools 0.12.1", + "js-sys", "log", "md5", "memmap2", @@ -1736,18 +1737,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.57" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" +checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.57" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" +checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" dependencies = [ "proc-macro2", "quote", diff --git a/Makefile b/Makefile index 9b26d91331..891b710732 100644 --- a/Makefile +++ b/Makefile @@ -56,6 +56,9 @@ last-tag: wasm: wasm-pack build src/core -d ../../pkg +wasm-test: + wasm-pack test --node src/core + wasi: cargo wasi build diff --git a/README.md b/README.md index f12d6a65ce..702a729dd9 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,9 @@ Quickly search, compare, and analyze genomic and metagenomic data sets. License: 3-Clause BSD [![Documentation](https://readthedocs.org/projects/sourmash/badge/?version=latest)](http://sourmash.readthedocs.io/en/latest/) [![Gitter](https://badges.gitter.im/sourmash-bio/community.svg)](https://gitter.im/sourmash-bio/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) + [![DOI](http://joss.theoj.org/papers/10.21105/joss.00027/status.svg)](http://joss.theoj.org/papers/10.21105/joss.00027) +[![pyOpenSci](https://tinyurl.com/y22nb8up)](https://github.com/pyOpenSci/software-submission/issues/129) [![Bioconda install](https://img.shields.io/conda/dn/bioconda/sourmash.svg?style=flag&label=Bioconda)](https://anaconda.org/bioconda/sourmash) PyPI diff --git a/doc/command-line.md b/doc/command-line.md index 71173792cf..90633d342e 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -1914,7 +1914,10 @@ will continue processing input sequences. ### `sourmash signature manifest` - output a manifest for a file -Output a manifest for a file, database, or collection. +Output a manifest for a file, database, or collection. Note that +these manifests are not usually suitable for use as standalone +manifests; the `sourmash sig collect` and `sourmash sig check` +commands produce standalone manifests. For example, ``` @@ -1942,8 +1945,10 @@ CSV and SQLite manifest files. ### `sourmash signature check` - compare picklists and manifests -Compare picklists and manifests across databases, and optionally output matches -and missing items. +Compare picklists and manifests across databases, and optionally +output matches and missing items. In particular, `sig check` can be +used to create standalone manifests for a subset of a large collection, +using picklists. For example, ``` @@ -1962,17 +1967,28 @@ collections of signatures and identifiers. With `-m/--save-manifest-matching`, `sig check` creates a standalone manifest. In these manifests, sourmash v4 will by default write paths to the matched elements that are relative to the current working -directory. In some cases - when the output manifest is in different +directory. In some cases - when the output manifest is in a different directory - this will create manifests that do not work properly with sourmash. The `--relpath` argument will rewrite the paths to be relative to the manifest, while the `--abspath` argument will rewrite paths to be absolute. The `--relpath` behavior will be the default in sourmash v5. +Standalone manifests created with `-m/--save-manifest-matching` will +use the paths given to `sig check` on the command line; we recommend +using zip files and sig files, and avoiding directory hierarchies or +path lists. You can use `--from-file` to pass in long lists of +filenames via a text file. + ### `sourmash signature collect` - collect manifests across databases Collect manifests from across (many) files and merge into a single -standalone manifest. +standalone manifest. Standalone manifests can be used directly as a +sourmash database; they support efficient searching and selection of +sketches, as well as lazy loading of individual sketches from large +collections. See +[advanced usage information on sourmash databases](databases-advanced.md) +for more information. For example, ``` @@ -1987,20 +2003,30 @@ This manifest file can be loaded directly from the command line by sourmash. particularly useful when working with large collections of signatures and identifiers, and has command line options for merging and updating manifests. +The standalone manifests created by `sig collect` will reference the +paths given on the command line; we recommend using zip files and sig +files, and avoiding directory hierarchies or path lists. You can also +use `--from-file` to pass in long lists of filenames. + +Standalone manifests produced by `sig collect` work most efficiently +when constructed from many small zip file collections. + As with `sig check`, the standalone manifests created by `sig collect` in sourmash v4 will by default write paths to the matched elements relative to the current working directory. When the output manifest -is in a different directory, this will create manifests that do not work -properly with sourmash. The `--relpath` argument will rewrite the -paths to be relative to the manifest, while the `--abspath` argument -will rewrite paths to be absolute. The `--relpath` behavior will be -the default in sourmash v5. +is in a different directory, this will create manifests that do not +work properly with sourmash. The `--relpath` argument will rewrite +the paths to be relative to the manifest, while the `--abspath` +argument will rewrite paths to be absolute. The `--relpath` behavior +will be the default in sourmash v5. ## Advanced command-line usage ### Loading signatures and databases -sourmash uses several different command-line styles. +sourmash uses several different command-line styles. Most sourmash +commands can load sketches from any standard collection type; we +primarily recommend using zipfiles (but read on!) Briefly, @@ -2011,22 +2037,18 @@ Briefly, need to provide a selector (ksize with `-k`, moltype with `--dna` etc, or md5sum with `--query-md5`) that picks out a single signature. -* `compare` takes multiple signatures and can load them from files, - directories, and indexed databases (SBT or LCA). It can also take - a list of file paths in a text file, using `--from-file` (see below). +* `compare` takes multiple signatures and can load them from any + sourmash collection type. * the `lca classify` and `lca summarize` commands take multiple signatures with `--query`, and multiple LCA databases, with `--db`. `sourmash multigather` also uses this style. This allows these commands to specify multiple queries **and** multiple databases without - (too much) confusion. These commands will take files containing - signature files using `--query-from-file` (see below). + (too much) confusion. The database must be LCA databases. * `index` and `lca index` take a few fixed parameters (database name, and for `lca index`, a taxonomy file) and then an arbitrary number of - other files that contain signatures, including files, directories, - and indexed databases. These commands will also take `--from-file` - (see below). + other files that contain signatures. None of these commands currently support searching, comparing, or indexing signatures with multiple ksizes or moltypes at the same time; you need @@ -2092,7 +2114,7 @@ The following `coltype`s are currently supported for picklists: * `gather` - use the CSV output of `sourmash gather` as a picklist * `prefetch` - use the CSV output of `sourmash prefetch` as a picklist * `search` - use the CSV output of `sourmash prefetch` as a picklist -* `manifest` - use the CSV output of `sourmash sig manifest` as a picklist +* `manifest` - use CSV manifests produced by `sig manifest` as a picklist Identifiers are constructed by using the first space delimited word in the signature name. @@ -2101,7 +2123,7 @@ One way to build a picklist is to use `sourmash sig grep --csv out.csv` to construct a CSV file containing a list of all sketches that match the pattern (which can be a string or regexp). The `out.csv` file can be used as a picklist via the picklist -manifest format with `--picklist out.csv::manifest`. +manifest CSV format with `--picklist out.csv::manifest`. You can also use `sourmash sig describe --csv out.csv ` or `sourmash sig manifest -o out.csv ` to construct an @@ -2144,7 +2166,9 @@ slow, especially for many (100s or 1000s) of signatures. All of the `sourmash` commands support loading collections of signatures from zip files. You can create a compressed collection of signatures using `sourmash sig cat *.sig -o collections.zip` and then -specifying `collections.zip` on the command line in place of `*.sig`. +specifying `collections.zip` on the command line in place of `*.sig`; +you can also sketch FASTA/FASTQ files directly into a zip file with +`-o collections.zip`. ### Choosing signature output formats @@ -2171,7 +2195,7 @@ to stdout. All of these save formats can be loaded by sourmash commands. **We strongly suggest using .zip files to store signatures: they are fast, -small, and fully supported by all the sourmash commands.** +small, and fully supported by all the sourmash commands and API.** Note that when outputting large collections of signatures, some save formats require holding all the sketches in memory until they can be @@ -2186,19 +2210,6 @@ databases!](databases-advanced.md) ### Loading many signatures -#### Loading signatures within a directory hierarchy - -All of the `sourmash` commands support loading signatures from -beneath directories; provide the paths on the command line. - -#### Passing in lists of files - -Most sourmash commands will also take a `--from-file` or -`--query-from-file`, which will take the location of a text file containing -a list of file paths. This can be useful for situations where you want -to specify thousands of queries, or a subset of signatures produced by -some other command. - #### Indexed databases Indexed databases can make searching signatures much faster. SBT @@ -2209,9 +2220,6 @@ SQLite databases (new in sourmash v4.4.0) are typically larger on disk than SBTs and LCAs, but in turn are fast to load and support very low memory search. -(LCA databases also directly permit taxonomic searches using `sourmash lca` -functions.) - Commands that take multiple signatures or collections of signatures will also work with indexed databases. @@ -2223,9 +2231,9 @@ only at one scaled value. If the database signature type is incompatible with the other signatures, sourmash will complain appropriately. -In contrast, signature files, zip collections, and directory -hierarchies can contain many different types of signatures, and -compatible ones will be selected automatically. +In contrast, signature files and zip collections can contain many +different types of signatures, and compatible ones will be selected +automatically. Use the `sourmash index` command to create an SBT. @@ -2235,6 +2243,29 @@ database can be saved in JSON or SQL format with `-F json` or `-F sql`. Use `sourmash sig cat -o .sqldb` to create a SQLite indexed database. +#### Loading signatures within a directory hierarchy + +All of the `sourmash` commands support loading signatures (`.sig` or +`.sig.gz` files) from within directory hierarchies; you can just +provide the paths to the top-level directory on the command line. + +However, this is no longer recommended because it can be very +inefficient; we instead suggest passing all of the sketch files in +the directory into `sig collect` to build a standalone manifest, or +using `sig cat` on the directory to generate a zip file. + +#### Passing in lists of files + +sourmash commands support `--from-file` or `--query-from-file`, which +will take the location of a text file containing a list of file +paths. This can be useful for situations where you want to specify +thousands of queries, or a subset of signatures produced by some other +command. + +This is no longer recommended when using large collections; we instead +suggest using standalone manifests built with `sig collect` and `sig +check`, which will include extra metadata that supports fast loading. + ### Combining search databases on the command line All of the commands in sourmash operate in "online" mode, so you can @@ -2242,7 +2273,7 @@ combine multiple databases and signatures on the command line and get the same answer as if you built a single large database from all of them. The only caveat to this rule is that if you have multiple identical matches present across the databases, the order in which -they are found will differ depending on the order that the files are +they are used may depend on the order that the files are passed in on the command line. ### Using stdin @@ -2250,11 +2281,12 @@ passed in on the command line. Most commands will take signature JSON data via stdin using the usual UNIX convention, `-`. Moreover, `sourmash sketch` and the `sourmash sig` commands will output to stdout. So, for example, +``` +sourmash sketch ... -o - | sourmash sig describe - +``` +will describe the signatures that were just created. -`sourmash sketch ... -o - | sourmash sig describe -` will describe the -signatures that were just created. - -### Using manifests to explicitly refer to collections of files +### Using standalone manifests to explicitly refer to collections of files (sourmash v4.4 and later) @@ -2264,9 +2296,9 @@ internals to speed up signature selection through picklists and pattern matching. Manifests can _also_ be used externally (via the command-line), and -may be useful for organizing large collections of signatures. They can -be generated with the `sig collect`, `sig manifest`, and `sig check` -subcommands. +these "standalone manifests" may be useful for organizing large +collections of signatures. They can be generated with the `sig +collect`, `sig manifest`, and `sig check` subcommands. Suppose you have a large collection of signatures (`.sig` or `.sig.gz` files) in a location (e.g., under a directory, or in a zip file). You @@ -2280,21 +2312,32 @@ sourmash sig fileinfo manifest.sqlmf ``` This manifest contains _references_ to the signatures (but not the signatures themselves) and can then be used as a database target for most -sourmash operations - search, gather, etc. +sourmash operations - search, gather, etc. Manifests support +fast selection and lazy loading of sketches in many situations. + +The `sig check` command can also be used to create standalone manifests +from collections using a picklist, with the `-m/--save-manifest-matching` +option. This is useful for commands that don't support picklists natively, +e.g. plugins and extensions. -Note that `sig collect` will generate manifests containing the -pathnames given to it - so if you use relative paths, the references -will be relative to the working directory in which `sig collect` was +Note that `sig collect` and `sig check` will generate manifests containing the +pathnames given to them - so if you use relative paths, the references +will be relative to the working directory in which the command was run. You can use `sig collect --abspath` to rewrite the paths -into absolute paths. +into absolute paths, or `sig collect --relpath` to rewrite the paths +relative to the manifest file. **Our advice:** We suggest using zip file collections for most -situations; we primarily recommend using explicit manifests for -situations where you have a **very large** collection of signatures -(1000s or more), and don't want to make multiple copies of signatures -in the collection (as you would have to, with a zipfile). This can be -useful if you want to refer to different subsets of the collection -without making multiple copies in a zip file. +situations; we strongly recommend using standalone manifests for +situations where you have **very large** sketches or a **very large** +collection of sketches (1000s or more), and don't want to make +multiple copies of signatures in the collection (as you would have to, +with a zipfile). This is particularly useful if you want to refer to different +subsets of the collection without making multiple copies in a zip +file. + +You can read more about the details of zip files and manifests in +[the advanced usage information for databases](databases-advanced.md). ### Using sourmash plugins diff --git a/doc/databases-advanced.md b/doc/databases-advanced.md index 9e4d1c25d7..2a1f61fd28 100644 --- a/doc/databases-advanced.md +++ b/doc/databases-advanced.md @@ -54,39 +54,83 @@ Both SBTs and LCA databases can only store homogenous collections of signature t We recommend SBT and LCA databases for use only in specific situations - e.g. SBTs are great for single-genome "best match" search for SBTs, and `sourmash lca` commands require LCA databases. -### Manifests - -Manifests are catalogs of signature metadata - name, molecule type, k-mer size, and other information - that can be used to select specific signatures for searching or processing. Typically when using manifests the actual signatures themselves are not loaded until they are needed, although the efficiency of this depends on the signature storage mechanism; for example, JSON-format containers (`.sig` and `.lca.json` files) must be entirely loaded before any signature in the file them can be used, unlike zip containers. - -As of sourmash 4.4 manifests can be *directly* loaded from the command line as standalone collections. This lets manifests serve as a catalog of signatures stored in many different locations. - -Standalone manifests are preferable to both directory storage and pathlists (below), because they support fast selection and direct lazy loading. They are the most effective solution for managing custom collections of thousands to millions of signatures. - -Standalone manifests can be created with `sourmash sig collect` -(sourmash v4.4 and later). - -Sourmash supports two manifest file formats - CSV and SQLite. SQLite manifests are much faster and lower-memory than CSV manifests in exchange for consuming some extra disk space. +### Standalone manifests + +Manifests are catalogs of signature metadata - name, molecule type, +k-mer size, and other information - that can be used to select +specific signatures for searching or processing. Typically when using +manifests the actual signatures themselves are not loaded until they +are needed, although the efficiency of this depends on the signature +storage mechanism; for example, JSON-format containers (`.sig` and +`.lca.json` files) must be entirely loaded before any signature in the +file them can be used, unlike zip containers. + +As of sourmash 4.4 manifests can be *directly* loaded from the command +line as standalone collections. This lets manifests serve as a catalog +of signatures stored in many different locations. Sketches can be +selected by name, k-mer size, molecule type, and other features +without loading the actual sketch data. + +Standalone manifests are preferable to both directory storage and +pathlists (below), because they support fast selection and direct lazy +loading. This means that sourmash operations that support streaming or +online search (such as `prefetch` and `gather`, among others) can +avoid loading everything all at once. + +Standalone manifests are the most effective solution for managing custom +collections of thousands to millions of signatures, as well as working +with multiple large sketches. + +They can be created with `sourmash sig collect` and `sourmash sig +check` (sourmash v4.4 and later). + +Sourmash supports two manifest file formats - CSV and SQLite. SQLite +manifests are much faster and lower-memory than CSV manifests. ### Directories -Directory hierarchies of signatures are read natively by sourmash, and can be created or extended by specifying `-o dirname/` (with a trailing slash). +Directory hierarchies of signatures are read natively by sourmash, and +can be created or extended by specifying `-o dirname/` (with a +trailing slash). -To read from a directory, specify the directory name on the sourmash command line. When reading from directories, the entire directory hierarchy is traversed and all `.sig` and `.sig.gz` files are loaded as signatures. If `--force` is specified, _all_ files will be read, and failures will be ignored. +To read from a directory, specify the directory name on the sourmash +command line. When reading from directories, the entire directory +hierarchy is traversed and all `.sig` and `.sig.gz` files are loaded +as signatures. If `--force` is specified, _all_ files will be read, +and failures will be ignored. -When directories are specified as outputs, the signatures will be saved by their complete md5sum underneath the directory. +When directories are specified as outputs, the signatures will be +saved by their complete md5sum underneath the directory. -We don't particularly recommend storing signatures in directory hierarchies, since most of their use cases are now covered by other approaches. +We don't recommend loading signatures from directory hierarchies, +since the implementation is not particularly memory efficient and most +of the use cases for directories are now covered by other approaches - +in particular, standalone manifests. ### Pathlists -Pathlists are text files containing paths to one or more sourmash databases; any type of sourmash-readable collection can be listed. +Pathlists are text files containing paths to one or more sourmash +databases; any type of sourmash-readable collection can be listed. -The paths in pathlists can be relative or absolute within the file system. If they are relative, they must resolve with respect to the current working directory of the sourmash command. +The paths in pathlists can be relative or absolute within the file +system. If they are relative, they must resolve with respect to the +current working directory of the sourmash command. -We don't recommend using pathlists any more, since the original use cases are now supported with picklists, but they are still supported! +We don't recommend using pathlists, since the original use cases are +now supported with picklists and standalone manifests, but they are +still supported. Loading sketches from pathlists is also not very +efficient. Pathlists are not output by any sourmash commands. +Many commands support `--query-from-file` or `--from-file` as a way to +pass in a file containing many paths to sketches or collections. The +internal implementation of sourmash simply adds these to the +command-line arguments, and this is an effective and efficient way to +provide long lists of files to commands like `sig check` and `sig +collect` that create standalone manifests to support efficient lazy +loading. + ## Storing taxonomies sourmash supports taxonomic information output via the `sourmash lca` and `sourmash tax` subcommands. Both sets of commands rely on the same 7 taxonomic ranks: superkingdom, phylum, class, order, family, genus, and species (with limited support for a 'strain' rank). And both sets of subcommands take lineage spreadsheets that link specific identifiers to taxonomic lineages. diff --git a/doc/faq.md b/doc/faq.md index d8d9da0622..227952ff40 100644 --- a/doc/faq.md +++ b/doc/faq.md @@ -139,7 +139,7 @@ you use [the precomputed databases](databases.md), you will always end up using your query sketches at a minimum scaled of 1000, even if you created them with a lower scaled value. -Please also see [What resolution should my signatures be?](using-sourmash-a-guide.md#what-resolution-should-my-signatures-be-how-should-i-create-them). +Please also see [What resolution should my signatures be?](using-sourmash-a-guide.md#what-resolution-should-my-signatures-be-and-how-should-i-create-them). ## What threshold-bp value should I use with `sourmash prefetch` and `sourmash gather`? diff --git a/doc/release-notes/sourmash-2.0.md b/doc/release-notes/sourmash-2.0.md index c3b8647dd5..fbb541ad49 100644 --- a/doc/release-notes/sourmash-2.0.md +++ b/doc/release-notes/sourmash-2.0.md @@ -23,7 +23,7 @@ This is a list of substantial new features and functionality in sourmash 2.0. * Created [precomputed databases](../databases.md) for most of GenBank genomes. * Added taxonomic reporting functionality in the `sourmash lca` submodule - [see command-line docs](../command-line.md#sourmash-lca-subcommands-for-in-memory-taxonomy-integration). * Added signature manipulation utilities in the `sourmash signature` submodule - [see command-line docs](../command-line.md#sourmash-signature-subcommands-for-signature-manipulation) -* Introduced new modulo hash or "scaled" signatures for containment analysis; see [Using sourmash: a practical guide](../using-sourmash-a-guide.md#what-resolution-should-my-signatures-be--how-should-i-create-them) and [more details in the Python API examples](../api-example.md#advanced-features-of-sourmash-minhash-objects---scaled-and-num). +* Introduced new modulo hash or "scaled" signatures for containment analysis; see [Using sourmash: a practical guide](../using-sourmash-a-guide.md#what-resolution-should-my-signatures-be-and-how-should-i-create-them) and [more details in the Python API examples](../api-example.md#advanced-features-of-sourmash-minhash-objects---scaled-and-num). * Switched to using JSON instead of YAML for signatures. * Many performance optimizations! * Many more tests! diff --git a/doc/sourmash-sketch.md b/doc/sourmash-sketch.md index caba1a19a8..5ad43d266e 100644 --- a/doc/sourmash-sketch.md +++ b/doc/sourmash-sketch.md @@ -146,7 +146,7 @@ Some of the key command-line options supported by `fromfile` are: * `-o/--output-signatures` will save generated signatures to any of the [standard supported output formats](command-line.md#choosing-signature-output-formats). * `-o/--output-csv-info` will save a CSV file of input filenames and parameter strings for use with the `sourmash sketch` command line; this can be used to construct signatures in parallel. * `--already-done` will take a list of existing signatures/databases to check against; signatures with matching names and parameter strings will not be rebuilt. -* `--output-manifest-matching` will output a manifest of already-existing signatures, which can then be used with `sourmash sig cat` to collate signatures across databases; see [using manifests](command-line.md#using-manifests-to-explicitly-refer-to-collections-of-files). (This provides [`sourmash sig check` functionality](command-line.md#sourmash-signature-check---compare-picklists-and-manifests) in `sketch fromfile`.) +* `--output-manifest-matching` will output a manifest of already-existing signatures, which can then be used with `sourmash sig cat` to collate signatures across databases; see [using manifests](command-line.md#using-standalone-manifests-to-explicitly-refer-to-collections-of-files). (This provides [`sourmash sig check` functionality](command-line.md#sourmash-signature-check---compare-picklists-and-manifests) in `sketch fromfile`.) If you would like help and advice on constructing large databases, or pointers to code for generating the `fromfile` CSV format, please ask @@ -200,8 +200,8 @@ The `-p` argument to `sourmash sketch` provides parameter strings to sourmash, a A parameter string is a space-delimited collection that can contain one or more fields, comma-separated. * `k=` - create a sketch at this k-mer size; can provide more than one time in a parameter string. Typically `ksize` is between 4 and 100. -* `scaled=` - create a scaled MinHash with k-mers sampled deterministically at 1 per `` value. This controls sketch compression rates and resolution; for example, a 5 Mbp genome sketched with a scaled of 1000 would yield approximately 5,000 k-mers. `scaled` is incompatible with `num`. See [our guide to signature resolution](using-sourmash-a-guide.md#what-resolution-should-my-signatures-be--how-should-i-create-them) for more information. -* `num=` - create a standard MinHash with no more than `` k-mers kept. This will produce sketches identical to [mash sketches](https://mash.readthedocs.io/en/latest/). `num` is incompatible with `scaled`. See [our guide to signature resolution](using-sourmash-a-guide.md#what-resolution-should-my-signatures-be--how-should-i-create-them) for more information. +* `scaled=` - create a scaled MinHash with k-mers sampled deterministically at 1 per `` value. This controls sketch compression rates and resolution; for example, a 5 Mbp genome sketched with a scaled of 1000 would yield approximately 5,000 k-mers. `scaled` is incompatible with `num`. See [our guide to signature resolution](using-sourmash-a-guide.md#what-resolution-should-my-signatures-be-and-how-should-i-create-them) for more information. +* `num=` - create a standard MinHash with no more than `` k-mers kept. This will produce sketches identical to [mash sketches](https://mash.readthedocs.io/en/latest/). `num` is incompatible with `scaled`. See [our guide to signature resolution](using-sourmash-a-guide.md#what-resolution-should-my-signatures-be-and-how-should-i-create-them) for more information. * `abund` / `noabund` - create abundance-weighted (or not) sketches. See [Classify signatures: Abundance Weighting](classifying-signatures.md#abundance-weighting) for details of how this works. * `dna`, `protein`, `dayhoff`, `hp` - create this kind of sketch. Note that `sourmash sketch dna -p protein` and `sourmash sketch protein -p dna` are invalid; please use `sourmash sketch translate` for the former. * `seed=` - set the random number seed used for k-mer hashing. This is for advanced users who want to choose a completely different set of k-mers for sketches! The default is 42. diff --git a/doc/using-sourmash-a-guide.md b/doc/using-sourmash-a-guide.md index 29ccc52ec1..a3600c1337 100644 --- a/doc/using-sourmash-a-guide.md +++ b/doc/using-sourmash-a-guide.md @@ -41,7 +41,7 @@ however, and it probably doesn't really matter. (When we have blog posts or publications providing more formal guidance, we'll link to them here!) -## What resolution should my signatures be / how should I create them? +## What resolution should my signatures be and how should I create them? sourmash supports two ways of choosing the resolution or size of your signatures: using `num` to specify the maximum number of hashes, diff --git a/flake.nix b/flake.nix index 06ecc32fb4..5c3795f1d4 100644 --- a/flake.nix +++ b/flake.nix @@ -68,7 +68,7 @@ sourmash = python.buildPythonPackage ( commonArgs // rec { pname = "sourmash"; - version = "4.8.6"; + version = "4.8.8"; format = "pyproject"; cargoDeps = rustPlatform.importCargoLock { @@ -128,6 +128,7 @@ cargo-outdated cargo-udeps cargo-deny + cargo-wasi #cargo-semver-checks nixpkgs-fmt ]; diff --git a/pyproject.toml b/pyproject.toml index 291c732093..083016d1c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ build-backend = 'maturin' name = "sourmash" description = "tools for comparing biological sequences with k-mer sketches" readme = "README.md" -version = "4.8.7-dev" +version = "4.8.8" authors = [ { name="Luiz Irber", orcid="0000-0003-4371-9659" }, @@ -101,7 +101,7 @@ license = { text = "BSD 3-Clause License" } [project.optional-dependencies] test = [ "pytest>=6.2.4,<8.2.0", - "pytest-cov>=4,<5.0", + "pytest-cov>=4,<6.0", "pytest-xdist>=3.1", "pyyaml>=6,<7", "diff-cover>=7.3", diff --git a/src/core/CHANGELOG.md b/src/core/CHANGELOG.md index 67a3134144..ac4d169e80 100644 --- a/src/core/CHANGELOG.md +++ b/src/core/CHANGELOG.md @@ -5,6 +5,29 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [unreleased] + +## [0.13.1] - 2024-03-23 + +MSRV: 1.65 + +Changes/additions: + +* Implement file parsing for webassembly (#3047) +* fix `calculate_gather_stats` `threshold=0` bug (#3052) +* fix clippy beta issues (#3088) + +Updates: + +* Bump wasm-bindgen-test from 0.3.41 to 0.3.42 (#3063) +* Bump web-sys from 0.3.68 to 0.3.69 (#3061) +* Bump log from 0.4.20 to 0.4.21 (#3062) +* Bump rayon from 1.8.1 to 1.9.0 (#3058) +* Bump tempfile from 3.10.0 to 3.10.1 (#3059) +* Bump serde_json from 1.0.113 to 1.0.114 (#3044) +* Bump serde from 1.0.196 to 1.0.197 (#3045) +* Bump itertools from 0.12.0 to 0.12.1 (#3043) + ## [0.13.0] - 2024-02-23 MSRV: 1.65 @@ -17,6 +40,7 @@ Changes/additions: * make core Manifest booleans python compatible (core) (#3007) Updates: + * Bump roaring from 0.10.2 to 0.10.3 (#3014) * Bump histogram from 0.9.0 to 0.9.1 (#3002) * Bump chrono from 0.4.33 to 0.4.34 (#3000) @@ -287,7 +311,11 @@ Fixed: - Fix mem leak in get_mins (#807) - Fixes for WASI and WASM compilation (#771) (#723) -[unreleased]: https://github.com/sourmash-bio/sourmash/compare/r0.11.0...HEAD +[unreleased]: https://github.com/sourmash-bio/sourmash/compare/r0.13.1...HEAD +[0.13.1]: https://github.com/sourmash-bio/sourmash/compare/r0.13.0...r0.13.1 +[0.13.0]: https://github.com/sourmash-bio/sourmash/compare/r0.12.1...r0.13.0 +[0.12.1]: https://github.com/sourmash-bio/sourmash/compare/r0.12.0...r0.12.1 +[0.12.0]: https://github.com/sourmash-bio/sourmash/compare/r0.11.0...r0.12.0 [0.11.0]: https://github.com/sourmash-bio/sourmash/compare/r0.10.0...r0.11.0 [0.10.0]: https://github.com/sourmash-bio/sourmash/compare/r0.9.0...r0.10.0 [0.9.0]: https://github.com/sourmash-bio/sourmash/compare/r0.9.0...r0.10.0 diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 0f292db6d6..7be417cfd6 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "sourmash" -version = "0.13.0" -authors = ["Luiz Irber "] -description = "MinHash sketches for genomic data" +version = "0.13.1" +authors = ["Luiz Irber ", "N. Tessa Pierce-Ward "] +description = "tools for comparing biological sequences with k-mer sketches" repository = "https://github.com/sourmash-bio/sourmash" keywords = ["minhash", "bioinformatics"] categories = ["science", "algorithms", "data-structures"] @@ -32,17 +32,18 @@ camino = { version = "1.1.6", features = ["serde1"] } cfg-if = "1.0" counter = "0.5.7" csv = "1.3.0" -enum_dispatch = "0.3.12" +enum_dispatch = "0.3.13" finch = { version = "0.6.0", optional = true } fixedbitset = "0.4.0" getrandom = { version = "0.2", features = ["js"] } getset = "0.1.1" -histogram = "0.9.1" +histogram = "0.10.0" itertools = "0.12.1" log = "0.4.21" md5 = "0.7.0" memmap2 = "0.9.4" murmurhash3 = "0.0.5" +needletail = { version = "0.5.1", default-features = false } niffler = { version = "2.3.1", default-features = false, features = [ "gz" ] } nohash-hasher = "0.2.0" num-iter = "0.1.44" @@ -50,12 +51,12 @@ once_cell = "1.18.0" ouroboros = "0.18.3" piz = "0.5.0" primal-check = "0.3.1" -rayon = { version = "1.9.0", optional = true } +rayon = { version = "1.10.0", optional = true } rkyv = { version = "0.7.44", optional = true } roaring = "0.10.3" roots = "0.0.8" serde = { version = "1.0.197", features = ["derive"] } -serde_json = "1.0.114" +serde_json = "1.0.115" statrs = "0.16.0" streaming-stats = "0.2.3" thiserror = "1.0" @@ -64,8 +65,6 @@ typed-builder = "0.18.0" vec-collections = "0.4.3" [dev-dependencies] -criterion = "0.5.1" -needletail = { version = "0.5.1", default-features = false } proptest = { version = "1.4.0", default-features = false, features = ["std"]} rand = "0.8.2" tempfile = "3.10.1" @@ -95,17 +94,13 @@ skip_feature_sets = [ ## Wasm section. Crates only used for WASM, as well as specific configurations -[target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies.wasm-bindgen] -version = "0.2.89" -features = ["serde-serialize"] +[target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies] +js-sys = "0.3.68" +web-sys = { version = "0.3.69", features = ["console", "File", "FileReaderSync"] } +wasm-bindgen = { version = "0.2.89", features = ["serde-serialize"] } -[target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies.web-sys] -version = "0.3.69" -features = ["console", "File"] - -[target.'cfg(all(target_arch = "wasm32"))'.dependencies.chrono] -version = "0.4.32" -features = ["wasmbind"] +[target.'cfg(all(target_arch = "wasm32"))'.dependencies] +chrono = { version = "0.4.32", features = ["wasmbind"] } [target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dev-dependencies] wasm-bindgen-test = "0.3.42" @@ -113,3 +108,5 @@ wasm-bindgen-test = "0.3.42" ### These crates don't compile on wasm [target.'cfg(not(target_arch = "wasm32"))'.dependencies] rocksdb = { version = "0.21.0", optional = true } +[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] +criterion = "0.5.1" diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index 8cc6129cf4..9f708381ef 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -6,8 +6,7 @@ use camino::Utf8PathBuf as PathBuf; use crate::encodings::Idx; use crate::manifest::{Manifest, Record}; use crate::prelude::*; -use crate::signature::Signature; -use crate::storage::{FSStorage, InnerStorage, MemStorage, SigStore, Storage, ZipStorage}; +use crate::storage::{FSStorage, InnerStorage, MemStorage, SigStore, ZipStorage}; use crate::{Error, Result}; #[cfg(feature = "parallel")] diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index ac69cd58eb..f8934596dc 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -1,8 +1,6 @@ use serde::{Deserialize, Serialize}; use std::collections::HashMap; -use std::convert::TryFrom; use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}; -use std::iter::Iterator; use std::str; use nohash_hasher::BuildNoHashHasher; diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs index c82ca6ee1e..7441a9b69f 100644 --- a/src/core/src/manifest.rs +++ b/src/core/src/manifest.rs @@ -1,4 +1,3 @@ -use std::convert::TryInto; use std::fs::File; use std::io::{BufRead, BufReader, Read, Write}; use std::ops::Deref; @@ -12,7 +11,7 @@ use serde::{Deserialize, Serialize}; use crate::encodings::HashFunctions; use crate::prelude::*; -use crate::signature::{Signature, SigsTrait}; +use crate::signature::SigsTrait; use crate::sketch::Sketch; use crate::Result; diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index da38587dc3..0ab8190f98 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -6,7 +6,6 @@ use core::iter::FusedIterator; use std::fs::File; use std::io; -use std::iter::Iterator; use std::path::Path; use std::str; @@ -18,7 +17,6 @@ use typed_builder::TypedBuilder; use crate::encodings::{aa_to_dayhoff, aa_to_hp, revcomp, to_aa, HashFunctions, VALID}; use crate::prelude::*; -use crate::selection::{Select, Selection}; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; use crate::Error; @@ -891,7 +889,6 @@ impl PartialEq for Signature { #[cfg(test)] mod test { - use std::convert::TryInto; use std::fs::File; use std::io::{BufReader, Read}; use std::path::PathBuf; diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 24cdc9539f..1ee747745a 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -2,7 +2,7 @@ use std::cmp::Ordering; use std::collections::{BTreeMap, BTreeSet}; use std::f64::consts::PI; use std::fmt::Write; -use std::iter::{Iterator, Peekable}; +use std::iter::Peekable; use std::str; use std::sync::Mutex; @@ -942,56 +942,6 @@ impl> Iterator for Intersection { } } -struct Union> { - iter: Peekable, - other: Peekable, -} - -impl> Iterator for Union { - type Item = T; - - fn next(&mut self) -> Option { - let res = match (self.iter.peek(), self.other.peek()) { - (Some(ref left_key), Some(ref right_key)) => left_key.cmp(right_key), - (None, Some(_)) => { - return self.other.next(); - } - (Some(_), None) => { - return self.iter.next(); - } - _ => return None, - }; - - match res { - Ordering::Less => self.iter.next(), - Ordering::Greater => self.other.next(), - Ordering::Equal => { - self.other.next(); - self.iter.next() - } - } - } -} - -#[cfg(test)] -mod test { - use super::Union; - - #[test] - fn test_union() { - let v1 = [1u64, 2, 4, 10]; - let v2 = [1u64, 3, 4, 9]; - - let union: Vec = Union { - iter: v1.iter().peekable(), - other: v2.iter().peekable(), - } - .cloned() - .collect(); - assert_eq!(union, [1, 2, 3, 4, 9, 10]); - } -} - //############# // A MinHash implementation for low scaled or large cardinalities diff --git a/src/core/src/wasm.rs b/src/core/src/wasm.rs index c2a0eb6c30..cd9efec091 100644 --- a/src/core/src/wasm.rs +++ b/src/core/src/wasm.rs @@ -4,6 +4,7 @@ #[global_allocator] static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT; +use needletail::parse_fastx_reader; use wasm_bindgen::prelude::*; use crate::cmd::ComputeParameters as _ComputeParameters; @@ -57,15 +58,15 @@ impl KmerMinHash { } #[wasm_bindgen] - pub fn add_sequence_js(&mut self, buf: &str) { - self.0 - .add_sequence(buf.as_bytes(), true) - .expect("Error adding sequence"); + pub fn add_sequence_js(&mut self, buf: &str) -> Result<(), JsErrors> { + self.0.add_sequence(buf.as_bytes(), true)?; + Ok(()) } #[wasm_bindgen] - pub fn to_json(&mut self) -> String { - serde_json::to_string(&self.0).unwrap() + pub fn to_json(&mut self) -> Result { + let json = serde_json::to_string(&self.0)?; + Ok(json) } } @@ -81,6 +82,40 @@ impl ComputeParameters { pub fn set_ksizes(&mut self, ksizes: Vec) { self.0.set_ksizes(ksizes); } + + #[wasm_bindgen] + pub fn set_scaled(&mut self, scaled: u32) { + self.0.set_scaled(scaled as u64); + } + + #[wasm_bindgen] + pub fn set_num(&mut self, num: u32) { + self.0.set_num_hashes(num); + } + + #[wasm_bindgen] + pub fn set_protein(&mut self, is_protein: bool) { + self.0.set_protein(is_protein); + } + + #[wasm_bindgen] + pub fn set_dayhoff(&mut self, dayhoff: bool) { + self.0.set_dayhoff(dayhoff); + } + + #[wasm_bindgen] + pub fn set_hp(&mut self, hp: bool) { + self.0.set_hp(hp); + } + + #[wasm_bindgen] + pub fn set_track_abundance(&mut self, track: bool) { + self.0.set_track_abundance(track); + } + #[wasm_bindgen] + pub fn set_seed(&mut self, seed: u32) { + self.0.set_seed(seed.into()); + } } #[wasm_bindgen] @@ -93,20 +128,39 @@ impl Signature { } #[wasm_bindgen] - pub fn add_sequence_js(&mut self, buf: &str) { - self.0 - .add_sequence(buf.as_bytes(), true) - .expect("Error adding sequence"); + pub fn add_sequence_js(&mut self, buf: &str) -> Result<(), JsErrors> { + self.0.add_sequence(buf.as_bytes(), true)?; + + Ok(()) } #[wasm_bindgen] - pub fn add_from_file(&mut self, fp: web_sys::File) { - unimplemented!() + pub fn add_from_file( + &mut self, + fp: web_sys::File, + callback: Option, + ) -> Result<(), JsErrors> { + let wf = SyncFile::new(fp, callback); + + let (rdr, _format) = niffler::send::get_reader(Box::new(wf))?; + + let mut parser = parse_fastx_reader(std::io::BufReader::with_capacity( + 1024 << 14, // 16 MiB + rdr, + ))?; + + while let Some(record) = parser.next() { + let record = record?; + self.0.add_sequence(&record.seq(), true)?; + } + + Ok(()) } #[wasm_bindgen] - pub fn to_json(&mut self) -> String { - serde_json::to_string(&self.0).unwrap() + pub fn to_json(&mut self) -> Result { + let json = serde_json::to_string(&self.0)?; + Ok(json) } pub fn size(&self) -> usize { @@ -114,6 +168,28 @@ impl Signature { } } +#[derive(thiserror::Error, Debug)] +pub enum JsErrors { + #[error(transparent)] + SourmashError(#[from] crate::Error), + + #[error(transparent)] + SerdeError(#[from] serde_json::error::Error), + + #[error(transparent)] + NifflerError(#[from] niffler::Error), + + #[error(transparent)] + NeedletailError(#[from] needletail::errors::ParseError), +} + +impl Into for JsErrors { + fn into(self) -> JsValue { + let error = js_sys::Error::new(&self.to_string()); + error.into() + } +} + #[cfg(test)] mod test { use super::*; @@ -127,3 +203,92 @@ mod test { assert_eq!(sig.size(), 3); } } + +// ============================== + +use js_sys::Number; +use js_sys::Uint8Array; +use once_cell::sync::Lazy; +use web_sys::FileReaderSync; + +thread_local! { + static FILE_READER_SYNC: Lazy = Lazy::new(|| { + FileReaderSync::new().expect("Failed to create FileReaderSync. Is it running in a web worker context?") + }); +} + +/// Wrapper around a `web_sys::File` that implements `Read` and `Seek`. +pub struct SyncFile { + file: web_sys::File, + pos: u64, + cb: Option, +} + +/// Because this needs to be initialized in a Web Worker, it is safe to make it Send. +/// (hopefully. I don't think they can be sent across Web Workers, nor accessed from other WW) +unsafe impl Send for SyncFile {} + +impl SyncFile { + pub fn new(file: web_sys::File, cb: Option) -> Self { + Self { file, pos: 0, cb } + } + + /// File size in bytes. + pub fn size(&self) -> u64 { + let size = self.file.size(); + if size <= Number::MAX_SAFE_INTEGER { + return size as u64; + } else { + panic!("size is not safe to convert to integer from float") + } + } + + fn set_pos(&mut self, pos: u64) { + self.pos = pos; + self.cb.as_ref().map(|f| { + let arr = js_sys::Array::new_with_length(1); + arr.set(0, self.progress().into()); + f.apply(&JsValue::null(), &arr) + .expect("Error calling progress callback"); + }); + } + + /// Current progress on the file + pub fn progress(&self) -> f64 { + self.pos as f64 / self.file.size() + } +} + +impl std::io::Read for SyncFile { + fn read(&mut self, buf: &mut [u8]) -> Result { + let current_offset = self.pos; + let new_offset_f64 = current_offset as f64; + let new_offset_end_f64 = current_offset.saturating_add( + u64::try_from(buf.len()).map_err(|_| std::io::Error::other("Can't convert to u64"))?, + ) as f64; + + let blob = self + .file + .slice_with_f64_and_f64(new_offset_f64, new_offset_end_f64) + .map_err(|_| std::io::Error::other("failed to slice file"))?; + let array_buffer = FILE_READER_SYNC + .with(|frs| frs.read_as_array_buffer(&blob)) + .map_err(|_| std::io::Error::other("failed to read as array buffer"))?; + + let array = Uint8Array::new(&array_buffer); + let read_bytes = usize::try_from(array.byte_length()) + .map_err(|_| std::io::Error::other("read too many bytes at once"))?; + + // Copy to output buffer + array.copy_to(&mut buf[..read_bytes]); + + // Update position + self.set_pos( + current_offset + .checked_add(read_bytes as u64) + .ok_or_else(|| std::io::Error::other("new position too large"))?, + ); + + Ok(read_bytes) + } +} diff --git a/src/core/tests/dedicated_worker.rs b/src/core/tests/dedicated_worker.rs new file mode 100644 index 0000000000..f7186a003f --- /dev/null +++ b/src/core/tests/dedicated_worker.rs @@ -0,0 +1,5 @@ +#![cfg(all(target_arch = "wasm32", target_os = "unknown"))] + +use wasm_bindgen_test::wasm_bindgen_test_configure; + +wasm_bindgen_test_configure!(run_in_dedicated_worker); diff --git a/src/core/tests/node.rs b/src/core/tests/node.rs new file mode 100644 index 0000000000..f846433061 --- /dev/null +++ b/src/core/tests/node.rs @@ -0,0 +1,8 @@ +#![cfg(all(target_arch = "wasm32", target_os = "unknown"))] + +use wasm_bindgen_test::*; + +#[wasm_bindgen_test] +fn pass() { + assert_eq!(1, 1); +} diff --git a/src/core/tests/service_worker.rs b/src/core/tests/service_worker.rs new file mode 100644 index 0000000000..dae9341d9e --- /dev/null +++ b/src/core/tests/service_worker.rs @@ -0,0 +1,5 @@ +#![cfg(all(target_arch = "wasm32", target_os = "unknown"))] + +use wasm_bindgen_test::wasm_bindgen_test_configure; + +wasm_bindgen_test_configure!(run_in_service_worker); diff --git a/src/core/tests/shared_worker.rs b/src/core/tests/shared_worker.rs new file mode 100644 index 0000000000..8d8bfc7a4f --- /dev/null +++ b/src/core/tests/shared_worker.rs @@ -0,0 +1,5 @@ +#![cfg(all(target_arch = "wasm32", target_os = "unknown"))] + +use wasm_bindgen_test::wasm_bindgen_test_configure; + +wasm_bindgen_test_configure!(run_in_shared_worker); diff --git a/src/core/tests/web.rs b/src/core/tests/web.rs new file mode 100644 index 0000000000..3bbc3dad61 --- /dev/null +++ b/src/core/tests/web.rs @@ -0,0 +1,5 @@ +#![cfg(all(target_arch = "wasm32", target_os = "unknown"))] + +use wasm_bindgen_test::wasm_bindgen_test_configure; + +wasm_bindgen_test_configure!(run_in_browser); diff --git a/src/sourmash/command_compute.py b/src/sourmash/command_compute.py index aac66def13..dbb3c42ad1 100644 --- a/src/sourmash/command_compute.py +++ b/src/sourmash/command_compute.py @@ -13,9 +13,15 @@ from .utils import RustObject from ._lowlevel import ffi, lib -DEFAULT_COMPUTE_K = "21,31,51" -DEFAULT_MMHASH_SEED = 42 -DEFAULT_LINE_COUNT = 1500 + +from .command_sketch import ( + _compute_individual, + _compute_merged, + ComputeParameters, + add_seq, + set_sig_name, + DEFAULT_MMHASH_SEED, +) def compute(args): diff --git a/src/sourmash/command_sketch.py b/src/sourmash/command_sketch.py index 508cac7c01..e98212f8c1 100644 --- a/src/sourmash/command_sketch.py +++ b/src/sourmash/command_sketch.py @@ -12,18 +12,14 @@ import sourmash from .signature import SourmashSignature from .logging import notify, error, set_quiet, print_results -from .command_compute import ( - _compute_individual, - _compute_merged, - ComputeParameters, - add_seq, - set_sig_name, - DEFAULT_MMHASH_SEED, -) from sourmash import sourmash_args from sourmash.sourmash_args import check_scaled_bounds, check_num_bounds from sourmash.sig.__main__ import _summarize_manifest, _SketchInfo from sourmash.manifest import CollectionManifest +from .utils import RustObject +from ._lowlevel import ffi, lib + +DEFAULT_MMHASH_SEED = 42 DEFAULTS = dict( dna="k=31,scaled=1000,noabund", @@ -637,3 +633,452 @@ def fromfile(args): notify( f"** {total_sigs} total requested; output {total_sigs - skipped_sigs}, skipped {skipped_sigs}" ) + + +class _signatures_for_compute_factory: + "Build signatures on demand, based on args input to 'compute'." + + def __init__(self, args): + self.args = args + + def __call__(self): + args = self.args + params = ComputeParameters( + ksizes=args.ksizes, + seed=args.seed, + protein=args.protein, + dayhoff=args.dayhoff, + hp=args.hp, + dna=args.dna, + num_hashes=args.num_hashes, + track_abundance=args.track_abundance, + scaled=args.scaled, + ) + sig = SourmashSignature.from_params(params) + return [sig] + + +def _compute_individual(args, signatures_factory): + # this is where output signatures will go. + save_sigs = None + + # track: is this the first file? in cases where we have empty inputs, + # we don't want to open any outputs. + first_file_for_output = True + + # if args.output is set, we are aggregating all output to a single file. + # do not open a new output file for each input. + open_output_each_time = True + if args.output: + open_output_each_time = False + + for filename in args.filenames: + if open_output_each_time: + # for each input file, construct output filename + sigfile = os.path.basename(filename) + ".sig" + if args.output_dir: + sigfile = os.path.join(args.output_dir, sigfile) + + # does it already exist? skip if so. + if os.path.exists(sigfile) and not args.force: + notify("skipping {} - already done", filename) + continue # go on to next file. + + # nope? ok, let's save to it. + assert not save_sigs + save_sigs = sourmash_args.SaveSignaturesToLocation(sigfile) + + # + # calculate signatures! + # + + # now, set up to iterate over sequences. + with screed.open(filename) as screed_iter: + if not screed_iter: + notify(f"no sequences found in '{filename}'?!") + continue + + # open output for signatures + if open_output_each_time: + save_sigs.open() + # or... is this the first time to write something to args.output? + elif first_file_for_output: + save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) + save_sigs.open() + first_file_for_output = False + + # make a new signature for each sequence? + if args.singleton: + n_calculated = 0 + for n, record in enumerate(screed_iter): + sigs = signatures_factory() + try: + add_seq( + sigs, + record.sequence, + args.input_is_protein, + args.check_sequence, + ) + except ValueError as exc: + error(f"ERROR when reading from '{filename}' - ") + error(str(exc)) + sys.exit(-1) + + n_calculated += len(sigs) + set_sig_name(sigs, filename, name=record.name) + save_sigs_to_location(sigs, save_sigs) + + notify( + "calculated {} signatures for {} sequences in {}", + n_calculated, + n + 1, + filename, + ) + + # nope; make a single sig for the whole file + else: + sigs = signatures_factory() + + # consume & calculate signatures + notify(f"... reading sequences from {filename}") + name = None + for n, record in enumerate(screed_iter): + if n % 10000 == 0: + if n: + notify("\r...{} {}", filename, n, end="") + elif args.name_from_first: + name = record.name + + try: + add_seq( + sigs, + record.sequence, + args.input_is_protein, + args.check_sequence, + ) + except ValueError as exc: + error(f"ERROR when reading from '{filename}' - ") + error(str(exc)) + sys.exit(-1) + + notify("...{} {} sequences", filename, n, end="") + + set_sig_name(sigs, filename, name) + save_sigs_to_location(sigs, save_sigs) + + notify( + f"calculated {len(sigs)} signatures for {n+1} sequences in {filename}" + ) + + # if not args.output, close output for every input filename. + if open_output_each_time: + save_sigs.close() + notify( + f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0." + ) + save_sigs = None + + # if --output-dir specified, all collected signatures => args.output, + # and we need to close here. + if args.output and save_sigs is not None: + save_sigs.close() + notify( + f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0." + ) + + +def _compute_merged(args, signatures_factory): + # make a signature for the whole file + sigs = signatures_factory() + + total_seq = 0 + for filename in args.filenames: + # consume & calculate signatures + notify("... reading sequences from {}", filename) + + n = None + with screed.open(filename) as f: + for n, record in enumerate(f): + if n % 10000 == 0 and n: + notify("\r... {} {}", filename, n, end="") + + add_seq( + sigs, record.sequence, args.input_is_protein, args.check_sequence + ) + if n is not None: + notify("... {} {} sequences", filename, n + 1) + total_seq += n + 1 + else: + notify(f"no sequences found in '{filename}'?!") + + if total_seq: + set_sig_name(sigs, filename, name=args.merge) + notify( + "calculated 1 signature for {} sequences taken from {} files", + total_seq, + len(args.filenames), + ) + + # at end, save! + save_siglist(sigs, args.output) + + +def add_seq(sigs, seq, input_is_protein, check_sequence): + for sig in sigs: + if input_is_protein: + sig.add_protein(seq) + else: + sig.add_sequence(seq, not check_sequence) + + +def set_sig_name(sigs, filename, name=None): + if filename == "-": # if stdin, set filename to empty. + filename = "" + for sig in sigs: + if name is not None: + sig._name = name + + sig.filename = filename + + +def save_siglist(siglist, sigfile_name): + "Save multiple signatures to a filename." + + # save! + with sourmash_args.SaveSignaturesToLocation(sigfile_name) as save_sig: + for ss in siglist: + save_sig.add(ss) + + notify(f"saved {len(save_sig)} signature(s) to '{save_sig.location}'") + + +def save_sigs_to_location(siglist, save_sig): + "Save multiple signatures to an already-open location." + import sourmash + + for ss in siglist: + save_sig.add(ss) + + +class ComputeParameters(RustObject): + __dealloc_func__ = lib.computeparams_free + + def __init__( + self, + *, + ksizes=(21, 31, 51), + seed=42, + protein=False, + dayhoff=False, + hp=False, + dna=True, + num_hashes=500, + track_abundance=False, + scaled=0, + ): + self._objptr = lib.computeparams_new() + + self.seed = seed + self.ksizes = ksizes + self.protein = protein + self.dayhoff = dayhoff + self.hp = hp + self.dna = dna + self.num_hashes = num_hashes + self.track_abundance = track_abundance + self.scaled = scaled + + @classmethod + def from_manifest_row(cls, row): + "convert a CollectionManifest row into a ComputeParameters object" + is_dna = is_protein = is_dayhoff = is_hp = False + if row["moltype"] == "DNA": + is_dna = True + elif row["moltype"] == "protein": + is_protein = True + elif row["moltype"] == "hp": + is_hp = True + elif row["moltype"] == "dayhoff": + is_dayhoff = True + else: + assert 0 + + if is_dna: + ksize = row["ksize"] + else: + ksize = row["ksize"] * 3 + + p = cls( + ksizes=[ksize], + seed=DEFAULT_MMHASH_SEED, + protein=is_protein, + dayhoff=is_dayhoff, + hp=is_hp, + dna=is_dna, + num_hashes=row["num"], + track_abundance=row["with_abundance"], + scaled=row["scaled"], + ) + + return p + + def to_param_str(self): + "Convert object to equivalent params str." + pi = [] + + if self.dna: + pi.append("dna") + elif self.protein: + pi.append("protein") + elif self.hp: + pi.append("hp") + elif self.dayhoff: + pi.append("dayhoff") + else: + assert 0 # must be one of the previous + + if self.dna: + kstr = [f"k={k}" for k in self.ksizes] + else: + # for protein, divide ksize by three. + kstr = [f"k={k//3}" for k in self.ksizes] + assert kstr + pi.extend(kstr) + + if self.num_hashes != 0: + pi.append(f"num={self.num_hashes}") + elif self.scaled != 0: + pi.append(f"scaled={self.scaled}") + else: + assert 0 + + if self.track_abundance: + pi.append("abund") + # noabund is default + + if self.seed != DEFAULT_MMHASH_SEED: + pi.append(f"seed={self.seed}") + # self.seed + + return ",".join(pi) + + def __repr__(self): + return f"ComputeParameters(ksizes={self.ksizes}, seed={self.seed}, protein={self.protein}, dayhoff={self.dayhoff}, hp={self.hp}, dna={self.dna}, num_hashes={self.num_hashes}, track_abundance={self.track_abundance}, scaled={self.scaled})" + + def __eq__(self, other): + return ( + self.ksizes == other.ksizes + and self.seed == other.seed + and self.protein == other.protein + and self.dayhoff == other.dayhoff + and self.hp == other.hp + and self.dna == other.dna + and self.num_hashes == other.num_hashes + and self.track_abundance == other.track_abundance + and self.scaled == other.scaled + ) + + @staticmethod + def from_args(args): + ptr = lib.computeparams_new() + ret = ComputeParameters._from_objptr(ptr) + + for arg, value in vars(args).items(): + try: + getattr(type(ret), arg).fset(ret, value) + except AttributeError: + pass + + return ret + + @property + def seed(self): + return self._methodcall(lib.computeparams_seed) + + @seed.setter + def seed(self, v): + return self._methodcall(lib.computeparams_set_seed, v) + + @property + def ksizes(self): + size = ffi.new("uintptr_t *") + ksizes_ptr = self._methodcall(lib.computeparams_ksizes, size) + size = size[0] + ksizes = ffi.unpack(ksizes_ptr, size) + lib.computeparams_ksizes_free(ksizes_ptr, size) + return ksizes + + @ksizes.setter + def ksizes(self, v): + return self._methodcall(lib.computeparams_set_ksizes, list(v), len(v)) + + @property + def protein(self): + return self._methodcall(lib.computeparams_protein) + + @protein.setter + def protein(self, v): + return self._methodcall(lib.computeparams_set_protein, v) + + @property + def dayhoff(self): + return self._methodcall(lib.computeparams_dayhoff) + + @dayhoff.setter + def dayhoff(self, v): + return self._methodcall(lib.computeparams_set_dayhoff, v) + + @property + def hp(self): + return self._methodcall(lib.computeparams_hp) + + @hp.setter + def hp(self, v): + return self._methodcall(lib.computeparams_set_hp, v) + + @property + def dna(self): + return self._methodcall(lib.computeparams_dna) + + @dna.setter + def dna(self, v): + return self._methodcall(lib.computeparams_set_dna, v) + + @property + def moltype(self): + if self.dna: + moltype = "DNA" + elif self.protein: + moltype = "protein" + elif self.hp: + moltype = "hp" + elif self.dayhoff: + moltype = "dayhoff" + else: + assert 0 + + return moltype + + @property + def num_hashes(self): + return self._methodcall(lib.computeparams_num_hashes) + + @num_hashes.setter + def num_hashes(self, v): + return self._methodcall(lib.computeparams_set_num_hashes, v) + + @property + def track_abundance(self): + return self._methodcall(lib.computeparams_track_abundance) + + @track_abundance.setter + def track_abundance(self, v): + return self._methodcall(lib.computeparams_set_track_abundance, v) + + @property + def scaled(self): + return self._methodcall(lib.computeparams_scaled) + + @scaled.setter + def scaled(self, v): + return self._methodcall(lib.computeparams_set_scaled, int(v)) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index aede904a04..02099b54d9 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -1554,7 +1554,6 @@ def check(args): def collect(args): "Collect signature metadata across many locations, save to manifest" - # TODO: set_quiet(False, args.debug) if args.cli_version == "v5": diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 073977cb79..1a5d22940a 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -313,6 +313,9 @@ def genome(args): sys.exit(-1) # for each queryResult, summarize at rank and classify according to thresholds, reporting any errors that occur. + n_total = len(query_gather_results) + classified_results = [] + found_error = False for queryResult in query_gather_results: try: queryResult.build_classification_result( @@ -322,10 +325,21 @@ def genome(args): lingroup_ranks=lg_ranks, lingroups=all_lgs, ) + classified_results.append(queryResult) except ValueError as exc: - error(f"ERROR: {str(exc)}") - sys.exit(-1) + found_error = True + notify(f"ERROR: {str(exc)}") + + n_classified = len(classified_results) + if n_classified == 0: + notify("No queries could be classified. Exiting.") + sys.exit(-1) + else: + classif_perc = (float(n_classified) / float(n_total)) * 100 + notify( + f"classified {n_classified}/{n_total} queries ({classif_perc :.2f}%). Writing results" + ) # write outputs if "csv_summary" in args.output_format: @@ -334,7 +348,7 @@ def genome(args): ) with FileOutputCSV(summary_outfile) as out_fp: tax_utils.write_summary( - query_gather_results, + classified_results, out_fp, limit_float_decimals=limit_float, classification=True, @@ -389,6 +403,11 @@ def genome(args): with FileOutputCSV(lineage_outfile) as out_fp: tax_utils.write_output(header, lineage_results, out_fp) + # if there was a classification error, exit with err code + if found_error: + if not args.force: + sys.exit(-1) + def annotate(args): """ diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 55feed66d2..a2fbeb3f30 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1274,7 +1274,9 @@ def load( elif "accession" in header: identifier = "accession" header = ["ident" if "accession" == x else x for x in header] - elif "name" in header and "lineage" in header: + elif "lineage" in header and any( + ["name" in header, "match_name" in header] + ): return cls.load_from_gather_with_lineages( filename, force=force, lins=lins, ictv=ictv ) @@ -1390,9 +1392,14 @@ def load_from_gather_with_lineages( if not header: raise ValueError(f"cannot read taxonomy assignments from {filename}") - if "name" not in header or "lineage" not in header: + ident_col = None + if "name" in header: + ident_col = "name" + elif "match_name" in header: + ident_col = "match_name" + if "lineage" not in header or ident_col is None: raise ValueError( - "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" + "Expected headers 'name'/'match_name' and 'lineage' not found. Is this a with-lineages file?" ) ranks = None @@ -1405,7 +1412,7 @@ def load_from_gather_with_lineages( for n, row in enumerate(r): num_rows += 1 - name = row["name"] + name = row[ident_col] ident = get_ident(name) if lins: @@ -2321,9 +2328,6 @@ def summarize_up_ranks(self, single_rank=None, force_resummarize=False): f"Error: rank '{single_rank}' not in available ranks ({', '.join(self.summarized_ranks)})" ) self.summarized_ranks = [single_rank] - notify( - f"Starting summarization up rank(s): {', '.join(self.summarized_ranks)} " - ) for taxres in self.raw_taxresults: lininfo = taxres.lineageInfo if ( diff --git a/tests/test-data/tax/test1.gather.with-lineages.csv b/tests/test-data/tax/test1.gather.with-lineages.csv new file mode 100644 index 0000000000..1c81221737 --- /dev/null +++ b/tests/test-data/tax/test1.gather.with-lineages.csv @@ -0,0 +1,5 @@ +intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,query_bp,ksize,scaled,query_n_hashes,lineage +442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Pseudomonadota;Gammaproteobacteria;Enterobacterales;Enterobacteriaceae;Escherichia;Escherichia coli +390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Prevotellaceae;Prevotella;Prevotella copri +138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Bacteroidaceae;Phocaeicola;Phocaeicola vulgatus +338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Prevotellaceae;Prevotella;Prevotella copri diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 6809c8ff25..49df253f19 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -3355,16 +3355,35 @@ def test_sig_describe_dayhoff(c): ) -@utils.in_tempdir -def test_sig_describe_1_hp(c): +def test_sig_describe_1_hp(runtmp): + c = runtmp + # get basic info on a signature testdata = utils.get_test_data("short.fa") - c.run_sourmash( - "compute", "-k", "21,30", "--dayhoff", "--hp", "--protein", "--dna", testdata + + # run four separate commands to make 4 different sets of sigs... + c.sourmash("sketch", "dna", "-p", "k=21,k=30,num=500", "-o", "out.zip", testdata) + c.sourmash( + "sketch", "translate", "-p", "k=7,k=10,num=500", "-o", "out.zip", testdata + ) + c.sourmash( + "sketch", "translate", "-p", "k=7,k=10,num=500,hp", "-o", "out.zip", testdata + ) + c.sourmash( + "sketch", + "translate", + "-p", + "k=7,k=10,num=500,dayhoff", + "-o", + "out.zip", + testdata, ) - # stdout should be new signature - computed_sig = os.path.join(c.location, "short.fa.sig") - c.run_sourmash("sig", "describe", computed_sig) + + # then combine into one .sig file + c.sourmash("sig", "cat", "out.zip", "-o", "short.fa.sig") + + # & run sig describe + c.run_sourmash("sig", "describe", "short.fa.sig") out = c.last_result.out print(c.last_result.out) @@ -3444,7 +3463,6 @@ def test_sig_describe_1_hp(c): signature license: CC0 --- -signature filename: short.fa.sig signature: ** no name ** source file: short.fa md5: 71f7c111c01785e5f38efad45b00a0e1 diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index fc083a21e5..23647e517b 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1791,26 +1791,48 @@ def test_compare_deduce_molecule(runtmp): def test_compare_choose_molecule_dna(runtmp): - # choose molecule type + # choose molecule type with --dna, ignoring protein testdata1 = utils.get_test_data("short.fa") testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash("compute", "-k", "30", "--dna", "--protein", testdata1, testdata2) - - runtmp.sourmash("compare", "--dna", "short.fa.sig", "short2.fa.sig") + runtmp.sourmash( + "sketch", "dna", "-p", "k=30,num=500", testdata1, testdata2, "-o", "sigs.zip" + ) + runtmp.sourmash( + "sketch", + "translate", + "-p", + "k=10,num=500", + testdata1, + testdata2, + "-o", + "sigs.zip", + ) + runtmp.sourmash("compare", "--dna", "sigs.zip") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert "min similarity in matrix: 0.938" in runtmp.last_result.out def test_compare_choose_molecule_protein(runtmp): - # choose molecule type + # choose molecule type with --protein, ignoring DNA testdata1 = utils.get_test_data("short.fa") testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash("compute", "-k", "30", "--dna", "--protein", testdata1, testdata2) - - runtmp.sourmash("compare", "--protein", "short.fa.sig", "short2.fa.sig") + runtmp.sourmash( + "sketch", "dna", "-p", "k=30,num=500", testdata1, testdata2, "-o", "sigs.zip" + ) + runtmp.sourmash( + "sketch", + "translate", + "-p", + "k=10,num=500", + testdata1, + testdata2, + "-o", + "sigs.zip", + ) + runtmp.sourmash("compare", "--protein", "sigs.zip") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert "min similarity in matrix: 0.91" in runtmp.last_result.out diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py index 87460dcbcb..98448e4d6b 100644 --- a/tests/test_sourmash_sketch.py +++ b/tests/test_sourmash_sketch.py @@ -15,8 +15,7 @@ from sourmash import MinHash from sourmash.sbt import SBT, Node from sourmash.sbtmh import SigLeaf, load_sbt_index -from sourmash.command_compute import ComputeParameters -from sourmash.cli.compute import subparser +from sourmash.command_sketch import ComputeParameters from sourmash.cli import SourmashParser from sourmash import manifest diff --git a/tests/test_tax.py b/tests/test_tax.py index 70b4f14fc0..fc68b6448d 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -2700,6 +2700,115 @@ def test_genome_gather_two_files_empty_force(runtmp): ) +def test_genome_gather_two_files_one_classif_fail(runtmp): + # if one query cant be classified still get classif for second + # no --force = fail but still write file + c = runtmp + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") + + # make test2 results (identical to test1 except query_name and filename) + g_res2 = runtmp.output("test2.gather.csv") + test2_results = [ + x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines() + ] + test2_results[1] = test2_results[1].replace( + "0.08815317112086159", "1.1" + ) # make test2 f_unique_to_query sum to >1 + for line in test2_results: + print(line) + with open(g_res2, "w") as fp: + fp.writelines(test2_results) + + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == -1 + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert "test2" not in c.last_result.out + assert ( + "ERROR: Summarized fraction is > 100% of the query! This should not be possible. Please check that your input files come directly from a single gather run per query." + in c.last_result.err + ) + + +def test_genome_gather_two_files_one_classif(runtmp): + # if one query cant be classified, still get classif for second + c = runtmp + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") + + # make test2 results (identical to test1 except query_name and filename) + g_res2 = runtmp.output("test2.gather.csv") + test2_results = [ + x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines() + ] + test2_results[1] = test2_results[1].replace( + "0.08815317112086159", "1.1" + ) # make test2 f_unique_to_query sum to >1 + for line in test2_results: + print(line) + with open(g_res2, "w") as fp: + fp.writelines(test2_results) + + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + "--force", + ) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert "test2" not in c.last_result.out + assert ( + "ERROR: Summarized fraction is > 100% of the query! This should not be possible. Please check that your input files come directly from a single gather run per query." + in c.last_result.err + ) + + def test_genome_gather_duplicate_filename(runtmp): c = runtmp taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") @@ -5936,10 +6045,6 @@ def test_metagenome_LIN_lingroups(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert ( - "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" - in c.last_result.err - ) assert ( "Read 5 lingroup rows and found 5 distinct lingroup prefixes." in c.last_result.err @@ -5970,10 +6075,6 @@ def test_metagenome_LIN_human_summary_no_lin_position(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert ( - "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" - in c.last_result.err - ) assert "sample name proportion cANI lineage" in c.last_result.out assert "----------- ---------- ---- -------" in c.last_result.out assert "test1 86.9% - unclassified" in c.last_result.out @@ -6020,10 +6121,6 @@ def test_metagenome_LIN_human_summary_lin_position_5(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert ( - "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" - in c.last_result.err - ) assert "sample name proportion cANI lineage" in c.last_result.out assert "----------- ---------- ---- -------" in c.last_result.out assert "test1 86.9% - unclassified" in c.last_result.out @@ -6058,10 +6155,6 @@ def test_metagenome_LIN_krona_lin_position_5(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert ( - "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" - in c.last_result.err - ) assert "fraction 0 1 2 3 4 5" in c.last_result.out assert "0.08815317112086159 0 0 0 0 0 0" in c.last_result.out assert "0.07778220981252493 1 0 0 0 0 0" in c.last_result.out @@ -6133,10 +6226,6 @@ def test_metagenome_LIN_lingroups_empty_lg_file(runtmp): print(c.last_result.err) assert c.last_result.status != 0 - assert ( - "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" - in c.last_result.err - ) assert ( f"Cannot read lingroups from '{lg_file}'. Is file empty?" in c.last_result.err ) @@ -6302,8 +6391,4 @@ def test_metagenome_LIN_lingroups_lg_only_header(runtmp): print(c.last_result.err) assert c.last_result.status != 0 - assert ( - "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" - in c.last_result.err - ) assert f"No lingroups loaded from {lg_file}" in c.last_result.err diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 192406e251..dfca20628a 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1014,7 +1014,7 @@ def test_check_and_load_gather_lineage_csvs_bad_header(runtmp): with pytest.raises(ValueError) as exc: LineageDB.load_from_gather_with_lineages(g_res) assert ( - "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" + "Expected headers 'name'/'match_name' and 'lineage' not found. Is this a with-lineages file?" in str(exc.value) ) @@ -1038,6 +1038,29 @@ def test_check_and_load_gather_lineage_csvs_isdir(runtmp): assert "is a directory" in str(exc.value) +def test_check_and_load_gather_lineage_csvs_name(runtmp): + # test loading a with-lineage file that has 'name', not 'match_name' + g_res = utils.get_test_data("tax/test1.gather.with-lineages.csv") + + lins = LineageDB.load_from_gather_with_lineages(g_res) + assert len(lins) == 4 + + +def test_check_and_load_gather_lineage_csvs_match_name(runtmp): + # test loading a with-lineage file that has 'match_name' instead of 'name' + g_res = utils.get_test_data("tax/test1.gather.with-lineages.csv") + out_lins = runtmp.output("match-name.lineages.csv") + with open(g_res) as f_in: + first_line = f_in.readline().replace("name", "match_name") + with open(out_lins, "w") as f_out: + f_out.write(first_line) + for line in f_in: + f_out.write(line) + + lins = LineageDB.load_from_gather_with_lineages(out_lins) + assert len(lins) == 4 + + def test_check_and_load_gather_csvs_fail_on_missing(runtmp): g_csv = utils.get_test_data("tax/test1.gather.csv") # make gather results with taxonomy name not in tax_assign