Skip to content

Commit

Permalink
MRG: fix manifest n_hashes + test (#171)
Browse files Browse the repository at this point in the history
- fixes #170
  • Loading branch information
bluegenes authored Jan 8, 2025
1 parent bdb7319 commit 1c9ff80
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 4 deletions.
1 change: 0 additions & 1 deletion src/directsketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ use sourmash::collection::Collection;
use std::cmp::max;
use std::collections::HashMap;
use std::fs::{self, create_dir_all};
use std::panic;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use tokio::fs::File;
Expand Down
6 changes: 4 additions & 2 deletions src/utils/buildutils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use sourmash::encodings::{HashFunctions, Idx};
use sourmash::errors::SourmashError;
use sourmash::manifest::Record;
use sourmash::selection::Selection;
use sourmash::signature::Signature;
use sourmash::signature::{Signature, SigsTrait};
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt::Display;
Expand Down Expand Up @@ -871,7 +871,9 @@ impl BuildCollection {
record.set_filename(Some(filename.clone()));
record.set_md5(Some(sig.md5sum()));
record.set_md5short(Some(sig.md5sum()[0..8].into()));
record.set_n_hashes(Some(sig.size()));
record.set_n_hashes(Some(
sig.get_sketch().expect("cannot retrieve sketch").size(),
));

// note, this needs to be set when writing sigs (not here)
// record.set_internal_location("")
Expand Down
45 changes: 44 additions & 1 deletion tests/test_gbsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytest

import sourmash
from sourmash import sourmash_args
import sourmash_tst_utils as utils
from sourmash_tst_utils import SourmashCommandFailed

Expand Down Expand Up @@ -74,6 +75,48 @@ def test_gbsketch_simple(runtmp, capfd):
assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz"


def test_gbsketch_manifest(runtmp, capfd):
acc_csv = get_test_data('acc.csv')
output = runtmp.output('simple.zip')
failed = runtmp.output('failed.csv')
ch_fail = runtmp.output('checksum_dl_failed.csv')

sig1 = get_test_data('GCA_000175535.1.sig.gz')
sig2 = get_test_data('GCA_000961135.2.sig.gz')
sig3 = get_test_data('GCA_000961135.2.protein.sig.gz')
ss1 = sourmash.load_one_signature(sig1, ksize=31)
ss2 = sourmash.load_one_signature(sig2, ksize=31)
# why does this need ksize =30 and not ksize = 10!???
ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein')

runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output,
'--failed', failed, '-r', '1', '--checksum-fail', ch_fail,
'--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200")

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty
captured = capfd.readouterr()
print(captured.err)
print(f"looking for path: {output}")

idx = sourmash.load_file_as_index(output)
manifest = sourmash_args.get_manifest(idx)
assert len(manifest) == 3
assert manifest._md5_set == set([ss1.md5sum(), ss2.md5sum(), ss3.md5sum()])
for row in manifest.rows:
print(row)
if 'GCA_000175535.1' in row["name"]:
assert row["md5"] == ss1.md5sum()
assert row["n_hashes"] == 1047
if "GCA_000961135.2" in row["name"]:
if row["moltype"] == 'DNA':
assert row["md5"] == ss2.md5sum()
assert row["n_hashes"] == 1776
else:
assert row["md5"] == ss3.md5sum()
assert row["n_hashes"] == 2596


def test_gbsketch_simple_url(runtmp):
acc_csv = get_test_data('acc-with-ftppath.csv')
output = runtmp.output('simple.zip')
Expand Down Expand Up @@ -898,7 +941,7 @@ def test_gbsketch_simple_skipmer(runtmp, capfd):
print(captured.err)
print(f"looking for path: {output}")

# read the file with python and check sigs
# read the file with python and check sigs
import zipfile, gzip, json

with zipfile.ZipFile(output, "r") as zf:
Expand Down
41 changes: 41 additions & 0 deletions tests/test_urlsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import csv
import sourmash
from sourmash import sourmash_args
import sourmash_tst_utils as utils
from sourmash_tst_utils import SourmashCommandFailed

Expand Down Expand Up @@ -70,6 +71,46 @@ def test_urlsketch_simple(runtmp):
assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz"


def test_urlsketch_manifest(runtmp, capfd):
acc_csv = get_test_data('acc-url.csv')
output = runtmp.output('simple.zip')
failed = runtmp.output('failed.csv')

sig1 = get_test_data('GCA_000175535.1.sig.gz')
sig2 = get_test_data('GCA_000961135.2.sig.gz')
sig3 = get_test_data('GCA_000961135.2.protein.sig.gz')
ss1 = sourmash.load_one_signature(sig1, ksize=31)
ss2 = sourmash.load_one_signature(sig2, ksize=31)
ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein')

runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output,
'--failed', failed, '-r', '1',
'--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200")

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty
captured = capfd.readouterr()
print(captured.err)
print(f"looking for path: {output}")

idx = sourmash.load_file_as_index(output)
manifest = sourmash_args.get_manifest(idx)
assert len(manifest) == 3
assert manifest._md5_set == set([ss1.md5sum(), ss2.md5sum(), ss3.md5sum()])
for row in manifest.rows:
print(row)
if 'GCA_000175535.1' in row["name"]:
assert row["md5"] == ss1.md5sum()
assert row["n_hashes"] == 1047
if "GCA_000961135.2" in row["name"]:
if row["moltype"] == 'DNA':
assert row["md5"] == ss2.md5sum()
assert row["n_hashes"] == 1776
else:
assert row["md5"] == ss3.md5sum()
assert row["n_hashes"] == 2596


def test_urlsketch_save_fastas(runtmp):
acc_csv = get_test_data('acc-url.csv')
output = runtmp.output('simple.zip')
Expand Down

0 comments on commit 1c9ff80

Please sign in to comment.