Skip to content

Commit

Permalink
MRG: try using sourmash_utils functions. (#15)
Browse files Browse the repository at this point in the history
* try using new sourmash_utils stuff

* add sourmash_utils as a dep

* upd sourmash utils usage

* refactor around sourmash_utils

* bump version

* fix CI
  • Loading branch information
ctb authored Jun 11, 2024
1 parent 8b42d09 commit a0e2970
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 58 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,4 @@ jobs:

- name: build examples
shell: bash -l {0}
run: make cleanrun
run: make cleanall
6 changes: 4 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.PHONY: dist test_workflow
.PHONY: dist test_workflow clean cleanall

all: test_workflow

Expand All @@ -14,7 +14,9 @@ install:
test_workflow:
cd test_workflow && make

cleanrun:
clean: cd test_workflow && make clean

cleanall:
cd test_workflow && make cleanall

dist:
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@ name = "sourmash_plugin_pangenomics"
description = "sourmash plugin to do pangenomics."
readme = "README.md"
requires-python = ">=3.10"
version = "0.2"
version = "0.2.1"
authors = [
{name = "Colton Baumler", email = "[email protected]"},
{name = "Titus Brown", email = "[email protected]"},
]

dependencies = ["sourmash>=4.8.8,<5"]
dependencies = ["sourmash>=4.8.8,<5", "sourmash_utils>=0.2"]

[metadata]
license = { text = "BSD 3-Clause License" }
Expand Down
85 changes: 35 additions & 50 deletions src/sourmash_plugin_pangenomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pprint

import sourmash
import sourmash_utils
from sourmash import sourmash_args
from sourmash.tax import tax_utils
from sourmash.logging import debug_literal
Expand Down Expand Up @@ -66,12 +67,6 @@ def __init__(self, subparser):
help="database lineages file",
)
p.add_argument("sketches", nargs="+", help="sketches to combine")
p.add_argument("--scaled", default=1000, type=int)
p.add_argument("-k", "--ksize", default=31, type=int)
p.add_argument(
"-m", "--moltype",
default="DNA"
)
p.add_argument(
"-o",
"--output",
Expand All @@ -89,6 +84,7 @@ def __init__(self, subparser):
action="store_true",
help="Enable abundance tracking of hashes across rank selection.",
)
sourmash_utils.add_standard_minhash_args(p)

def main(self, args):
super().main(args)
Expand All @@ -107,18 +103,13 @@ def __init__(self, subparser):
p = subparser

p.add_argument("sketches", nargs="+", help="sketches to combine")
p.add_argument("--scaled", default=1000, type=int)
p.add_argument("-k", "--ksize", default=31, type=int)
p.add_argument(
"-m", "--moltype",
default="DNA"
)
p.add_argument(
"-o",
"--output",
required=True,
help="Define a filename for the pangenome signatures (.zip preferred).",
)
sourmash_utils.add_standard_minhash_args(p)

def main(self, args):
super().main(args)
Expand All @@ -140,13 +131,6 @@ def __init__(self, subparser):
metavar="SOURMASH_DATABASE",
help="The sourmash dictionary created from 'pangenome_creatdb --abund'",
)
p.add_argument(
"-k",
"--ksize",
type=int,
default=31,
help="The ksize of the sourmash pangenome database",
)
p.add_argument(
"-l",
"--lineage",
Expand All @@ -164,6 +148,7 @@ def __init__(self, subparser):
required=False,
help="CSV file containing classification of each hash",
)
sourmash_utils.add_standard_minhash_args(p)

def main(self, args):
super().main(args)
Expand All @@ -181,9 +166,9 @@ def __init__(self, subparser):
super().__init__(subparser)
p = subparser
p.add_argument("metagenome_sig")
p.add_argument("-k", "--ksize", default=31, help="k-mer size", type=int)
p.add_argument("ranktable_csv_files", nargs="+",
help="rank tables produced by pangenome_ranktable")
sourmash_utils.add_standard_minhash_args(p)

def main(self, args):
super().main(args)
Expand All @@ -207,14 +192,13 @@ def pangenome_createdb_main(args):
if args.csv:
csv_file = check_csv(args.csv)

select_mh = sourmash_utils.create_minhash_from_args(args)
print(f"selecting sketches: {select_mh}")

# Load the database
for filename in args.sketches:
print(f"loading file {filename} as index => manifest")
db = sourmash_args.load_file_as_index(filename)
db = db.select(ksize=args.ksize)
# @CTB check moltype
mf = db.manifest
assert mf, "no matching sketches for given ksize!?"
print(f"loading sketches from file {filename}")
db = sourmash_utils.load_index_and_select(filename, select_mh)

if args.csv:
chunk = []
Expand Down Expand Up @@ -322,7 +306,7 @@ def pangenome_createdb_main(args):

ss = sourmash.SourmashSignature(abund_mh, name=sig_name)
else:
ss = sourmash/SourmashSignature(mh, name=sig_name)
ss = sourmash.SourmashSignature(mh, name=sig_name)

save_sigs.add(ss)

Expand Down Expand Up @@ -352,14 +336,13 @@ def check_csv(csv_file):
#

def pangenome_merge_main(args):
select_mh = sourmash_utils.create_minhash_from_args(args)
print(f"selecting sketches: {select_mh}")

# Load the database
for filename in args.sketches:
print(f"loading file {filename} as index => manifest")
db = sourmash_args.load_file_as_index(filename)
db = db.select(ksize=args.ksize)
# @CTB check moltype
mf = db.manifest
assert mf, "no matching sketches for given ksize!?"
print(f"loading sketches from file {filename}")
db = sourmash_utils.load_index_and_select(filename, select_mh)

c = Counter()
mh = None
Expand Down Expand Up @@ -393,21 +376,18 @@ def pangenome_merge_main(args):


def db_process(
filename,
ignore_case,
k=31,
lineage_name="None",
filename,
ignore_case,
select_mh=None,
lineage_name="None",
):
bname = os.path.basename(filename)
ss_dict = {}
print(f"\nloading file {bname} as index => manifest")
print(f"selecting sketches: {select_mh}")

db = sourmash_args.load_file_as_index(filename)
db = db.select(ksize=k)
mf = db.manifest
print(f"{bname} contains {len(mf)} signatures")
ss_dict = {}
print(f"loading sketches from file '{filename}'")
db = sourmash_utils.load_index_and_select(filename, select_mh)
print(f"'{filename}' contains {len(db)} signatures")

assert mf, "no matching sketches for given ksize!?"
if lineage_name:
print(f"Looking for {lineage_name} signature\n")

Expand All @@ -422,10 +402,11 @@ def search_pattern(vals):
return any(pattern.search(val) for val in vals)

# find all matching rows.
mf = db.manifest
sub_mf = mf.filter_on_columns(search_pattern, ["name", "filename", "md5"])

selected_sigs = []
print(f"Found {len(sub_mf)} signatures in {bname}:")
print(f"Found {len(sub_mf)} signatures in '{filename}':")

for n, row in enumerate(sub_mf.rows, start=1):
print(f'{n:<15} \033[0;31m{row.get("name")}\033[0m')
Expand All @@ -441,7 +422,7 @@ def search_pattern(vals):
for ss in db.signatures():
name = ss.name

print(f"Found \033[0;31m{name}\033[0m in {bname}")
print(f"Found \033[0;31m{name}\033[0m in '{filename}'")

mh = ss.minhash
hashes = mh.hashes
Expand Down Expand Up @@ -514,9 +495,11 @@ def pangenome_elements(data):
#

def pangenome_ranktable_main(args):
select_mh = sourmash_utils.create_minhash_from_args(args)

ss_dict = db_process(
filename=args.data,
k=args.ksize,
select_mh=select_mh,
lineage_name=args.lineage,
ignore_case=args.ignore_case,
)
Expand Down Expand Up @@ -547,8 +530,10 @@ def pangenome_ranktable_main(args):
#

def classify_hashes_main(args):
db = sourmash.load_file_as_index(args.metagenome_sig)
db = db.select(ksize=args.ksize)
select_mh = sourmash_utils.create_minhash_from_args(args)
print(f"selecting sketches: {select_mh}")

db = sourmash_utils.load_index_and_select(args.metagenome_sig, select_mh)
sketches = list(db.signatures())
assert len(sketches) == 1
sketch = sketches[0]
Expand Down
6 changes: 4 additions & 2 deletions test_workflow/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
.PHONY: all clean cleanall
all:
snakemake -c 4 -p

cleanall:
clean:
snakemake -c 4 --delete-all-output
snakemake -c 4 -p

cleanall: clean all
2 changes: 1 addition & 1 deletion test_workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ rule merge_agatha:
pangenome_sig="test_output/agatha-merged.sig.zip",
shell: """
sourmash scripts pangenome_createdb {input.db} -t {input.tax} \
-o {output} --abund -k 21
-o {output} --abund -k 21 --dna --scaled=1000
"""

rule merge_agatha_2:
Expand Down

0 comments on commit a0e2970

Please sign in to comment.