From e2b8aa352dd3da83082272af9381333dfc865eeb Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 3 Jul 2020 10:17:55 -0700 Subject: [PATCH 1/2] autodetect FASTA/FASTQ and complain if try to feed in as signatures --- sourmash/sig/__main__.py | 2 -- sourmash/sourmash_args.py | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py index 0ae1842b6d..23168cbaa8 100644 --- a/sourmash/sig/__main__.py +++ b/sourmash/sig/__main__.py @@ -73,7 +73,6 @@ def cat(args): try: this_siglist = sourmash_args.load_file_as_signatures(sigfile, traverse=True) except Exception as exc: - error('\nError while reading signatures from {}:'.format(sigfile)) error(str(exc)) error('(continuing)') @@ -177,7 +176,6 @@ def describe(args): for k in this_siglist: siglist.append((k, sigfile)) except Exception as exc: - error('\nError while reading signatures from {}:'.format(sigfile)) error(str(exc)) error('(continuing)') diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py index 094a535077..1d41145286 100644 --- a/sourmash/sourmash_args.py +++ b/sourmash/sourmash_args.py @@ -7,6 +7,8 @@ import itertools from enum import Enum +import screed + from sourmash import load_sbt_index from sourmash.lca.lca_db import load_single_database import sourmash.exceptions @@ -65,6 +67,11 @@ def calculate_moltype(args, default=None): def load_query_signature(filename, ksize, select_moltype, select_md5=None): + """Load a single signature to use as a query. + + Uses load_file_as_signatures underneath, so can load from collections + and indexed databases. + """ try: sl = load_file_as_signatures(filename, ksize=ksize, select_moltype=select_moltype) @@ -404,6 +411,21 @@ def _load_database(filename, traverse, traverse_yield_all): except: pass + if not loaded: + successful_screed_load = False + try: + # CTB: could be kind of time consuming for big record. + # maybe use screed format detection instead? + # CTB: also, close this. + it = screed.open(filename) + record = next(iter(it)) + successful_screed_load = True + except: + pass + + if successful_screed_load: + raise OSError("Error while reading signatures from '{}' - got sequences instead! Is this a FASTA/FASTQ file?".format(filename)) + if not loaded: raise OSError("Error while reading signatures from '{}'.".format(filename)) From e632bb9b16402b94bbf9ce07aca828146a9bfdbf Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 3 Jul 2020 20:50:04 -0700 Subject: [PATCH 2/2] add test for trying to load a screed-parseable (sequence) file --- sourmash/sourmash_args.py | 10 +++++----- tests/test_api.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py index 1d41145286..fd57f5ae27 100644 --- a/sourmash/sourmash_args.py +++ b/sourmash/sourmash_args.py @@ -413,12 +413,12 @@ def _load_database(filename, traverse, traverse_yield_all): if not loaded: successful_screed_load = False + it = None try: - # CTB: could be kind of time consuming for big record. - # maybe use screed format detection instead? - # CTB: also, close this. - it = screed.open(filename) - record = next(iter(it)) + # CTB: could be kind of time consuming for big record, but at the + # moment screed doesn't expose format detection cleanly. + with screed.open(filename) as it: + record = next(iter(it)) successful_screed_load = True except: pass diff --git a/tests/test_api.py b/tests/test_api.py index 1ff6c04cd9..243cc7ca62 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -46,3 +46,13 @@ def test_load_index_3(): sigs = list(idx.signatures()) assert len(sigs) == 2 + + +def test_load_fasta_as_signature(): + # try loading a fasta file - should fail with informative exception + testfile = utils.get_test_data('short.fa') + + with pytest.raises(OSError) as e: + idx = sourmash.load_file_as_index(testfile) + + assert "Error while reading signatures from '{}' - got sequences instead! Is this a FASTA/FASTQ file?".format(testfile) in str(e)