From 9c1c46928db0168bb7204f0becba36d1daa54708 Mon Sep 17 00:00:00 2001 From: pchaumeil Date: Thu, 29 Apr 2021 09:26:46 +1000 Subject: [PATCH 1/4] in de_novo_wf, '--skip_gtdb_refs' requires '--custom_taxonomy_file' --- gtdbtk/main.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gtdbtk/main.py b/gtdbtk/main.py index a85e2531..e83c6194 100644 --- a/gtdbtk/main.py +++ b/gtdbtk/main.py @@ -620,6 +620,10 @@ def parse_options(self, options): check_dependencies(['prodigal', 'hmmalign']) check_dependencies(['FastTree' + ('MP' if options.cpus > 1 else '')]) + if options.skip_gtdb_refs and options.custom_taxonomy_file is None: + raise GTDBTkExit("When running de_novo_wf, The '--skip_gtdb_refs' flag requires" + "'--custom_taxonomy_file' to be included to the command line.") + options.write_single_copy_genes = False self.identify(options) From bb3637c2ca18e1b2c06edd88b5d100e470ff218b Mon Sep 17 00:00:00 2001 From: Aaron Mussig Date: Tue, 22 Jun 2021 16:14:25 +1000 Subject: [PATCH 2/4] Disallow spaces in genome names/file paths due to downstream application issues (#327) --- gtdbtk/main.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/gtdbtk/main.py b/gtdbtk/main.py index e83c6194..30fb874d 100644 --- a/gtdbtk/main.py +++ b/gtdbtk/main.py @@ -76,7 +76,7 @@ def _check_package_compatibility(self): f'intended for this release: {Config.MIN_REF_DATA_VERSION}', ['bright'], fg='yellow')) - def _verify_genome_id(self, genome_id): + def _verify_genome_id(self, genome_id: str) -> bool: """Ensure genome ID will be valid in Newick tree. Parameters @@ -95,7 +95,7 @@ def _verify_genome_id(self, genome_id): If the genome identifier contains illegal characters. """ - invalid_chars = set('()[],;=') + invalid_chars = set('()[],;= ') if any((c in invalid_chars) for c in genome_id): self.logger.error(f'Invalid genome ID: {genome_id}') self.logger.error(f'The following characters are invalid: ' @@ -103,6 +103,13 @@ def _verify_genome_id(self, genome_id): raise GenomeNameInvalid(f'Invalid genome ID: {genome_id}') return True + @staticmethod + def _verify_file_path(file_path: str) -> bool: + if ' ' in file_path: + raise GTDBTkExit(f'The genome path contains a space, this is ' + f'unsupported by downstream applications: {file_path}') + return True + def _genomes_to_process(self, genome_dir, batchfile, extension): """Get genomes to process. @@ -136,6 +143,10 @@ def _genomes_to_process(self, genome_dir, batchfile, extension): for genome_key in genomic_files: self._verify_genome_id(genome_key) + # Check that there are no illegal characters in the file path + for file_path in genomic_files.values(): + self._verify_file_path(file_path) + # Check that the prefix is valid and the path exists invalid_paths = list() for genome_key, genome_path in genomic_files.items(): From c7ffe3ce446d75bb3a416a976ef143da15ae6593 Mon Sep 17 00:00:00 2001 From: Aaron Mussig Date: Tue, 22 Jun 2021 16:16:55 +1000 Subject: [PATCH 3/4] Prepare for 1.5.1 --- docs/src/changelog.rst | 6 ++++++ gtdbtk/__init__.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/src/changelog.rst b/docs/src/changelog.rst index 5dbd2b58..f8c390ba 100644 --- a/docs/src/changelog.rst +++ b/docs/src/changelog.rst @@ -2,6 +2,12 @@ Change log ========== +1.5.1 +----- + +* (`#327 `_) Disallow spaces in genome names/file paths due to downstream application issues. + + 1.5.0 ----- diff --git a/gtdbtk/__init__.py b/gtdbtk/__init__.py index 70d0f0f5..f72eec57 100644 --- a/gtdbtk/__init__.py +++ b/gtdbtk/__init__.py @@ -29,4 +29,4 @@ __status__ = 'Production' __title__ = 'GTDB-Tk' __url__ = 'https://github.com/Ecogenomics/GTDBTk' -__version__ = '1.5.0' +__version__ = '1.5.1' From 5d9ef03cb6d0b59f022e30a9a7870be71f62bce8 Mon Sep 17 00:00:00 2001 From: Aaron Mussig Date: Thu, 24 Jun 2021 14:10:33 +1000 Subject: [PATCH 4/4] fix(#326): Disallow blank/zero length genome names. --- docs/src/changelog.rst | 2 +- gtdbtk/main.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/docs/src/changelog.rst b/docs/src/changelog.rst index f8c390ba..ca9954a0 100644 --- a/docs/src/changelog.rst +++ b/docs/src/changelog.rst @@ -6,7 +6,7 @@ Change log ----- * (`#327 `_) Disallow spaces in genome names/file paths due to downstream application issues. - +* (`#326 `_) Disallow genome names that are blank. 1.5.0 ----- diff --git a/gtdbtk/main.py b/gtdbtk/main.py index 30fb874d..4dee14fd 100644 --- a/gtdbtk/main.py +++ b/gtdbtk/main.py @@ -91,16 +91,20 @@ def _verify_genome_id(self, genome_id: str) -> bool: Raises ------ - GenomeNameInvalid + GTDBTkExit If the genome identifier contains illegal characters. """ - - invalid_chars = set('()[],;= ') + if genome_id is None or not isinstance(genome_id, str): + raise GTDBTkExit(f'The genome name is not a valid string: {genome_id}') + if len(genome_id) == 0: + raise GTDBTkExit('Genome name cannot be blank, check for input files ' + 'without a name, or empty columns in the batchfile.') + invalid_chars = frozenset('()[],;= ') if any((c in invalid_chars) for c in genome_id): self.logger.error(f'Invalid genome ID: {genome_id}') self.logger.error(f'The following characters are invalid: ' f'{" ".join(invalid_chars)}') - raise GenomeNameInvalid(f'Invalid genome ID: {genome_id}') + raise GTDBTkExit(f'Invalid genome ID: {genome_id}') return True @staticmethod