From 9c1c46928db0168bb7204f0becba36d1daa54708 Mon Sep 17 00:00:00 2001
From: pchaumeil
Date: Thu, 29 Apr 2021 09:26:46 +1000
Subject: [PATCH 1/4] in de_novo_wf, '--skip_gtdb_refs' requires
'--custom_taxonomy_file'
---
gtdbtk/main.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/gtdbtk/main.py b/gtdbtk/main.py
index a85e2531..e83c6194 100644
--- a/gtdbtk/main.py
+++ b/gtdbtk/main.py
@@ -620,6 +620,10 @@ def parse_options(self, options):
check_dependencies(['prodigal', 'hmmalign'])
check_dependencies(['FastTree' + ('MP' if options.cpus > 1 else '')])
+ if options.skip_gtdb_refs and options.custom_taxonomy_file is None:
+ raise GTDBTkExit("When running de_novo_wf, The '--skip_gtdb_refs' flag requires"
+ "'--custom_taxonomy_file' to be included to the command line.")
+
options.write_single_copy_genes = False
self.identify(options)
From bb3637c2ca18e1b2c06edd88b5d100e470ff218b Mon Sep 17 00:00:00 2001
From: Aaron Mussig
Date: Tue, 22 Jun 2021 16:14:25 +1000
Subject: [PATCH 2/4] Disallow spaces in genome names/file paths due to
downstream application issues (#327)
---
gtdbtk/main.py | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/gtdbtk/main.py b/gtdbtk/main.py
index e83c6194..30fb874d 100644
--- a/gtdbtk/main.py
+++ b/gtdbtk/main.py
@@ -76,7 +76,7 @@ def _check_package_compatibility(self):
f'intended for this release: {Config.MIN_REF_DATA_VERSION}',
['bright'], fg='yellow'))
- def _verify_genome_id(self, genome_id):
+ def _verify_genome_id(self, genome_id: str) -> bool:
"""Ensure genome ID will be valid in Newick tree.
Parameters
@@ -95,7 +95,7 @@ def _verify_genome_id(self, genome_id):
If the genome identifier contains illegal characters.
"""
- invalid_chars = set('()[],;=')
+ invalid_chars = set('()[],;= ')
if any((c in invalid_chars) for c in genome_id):
self.logger.error(f'Invalid genome ID: {genome_id}')
self.logger.error(f'The following characters are invalid: '
@@ -103,6 +103,13 @@ def _verify_genome_id(self, genome_id):
raise GenomeNameInvalid(f'Invalid genome ID: {genome_id}')
return True
+ @staticmethod
+ def _verify_file_path(file_path: str) -> bool:
+ if ' ' in file_path:
+ raise GTDBTkExit(f'The genome path contains a space, this is '
+ f'unsupported by downstream applications: {file_path}')
+ return True
+
def _genomes_to_process(self, genome_dir, batchfile, extension):
"""Get genomes to process.
@@ -136,6 +143,10 @@ def _genomes_to_process(self, genome_dir, batchfile, extension):
for genome_key in genomic_files:
self._verify_genome_id(genome_key)
+ # Check that there are no illegal characters in the file path
+ for file_path in genomic_files.values():
+ self._verify_file_path(file_path)
+
# Check that the prefix is valid and the path exists
invalid_paths = list()
for genome_key, genome_path in genomic_files.items():
From c7ffe3ce446d75bb3a416a976ef143da15ae6593 Mon Sep 17 00:00:00 2001
From: Aaron Mussig
Date: Tue, 22 Jun 2021 16:16:55 +1000
Subject: [PATCH 3/4] Prepare for 1.5.1
---
docs/src/changelog.rst | 6 ++++++
gtdbtk/__init__.py | 2 +-
2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/docs/src/changelog.rst b/docs/src/changelog.rst
index 5dbd2b58..f8c390ba 100644
--- a/docs/src/changelog.rst
+++ b/docs/src/changelog.rst
@@ -2,6 +2,12 @@
Change log
==========
+1.5.1
+-----
+
+* (`#327 `_) Disallow spaces in genome names/file paths due to downstream application issues.
+
+
1.5.0
-----
diff --git a/gtdbtk/__init__.py b/gtdbtk/__init__.py
index 70d0f0f5..f72eec57 100644
--- a/gtdbtk/__init__.py
+++ b/gtdbtk/__init__.py
@@ -29,4 +29,4 @@
__status__ = 'Production'
__title__ = 'GTDB-Tk'
__url__ = 'https://github.com/Ecogenomics/GTDBTk'
-__version__ = '1.5.0'
+__version__ = '1.5.1'
From 5d9ef03cb6d0b59f022e30a9a7870be71f62bce8 Mon Sep 17 00:00:00 2001
From: Aaron Mussig
Date: Thu, 24 Jun 2021 14:10:33 +1000
Subject: [PATCH 4/4] fix(#326): Disallow blank/zero length genome names.
---
docs/src/changelog.rst | 2 +-
gtdbtk/main.py | 12 ++++++++----
2 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/docs/src/changelog.rst b/docs/src/changelog.rst
index f8c390ba..ca9954a0 100644
--- a/docs/src/changelog.rst
+++ b/docs/src/changelog.rst
@@ -6,7 +6,7 @@ Change log
-----
* (`#327 `_) Disallow spaces in genome names/file paths due to downstream application issues.
-
+* (`#326 `_) Disallow genome names that are blank.
1.5.0
-----
diff --git a/gtdbtk/main.py b/gtdbtk/main.py
index 30fb874d..4dee14fd 100644
--- a/gtdbtk/main.py
+++ b/gtdbtk/main.py
@@ -91,16 +91,20 @@ def _verify_genome_id(self, genome_id: str) -> bool:
Raises
------
- GenomeNameInvalid
+ GTDBTkExit
If the genome identifier contains illegal characters.
"""
-
- invalid_chars = set('()[],;= ')
+ if genome_id is None or not isinstance(genome_id, str):
+ raise GTDBTkExit(f'The genome name is not a valid string: {genome_id}')
+ if len(genome_id) == 0:
+ raise GTDBTkExit('Genome name cannot be blank, check for input files '
+ 'without a name, or empty columns in the batchfile.')
+ invalid_chars = frozenset('()[],;= ')
if any((c in invalid_chars) for c in genome_id):
self.logger.error(f'Invalid genome ID: {genome_id}')
self.logger.error(f'The following characters are invalid: '
f'{" ".join(invalid_chars)}')
- raise GenomeNameInvalid(f'Invalid genome ID: {genome_id}')
+ raise GTDBTkExit(f'Invalid genome ID: {genome_id}')
return True
@staticmethod