diff --git a/docs/src/changelog.rst b/docs/src/changelog.rst index 5dbd2b58..ca9954a0 100644 --- a/docs/src/changelog.rst +++ b/docs/src/changelog.rst @@ -2,6 +2,12 @@ Change log ========== +1.5.1 +----- + +* (`#327 `_) Disallow spaces in genome names/file paths due to downstream application issues. +* (`#326 `_) Disallow genome names that are blank. + 1.5.0 ----- diff --git a/gtdbtk/__init__.py b/gtdbtk/__init__.py index 70d0f0f5..f72eec57 100644 --- a/gtdbtk/__init__.py +++ b/gtdbtk/__init__.py @@ -29,4 +29,4 @@ __status__ = 'Production' __title__ = 'GTDB-Tk' __url__ = 'https://github.com/Ecogenomics/GTDBTk' -__version__ = '1.5.0' +__version__ = '1.5.1' diff --git a/gtdbtk/main.py b/gtdbtk/main.py index a85e2531..4dee14fd 100644 --- a/gtdbtk/main.py +++ b/gtdbtk/main.py @@ -76,7 +76,7 @@ def _check_package_compatibility(self): f'intended for this release: {Config.MIN_REF_DATA_VERSION}', ['bright'], fg='yellow')) - def _verify_genome_id(self, genome_id): + def _verify_genome_id(self, genome_id: str) -> bool: """Ensure genome ID will be valid in Newick tree. Parameters @@ -91,16 +91,27 @@ def _verify_genome_id(self, genome_id): Raises ------ - GenomeNameInvalid + GTDBTkExit If the genome identifier contains illegal characters. """ - - invalid_chars = set('()[],;=') + if genome_id is None or not isinstance(genome_id, str): + raise GTDBTkExit(f'The genome name is not a valid string: {genome_id}') + if len(genome_id) == 0: + raise GTDBTkExit('Genome name cannot be blank, check for input files ' + 'without a name, or empty columns in the batchfile.') + invalid_chars = frozenset('()[],;= ') if any((c in invalid_chars) for c in genome_id): self.logger.error(f'Invalid genome ID: {genome_id}') self.logger.error(f'The following characters are invalid: ' f'{" ".join(invalid_chars)}') - raise GenomeNameInvalid(f'Invalid genome ID: {genome_id}') + raise GTDBTkExit(f'Invalid genome ID: {genome_id}') + return True + + @staticmethod + def _verify_file_path(file_path: str) -> bool: + if ' ' in file_path: + raise GTDBTkExit(f'The genome path contains a space, this is ' + f'unsupported by downstream applications: {file_path}') return True def _genomes_to_process(self, genome_dir, batchfile, extension): @@ -136,6 +147,10 @@ def _genomes_to_process(self, genome_dir, batchfile, extension): for genome_key in genomic_files: self._verify_genome_id(genome_key) + # Check that there are no illegal characters in the file path + for file_path in genomic_files.values(): + self._verify_file_path(file_path) + # Check that the prefix is valid and the path exists invalid_paths = list() for genome_key, genome_path in genomic_files.items(): @@ -620,6 +635,10 @@ def parse_options(self, options): check_dependencies(['prodigal', 'hmmalign']) check_dependencies(['FastTree' + ('MP' if options.cpus > 1 else '')]) + if options.skip_gtdb_refs and options.custom_taxonomy_file is None: + raise GTDBTkExit("When running de_novo_wf, The '--skip_gtdb_refs' flag requires" + "'--custom_taxonomy_file' to be included to the command line.") + options.write_single_copy_genes = False self.identify(options)