Skip to content

Commit

Permalink
Disallow spaces in genome names/file paths due to downstream applicat…
Browse files Browse the repository at this point in the history
…ion issues (#327)
  • Loading branch information
aaronmussig committed Jun 22, 2021
1 parent 9c1c469 commit bb3637c
Showing 1 changed file with 13 additions and 2 deletions.
15 changes: 13 additions & 2 deletions gtdbtk/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def _check_package_compatibility(self):
f'intended for this release: {Config.MIN_REF_DATA_VERSION}',
['bright'], fg='yellow'))

def _verify_genome_id(self, genome_id):
def _verify_genome_id(self, genome_id: str) -> bool:
"""Ensure genome ID will be valid in Newick tree.
Parameters
Expand All @@ -95,14 +95,21 @@ def _verify_genome_id(self, genome_id):
If the genome identifier contains illegal characters.
"""

invalid_chars = set('()[],;=')
invalid_chars = set('()[],;= ')
if any((c in invalid_chars) for c in genome_id):
self.logger.error(f'Invalid genome ID: {genome_id}')
self.logger.error(f'The following characters are invalid: '
f'{" ".join(invalid_chars)}')
raise GenomeNameInvalid(f'Invalid genome ID: {genome_id}')
return True

@staticmethod
def _verify_file_path(file_path: str) -> bool:
if ' ' in file_path:
raise GTDBTkExit(f'The genome path contains a space, this is '
f'unsupported by downstream applications: {file_path}')
return True

def _genomes_to_process(self, genome_dir, batchfile, extension):
"""Get genomes to process.
Expand Down Expand Up @@ -136,6 +143,10 @@ def _genomes_to_process(self, genome_dir, batchfile, extension):
for genome_key in genomic_files:
self._verify_genome_id(genome_key)

# Check that there are no illegal characters in the file path
for file_path in genomic_files.values():
self._verify_file_path(file_path)

# Check that the prefix is valid and the path exists
invalid_paths = list()
for genome_key, genome_path in genomic_files.items():
Expand Down

0 comments on commit bb3637c

Please sign in to comment.