users can use GCA and GCF genomes.

Ecogenomics · Feb 25, 2020 · ec31c86 · ec31c86
1 parent 25010e0
commit ec31c86
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 28 deletions.
diff --git a/gtdbtk/classify.py b/gtdbtk/classify.py
@@ -39,7 +39,7 @@
 from gtdbtk.external.pplacer import Pplacer
 from gtdbtk.markers import Markers
 from gtdbtk.relative_distance import RelativeDistance
-from gtdbtk.tools import add_ncbi_prefix, symlink_f, get_memory_gb
+from gtdbtk.tools import add_ncbi_prefix, symlink_f, get_memory_gb, get_reference_ids
 
 sys.setrecursionlimit(15000)
 
@@ -64,6 +64,8 @@ def __init__(self, cpus=1, pplacer_cpus=None):
 
         self.species_radius = self.parse_radius_file()
 
+        self.reference_ids = get_reference_ids()
+
     def parse_radius_file(self):
         results = {}
         with open(Config.RADII_FILE) as f:
@@ -402,12 +404,12 @@ def run(self,
                 # if, while going up the tree, we find a node with only one
                 # reference genome, we select this reference genome as
                 # leaf_reference.
-                if userleaf.taxon.label[0:3] not in ['RS_', 'GB_', 'UBA']:
+                if userleaf.taxon.label not in self.reference_ids:
 
                     par_node = userleaf.parent_node
                     leaf_ref_genome = None
                     leaf_ref_genomes = [subnd for subnd in par_node.leaf_iter(
-                    ) if subnd.taxon.label.replace("'", '')[0:3] in ['RS_', 'GB_', 'UBA']]
+                    ) if subnd.taxon.label.replace("'", '') in self.reference_ids]
                     if len(leaf_ref_genomes) == 1:
                         leaf_ref_genome = leaf_ref_genomes[0]
 
@@ -419,7 +421,7 @@ def run(self,
                         par_node = par_node.parent_node
                         if leaf_ref_genome is None:
                             leaf_ref_genomes = [subnd for subnd in par_node.leaf_iter(
-                            ) if subnd.taxon.label.replace("'", '')[0:3] in ['RS_', 'GB_', 'UBA']]
+                            ) if subnd.taxon.label.replace("'", '') in self.reference_ids]
                             if len(leaf_ref_genomes) == 1:
                                 leaf_ref_genome = leaf_ref_genomes[0]
                         _support, parent_taxon, _aux_info = parse_label(
@@ -430,15 +432,14 @@ def run(self,
                     if parent_rank.startswith('g__'):
                         # we get all the reference genomes under this genus
                         list_subnode_initials = [subnd.taxon.label.replace(
-                            "'", '')[0:3] for subnd in par_node.leaf_iter()]
-                        if (list_subnode_initials.count('RS_') + list_subnode_initials.count(
-                                'GB_') + list_subnode_initials.count('UBA')) < 1:
+                            "'", '') for subnd in par_node.leaf_iter()]
+                        if len(set(list_subnode_initials) & set(self.reference_ids)) < 1:
                             raise Exception(
                                 "There is no reference genomes under '{}'".format('parent_rank'))
                         else:
                             dict_dist_refgenomes = {}
                             list_ref_genomes = [subnd for subnd in par_node.leaf_iter(
-                            ) if subnd.taxon.label.replace("'", '')[0:3] in ['RS_', 'GB_', 'UBA']]
+                            ) if subnd.taxon.label.replace("'", '') in self.reference_ids]
                             # we pick the first 100 genomes closest (patristic distance) to the
                             # user genome under the same genus
                             for ref_genome in list_ref_genomes:
@@ -498,12 +499,12 @@ def run(self,
                     # on the same parent node so we need to go up the tree
                     # to find a node with a reference genome as leaf.
                     cur_node = leaf.parent_node
-                    list_subnode_initials = [subnd.taxon.label.replace(
-                        "'", '')[0:3] for subnd in cur_node.leaf_iter()]
-                    while 'RS_' not in list_subnode_initials and 'GB_' not in list_subnode_initials and 'UBA' not in list_subnode_initials:
+                    list_subnode = [subnd.taxon.label.replace(
+                        "'", '') for subnd in cur_node.leaf_iter()]
+                    while len(set(list_subnode) & set(self.reference_ids)) < 1:
                         cur_node = cur_node.parent_node
                         list_subnode_initials = [subnd.taxon.label.replace(
-                            "'", '')[0:3] for subnd in cur_node.leaf_iter()]
+                            "'", '') for subnd in cur_node.leaf_iter()]
 
                     current_rel_list = cur_node.rel_dist
 
@@ -537,7 +538,7 @@ def run(self,
 
                         # get all reference genomes under the current node
                         list_subnode = [childnd.taxon.label.replace("'", '') for childnd in cur_node.leaf_iter(
-                        ) if childnd.taxon.label[0:3] in ['RS_', 'UBA', 'GB_']]
+                        ) if childnd.taxon.label in self.reference_ids]
 
                         # get all names for the child rank
                         list_ranks = [self.gtdb_taxonomy.get(
@@ -567,7 +568,7 @@ def run(self,
                     # case 1b
                     if len(child_taxons) == 0 and closest_rank is None:
                         list_leaves = [childnd.taxon.label.replace("'", '') for childnd in cur_node.leaf_iter(
-                        ) if childnd.taxon.label[0:3] in ['RS_', 'UBA', 'GB_']]
+                        ) if childnd.taxon.label in self.reference_ids]
                         if len(list_leaves) != 1:
                             list_subrank = []
                             for leaf_subrank in list_leaves:
@@ -1131,7 +1132,7 @@ def _parse_subnodes(self, list_subnode, closest_rank):
         initial_loop = True
         for item in list_subnode:
             # We get the taxonomy of all reference genomes
-            if item.startswith('RS_') or item.startswith('GB_') or item.startswith('UBA'):
+            if item in self.reference_ids:
                 taxonomy_from_file = self.gtdb_taxonomy.get(item)
                 # we store the selected rank (i.e. order) for each reference
                 # genome

diff --git a/gtdbtk/exceptions.py b/gtdbtk/exceptions.py
@@ -65,6 +65,13 @@ def __init__(self, message=''):
         GTDBTkException.__init__(self, message)
 
 
+class InconsistentGenomeBatch(GTDBTkException):
+    """ Thrown when number of genomes in the identify directory is different than the number of genomes to process. """
+
+    def __init__(self, message=''):
+        GTDBTkException.__init__(self, message)
+
+
 class FileNotFound(GTDBTkException):
     """ Thrown when a file is not found. """
 

diff --git a/gtdbtk/main.py b/gtdbtk/main.py
@@ -37,7 +37,7 @@
 from gtdbtk.markers import Markers
 from gtdbtk.misc import Misc
 from gtdbtk.reroot_tree import RerootTree
-from gtdbtk.tools import symlink_f
+from gtdbtk.tools import symlink_f, get_reference_ids
 
 
 class OptionsParser(object):
@@ -55,6 +55,8 @@ def __init__(self, version):
         self.version = version
         self._check_package_compatibility()
 
+        self.genomes_to_process = None
+
     def _check_package_compatibility(self):
         """Check that GTDB-Tk is using the most up-to-date reference package."""
         pkg_ver = float(Config.VERSION_DATA.replace('r', ''))
@@ -160,13 +162,6 @@ def _genomes_to_process(self, genome_dir, batchfile, extension):
         # Check that the prefix is valid and the path exists
         invalid_paths = list()
         for genome_key, genome_path in genomic_files.items():
-            if genome_key.startswith("RS_") or genome_key.startswith("GB_") \
-                    or genome_key.startswith("UBA"):
-                self.logger.error("Submitted genomes start with the same prefix"
-                                  " (RS_,GB_,UBA) as reference genomes in"
-                                  " GTDB-Tk. This will cause issues for"
-                                  " downstream analysis.")
-                raise GTDBTkExit
 
             if not os.path.isfile(genome_path):
                 invalid_paths.append((genome_key, genome_path))
@@ -191,6 +186,12 @@ def _genomes_to_process(self, genome_dir, batchfile, extension):
                                   'check the format of this file.' % batchfile)
             raise GTDBTkExit
 
+        if set(genomic_files.keys()) & set(get_reference_ids()):
+            self.logger.error(
+                'The following ids are reserved for GTDB-Tk reference genomes: {}'.format(set(genomic_files.keys()) & set(get_reference_ids())))
+            self.logger.error('Please rename those genomes.')
+            raise GTDBTkExit
+
         return genomic_files, tln_tables
 
     def _marker_set_id(self, bac120_ms, ar122_ms, rps23_ms):
@@ -243,6 +244,7 @@ def identify(self, options):
 
         genomes, tln_tables = self._genomes_to_process(
             options.genome_dir, options.batchfile, options.extension)
+        self.genomes_to_process = genomes
 
         markers = Markers(options.cpus)
         markers.identify(genomes,
@@ -282,7 +284,8 @@ def align(self, options):
                       options.min_perc_taxa,
                       options.out_dir,
                       options.prefix,
-                      options.outgroup_taxon)
+                      options.outgroup_taxon,
+                      self.genomes_to_process)
 
         self.logger.info('Done.')
 
@@ -633,6 +636,7 @@ def parse_options(self, options):
             options.max_consensus = None
             options.rnd_seed = None
             options.skip_trimming = False
+
             self.align(options)
 
             self.classify(options)

diff --git a/gtdbtk/markers.py b/gtdbtk/markers.py
@@ -28,7 +28,7 @@
 from gtdbtk.biolib_lite.seq_io import read_fasta
 from gtdbtk.biolib_lite.taxonomy import Taxonomy
 from gtdbtk.config.output import *
-from gtdbtk.exceptions import GenomeMarkerSetUnknown, MSAMaskLengthMismatch
+from gtdbtk.exceptions import GenomeMarkerSetUnknown, MSAMaskLengthMismatch, InconsistentGenomeBatch
 from gtdbtk.external.hmm_aligner import HmmAligner
 from gtdbtk.external.pfam_search import PfamSearch
 from gtdbtk.external.prodigal import Prodigal
@@ -463,7 +463,8 @@ def align(self,
               min_per_taxa,
               out_dir,
               prefix,
-              outgroup_taxon):
+              outgroup_taxon,
+              genomes_to_process=None):
         """Align marker genes in genomes."""
 
         if identify_dir != out_dir:
@@ -492,6 +493,12 @@ def align(self,
 
         genomic_files = self._path_to_identify_data(
             identify_dir, identify_dir != out_dir)
+        if genomes_to_process is not None and len(genomic_files) != len(genomes_to_process):
+            self.logger.error('{} are not present in the input list of genome to process.'.format(
+                list(set(genomic_files.keys()) - set(genomes_to_process.keys()))))
+            raise InconsistentGenomeBatch(
+                'Number of processed genomes in the identify directory is unequal to the list of input genomes.')
+
         self.logger.info('Aligning markers in %d genomes with %d threads.' % (len(genomic_files),
                                                                               self.cpus))
 
@@ -554,7 +561,8 @@ def align(self,
 
             # Write the individual marker alignments to disk
             if self.debug:
-                self._write_individual_markers(user_msa, marker_set_id, marker_info_file, out_dir, prefix)
+                self._write_individual_markers(
+                    user_msa, marker_set_id, marker_info_file, out_dir, prefix)
 
             # filter columns without sufficient representation across taxa
             if skip_trimming:
@@ -680,7 +688,8 @@ def _write_individual_markers(self, user_msa, marker_set_id, marker_list, out_di
             offset += marker_len
 
         if total_msa_len != offset:
-            self.logger.warning('Internal error: the total MSA length is not equal to the offset.')
+            self.logger.warning(
+                'Internal error: the total MSA length is not equal to the offset.')
         for path_marker, gid_dict in marker_to_msa.items():
             with open(path_marker, 'w') as fh:
                 for genome_id, genome_msa in gid_dict.items():

diff --git a/gtdbtk/tools.py b/gtdbtk/tools.py
@@ -9,12 +9,28 @@
 
 from gtdbtk.config.output import CHECKSUM_SUFFIX
 
+import gtdbtk.config.config as Config
+
 
 ##################################################
 ############MISC UTILITIES########################
 ##################################################
 
 
+def get_reference_ids():
+    results = []
+    with open(Config.TAXONOMY_FILE) as tf:
+        for line in tf:
+            raw_id = line.split('\t')[0]
+            results.append(raw_id)
+            if raw_id[0:4] in ['GCF_', 'GCA_']:
+                results.append(add_ncbi_prefix(raw_id))
+            elif raw_id[0:3] in ['RS_', 'GB_']:
+                results.append(raw_id[3:])
+
+    return results
+
+
 def add_ncbi_prefix(refname):
     if refname.startswith("GCF_"):
         return "RS_" + refname