Skip to content

Commit

Permalink
users can use GCA and GCF genomes.
Browse files Browse the repository at this point in the history
  • Loading branch information
pchaumeil committed Feb 25, 2020
1 parent 25010e0 commit ec31c86
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 28 deletions.
31 changes: 16 additions & 15 deletions gtdbtk/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from gtdbtk.external.pplacer import Pplacer
from gtdbtk.markers import Markers
from gtdbtk.relative_distance import RelativeDistance
from gtdbtk.tools import add_ncbi_prefix, symlink_f, get_memory_gb
from gtdbtk.tools import add_ncbi_prefix, symlink_f, get_memory_gb, get_reference_ids

sys.setrecursionlimit(15000)

Expand All @@ -64,6 +64,8 @@ def __init__(self, cpus=1, pplacer_cpus=None):

self.species_radius = self.parse_radius_file()

self.reference_ids = get_reference_ids()

def parse_radius_file(self):
results = {}
with open(Config.RADII_FILE) as f:
Expand Down Expand Up @@ -402,12 +404,12 @@ def run(self,
# if, while going up the tree, we find a node with only one
# reference genome, we select this reference genome as
# leaf_reference.
if userleaf.taxon.label[0:3] not in ['RS_', 'GB_', 'UBA']:
if userleaf.taxon.label not in self.reference_ids:

par_node = userleaf.parent_node
leaf_ref_genome = None
leaf_ref_genomes = [subnd for subnd in par_node.leaf_iter(
) if subnd.taxon.label.replace("'", '')[0:3] in ['RS_', 'GB_', 'UBA']]
) if subnd.taxon.label.replace("'", '') in self.reference_ids]
if len(leaf_ref_genomes) == 1:
leaf_ref_genome = leaf_ref_genomes[0]

Expand All @@ -419,7 +421,7 @@ def run(self,
par_node = par_node.parent_node
if leaf_ref_genome is None:
leaf_ref_genomes = [subnd for subnd in par_node.leaf_iter(
) if subnd.taxon.label.replace("'", '')[0:3] in ['RS_', 'GB_', 'UBA']]
) if subnd.taxon.label.replace("'", '') in self.reference_ids]
if len(leaf_ref_genomes) == 1:
leaf_ref_genome = leaf_ref_genomes[0]
_support, parent_taxon, _aux_info = parse_label(
Expand All @@ -430,15 +432,14 @@ def run(self,
if parent_rank.startswith('g__'):
# we get all the reference genomes under this genus
list_subnode_initials = [subnd.taxon.label.replace(
"'", '')[0:3] for subnd in par_node.leaf_iter()]
if (list_subnode_initials.count('RS_') + list_subnode_initials.count(
'GB_') + list_subnode_initials.count('UBA')) < 1:
"'", '') for subnd in par_node.leaf_iter()]
if len(set(list_subnode_initials) & set(self.reference_ids)) < 1:
raise Exception(
"There is no reference genomes under '{}'".format('parent_rank'))
else:
dict_dist_refgenomes = {}
list_ref_genomes = [subnd for subnd in par_node.leaf_iter(
) if subnd.taxon.label.replace("'", '')[0:3] in ['RS_', 'GB_', 'UBA']]
) if subnd.taxon.label.replace("'", '') in self.reference_ids]
# we pick the first 100 genomes closest (patristic distance) to the
# user genome under the same genus
for ref_genome in list_ref_genomes:
Expand Down Expand Up @@ -498,12 +499,12 @@ def run(self,
# on the same parent node so we need to go up the tree
# to find a node with a reference genome as leaf.
cur_node = leaf.parent_node
list_subnode_initials = [subnd.taxon.label.replace(
"'", '')[0:3] for subnd in cur_node.leaf_iter()]
while 'RS_' not in list_subnode_initials and 'GB_' not in list_subnode_initials and 'UBA' not in list_subnode_initials:
list_subnode = [subnd.taxon.label.replace(
"'", '') for subnd in cur_node.leaf_iter()]
while len(set(list_subnode) & set(self.reference_ids)) < 1:
cur_node = cur_node.parent_node
list_subnode_initials = [subnd.taxon.label.replace(
"'", '')[0:3] for subnd in cur_node.leaf_iter()]
"'", '') for subnd in cur_node.leaf_iter()]

current_rel_list = cur_node.rel_dist

Expand Down Expand Up @@ -537,7 +538,7 @@ def run(self,

# get all reference genomes under the current node
list_subnode = [childnd.taxon.label.replace("'", '') for childnd in cur_node.leaf_iter(
) if childnd.taxon.label[0:3] in ['RS_', 'UBA', 'GB_']]
) if childnd.taxon.label in self.reference_ids]

# get all names for the child rank
list_ranks = [self.gtdb_taxonomy.get(
Expand Down Expand Up @@ -567,7 +568,7 @@ def run(self,
# case 1b
if len(child_taxons) == 0 and closest_rank is None:
list_leaves = [childnd.taxon.label.replace("'", '') for childnd in cur_node.leaf_iter(
) if childnd.taxon.label[0:3] in ['RS_', 'UBA', 'GB_']]
) if childnd.taxon.label in self.reference_ids]
if len(list_leaves) != 1:
list_subrank = []
for leaf_subrank in list_leaves:
Expand Down Expand Up @@ -1131,7 +1132,7 @@ def _parse_subnodes(self, list_subnode, closest_rank):
initial_loop = True
for item in list_subnode:
# We get the taxonomy of all reference genomes
if item.startswith('RS_') or item.startswith('GB_') or item.startswith('UBA'):
if item in self.reference_ids:
taxonomy_from_file = self.gtdb_taxonomy.get(item)
# we store the selected rank (i.e. order) for each reference
# genome
Expand Down
7 changes: 7 additions & 0 deletions gtdbtk/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ def __init__(self, message=''):
GTDBTkException.__init__(self, message)


class InconsistentGenomeBatch(GTDBTkException):
""" Thrown when number of genomes in the identify directory is different than the number of genomes to process. """

def __init__(self, message=''):
GTDBTkException.__init__(self, message)


class FileNotFound(GTDBTkException):
""" Thrown when a file is not found. """

Expand Down
22 changes: 13 additions & 9 deletions gtdbtk/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from gtdbtk.markers import Markers
from gtdbtk.misc import Misc
from gtdbtk.reroot_tree import RerootTree
from gtdbtk.tools import symlink_f
from gtdbtk.tools import symlink_f, get_reference_ids


class OptionsParser(object):
Expand All @@ -55,6 +55,8 @@ def __init__(self, version):
self.version = version
self._check_package_compatibility()

self.genomes_to_process = None

def _check_package_compatibility(self):
"""Check that GTDB-Tk is using the most up-to-date reference package."""
pkg_ver = float(Config.VERSION_DATA.replace('r', ''))
Expand Down Expand Up @@ -160,13 +162,6 @@ def _genomes_to_process(self, genome_dir, batchfile, extension):
# Check that the prefix is valid and the path exists
invalid_paths = list()
for genome_key, genome_path in genomic_files.items():
if genome_key.startswith("RS_") or genome_key.startswith("GB_") \
or genome_key.startswith("UBA"):
self.logger.error("Submitted genomes start with the same prefix"
" (RS_,GB_,UBA) as reference genomes in"
" GTDB-Tk. This will cause issues for"
" downstream analysis.")
raise GTDBTkExit

if not os.path.isfile(genome_path):
invalid_paths.append((genome_key, genome_path))
Expand All @@ -191,6 +186,12 @@ def _genomes_to_process(self, genome_dir, batchfile, extension):
'check the format of this file.' % batchfile)
raise GTDBTkExit

if set(genomic_files.keys()) & set(get_reference_ids()):
self.logger.error(
'The following ids are reserved for GTDB-Tk reference genomes: {}'.format(set(genomic_files.keys()) & set(get_reference_ids())))
self.logger.error('Please rename those genomes.')
raise GTDBTkExit

return genomic_files, tln_tables

def _marker_set_id(self, bac120_ms, ar122_ms, rps23_ms):
Expand Down Expand Up @@ -243,6 +244,7 @@ def identify(self, options):

genomes, tln_tables = self._genomes_to_process(
options.genome_dir, options.batchfile, options.extension)
self.genomes_to_process = genomes

markers = Markers(options.cpus)
markers.identify(genomes,
Expand Down Expand Up @@ -282,7 +284,8 @@ def align(self, options):
options.min_perc_taxa,
options.out_dir,
options.prefix,
options.outgroup_taxon)
options.outgroup_taxon,
self.genomes_to_process)

self.logger.info('Done.')

Expand Down Expand Up @@ -633,6 +636,7 @@ def parse_options(self, options):
options.max_consensus = None
options.rnd_seed = None
options.skip_trimming = False

self.align(options)

self.classify(options)
Expand Down
17 changes: 13 additions & 4 deletions gtdbtk/markers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from gtdbtk.biolib_lite.seq_io import read_fasta
from gtdbtk.biolib_lite.taxonomy import Taxonomy
from gtdbtk.config.output import *
from gtdbtk.exceptions import GenomeMarkerSetUnknown, MSAMaskLengthMismatch
from gtdbtk.exceptions import GenomeMarkerSetUnknown, MSAMaskLengthMismatch, InconsistentGenomeBatch
from gtdbtk.external.hmm_aligner import HmmAligner
from gtdbtk.external.pfam_search import PfamSearch
from gtdbtk.external.prodigal import Prodigal
Expand Down Expand Up @@ -463,7 +463,8 @@ def align(self,
min_per_taxa,
out_dir,
prefix,
outgroup_taxon):
outgroup_taxon,
genomes_to_process=None):
"""Align marker genes in genomes."""

if identify_dir != out_dir:
Expand Down Expand Up @@ -492,6 +493,12 @@ def align(self,

genomic_files = self._path_to_identify_data(
identify_dir, identify_dir != out_dir)
if genomes_to_process is not None and len(genomic_files) != len(genomes_to_process):
self.logger.error('{} are not present in the input list of genome to process.'.format(
list(set(genomic_files.keys()) - set(genomes_to_process.keys()))))
raise InconsistentGenomeBatch(
'Number of processed genomes in the identify directory is unequal to the list of input genomes.')

self.logger.info('Aligning markers in %d genomes with %d threads.' % (len(genomic_files),
self.cpus))

Expand Down Expand Up @@ -554,7 +561,8 @@ def align(self,

# Write the individual marker alignments to disk
if self.debug:
self._write_individual_markers(user_msa, marker_set_id, marker_info_file, out_dir, prefix)
self._write_individual_markers(
user_msa, marker_set_id, marker_info_file, out_dir, prefix)

# filter columns without sufficient representation across taxa
if skip_trimming:
Expand Down Expand Up @@ -680,7 +688,8 @@ def _write_individual_markers(self, user_msa, marker_set_id, marker_list, out_di
offset += marker_len

if total_msa_len != offset:
self.logger.warning('Internal error: the total MSA length is not equal to the offset.')
self.logger.warning(
'Internal error: the total MSA length is not equal to the offset.')
for path_marker, gid_dict in marker_to_msa.items():
with open(path_marker, 'w') as fh:
for genome_id, genome_msa in gid_dict.items():
Expand Down
16 changes: 16 additions & 0 deletions gtdbtk/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,28 @@

from gtdbtk.config.output import CHECKSUM_SUFFIX

import gtdbtk.config.config as Config


##################################################
############MISC UTILITIES########################
##################################################


def get_reference_ids():
results = []
with open(Config.TAXONOMY_FILE) as tf:
for line in tf:
raw_id = line.split('\t')[0]
results.append(raw_id)
if raw_id[0:4] in ['GCF_', 'GCA_']:
results.append(add_ncbi_prefix(raw_id))
elif raw_id[0:3] in ['RS_', 'GB_']:
results.append(raw_id[3:])

return results


def add_ncbi_prefix(refname):
if refname.startswith("GCF_"):
return "RS_" + refname
Expand Down

0 comments on commit ec31c86

Please sign in to comment.