Skip to content

Commit

Permalink
Add the remove_labels function for #373, removing internal labels mak…
Browse files Browse the repository at this point in the history
…es the tree compatible with Itol
  • Loading branch information
pchaumeil committed Apr 6, 2022
1 parent 525dea1 commit 1199ecc
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 7 deletions.
4 changes: 2 additions & 2 deletions docs/src/installing/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ Hardware requirements
- Time
* - Archaea
- ~34 GB
- ~30 GB
- ~65 GB
- ~1 hour / 1,000 genomes @ 64 CPUs
* - Bacteria
- ~320 GB ( 20GB for divide-and-conquer)
- ~30 GB
- ~65 GB
- ~1 hour / 1,000 genomes @ 64 CPUs

.. note::
Expand Down
9 changes: 5 additions & 4 deletions gtdbtk/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,11 @@ def print_help():
decorate -> Decorate tree with GTDB taxonomy
Tools:
infer_ranks -> Establish taxonomic ranks of internal nodes using RED
ani_rep -> Calculates ANI to GTDB representative genomes
trim_msa -> Trim an untrimmed MSA file based on a mask
export_msa -> Export the untrimmed archaeal or bacterial MSA file
infer_ranks -> Establish taxonomic ranks of internal nodes using RED
ani_rep -> Calculates ANI to GTDB representative genomes
trim_msa -> Trim an untrimmed MSA file based on a mask
export_msa -> Export the untrimmed archaeal or bacterial MSA file
remove_labels -> Remove labels (bootstrap values, node labels) from an Newick tree.
Testing:
test -> Validate the classify_wf pipeline with 3 archaeal genomes
Expand Down
10 changes: 10 additions & 0 deletions gtdbtk/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,16 @@ def get_main_parser():
__debug(grp)
__help(grp)

# Remove labels
with subparser(sub_parsers, 'remove_labels', 'Remove labels (bootstrap values, node labels) from an Newick tree to '
'to improve compatibility with tree viewers') as parser:
with arg_group(parser, 'required named arguments') as grp:
__input_tree(grp, required=True)
__output_tree(grp, required=True)
with arg_group(parser, 'optional arguments') as grp:
__debug(grp)
__help(grp)

# Export MSA.
with subparser(sub_parsers, 'export_msa', 'Export the untrimmed archaeal or bacterial MSA file.') as parser:
with arg_group(parser, 'required named arguments') as grp:
Expand Down
17 changes: 17 additions & 0 deletions gtdbtk/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,21 @@ def ani_rep(self, options):

self.logger.info('Done.')

def remove_labels(self, options):
"""Remove labels from tree.
Parameters
----------
options : argparse.Namespace
The CLI arguments input by the user.
"""

check_file_exists(options.input_tree)

r = Misc()
r.remove_labels(options.input_tree, options.output_tree)
self.logger.info('Done.')

def remove_intermediate_files(self,out_dir,workflow_name):
"""Remove intermediate files from the output directory.
Parameters
Expand Down Expand Up @@ -809,6 +824,8 @@ def parse_options(self, options):
self.infer_ranks(options)
elif options.subparser_name == 'ani_rep':
self.ani_rep(options)
elif options.subparser_name == 'remove_labels':
self.remove_labels(options)
elif options.subparser_name == 'trim_msa':
self.trim_msa(options)
elif options.subparser_name == 'export_msa':
Expand Down
25 changes: 25 additions & 0 deletions gtdbtk/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

import shutil

import dendropy

import gtdbtk.config.config as Config
from gtdbtk.biolib_lite.execute import check_dependencies
from gtdbtk.biolib_lite.logger import colour
Expand Down Expand Up @@ -104,6 +106,29 @@ def checkfolder(self, folder_path, folder_name):
folder_name, folder_path, colour('MISSING', ['bright'], fg='red')))
return False

def remove_labels(self, input_file, output_file):
"""Remove labels from a Newick Tree.
Parameters
----------
input_file : str
The path to the input Newick tree.
output_file : str
The path to the output Newick tree.
"""

self.logger.info("Removing labels from tree {}".format(input_file))
intree= dendropy.Tree.get_from_path(input_file,
schema='newick',
rooting='force-rooted',
preserve_underscores=True)

for node in intree.internal_nodes():
node.label = None

intree.write_to_path(output_file, schema='newick', suppress_rooting=True)


def remove_intermediate_files(self,output_dir,wf_name):
"""Remove intermediate files.
Expand Down
11 changes: 10 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,15 @@ def test_identify_align_classify(self):
self.assertEqual(len(infos), 20)
self.assertTrue(infos[1].startswith('d__Archaea'))

self.assertTrue(os.path.isdir(os.path.join(classify_options.out_dir, DIR_IDENTIFY_INTERMEDIATE)))
self.assertTrue(os.path.isdir(os.path.join(classify_options.out_dir, DIR_ALIGN_INTERMEDIATE)))
self.assertTrue(os.path.isdir(os.path.join(classify_options.out_dir, DIR_CLASSIFY_INTERMEDIATE)))
self.optionparser.remove_intermediate_files(classify_options.out_dir,'classify_wf')
self.assertFalse(os.path.exists(os.path.join(classify_options.out_dir, DIR_IDENTIFY_INTERMEDIATE)))
self.assertFalse(os.path.exists(os.path.join(classify_options.out_dir, DIR_ALIGN_INTERMEDIATE)))
self.assertFalse(os.path.exists(os.path.join(classify_options.out_dir, DIR_CLASSIFY_INTERMEDIATE)))


def test_classify_wf(self):
tmp_folder = ''.join(random.choice(
string.ascii_uppercase + string.digits) for _ in range(10))
Expand Down Expand Up @@ -265,7 +274,7 @@ def test_root(self):
"""Test that rooting is successful when called through the CLI"""
options = argparse.ArgumentParser()
options.input_tree = 'tests/data/pplacer_dir_reference/gtdbtk.ar53.classify.tree'
options.outgroup_taxon = 'p__Altarchaeota'
options.outgroup_taxon = 'p__Altiarchaeota'
options.output_tree = os.path.join(self.generic_out_path, 'test.rooted.tree')
options.custom_taxonomy_file = None
options.gtdbtk_classification_file = None
Expand Down

0 comments on commit 1199ecc

Please sign in to comment.