From cab360b281128a531f53e34ebe4851e420387133 Mon Sep 17 00:00:00 2001
From: pchaumeil
Date: Tue, 9 Mar 2021 13:11:57 +1000
Subject: [PATCH 01/16] Update de_novo_wf documentation with valid taxon
---
docs/src/commands/de_novo_wf.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/src/commands/de_novo_wf.rst b/docs/src/commands/de_novo_wf.rst
index 674f896e..9d6d0dbf 100644
--- a/docs/src/commands/de_novo_wf.rst
+++ b/docs/src/commands/de_novo_wf.rst
@@ -68,7 +68,7 @@ Input
.. code-block:: bash
- gtdbtk de_novo_wf --genome_dir genomes/ --outgroup_taxon p__Nanobacteria --ar122_ms --out_dir de_novo_wf --cpus 3
+ gtdbtk de_novo_wf --genome_dir genomes/ --outgroup_taxon p__Undinarchaeota --ar122_ms --out_dir de_novo_wf --cpus 3
gtdbtk de_novo_wf --genome_dir ./genomes --bac120_ms --outgroup_taxon p__Chloroflexota --taxa_filter p__Firmicutes --out_dir de_novo_output
From 38d2ed3f6975987e229557336664182d233be840 Mon Sep 17 00:00:00 2001
From: pchaumeil
Date: Mon, 15 Mar 2021 12:43:46 +1000
Subject: [PATCH 02/16] Change the database directory structure ( same as NCBI
) , i.e GCA/123/456/789.
---
gtdbtk/ani_rep.py | 13 ++++++----
gtdbtk/classify.py | 24 ++++++++++++++++++-
gtdbtk/config/config.py | 1 +
scripts/create_genome_paths.sh | 9 +++++++
.../test_gtdbtk/test_external/test_fastani.py | 12 +++++-----
5 files changed, 47 insertions(+), 12 deletions(-)
create mode 100644 scripts/create_genome_paths.sh
diff --git a/gtdbtk/ani_rep.py b/gtdbtk/ani_rep.py
index 82750183..d6548d21 100644
--- a/gtdbtk/ani_rep.py
+++ b/gtdbtk/ani_rep.py
@@ -5,8 +5,9 @@
from gtdbtk.biolib_lite.common import canonical_gid
from gtdbtk.biolib_lite.execute import check_dependencies
from gtdbtk.biolib_lite.taxonomy import Taxonomy
-from gtdbtk.config.config import (FASTANI_GENOMES,
+from gtdbtk.config.config import (FASTANI_DIR,
FASTANI_GENOMES_EXT,
+ FASTANI_GENOME_LIST,
TAXONOMY_FILE,
AF_THRESHOLD)
from gtdbtk.config.output import DIR_ANI_REP_INT_MASH
@@ -53,10 +54,12 @@ def _get_ref_genomes():
Dict[genome_id] = fasta_path
"""
ref_genomes = dict()
- for f_name in os.listdir(FASTANI_GENOMES):
- if f_name.endswith(FASTANI_GENOMES_EXT):
- accession = f_name.split(FASTANI_GENOMES_EXT)[0]
- ref_genomes[accession] = os.path.join(FASTANI_GENOMES, f_name)
+ with open(FASTANI_GENOME_LIST) as g_path_file:
+ for line in g_path_file:
+ (full_name, path) = line.strip().split()
+ if full_name.endswith(FASTANI_GENOMES_EXT):
+ accession = full_name.split(FASTANI_GENOMES_EXT)[0]
+ ref_genomes[accession] = os.path.join(FASTANI_DIR, path, full_name)
return ref_genomes
def run(self, genomes, no_mash, max_d, out_dir, prefix, mash_k, mash_v, mash_s, min_af, mash_db):
diff --git a/gtdbtk/classify.py b/gtdbtk/classify.py
index b78533fd..0e9b0b1b 100644
--- a/gtdbtk/classify.py
+++ b/gtdbtk/classify.py
@@ -101,6 +101,26 @@ def parse_radius_file(self):
results[gid] = float(infos[2])
return results
+ def parse_leaf_to_dir_path(self,genome_id):
+ """ Convert a genome id to a path.
+ i.e GCA_123456789.0 would be converted to GCA/123/456/789/
+
+ Parameters
+ ----------
+ genome_id: str
+ NCBI genome id GCF/GCA_xxxxxxxxx
+ :return: str
+ path to the genome id path
+ """
+ try:
+ genome_path = '/'.join([genome_id[0:3],genome_id[4:7],
+ genome_id[7:10],genome_id[10:13]])
+ return genome_path
+ except IndexError:
+ logger = logging.getLogger('timestamp')
+ logger.error('Specified path could not be created for reference genome: ' + genome_id)
+ raise GTDBTkExit('Specified path could not be created for reference genome: ' + genome_id)
+
def place_genomes(self,
user_msa_file,
marker_set_id,
@@ -1655,7 +1675,9 @@ def _get_fastani_genome_path(self, fastani_verification, genomes):
if leafnode.taxon.label.startswith('GB_') or leafnode.taxon.label.startswith('RS_'):
shortleaf = leafnode.taxon.label[3:]
ref_path = os.path.join(
- Config.FASTANI_GENOMES, shortleaf + Config.FASTANI_GENOMES_EXT)
+ Config.FASTANI_GENOMES,
+ self.parse_leaf_to_dir_path(shortleaf),
+ shortleaf + Config.FASTANI_GENOMES_EXT)
if not os.path.isfile(ref_path):
raise GTDBTkExit(f'Reference genome missing from FastANI database: {ref_path}')
diff --git a/gtdbtk/config/config.py b/gtdbtk/config/config.py
index 4a5950dc..e84ff2f7 100644
--- a/gtdbtk/config/config.py
+++ b/gtdbtk/config/config.py
@@ -200,6 +200,7 @@
# Fastani configuration
FASTANI_SPECIES_THRESHOLD = 95.0
FASTANI_GENOMES = os.path.join(FASTANI_DIR, "database/")
+FASTANI_GENOME_LIST = os.path.join(FASTANI_DIR, "genome_paths.tsv")
FASTANI_GENOMES_EXT = "_genomic.fna.gz"
# MRCA RED VALUE
diff --git a/scripts/create_genome_paths.sh b/scripts/create_genome_paths.sh
new file mode 100644
index 00000000..45b3ec42
--- /dev/null
+++ b/scripts/create_genome_paths.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+DATAPATH='database'
+for f in $DATAPATH/*.gz
+do
+ mkdir --parents database/${f:9:3}/${f:13:3}/${f:16:3}/${f:19:3}/ ; mv $f $_
+ filef="$(basename -- $f)"
+ echo "$filef database/${f:9:3}/${f:13:3}/${f:16:3}/${f:19:3}/ " >> genome_paths.tsv
+done
\ No newline at end of file
diff --git a/tests/test_gtdbtk/test_external/test_fastani.py b/tests/test_gtdbtk/test_external/test_fastani.py
index 17d3180c..47c6e14d 100644
--- a/tests/test_gtdbtk/test_external/test_fastani.py
+++ b/tests/test_gtdbtk/test_external/test_fastani.py
@@ -48,12 +48,12 @@ def test_run(self):
d_compare = {'a': {'x', 'y'},
'b': {'x'},
'c': {'z'}}
- d_paths = {'a': os.path.join(self.genome_root, 'GCA_001873845.1_genomic.fna.gz'),
- 'b': os.path.join(self.genome_root, 'GCA_002083985.1_genomic.fna.gz'),
- 'c': os.path.join(self.genome_root, 'GCF_000185805.1_genomic.fna.gz'),
- 'x': os.path.join(self.genome_root, 'GCA_002841105.1_genomic.fna.gz'),
- 'y': os.path.join(self.genome_root, 'GCA_000402295.1_genomic.fna.gz'),
- 'z': os.path.join(self.genome_root, 'GCA_002011165.1_genomic.fna.gz')}
+ d_paths = {'a': os.path.join(self.genome_root,'GCA/001/873/845', 'GCA_001873845.1_genomic.fna.gz'),
+ 'b': os.path.join(self.genome_root,'GCA/002/083/985', 'GCA_002083985.1_genomic.fna.gz'),
+ 'c': os.path.join(self.genome_root,'GCF/000/185/805', 'GCF_000185805.1_genomic.fna.gz'),
+ 'x': os.path.join(self.genome_root,'GCA/002/841/105', 'GCA_002841105.1_genomic.fna.gz'),
+ 'y': os.path.join(self.genome_root,'GCA/000/402/295', 'GCA_000402295.1_genomic.fna.gz'),
+ 'z': os.path.join(self.genome_root,'GCA/002/011/165', 'GCA_002011165.1_genomic.fna.gz')}
result = fa.run(d_compare, d_paths)
From 6e33947bd7bf05afc0a9d2a25e5c3590059185be Mon Sep 17 00:00:00 2001
From: pchaumeil
Date: Fri, 26 Mar 2021 13:31:27 +1000
Subject: [PATCH 03/16] Workflow runs to completion even with failed genomes.
---
gtdbtk/config/output.py | 2 ++
gtdbtk/external/prodigal.py | 8 +++++++-
gtdbtk/markers.py | 22 ++++++++++++++++------
3 files changed, 25 insertions(+), 7 deletions(-)
diff --git a/gtdbtk/config/output.py b/gtdbtk/config/output.py
index 4e76bc0b..7af03915 100644
--- a/gtdbtk/config/output.py
+++ b/gtdbtk/config/output.py
@@ -8,6 +8,7 @@
PATH_BAC120_MARKER_SUMMARY = join(DIR_IDENTIFY, '{prefix}.bac120.markers_summary.tsv')
PATH_AR122_MARKER_SUMMARY = join(DIR_IDENTIFY, '{prefix}.ar122.markers_summary.tsv')
PATH_TLN_TABLE_SUMMARY = join(DIR_IDENTIFY, '{prefix}.translation_table_summary.tsv')
+PATH_FAILS = join(DIR_IDENTIFY,'{prefix}.failed_genomes.tsv')
# Command: identify -> marker genes
GENOME_FILE_SUFFIX = "_genomic.fna"
@@ -91,3 +92,4 @@
# General files
PATH_WARNINGS = '{prefix}.warnings.log'
+
diff --git a/gtdbtk/external/prodigal.py b/gtdbtk/external/prodigal.py
index 381c7e34..631d48a2 100644
--- a/gtdbtk/external/prodigal.py
+++ b/gtdbtk/external/prodigal.py
@@ -33,6 +33,7 @@ class Prodigal(object):
def __init__(self,
threads,
+ failed_genomes_file,
marker_gene_dir,
protein_file_suffix,
nt_gene_file_suffix,
@@ -42,6 +43,7 @@ def __init__(self,
self.logger = logging.getLogger('timestamp')
self.warnings = logging.getLogger('warnings')
+ self.fails = open(failed_genomes_file,'w')
self.threads = threads
@@ -238,13 +240,17 @@ def run(self, genomic_files, tln_tables):
f'been excluded from analysis due to Prodigal '
f'failing to call any genes:')
+
# If there are few low-quality genomes just output to console.
if len(lq_gids) > 10:
for lq_gid in lq_gids:
self.warnings.info(lq_gid)
+ self.fails.write(f'{lq_gid}\tno genes were called by Prodigal\n')
else:
for lq_gid in lq_gids:
- self.logger.warning(f'Skipping: {lq_gid}')
+ self.logger.warning(f'Skipping: ')
self.warnings.info(lq_gid)
+ self.fails.write(f'{lq_gid}\tno genes were called by Prodigal\n')
+ self.fails.close()
return result_dict
diff --git a/gtdbtk/markers.py b/gtdbtk/markers.py
index c4933a8b..4854df55 100644
--- a/gtdbtk/markers.py
+++ b/gtdbtk/markers.py
@@ -110,6 +110,8 @@ def _report_identified_marker_genes(self, gene_dict, outdir, prefix,
os.path.join(outdir, os.path.basename(PATH_AR122_MARKER_SUMMARY.format(prefix=prefix))))
symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
+ symlink_f(PATH_FAILS.format(prefix=prefix),
+ os.path.join(outdir, os.path.basename(PATH_FAILS.format(prefix=prefix))))
# Write the single copy AR122/BAC120 FASTA files to disk.
if write_single_copy_genes:
@@ -176,7 +178,9 @@ def identify(self, genomes, tln_tables, out_dir, prefix, force, write_single_cop
f'{self.cpus} threads.')
self.marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE)
+ self.failed_genomes = os.path.join(out_dir, PATH_FAILS.format(prefix=prefix))
prodigal = Prodigal(self.cpus,
+ self.failed_genomes,
self.marker_gene_dir,
self.protein_file_suffix,
self.nt_gene_file_suffix,
@@ -415,15 +419,21 @@ def align(self,
genomes_to_process=None):
"""Align marker genes in genomes."""
+ # read genomes that failed identify steps to skip them
+ failed_genomes_file = os.path.join(os.path.join(identify_dir,os.path.basename(PATH_FAILS.format(prefix=prefix))))
+ with open(failed_genomes_file) as fgf:
+ failed_genomes = [row.split()[0] for row in fgf]
+
# If the user is re-running this step, check if the identify step is consistent.
genomic_files = self._path_to_identify_data(identify_dir, identify_dir != out_dir)
if genomes_to_process is not None and len(genomic_files) != len(genomes_to_process):
- self.logger.error('{} are not present in the input list of genome to process.'.format(
- list(set(genomic_files.keys()) - set(genomes_to_process.keys()))))
- raise InconsistentGenomeBatch(
- 'You are attempting to run GTDB-Tk on a non-empty directory that contains extra '
- 'genomes not present in your initial identify directory. Remove them, or run '
- 'GTDB-Tk on a new directory.')
+ if list(set(genomic_files.keys()) - set(genomes_to_process.keys())).sort() != failed_genomes.sort():
+ self.logger.error('{} are not present in the input list of genome to process.'.format(
+ list(set(genomic_files.keys()) - set(genomes_to_process.keys()))))
+ raise InconsistentGenomeBatch(
+ 'You are attempting to run GTDB-Tk on a non-empty directory that contains extra '
+ 'genomes not present in your initial identify directory. Remove them, or run '
+ 'GTDB-Tk on a new directory.')
# If this is being run as a part of classify_wf, copy the required files.
if identify_dir != out_dir:
From c3caf87ef0ef0fa404f32bbe9a6a3a312121bdf0 Mon Sep 17 00:00:00 2001
From: pchaumeil
Date: Fri, 26 Mar 2021 15:06:01 +1000
Subject: [PATCH 04/16] CLI issue when using scratch_dir in classify_wf (issue
#311) .
---
gtdbtk/main.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/gtdbtk/main.py b/gtdbtk/main.py
index 807f1e49..a85e2531 100644
--- a/gtdbtk/main.py
+++ b/gtdbtk/main.py
@@ -712,8 +712,6 @@ def parse_options(self, options):
options.max_consensus = None
options.rnd_seed = None
options.skip_trimming = False
- options.scratch_dir = None
- options.recalculate_red = False
self.align(options)
From 292e10c9fe584488ea9bec5586c35bf2bcabcf94 Mon Sep 17 00:00:00 2001
From: pchaumeil
Date: Tue, 6 Apr 2021 14:50:59 +1000
Subject: [PATCH 05/16] fix missing information
---
gtdbtk/external/prodigal.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/gtdbtk/external/prodigal.py b/gtdbtk/external/prodigal.py
index 631d48a2..f5622e61 100644
--- a/gtdbtk/external/prodigal.py
+++ b/gtdbtk/external/prodigal.py
@@ -184,6 +184,7 @@ def run(self, genomic_files, tln_tables):
for _ in range(self.threads):
worker_queue.put(None)
+ worker_proc = []
try:
manager = mp.Manager()
out_dict = manager.dict()
@@ -248,7 +249,7 @@ def run(self, genomic_files, tln_tables):
self.fails.write(f'{lq_gid}\tno genes were called by Prodigal\n')
else:
for lq_gid in lq_gids:
- self.logger.warning(f'Skipping: ')
+ self.logger.warning(f'Skipping: {lq_gid}')
self.warnings.info(lq_gid)
self.fails.write(f'{lq_gid}\tno genes were called by Prodigal\n')
From a8be3a0f48656382f6c69c9c0edccdb56d62e7e7 Mon Sep 17 00:00:00 2001
From: pchaumeil
Date: Thu, 8 Apr 2021 11:13:43 +1000
Subject: [PATCH 06/16] Fix issue with failed_genomes file being created before
folder is created
---
docs/src/changelog.rst | 7 +++++++
gtdbtk/__init__.py | 2 +-
gtdbtk/external/prodigal.py | 9 +++++----
gtdbtk/markers.py | 2 ++
.../identify/gtdbtk.failed_genomes.tsv | 0
.../identify/gtdbtk.failed_genomes.tsv | 0
6 files changed, 15 insertions(+), 5 deletions(-)
create mode 100644 tests/data/align_dir_reference/identify/gtdbtk.failed_genomes.tsv
create mode 100644 tests/data/identify_dir_reference/identify/gtdbtk.failed_genomes.tsv
diff --git a/docs/src/changelog.rst b/docs/src/changelog.rst
index 1c49d574..f0b69d71 100644
--- a/docs/src/changelog.rst
+++ b/docs/src/changelog.rst
@@ -2,6 +2,13 @@
Change log
==========
+1.4.2
+-----
+
+* (`#311 `_) Fixed --scratch_dir not working in v 1.4.1 for classify_wf
+* (`#312 `_) Automatic drop of genome leads to error in downstream modules of classify_wf
+
+
1.4.1
-----
diff --git a/gtdbtk/__init__.py b/gtdbtk/__init__.py
index 2c3ffe01..07c898ba 100644
--- a/gtdbtk/__init__.py
+++ b/gtdbtk/__init__.py
@@ -29,4 +29,4 @@
__status__ = 'Production'
__title__ = 'GTDB-Tk'
__url__ = 'https://github.com/Ecogenomics/GTDBTk'
-__version__ = '1.4.1'
+__version__ = '1.4.2'
diff --git a/gtdbtk/external/prodigal.py b/gtdbtk/external/prodigal.py
index f5622e61..b50e443e 100644
--- a/gtdbtk/external/prodigal.py
+++ b/gtdbtk/external/prodigal.py
@@ -43,7 +43,7 @@ def __init__(self,
self.logger = logging.getLogger('timestamp')
self.warnings = logging.getLogger('warnings')
- self.fails = open(failed_genomes_file,'w')
+ self.failed_genomes_file = failed_genomes_file
self.threads = threads
@@ -227,6 +227,7 @@ def run(self, genomic_files, tln_tables):
# Report on any genomes which failed to have any genes called
result_dict = dict()
lq_gids = list()
+ fails = open(self.failed_genomes_file,'w')
for gid, gid_dict in out_dict.items():
if os.path.getsize(gid_dict['aa_gene_path']) <= 1:
lq_gids.append(gid)
@@ -246,12 +247,12 @@ def run(self, genomic_files, tln_tables):
if len(lq_gids) > 10:
for lq_gid in lq_gids:
self.warnings.info(lq_gid)
- self.fails.write(f'{lq_gid}\tno genes were called by Prodigal\n')
+ fails.write(f'{lq_gid}\tno genes were called by Prodigal\n')
else:
for lq_gid in lq_gids:
self.logger.warning(f'Skipping: {lq_gid}')
self.warnings.info(lq_gid)
- self.fails.write(f'{lq_gid}\tno genes were called by Prodigal\n')
+ fails.write(f'{lq_gid}\tno genes were called by Prodigal\n')
- self.fails.close()
+ fails.close()
return result_dict
diff --git a/gtdbtk/markers.py b/gtdbtk/markers.py
index 4854df55..dcf0b6f1 100644
--- a/gtdbtk/markers.py
+++ b/gtdbtk/markers.py
@@ -55,6 +55,8 @@ def __init__(self, cpus=1, debug=False):
self.cpus = cpus
self.debug = debug
self.marker_gene_dir = None
+ self.failed_genomes = None
+
self.genome_file_suffix = GENOME_FILE_SUFFIX
self.protein_file_suffix = PROTEIN_FILE_SUFFIX
diff --git a/tests/data/align_dir_reference/identify/gtdbtk.failed_genomes.tsv b/tests/data/align_dir_reference/identify/gtdbtk.failed_genomes.tsv
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/data/identify_dir_reference/identify/gtdbtk.failed_genomes.tsv b/tests/data/identify_dir_reference/identify/gtdbtk.failed_genomes.tsv
new file mode 100644
index 00000000..e69de29b
From f7c2ea676d8a83e559211b06f5e3a43c612f06e1 Mon Sep 17 00:00:00 2001
From: Aaron Mussig
Date: Wed, 21 Apr 2021 10:55:36 +1000
Subject: [PATCH 07/16] PFAM markers updated to 33.1
---
gtdbtk/config/config.py | 28 ++++++++++++++--------------
1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/gtdbtk/config/config.py b/gtdbtk/config/config.py
index 4a5950dc..2d198535 100644
--- a/gtdbtk/config/config.py
+++ b/gtdbtk/config/config.py
@@ -83,8 +83,8 @@
RED_MIN_CHILDREN = 2
# Marker information
-BAC120_MARKERS = {"PFAM": ["PF00380.14.hmm", "PF00410.14.hmm", "PF00466.15.hmm",
- "PF01025.14.hmm", "PF02576.12.hmm", "PF03726.9.hmm"],
+BAC120_MARKERS = {"PFAM": ["PF00380.20.hmm", "PF00410.20.hmm", "PF00466.21.hmm",
+ "PF01025.20.hmm", "PF02576.18.hmm", "PF03726.15.hmm"],
"TIGRFAM": ["TIGR00006.HMM", "TIGR00019.HMM", "TIGR00020.HMM",
"TIGR00029.HMM", "TIGR00043.HMM", "TIGR00054.HMM",
"TIGR00059.HMM", "TIGR00061.HMM", "TIGR00064.HMM",
@@ -124,18 +124,18 @@
"TIGR03625.HMM", "TIGR03632.HMM", "TIGR03654.HMM",
"TIGR03723.HMM", "TIGR03725.HMM", "TIGR03953.HMM"]}
-AR122_MARKERS = {"PFAM": ["PF01868.11.hmm", "PF01282.14.hmm", "PF01655.13.hmm",
- "PF01092.14.hmm", "PF01000.21.hmm", "PF00368.13.hmm",
- "PF00827.12.hmm", "PF01269.12.hmm", "PF00466.15.hmm",
- "PF01015.13.hmm", "PF13685.1.hmm", "PF02978.14.hmm",
- "PF04919.7.hmm", "PF01984.15.hmm", "PF04104.9.hmm",
- "PF00410.14.hmm", "PF01798.13.hmm", "PF01864.12.hmm",
- "PF01990.12.hmm", "PF07541.7.hmm", "PF04019.7.hmm",
- "PF00900.15.hmm", "PF01090.14.hmm", "PF02006.11.hmm",
- "PF01157.13.hmm", "PF01191.14.hmm", "PF01866.12.hmm",
- "PF01198.14.hmm", "PF01496.14.hmm", "PF00687.16.hmm",
- "PF03874.11.hmm", "PF01194.12.hmm", "PF01200.13.hmm",
- "PF13656.1.hmm", "PF01280.15.hmm"],
+AR122_MARKERS = {"PFAM": ["PF01868.17.hmm", "PF01282.20.hmm", "PF01655.19.hmm",
+ "PF01092.20.hmm", "PF01000.27.hmm", "PF00368.19.hmm",
+ "PF00827.18.hmm", "PF01269.18.hmm", "PF00466.21.hmm",
+ "PF01015.19.hmm", "PF13685.7.hmm", "PF02978.20.hmm",
+ "PF04919.13.hmm", "PF01984.21.hmm", "PF04104.15.hmm",
+ "PF00410.20.hmm", "PF01798.19.hmm", "PF01864.18.hmm",
+ "PF01990.18.hmm", "PF07541.13.hmm", "PF04019.13.hmm",
+ "PF00900.21.hmm", "PF01090.20.hmm", "PF02006.17.hmm",
+ "PF01157.19.hmm", "PF01191.20.hmm", "PF01866.18.hmm",
+ "PF01198.20.hmm", "PF01496.20.hmm", "PF00687.22.hmm",
+ "PF03874.17.hmm", "PF01194.18.hmm", "PF01200.19.hmm",
+ "PF13656.7.hmm", "PF01280.21.hmm"],
"TIGRFAM": ["TIGR00468.HMM", "TIGR01060.HMM", "TIGR03627.HMM",
"TIGR01020.HMM", "TIGR02258.HMM", "TIGR00293.HMM",
"TIGR00389.HMM", "TIGR01012.HMM", "TIGR00490.HMM",
From f4bca5d044cf30a768d69445faf465215cc8824f Mon Sep 17 00:00:00 2001
From: Aaron Mussig
Date: Thu, 22 Apr 2021 10:06:31 +1000
Subject: [PATCH 08/16] update minimum package requirement
---
gtdbtk/config/config.py | 2 +-
gtdbtk/markers.py | 12 ------------
2 files changed, 1 insertion(+), 13 deletions(-)
diff --git a/gtdbtk/config/config.py b/gtdbtk/config/config.py
index 2d198535..a4db6fd1 100644
--- a/gtdbtk/config/config.py
+++ b/gtdbtk/config/config.py
@@ -21,7 +21,7 @@
If the reference package sub-folders still exist in GTDBTK_DATA_PATH, then there
is no need to edit the variables below.
"""
-MIN_REF_DATA_VERSION = 'r95'
+MIN_REF_DATA_VERSION = 'r202'
MSA_FOLDER = os.path.join(GENERIC_PATH, "msa/")
MASK_DIR = os.path.join(GENERIC_PATH, "masks/")
diff --git a/gtdbtk/markers.py b/gtdbtk/markers.py
index dcf0b6f1..714e458e 100644
--- a/gtdbtk/markers.py
+++ b/gtdbtk/markers.py
@@ -512,18 +512,6 @@ def align(self,
# Generate the user MSA.
user_msa = align.align_marker_set(cur_genome_files, marker_info_file, copy_number_f, self.cpus)
- # self.logger.log(Config.LOG_TASK, f'Aligning {len(cur_genome_files):,} {domain_str} genomes.')
- # hmm_aligner = HmmAligner(self.cpus,
- # self.pfam_top_hit_suffix,
- # self.tigrfam_top_hit_suffix,
- # self.protein_file_suffix,
- # self.pfam_hmm_dir,
- # self.tigrfam_hmms,
- # Config.BAC120_MARKERS,
- # Config.AR122_MARKERS)
- # user_msa = hmm_aligner.align_marker_set(cur_genome_files,
- # marker_set_id)
-
# Write the individual marker alignments to disk
if self.debug:
self._write_individual_markers(
From b0ed0c1a8457e9a9f6bf78d0f09901017a671883 Mon Sep 17 00:00:00 2001
From: Aaron Mussig
Date: Thu, 22 Apr 2021 10:08:05 +1000
Subject: [PATCH 09/16] terminate writer proc if initialised (#313)
---
gtdbtk/external/prodigal.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/gtdbtk/external/prodigal.py b/gtdbtk/external/prodigal.py
index b50e443e..73ddcdb1 100644
--- a/gtdbtk/external/prodigal.py
+++ b/gtdbtk/external/prodigal.py
@@ -185,6 +185,7 @@ def run(self, genomic_files, tln_tables):
worker_queue.put(None)
worker_proc = []
+ writer_proc = None
try:
manager = mp.Manager()
out_dict = manager.dict()
@@ -214,8 +215,8 @@ def run(self, genomic_files, tln_tables):
except Exception:
for p in worker_proc:
p.terminate()
-
- writer_proc.terminate()
+ if writer_proc:
+ writer_proc.terminate()
raise ProdigalException('An exception was caught while running Prodigal.')
# Report if any genomes were skipped due to having already been processed.
From bc89029285a12fbc6e09653cf1d2ddd79fc85ff4 Mon Sep 17 00:00:00 2001
From: Aaron Mussig
Date: Thu, 22 Apr 2021 10:08:59 +1000
Subject: [PATCH 10/16] update exception error message
---
gtdbtk/external/prodigal.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/gtdbtk/external/prodigal.py b/gtdbtk/external/prodigal.py
index 73ddcdb1..9d0f03fb 100644
--- a/gtdbtk/external/prodigal.py
+++ b/gtdbtk/external/prodigal.py
@@ -212,12 +212,12 @@ def run(self, genomic_files, tln_tables):
writer_queue.put(None)
writer_proc.join()
- except Exception:
+ except Exception as e:
for p in worker_proc:
p.terminate()
if writer_proc:
writer_proc.terminate()
- raise ProdigalException('An exception was caught while running Prodigal.')
+ raise ProdigalException(f'An exception was caught while running Prodigal: {e}')
# Report if any genomes were skipped due to having already been processed.
if n_skipped.value > 0:
From 8f684c29160271daccac8db223698cd3c26bb9f6 Mon Sep 17 00:00:00 2001
From: Aaron Mussig
Date: Thu, 22 Apr 2021 10:43:51 +1000
Subject: [PATCH 11/16] update documentation for r202
---
docs/src/changelog.rst | 7 +++++++
docs/src/installing/index.rst | 6 +++++-
gtdbtk/__init__.py | 2 +-
3 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/docs/src/changelog.rst b/docs/src/changelog.rst
index f0b69d71..5dbd2b58 100644
--- a/docs/src/changelog.rst
+++ b/docs/src/changelog.rst
@@ -2,6 +2,13 @@
Change log
==========
+1.5.0
+-----
+
+* (`#311 `_) Updated GTDB-Tk to support R202.
+ See https://ecogenomics.github.io/GTDBTk/installing/index.html#gtdb-tk-reference-data for instructions on downloading R202.
+
+
1.4.2
-----
diff --git a/docs/src/installing/index.rst b/docs/src/installing/index.rst
index 042131ad..792de8a3 100644
--- a/docs/src/installing/index.rst
+++ b/docs/src/installing/index.rst
@@ -122,7 +122,8 @@ GTDB-Tk requires ~27G of external data that needs to be downloaded and unarchive
.. code-block:: bash
- wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release95/95.0/auxillary_files/gtdbtk_r95_data.tar.gz
+ wget https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_data.tar.gz
+ wget https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/auxillary_files/gtdbtk_data.tar.gz (or, mirror)
tar xvzf gtdbtk_r95_data.tar.gz
@@ -136,6 +137,9 @@ Note that different versions of the GTDB release data may not run on all version
* - GTDB Release
- Minimum version
- Maximum version
+ * - R202
+ - 1.5.0
+ - N/A
* - R95
- 1.3.0
- N/A
diff --git a/gtdbtk/__init__.py b/gtdbtk/__init__.py
index 07c898ba..70d0f0f5 100644
--- a/gtdbtk/__init__.py
+++ b/gtdbtk/__init__.py
@@ -29,4 +29,4 @@
__status__ = 'Production'
__title__ = 'GTDB-Tk'
__url__ = 'https://github.com/Ecogenomics/GTDBTk'
-__version__ = '1.4.2'
+__version__ = '1.5.0'
From 393239727de4cb1aee73b37cbf9d492be37864ab Mon Sep 17 00:00:00 2001
From: Aaron Mussig
Date: Thu, 22 Apr 2021 10:44:51 +1000
Subject: [PATCH 12/16] update documentation for r202
---
docs/src/installing/index.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/src/installing/index.rst b/docs/src/installing/index.rst
index 792de8a3..87dee019 100644
--- a/docs/src/installing/index.rst
+++ b/docs/src/installing/index.rst
@@ -142,7 +142,7 @@ Note that different versions of the GTDB release data may not run on all version
- N/A
* - R95
- 1.3.0
- - N/A
+ - 1.4.2
* - R89
- 0.3.0
- 0.1.2
From 61e1e88a1d9ca299f463e7b3896e762ad4a1f0e5 Mon Sep 17 00:00:00 2001
From: Aaron Mussig
Date: Thu, 22 Apr 2021 14:52:52 +1000
Subject: [PATCH 13/16] update memory requirements
---
docs/src/installing/index.rst | 4 ++--
gtdbtk/classify.py | 24 ++++++++++++------------
gtdbtk/config/config.py | 16 +++++++++-------
gtdbtk/misc.py | 2 +-
4 files changed, 24 insertions(+), 22 deletions(-)
diff --git a/docs/src/installing/index.rst b/docs/src/installing/index.rst
index 87dee019..53690ee0 100644
--- a/docs/src/installing/index.rst
+++ b/docs/src/installing/index.rst
@@ -34,11 +34,11 @@ Hardware requirements
- Storage
- Time
* - Archaea
- - ~8 GB
+ - ~13 GB
- ~27 GB
- ~1 hour / 1,000 genomes @ 64 CPUs
* - Bacteria
- - ~150 GB
+ - ~204 GB
- ~27 GB
- ~1 hour / 1,000 genomes @ 64 CPUs
diff --git a/gtdbtk/classify.py b/gtdbtk/classify.py
index 0e9b0b1b..73d33113 100644
--- a/gtdbtk/classify.py
+++ b/gtdbtk/classify.py
@@ -132,21 +132,21 @@ def place_genomes(self,
"""Place genomes into reference tree using pplacer."""
# Warn if the memory is insufficient
+ mem_warning = 'pplacer requires ~{req_gb} GB of RAM to fully load the ' \
+ '{domain} tree into memory. However, {cur_gb:,} GB was ' \
+ 'detected. This may affect pplacer performance, or fail' \
+ ' if there is insufficient swap space.'
mem_gb = get_memory_gb()
if mem_gb is not None:
mem_total = mem_gb['MemTotal']
- if marker_set_id == 'bac120' and mem_total < 145:
- self.logger.warning(f'pplacer requires ~152 GB of RAM to fully '
- f'load the bacterial tree into memory. '
- f'However, {mem_total:,}GB was detected. '
- f'This may affect pplacer performance, '
- f'or fail if there is insufficient scratch space.')
- elif marker_set_id == 'ar122' and mem_total < 6:
- self.logger.warning(f'pplacer requires ~8.2 GB of RAM to fully '
- f'load the archaeal tree into memory. '
- f'However, {mem_total:,}GB was detected. '
- f'This may affect pplacer performance, '
- f'or fail if there is insufficient scratch space.')
+ if marker_set_id == 'bac120' and mem_total < Config.PPLACER_MIN_RAM_BAC:
+ self.logger.warning(mem_warning.format(req_gb=Config.PPLACER_MIN_RAM_BAC,
+ domain='bacterial',
+ cur_gb=mem_total))
+ elif marker_set_id == 'ar122' and mem_total < Config.PPLACER_MIN_RAM_ARC:
+ self.logger.warning(mem_warning.format(req_gb=Config.PPLACER_MIN_RAM_ARC,
+ domain='archaeal',
+ cur_gb=mem_total))
# rename user MSA file for compatibility with pplacer
if not user_msa_file.endswith('.fasta'):
diff --git a/gtdbtk/config/config.py b/gtdbtk/config/config.py
index b72c122f..ac341509 100644
--- a/gtdbtk/config/config.py
+++ b/gtdbtk/config/config.py
@@ -196,6 +196,8 @@
PPLACER_BAC120_REF_PKG = f"gtdb_{VERSION_DATA}_bac120.refpkg"
PPLACER_AR122_REF_PKG = f"gtdb_{VERSION_DATA}_ar122.refpkg"
PPLACER_RPS23_REF_PKG = f"gtdb_{VERSION_DATA}_rps23.refpkg"
+PPLACER_MIN_RAM_BAC = 204
+PPLACER_MIN_RAM_ARC = 13
# Fastani configuration
FASTANI_SPECIES_THRESHOLD = 95.0
@@ -208,13 +210,13 @@
MRCA_RED_AR122 = os.path.join(RED_DIR, f"gtdbtk_{VERSION_DATA}_ar122.tsv")
# Hashing information for validating the reference package.
-REF_HASHES = {PPLACER_DIR: 'f41cfe0284ebaca4485b42e054936190c6a88bd1',
- MASK_DIR: '63551a43333bc6cbc9abf139ce881847ca19240b',
- MARKER_DIR: 'a325720422d8348d7a934143cc86112b6c92ac98',
- RADII_DIR: '1092727925f38a8a2b3f4fb40e3316c0083671f5',
- MSA_FOLDER: 'cf91d712c733e7e2535a41e6153c12b3c37d1ede',
- METADATA_DIR: 'e003b4d5d48302e85c536751f663a70447de83d4',
- TAX_FOLDER: '30c5970b2eaf5df654b2e01bfa39265302c0be89',
+REF_HASHES = {PPLACER_DIR: '4d931b5109a240602f55228029b87ee768da8141',
+ MASK_DIR: '36d6ac371d247b2b952523b9798e78908ea323fa',
+ MARKER_DIR: '2ba5ae35fb272462663651d18fd9e523317e48cd',
+ RADII_DIR: '9f9a2e21e27b9049044d04d731795499414a365c',
+ MSA_FOLDER: 'b426865245c39ee9f01b0392fb8f7867a9f76f0a',
+ METADATA_DIR: '7640aed96fdb13707a2b79b746a94335faabd6df',
+ TAX_FOLDER: '4a7a1e4047c088e92dee9740206499cdb7e5beca',
FASTANI_DIR: '6a3555bb61d9cc3163c26e65772b96b8f58a2d84',
RED_DIR: '6f661eef8e172a8a7e78af2a74fe4d079a3f5b0f'}
diff --git a/gtdbtk/misc.py b/gtdbtk/misc.py
index 28a63174..d3f3d8e6 100644
--- a/gtdbtk/misc.py
+++ b/gtdbtk/misc.py
@@ -158,7 +158,7 @@ def check_install(self):
if user_hash != expected_hash:
self.logger.info(" |-- {:16} {}".format(
- base_name, colour('HASH MISMATCH', ['bright'], fg='yellow')))
+ base_name, colour(f'HASH MISMATCH {user_hash}', ['bright'], fg='yellow')))
ok = False
else:
self.logger.info(" |-- {:16} {}".format(
From e62c5fa6645487ab3fc243fcc52cd472e69d204d Mon Sep 17 00:00:00 2001
From: Aaron Mussig
Date: Thu, 22 Apr 2021 14:56:56 +1000
Subject: [PATCH 14/16] safety when checking for failed genomes
---
gtdbtk/markers.py | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/gtdbtk/markers.py b/gtdbtk/markers.py
index 714e458e..c114e28b 100644
--- a/gtdbtk/markers.py
+++ b/gtdbtk/markers.py
@@ -423,8 +423,11 @@ def align(self,
# read genomes that failed identify steps to skip them
failed_genomes_file = os.path.join(os.path.join(identify_dir,os.path.basename(PATH_FAILS.format(prefix=prefix))))
- with open(failed_genomes_file) as fgf:
- failed_genomes = [row.split()[0] for row in fgf]
+ if os.path.isfile(failed_genomes_file):
+ with open(failed_genomes_file) as fgf:
+ failed_genomes = [row.split()[0] for row in fgf]
+ else:
+ failed_genomes = list()
# If the user is re-running this step, check if the identify step is consistent.
genomic_files = self._path_to_identify_data(identify_dir, identify_dir != out_dir)
From 5503e2c47c122ea7960b1d38ad86fbd43c0b90b4 Mon Sep 17 00:00:00 2001
From: Aaron Mussig
Date: Thu, 22 Apr 2021 15:11:13 +1000
Subject: [PATCH 15/16] update ref hashes
---
gtdbtk/config/config.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/gtdbtk/config/config.py b/gtdbtk/config/config.py
index ac341509..6ba3fb2d 100644
--- a/gtdbtk/config/config.py
+++ b/gtdbtk/config/config.py
@@ -217,8 +217,8 @@
MSA_FOLDER: 'b426865245c39ee9f01b0392fb8f7867a9f76f0a',
METADATA_DIR: '7640aed96fdb13707a2b79b746a94335faabd6df',
TAX_FOLDER: '4a7a1e4047c088e92dee9740206499cdb7e5beca',
- FASTANI_DIR: '6a3555bb61d9cc3163c26e65772b96b8f58a2d84',
- RED_DIR: '6f661eef8e172a8a7e78af2a74fe4d079a3f5b0f'}
+ FASTANI_DIR: '70439cf088d0fa0fdbb4f47b4a6b47e199912139',
+ RED_DIR: 'ad6a184150e7b6e58547912660a17999fadcfbff'}
# Config values for checking GTDB-Tk on startup.
GTDBTK_VER_CHECK = True
From f5dc940b5d6dc54a4ca11648da1e56cee6394ab6 Mon Sep 17 00:00:00 2001
From: pchaumeil
Date: Thu, 22 Apr 2021 16:11:32 +1000
Subject: [PATCH 16/16] New script to restructure genome directory. Update
README.md
---
README.md | 3 ++-
scripts/restructure_reps_fna_folder.sh | 9 +++++++++
2 files changed, 11 insertions(+), 1 deletion(-)
create mode 100644 scripts/restructure_reps_fna_folder.sh
diff --git a/README.md b/README.md
index a0122205..8a41c7df 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,8 @@
[](https://hub.docker.com/r/ecogenomic/gtdbtk)
[](https://hub.docker.com/r/ecogenomic/gtdbtk)
-[GTDB-Tk v1.3.0](https://ecogenomics.github.io/GTDBTk/announcements.html) was released on July 17, 2020 along with new reference data for [GTDB R05-RS95](https://gtdb.ecogenomic.org/). Upgrading is recommended.
+[GTDB-Tk v1.5.0](https://ecogenomics.github.io/GTDBTk/announcements.html) was released on April 23, 2021 along with new reference data for [GTDB R06-RS202](https://gtdb.ecogenomic.org/). Upgrading is recommended.
+ Please note v1.5.0+ is not compatible with GTDB R05-RS95.
GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy [GTDB](https://gtdb.ecogenomic.org/). It is designed to work with recent advances that allow hundreds or thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also be applied to isolate and single-cell genomes. The GTDB-Tk is open source and released under the [GNU General Public License (Version 3)](https://www.gnu.org/licenses/gpl-3.0.en.html).
diff --git a/scripts/restructure_reps_fna_folder.sh b/scripts/restructure_reps_fna_folder.sh
new file mode 100644
index 00000000..1aa16810
--- /dev/null
+++ b/scripts/restructure_reps_fna_folder.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+for f in $1/*.gz
+do
+ filef="$(basename -- $f)"
+ mkdir --parents $1/${filef:0:3}/${filef:4:3}/${filef:7:3}/${filef:10:3}/ ; mv $f $_
+ echo "$filef $1/${filef:0:3}/${filef:4:3}/${filef:7:3}/${filef:10:3}/" >> genome_paths.tsv
+
+done
\ No newline at end of file