Ecogenomics · aaronmussig · Apr 26, 2021 · Mar 9, 2021 · Mar 15, 2021 · Mar 26, 2021
diff --git a/README.md b/README.md
@@ -7,7 +7,8 @@
 [![Docker Image Version (latest by date)](https://img.shields.io/docker/v/ecogenomic/gtdbtk?sort=date&color=299bec&label=docker)](https://hub.docker.com/r/ecogenomic/gtdbtk)
 [![Docker Pulls](https://img.shields.io/docker/pulls/ecogenomic/gtdbtk?color=299bec&label=pulls)](https://hub.docker.com/r/ecogenomic/gtdbtk)
 
-<b>[GTDB-Tk v1.3.0](https://ecogenomics.github.io/GTDBTk/announcements.html) was released on July 17, 2020 along with new reference data for [GTDB R05-RS95](https://gtdb.ecogenomic.org/). Upgrading is recommended.</b>
+<b>[GTDB-Tk v1.5.0](https://ecogenomics.github.io/GTDBTk/announcements.html) was released on April 23, 2021 along with new reference data for [GTDB R06-RS202](https://gtdb.ecogenomic.org/). Upgrading is recommended.</b>  
+<b> Please note v1.5.0+ is not compatible with GTDB R05-RS95. </b>
 
 GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy [GTDB](https://gtdb.ecogenomic.org/). It is designed to work with recent advances that allow hundreds or thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also be applied to isolate and single-cell genomes. The GTDB-Tk is open source and released under the [GNU General Public License (Version 3)](https://www.gnu.org/licenses/gpl-3.0.en.html).
 

diff --git a/docs/src/changelog.rst b/docs/src/changelog.rst
@@ -2,6 +2,20 @@
 Change log
 ==========
 
+1.5.0
+-----
+
+* (`#311 <https://github.com/Ecogenomics/GTDBTk/issues/311>`_) Updated GTDB-Tk to support R202.
+  See https://ecogenomics.github.io/GTDBTk/installing/index.html#gtdb-tk-reference-data for instructions on downloading R202.
+
+
+1.4.2
+-----
+
+* (`#311 <https://github.com/Ecogenomics/GTDBTk/issues/311>`_) Fixed --scratch_dir not working in v 1.4.1 for classify_wf
+* (`#312 <https://github.com/Ecogenomics/GTDBTk/issues/311>`_) Automatic drop of genome leads to error in downstream modules of classify_wf
+
+
 1.4.1
 -----
 

diff --git a/docs/src/commands/de_novo_wf.rst b/docs/src/commands/de_novo_wf.rst
@@ -68,7 +68,7 @@ Input
 
 .. code-block:: bash
 
-    gtdbtk de_novo_wf --genome_dir genomes/ --outgroup_taxon p__Nanobacteria --ar122_ms --out_dir de_novo_wf --cpus 3
+    gtdbtk de_novo_wf --genome_dir genomes/ --outgroup_taxon p__Undinarchaeota --ar122_ms --out_dir de_novo_wf --cpus 3
 
     gtdbtk de_novo_wf --genome_dir ./genomes --bac120_ms --outgroup_taxon p__Chloroflexota --taxa_filter p__Firmicutes --out_dir de_novo_output
 

diff --git a/docs/src/installing/index.rst b/docs/src/installing/index.rst
@@ -34,11 +34,11 @@ Hardware requirements
      - Storage
      - Time
    * - Archaea
-     - ~8 GB
+     - ~13 GB
      - ~27 GB
      - ~1 hour / 1,000 genomes @ 64 CPUs
    * - Bacteria
-     - ~150 GB
+     - ~204 GB
      - ~27 GB
      - ~1 hour / 1,000 genomes @ 64 CPUs
 
@@ -122,7 +122,8 @@ GTDB-Tk requires ~27G of external data that needs to be downloaded and unarchive
 
 .. code-block:: bash
 
-    wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release95/95.0/auxillary_files/gtdbtk_r95_data.tar.gz
+    wget https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_data.tar.gz
+    wget https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/auxillary_files/gtdbtk_data.tar.gz  (or, mirror)
     tar xvzf gtdbtk_r95_data.tar.gz
 
 
@@ -136,9 +137,12 @@ Note that different versions of the GTDB release data may not run on all version
    * - GTDB Release
      - Minimum version
      - Maximum version
+   * - R202
+     - 1.5.0
+     - N/A
    * - R95
      - 1.3.0
-     - N/A
+     - 1.4.2
    * - R89
      - 0.3.0
      - 0.1.2

diff --git a/gtdbtk/__init__.py b/gtdbtk/__init__.py
@@ -29,4 +29,4 @@
 __status__ = 'Production'
 __title__ = 'GTDB-Tk'
 __url__ = 'https://github.com/Ecogenomics/GTDBTk'
-__version__ = '1.4.1'
+__version__ = '1.5.0'
diff --git a/gtdbtk/ani_rep.py b/gtdbtk/ani_rep.py
@@ -5,8 +5,9 @@
 from gtdbtk.biolib_lite.common import canonical_gid
 from gtdbtk.biolib_lite.execute import check_dependencies
 from gtdbtk.biolib_lite.taxonomy import Taxonomy
-from gtdbtk.config.config import (FASTANI_GENOMES,
+from gtdbtk.config.config import (FASTANI_DIR,
                                   FASTANI_GENOMES_EXT,
+                                  FASTANI_GENOME_LIST,
                                   TAXONOMY_FILE,
                                   AF_THRESHOLD)
 from gtdbtk.config.output import DIR_ANI_REP_INT_MASH
@@ -53,10 +54,12 @@ def _get_ref_genomes():
             Dict[genome_id] = fasta_path
         """
         ref_genomes = dict()
-        for f_name in os.listdir(FASTANI_GENOMES):
-            if f_name.endswith(FASTANI_GENOMES_EXT):
-                accession = f_name.split(FASTANI_GENOMES_EXT)[0]
-                ref_genomes[accession] = os.path.join(FASTANI_GENOMES, f_name)
+        with open(FASTANI_GENOME_LIST) as g_path_file:
+            for line in g_path_file:
+                (full_name, path) = line.strip().split()
+                if full_name.endswith(FASTANI_GENOMES_EXT):
+                    accession = full_name.split(FASTANI_GENOMES_EXT)[0]
+                ref_genomes[accession] = os.path.join(FASTANI_DIR, path, full_name)
         return ref_genomes
 
     def run(self, genomes, no_mash, max_d, out_dir, prefix, mash_k, mash_v, mash_s, min_af, mash_db):

diff --git a/gtdbtk/classify.py b/gtdbtk/classify.py
@@ -101,6 +101,26 @@ def parse_radius_file(self):
                 results[gid] = float(infos[2])
         return results
 
+    def parse_leaf_to_dir_path(self,genome_id):
+        """ Convert a genome id to a path.
+         i.e GCA_123456789.0 would be converted to GCA/123/456/789/
+
+        Parameters
+        ----------
+        genome_id: str
+            NCBI genome id GCF/GCA_xxxxxxxxx
+        :return: str
+            path to the genome id path
+        """
+        try:
+            genome_path = '/'.join([genome_id[0:3],genome_id[4:7],
+                                    genome_id[7:10],genome_id[10:13]])
+            return genome_path
+        except IndexError:
+            logger = logging.getLogger('timestamp')
+            logger.error('Specified path could not be created for reference genome: ' + genome_id)
+            raise GTDBTkExit('Specified path could not be created for reference genome: ' + genome_id)
+
     def place_genomes(self,
                       user_msa_file,
                       marker_set_id,
@@ -112,21 +132,21 @@ def place_genomes(self,
         """Place genomes into reference tree using pplacer."""
 
         # Warn if the memory is insufficient
+        mem_warning = 'pplacer requires ~{req_gb} GB of RAM to fully load the ' \
+                      '{domain} tree into memory. However, {cur_gb:,} GB was ' \
+                      'detected. This may affect pplacer performance, or fail' \
+                      ' if there is insufficient swap space.'
         mem_gb = get_memory_gb()
         if mem_gb is not None:
             mem_total = mem_gb['MemTotal']
-            if marker_set_id == 'bac120' and mem_total < 145:
-                self.logger.warning(f'pplacer requires ~152 GB of RAM to fully '
-                                    f'load the bacterial tree into memory. '
-                                    f'However, {mem_total:,}GB was detected. '
-                                    f'This may affect pplacer performance, '
-                                    f'or fail if there is insufficient scratch space.')
-            elif marker_set_id == 'ar122' and mem_total < 6:
-                self.logger.warning(f'pplacer requires ~8.2 GB of RAM to fully '
-                                    f'load the archaeal tree into memory. '
-                                    f'However, {mem_total:,}GB was detected. '
-                                    f'This may affect pplacer performance, '
-                                    f'or fail if there is insufficient scratch space.')
+            if marker_set_id == 'bac120' and mem_total < Config.PPLACER_MIN_RAM_BAC:
+                self.logger.warning(mem_warning.format(req_gb=Config.PPLACER_MIN_RAM_BAC,
+                                                       domain='bacterial',
+                                                       cur_gb=mem_total))
+            elif marker_set_id == 'ar122' and mem_total < Config.PPLACER_MIN_RAM_ARC:
+                self.logger.warning(mem_warning.format(req_gb=Config.PPLACER_MIN_RAM_ARC,
+                                                       domain='archaeal',
+                                                       cur_gb=mem_total))
 
         # rename user MSA file for compatibility with pplacer
         if not user_msa_file.endswith('.fasta'):
@@ -1655,7 +1675,9 @@ def _get_fastani_genome_path(self, fastani_verification, genomes):
                 if leafnode.taxon.label.startswith('GB_') or leafnode.taxon.label.startswith('RS_'):
                     shortleaf = leafnode.taxon.label[3:]
                 ref_path = os.path.join(
-                    Config.FASTANI_GENOMES, shortleaf + Config.FASTANI_GENOMES_EXT)
+                    Config.FASTANI_GENOMES,
+                    self.parse_leaf_to_dir_path(shortleaf),
+                    shortleaf + Config.FASTANI_GENOMES_EXT)
                 if not os.path.isfile(ref_path):
                     raise GTDBTkExit(f'Reference genome missing from FastANI database: {ref_path}')
 

diff --git a/gtdbtk/config/config.py b/gtdbtk/config/config.py
@@ -21,7 +21,7 @@
 If the reference package sub-folders still exist in GTDBTK_DATA_PATH, then there
 is no need to edit the variables below.
 """
-MIN_REF_DATA_VERSION = 'r95'
+MIN_REF_DATA_VERSION = 'r202'
 
 MSA_FOLDER = os.path.join(GENERIC_PATH, "msa/")
 MASK_DIR = os.path.join(GENERIC_PATH, "masks/")
@@ -83,8 +83,8 @@
 RED_MIN_CHILDREN = 2
 
 # Marker information
-BAC120_MARKERS = {"PFAM": ["PF00380.14.hmm", "PF00410.14.hmm", "PF00466.15.hmm",
-                           "PF01025.14.hmm", "PF02576.12.hmm", "PF03726.9.hmm"],
+BAC120_MARKERS = {"PFAM": ["PF00380.20.hmm", "PF00410.20.hmm", "PF00466.21.hmm",
+                           "PF01025.20.hmm", "PF02576.18.hmm", "PF03726.15.hmm"],
                   "TIGRFAM": ["TIGR00006.HMM", "TIGR00019.HMM", "TIGR00020.HMM",
                               "TIGR00029.HMM", "TIGR00043.HMM", "TIGR00054.HMM",
                               "TIGR00059.HMM", "TIGR00061.HMM", "TIGR00064.HMM",
@@ -124,18 +124,18 @@
                               "TIGR03625.HMM", "TIGR03632.HMM", "TIGR03654.HMM",
                               "TIGR03723.HMM", "TIGR03725.HMM", "TIGR03953.HMM"]}
 
-AR122_MARKERS = {"PFAM": ["PF01868.11.hmm", "PF01282.14.hmm", "PF01655.13.hmm",
-                          "PF01092.14.hmm", "PF01000.21.hmm", "PF00368.13.hmm",
-                          "PF00827.12.hmm", "PF01269.12.hmm", "PF00466.15.hmm",
-                          "PF01015.13.hmm", "PF13685.1.hmm", "PF02978.14.hmm",
-                          "PF04919.7.hmm", "PF01984.15.hmm", "PF04104.9.hmm",
-                          "PF00410.14.hmm", "PF01798.13.hmm", "PF01864.12.hmm",
-                          "PF01990.12.hmm", "PF07541.7.hmm", "PF04019.7.hmm",
-                          "PF00900.15.hmm", "PF01090.14.hmm", "PF02006.11.hmm",
-                          "PF01157.13.hmm", "PF01191.14.hmm", "PF01866.12.hmm",
-                          "PF01198.14.hmm", "PF01496.14.hmm", "PF00687.16.hmm",
-                          "PF03874.11.hmm", "PF01194.12.hmm", "PF01200.13.hmm",
-                          "PF13656.1.hmm", "PF01280.15.hmm"],
+AR122_MARKERS = {"PFAM": ["PF01868.17.hmm", "PF01282.20.hmm", "PF01655.19.hmm",
+                          "PF01092.20.hmm", "PF01000.27.hmm", "PF00368.19.hmm",
+                          "PF00827.18.hmm", "PF01269.18.hmm", "PF00466.21.hmm",
+                          "PF01015.19.hmm", "PF13685.7.hmm", "PF02978.20.hmm",
+                          "PF04919.13.hmm", "PF01984.21.hmm", "PF04104.15.hmm",
+                          "PF00410.20.hmm", "PF01798.19.hmm", "PF01864.18.hmm",
+                          "PF01990.18.hmm", "PF07541.13.hmm", "PF04019.13.hmm",
+                          "PF00900.21.hmm", "PF01090.20.hmm", "PF02006.17.hmm",
+                          "PF01157.19.hmm", "PF01191.20.hmm", "PF01866.18.hmm",
+                          "PF01198.20.hmm", "PF01496.20.hmm", "PF00687.22.hmm",
+                          "PF03874.17.hmm", "PF01194.18.hmm", "PF01200.19.hmm",
+                          "PF13656.7.hmm", "PF01280.21.hmm"],
                  "TIGRFAM": ["TIGR00468.HMM", "TIGR01060.HMM", "TIGR03627.HMM",
                              "TIGR01020.HMM", "TIGR02258.HMM", "TIGR00293.HMM",
                              "TIGR00389.HMM", "TIGR01012.HMM", "TIGR00490.HMM",
@@ -196,26 +196,29 @@
 PPLACER_BAC120_REF_PKG = f"gtdb_{VERSION_DATA}_bac120.refpkg"
 PPLACER_AR122_REF_PKG = f"gtdb_{VERSION_DATA}_ar122.refpkg"
 PPLACER_RPS23_REF_PKG = f"gtdb_{VERSION_DATA}_rps23.refpkg"
+PPLACER_MIN_RAM_BAC = 204
+PPLACER_MIN_RAM_ARC = 13
 
 # Fastani configuration
 FASTANI_SPECIES_THRESHOLD = 95.0
 FASTANI_GENOMES = os.path.join(FASTANI_DIR, "database/")
+FASTANI_GENOME_LIST = os.path.join(FASTANI_DIR, "genome_paths.tsv")
 FASTANI_GENOMES_EXT = "_genomic.fna.gz"
 
 # MRCA RED VALUE
 MRCA_RED_BAC120 = os.path.join(RED_DIR, f"gtdbtk_{VERSION_DATA}_bac120.tsv")
 MRCA_RED_AR122 = os.path.join(RED_DIR, f"gtdbtk_{VERSION_DATA}_ar122.tsv")
 
 # Hashing information for validating the reference package.
-REF_HASHES = {PPLACER_DIR: 'f41cfe0284ebaca4485b42e054936190c6a88bd1',
-              MASK_DIR: '63551a43333bc6cbc9abf139ce881847ca19240b',
-              MARKER_DIR: 'a325720422d8348d7a934143cc86112b6c92ac98',
-              RADII_DIR: '1092727925f38a8a2b3f4fb40e3316c0083671f5',
-              MSA_FOLDER: 'cf91d712c733e7e2535a41e6153c12b3c37d1ede',
-              METADATA_DIR: 'e003b4d5d48302e85c536751f663a70447de83d4',
-              TAX_FOLDER: '30c5970b2eaf5df654b2e01bfa39265302c0be89',
-              FASTANI_DIR: '6a3555bb61d9cc3163c26e65772b96b8f58a2d84',
-              RED_DIR: '6f661eef8e172a8a7e78af2a74fe4d079a3f5b0f'}
+REF_HASHES = {PPLACER_DIR: '4d931b5109a240602f55228029b87ee768da8141',
+              MASK_DIR: '36d6ac371d247b2b952523b9798e78908ea323fa',
+              MARKER_DIR: '2ba5ae35fb272462663651d18fd9e523317e48cd',
+              RADII_DIR: '9f9a2e21e27b9049044d04d731795499414a365c',
+              MSA_FOLDER: 'b426865245c39ee9f01b0392fb8f7867a9f76f0a',
+              METADATA_DIR: '7640aed96fdb13707a2b79b746a94335faabd6df',
+              TAX_FOLDER: '4a7a1e4047c088e92dee9740206499cdb7e5beca',
+              FASTANI_DIR: '70439cf088d0fa0fdbb4f47b4a6b47e199912139',
+              RED_DIR: 'ad6a184150e7b6e58547912660a17999fadcfbff'}
 
 # Config values for checking GTDB-Tk on startup.
 GTDBTK_VER_CHECK = True

diff --git a/gtdbtk/config/output.py b/gtdbtk/config/output.py
@@ -8,6 +8,7 @@
 PATH_BAC120_MARKER_SUMMARY = join(DIR_IDENTIFY, '{prefix}.bac120.markers_summary.tsv')
 PATH_AR122_MARKER_SUMMARY = join(DIR_IDENTIFY, '{prefix}.ar122.markers_summary.tsv')
 PATH_TLN_TABLE_SUMMARY = join(DIR_IDENTIFY, '{prefix}.translation_table_summary.tsv')
+PATH_FAILS = join(DIR_IDENTIFY,'{prefix}.failed_genomes.tsv')
 
 # Command: identify -> marker genes
 GENOME_FILE_SUFFIX = "_genomic.fna"
@@ -91,3 +92,4 @@
 
 # General files
 PATH_WARNINGS = '{prefix}.warnings.log'
+
diff --git a/gtdbtk/external/prodigal.py b/gtdbtk/external/prodigal.py
@@ -33,6 +33,7 @@ class Prodigal(object):
 
     def __init__(self,
                  threads,
+                 failed_genomes_file,
                  marker_gene_dir,
                  protein_file_suffix,
                  nt_gene_file_suffix,
@@ -42,6 +43,7 @@ def __init__(self,
 
         self.logger = logging.getLogger('timestamp')
         self.warnings = logging.getLogger('warnings')
+        self.failed_genomes_file = failed_genomes_file
 
         self.threads = threads
 
@@ -182,6 +184,8 @@ def run(self, genomic_files, tln_tables):
         for _ in range(self.threads):
             worker_queue.put(None)
 
+        worker_proc = []
+        writer_proc = None
         try:
             manager = mp.Manager()
             out_dict = manager.dict()
@@ -208,12 +212,12 @@ def run(self, genomic_files, tln_tables):
 
             writer_queue.put(None)
             writer_proc.join()
-        except Exception:
+        except Exception as e:
             for p in worker_proc:
                 p.terminate()
-
-            writer_proc.terminate()
-            raise ProdigalException('An exception was caught while running Prodigal.')
+            if writer_proc:
+                writer_proc.terminate()
+            raise ProdigalException(f'An exception was caught while running Prodigal: {e}')
 
         # Report if any genomes were skipped due to having already been processed.
         if n_skipped.value > 0:
@@ -224,6 +228,7 @@ def run(self, genomic_files, tln_tables):
         # Report on any genomes which failed to have any genes called
         result_dict = dict()
         lq_gids = list()
+        fails = open(self.failed_genomes_file,'w')
         for gid, gid_dict in out_dict.items():
             if os.path.getsize(gid_dict['aa_gene_path']) <= 1:
                 lq_gids.append(gid)
@@ -238,13 +243,17 @@ def run(self, genomic_files, tln_tables):
                                   f'been excluded from analysis due to Prodigal '
                                   f'failing to call any genes:')
 
+
             # If there are few low-quality genomes just output to console.
             if len(lq_gids) > 10:
                 for lq_gid in lq_gids:
                     self.warnings.info(lq_gid)
+                    fails.write(f'{lq_gid}\tno genes were called by Prodigal\n')
             else:
                 for lq_gid in lq_gids:
                     self.logger.warning(f'Skipping: {lq_gid}')
                     self.warnings.info(lq_gid)
+                    fails.write(f'{lq_gid}\tno genes were called by Prodigal\n')
 
+        fails.close()
         return result_dict
diff --git a/gtdbtk/main.py b/gtdbtk/main.py
@@ -712,8 +712,6 @@ def parse_options(self, options):
             options.max_consensus = None
             options.rnd_seed = None
             options.skip_trimming = False
-            options.scratch_dir = None
-            options.recalculate_red = False
 
             self.align(options)