diff --git a/README.md b/README.md index a0122205..8a41c7df 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,8 @@ [![Docker Image Version (latest by date)](https://img.shields.io/docker/v/ecogenomic/gtdbtk?sort=date&color=299bec&label=docker)](https://hub.docker.com/r/ecogenomic/gtdbtk) [![Docker Pulls](https://img.shields.io/docker/pulls/ecogenomic/gtdbtk?color=299bec&label=pulls)](https://hub.docker.com/r/ecogenomic/gtdbtk) -[GTDB-Tk v1.3.0](https://ecogenomics.github.io/GTDBTk/announcements.html) was released on July 17, 2020 along with new reference data for [GTDB R05-RS95](https://gtdb.ecogenomic.org/). Upgrading is recommended. +[GTDB-Tk v1.5.0](https://ecogenomics.github.io/GTDBTk/announcements.html) was released on April 23, 2021 along with new reference data for [GTDB R06-RS202](https://gtdb.ecogenomic.org/). Upgrading is recommended. + Please note v1.5.0+ is not compatible with GTDB R05-RS95. GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy [GTDB](https://gtdb.ecogenomic.org/). It is designed to work with recent advances that allow hundreds or thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also be applied to isolate and single-cell genomes. The GTDB-Tk is open source and released under the [GNU General Public License (Version 3)](https://www.gnu.org/licenses/gpl-3.0.en.html). diff --git a/docs/src/changelog.rst b/docs/src/changelog.rst index 1c49d574..5dbd2b58 100644 --- a/docs/src/changelog.rst +++ b/docs/src/changelog.rst @@ -2,6 +2,20 @@ Change log ========== +1.5.0 +----- + +* (`#311 `_) Updated GTDB-Tk to support R202. + See https://ecogenomics.github.io/GTDBTk/installing/index.html#gtdb-tk-reference-data for instructions on downloading R202. + + +1.4.2 +----- + +* (`#311 `_) Fixed --scratch_dir not working in v 1.4.1 for classify_wf +* (`#312 `_) Automatic drop of genome leads to error in downstream modules of classify_wf + + 1.4.1 ----- diff --git a/docs/src/commands/de_novo_wf.rst b/docs/src/commands/de_novo_wf.rst index 674f896e..9d6d0dbf 100644 --- a/docs/src/commands/de_novo_wf.rst +++ b/docs/src/commands/de_novo_wf.rst @@ -68,7 +68,7 @@ Input .. code-block:: bash - gtdbtk de_novo_wf --genome_dir genomes/ --outgroup_taxon p__Nanobacteria --ar122_ms --out_dir de_novo_wf --cpus 3 + gtdbtk de_novo_wf --genome_dir genomes/ --outgroup_taxon p__Undinarchaeota --ar122_ms --out_dir de_novo_wf --cpus 3 gtdbtk de_novo_wf --genome_dir ./genomes --bac120_ms --outgroup_taxon p__Chloroflexota --taxa_filter p__Firmicutes --out_dir de_novo_output diff --git a/docs/src/installing/index.rst b/docs/src/installing/index.rst index 042131ad..53690ee0 100644 --- a/docs/src/installing/index.rst +++ b/docs/src/installing/index.rst @@ -34,11 +34,11 @@ Hardware requirements - Storage - Time * - Archaea - - ~8 GB + - ~13 GB - ~27 GB - ~1 hour / 1,000 genomes @ 64 CPUs * - Bacteria - - ~150 GB + - ~204 GB - ~27 GB - ~1 hour / 1,000 genomes @ 64 CPUs @@ -122,7 +122,8 @@ GTDB-Tk requires ~27G of external data that needs to be downloaded and unarchive .. code-block:: bash - wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release95/95.0/auxillary_files/gtdbtk_r95_data.tar.gz + wget https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_data.tar.gz + wget https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/auxillary_files/gtdbtk_data.tar.gz (or, mirror) tar xvzf gtdbtk_r95_data.tar.gz @@ -136,9 +137,12 @@ Note that different versions of the GTDB release data may not run on all version * - GTDB Release - Minimum version - Maximum version + * - R202 + - 1.5.0 + - N/A * - R95 - 1.3.0 - - N/A + - 1.4.2 * - R89 - 0.3.0 - 0.1.2 diff --git a/gtdbtk/__init__.py b/gtdbtk/__init__.py index 2c3ffe01..70d0f0f5 100644 --- a/gtdbtk/__init__.py +++ b/gtdbtk/__init__.py @@ -29,4 +29,4 @@ __status__ = 'Production' __title__ = 'GTDB-Tk' __url__ = 'https://github.com/Ecogenomics/GTDBTk' -__version__ = '1.4.1' +__version__ = '1.5.0' diff --git a/gtdbtk/ani_rep.py b/gtdbtk/ani_rep.py index 82750183..d6548d21 100644 --- a/gtdbtk/ani_rep.py +++ b/gtdbtk/ani_rep.py @@ -5,8 +5,9 @@ from gtdbtk.biolib_lite.common import canonical_gid from gtdbtk.biolib_lite.execute import check_dependencies from gtdbtk.biolib_lite.taxonomy import Taxonomy -from gtdbtk.config.config import (FASTANI_GENOMES, +from gtdbtk.config.config import (FASTANI_DIR, FASTANI_GENOMES_EXT, + FASTANI_GENOME_LIST, TAXONOMY_FILE, AF_THRESHOLD) from gtdbtk.config.output import DIR_ANI_REP_INT_MASH @@ -53,10 +54,12 @@ def _get_ref_genomes(): Dict[genome_id] = fasta_path """ ref_genomes = dict() - for f_name in os.listdir(FASTANI_GENOMES): - if f_name.endswith(FASTANI_GENOMES_EXT): - accession = f_name.split(FASTANI_GENOMES_EXT)[0] - ref_genomes[accession] = os.path.join(FASTANI_GENOMES, f_name) + with open(FASTANI_GENOME_LIST) as g_path_file: + for line in g_path_file: + (full_name, path) = line.strip().split() + if full_name.endswith(FASTANI_GENOMES_EXT): + accession = full_name.split(FASTANI_GENOMES_EXT)[0] + ref_genomes[accession] = os.path.join(FASTANI_DIR, path, full_name) return ref_genomes def run(self, genomes, no_mash, max_d, out_dir, prefix, mash_k, mash_v, mash_s, min_af, mash_db): diff --git a/gtdbtk/classify.py b/gtdbtk/classify.py index b78533fd..73d33113 100644 --- a/gtdbtk/classify.py +++ b/gtdbtk/classify.py @@ -101,6 +101,26 @@ def parse_radius_file(self): results[gid] = float(infos[2]) return results + def parse_leaf_to_dir_path(self,genome_id): + """ Convert a genome id to a path. + i.e GCA_123456789.0 would be converted to GCA/123/456/789/ + + Parameters + ---------- + genome_id: str + NCBI genome id GCF/GCA_xxxxxxxxx + :return: str + path to the genome id path + """ + try: + genome_path = '/'.join([genome_id[0:3],genome_id[4:7], + genome_id[7:10],genome_id[10:13]]) + return genome_path + except IndexError: + logger = logging.getLogger('timestamp') + logger.error('Specified path could not be created for reference genome: ' + genome_id) + raise GTDBTkExit('Specified path could not be created for reference genome: ' + genome_id) + def place_genomes(self, user_msa_file, marker_set_id, @@ -112,21 +132,21 @@ def place_genomes(self, """Place genomes into reference tree using pplacer.""" # Warn if the memory is insufficient + mem_warning = 'pplacer requires ~{req_gb} GB of RAM to fully load the ' \ + '{domain} tree into memory. However, {cur_gb:,} GB was ' \ + 'detected. This may affect pplacer performance, or fail' \ + ' if there is insufficient swap space.' mem_gb = get_memory_gb() if mem_gb is not None: mem_total = mem_gb['MemTotal'] - if marker_set_id == 'bac120' and mem_total < 145: - self.logger.warning(f'pplacer requires ~152 GB of RAM to fully ' - f'load the bacterial tree into memory. ' - f'However, {mem_total:,}GB was detected. ' - f'This may affect pplacer performance, ' - f'or fail if there is insufficient scratch space.') - elif marker_set_id == 'ar122' and mem_total < 6: - self.logger.warning(f'pplacer requires ~8.2 GB of RAM to fully ' - f'load the archaeal tree into memory. ' - f'However, {mem_total:,}GB was detected. ' - f'This may affect pplacer performance, ' - f'or fail if there is insufficient scratch space.') + if marker_set_id == 'bac120' and mem_total < Config.PPLACER_MIN_RAM_BAC: + self.logger.warning(mem_warning.format(req_gb=Config.PPLACER_MIN_RAM_BAC, + domain='bacterial', + cur_gb=mem_total)) + elif marker_set_id == 'ar122' and mem_total < Config.PPLACER_MIN_RAM_ARC: + self.logger.warning(mem_warning.format(req_gb=Config.PPLACER_MIN_RAM_ARC, + domain='archaeal', + cur_gb=mem_total)) # rename user MSA file for compatibility with pplacer if not user_msa_file.endswith('.fasta'): @@ -1655,7 +1675,9 @@ def _get_fastani_genome_path(self, fastani_verification, genomes): if leafnode.taxon.label.startswith('GB_') or leafnode.taxon.label.startswith('RS_'): shortleaf = leafnode.taxon.label[3:] ref_path = os.path.join( - Config.FASTANI_GENOMES, shortleaf + Config.FASTANI_GENOMES_EXT) + Config.FASTANI_GENOMES, + self.parse_leaf_to_dir_path(shortleaf), + shortleaf + Config.FASTANI_GENOMES_EXT) if not os.path.isfile(ref_path): raise GTDBTkExit(f'Reference genome missing from FastANI database: {ref_path}') diff --git a/gtdbtk/config/config.py b/gtdbtk/config/config.py index 4a5950dc..6ba3fb2d 100644 --- a/gtdbtk/config/config.py +++ b/gtdbtk/config/config.py @@ -21,7 +21,7 @@ If the reference package sub-folders still exist in GTDBTK_DATA_PATH, then there is no need to edit the variables below. """ -MIN_REF_DATA_VERSION = 'r95' +MIN_REF_DATA_VERSION = 'r202' MSA_FOLDER = os.path.join(GENERIC_PATH, "msa/") MASK_DIR = os.path.join(GENERIC_PATH, "masks/") @@ -83,8 +83,8 @@ RED_MIN_CHILDREN = 2 # Marker information -BAC120_MARKERS = {"PFAM": ["PF00380.14.hmm", "PF00410.14.hmm", "PF00466.15.hmm", - "PF01025.14.hmm", "PF02576.12.hmm", "PF03726.9.hmm"], +BAC120_MARKERS = {"PFAM": ["PF00380.20.hmm", "PF00410.20.hmm", "PF00466.21.hmm", + "PF01025.20.hmm", "PF02576.18.hmm", "PF03726.15.hmm"], "TIGRFAM": ["TIGR00006.HMM", "TIGR00019.HMM", "TIGR00020.HMM", "TIGR00029.HMM", "TIGR00043.HMM", "TIGR00054.HMM", "TIGR00059.HMM", "TIGR00061.HMM", "TIGR00064.HMM", @@ -124,18 +124,18 @@ "TIGR03625.HMM", "TIGR03632.HMM", "TIGR03654.HMM", "TIGR03723.HMM", "TIGR03725.HMM", "TIGR03953.HMM"]} -AR122_MARKERS = {"PFAM": ["PF01868.11.hmm", "PF01282.14.hmm", "PF01655.13.hmm", - "PF01092.14.hmm", "PF01000.21.hmm", "PF00368.13.hmm", - "PF00827.12.hmm", "PF01269.12.hmm", "PF00466.15.hmm", - "PF01015.13.hmm", "PF13685.1.hmm", "PF02978.14.hmm", - "PF04919.7.hmm", "PF01984.15.hmm", "PF04104.9.hmm", - "PF00410.14.hmm", "PF01798.13.hmm", "PF01864.12.hmm", - "PF01990.12.hmm", "PF07541.7.hmm", "PF04019.7.hmm", - "PF00900.15.hmm", "PF01090.14.hmm", "PF02006.11.hmm", - "PF01157.13.hmm", "PF01191.14.hmm", "PF01866.12.hmm", - "PF01198.14.hmm", "PF01496.14.hmm", "PF00687.16.hmm", - "PF03874.11.hmm", "PF01194.12.hmm", "PF01200.13.hmm", - "PF13656.1.hmm", "PF01280.15.hmm"], +AR122_MARKERS = {"PFAM": ["PF01868.17.hmm", "PF01282.20.hmm", "PF01655.19.hmm", + "PF01092.20.hmm", "PF01000.27.hmm", "PF00368.19.hmm", + "PF00827.18.hmm", "PF01269.18.hmm", "PF00466.21.hmm", + "PF01015.19.hmm", "PF13685.7.hmm", "PF02978.20.hmm", + "PF04919.13.hmm", "PF01984.21.hmm", "PF04104.15.hmm", + "PF00410.20.hmm", "PF01798.19.hmm", "PF01864.18.hmm", + "PF01990.18.hmm", "PF07541.13.hmm", "PF04019.13.hmm", + "PF00900.21.hmm", "PF01090.20.hmm", "PF02006.17.hmm", + "PF01157.19.hmm", "PF01191.20.hmm", "PF01866.18.hmm", + "PF01198.20.hmm", "PF01496.20.hmm", "PF00687.22.hmm", + "PF03874.17.hmm", "PF01194.18.hmm", "PF01200.19.hmm", + "PF13656.7.hmm", "PF01280.21.hmm"], "TIGRFAM": ["TIGR00468.HMM", "TIGR01060.HMM", "TIGR03627.HMM", "TIGR01020.HMM", "TIGR02258.HMM", "TIGR00293.HMM", "TIGR00389.HMM", "TIGR01012.HMM", "TIGR00490.HMM", @@ -196,10 +196,13 @@ PPLACER_BAC120_REF_PKG = f"gtdb_{VERSION_DATA}_bac120.refpkg" PPLACER_AR122_REF_PKG = f"gtdb_{VERSION_DATA}_ar122.refpkg" PPLACER_RPS23_REF_PKG = f"gtdb_{VERSION_DATA}_rps23.refpkg" +PPLACER_MIN_RAM_BAC = 204 +PPLACER_MIN_RAM_ARC = 13 # Fastani configuration FASTANI_SPECIES_THRESHOLD = 95.0 FASTANI_GENOMES = os.path.join(FASTANI_DIR, "database/") +FASTANI_GENOME_LIST = os.path.join(FASTANI_DIR, "genome_paths.tsv") FASTANI_GENOMES_EXT = "_genomic.fna.gz" # MRCA RED VALUE @@ -207,15 +210,15 @@ MRCA_RED_AR122 = os.path.join(RED_DIR, f"gtdbtk_{VERSION_DATA}_ar122.tsv") # Hashing information for validating the reference package. -REF_HASHES = {PPLACER_DIR: 'f41cfe0284ebaca4485b42e054936190c6a88bd1', - MASK_DIR: '63551a43333bc6cbc9abf139ce881847ca19240b', - MARKER_DIR: 'a325720422d8348d7a934143cc86112b6c92ac98', - RADII_DIR: '1092727925f38a8a2b3f4fb40e3316c0083671f5', - MSA_FOLDER: 'cf91d712c733e7e2535a41e6153c12b3c37d1ede', - METADATA_DIR: 'e003b4d5d48302e85c536751f663a70447de83d4', - TAX_FOLDER: '30c5970b2eaf5df654b2e01bfa39265302c0be89', - FASTANI_DIR: '6a3555bb61d9cc3163c26e65772b96b8f58a2d84', - RED_DIR: '6f661eef8e172a8a7e78af2a74fe4d079a3f5b0f'} +REF_HASHES = {PPLACER_DIR: '4d931b5109a240602f55228029b87ee768da8141', + MASK_DIR: '36d6ac371d247b2b952523b9798e78908ea323fa', + MARKER_DIR: '2ba5ae35fb272462663651d18fd9e523317e48cd', + RADII_DIR: '9f9a2e21e27b9049044d04d731795499414a365c', + MSA_FOLDER: 'b426865245c39ee9f01b0392fb8f7867a9f76f0a', + METADATA_DIR: '7640aed96fdb13707a2b79b746a94335faabd6df', + TAX_FOLDER: '4a7a1e4047c088e92dee9740206499cdb7e5beca', + FASTANI_DIR: '70439cf088d0fa0fdbb4f47b4a6b47e199912139', + RED_DIR: 'ad6a184150e7b6e58547912660a17999fadcfbff'} # Config values for checking GTDB-Tk on startup. GTDBTK_VER_CHECK = True diff --git a/gtdbtk/config/output.py b/gtdbtk/config/output.py index 4e76bc0b..7af03915 100644 --- a/gtdbtk/config/output.py +++ b/gtdbtk/config/output.py @@ -8,6 +8,7 @@ PATH_BAC120_MARKER_SUMMARY = join(DIR_IDENTIFY, '{prefix}.bac120.markers_summary.tsv') PATH_AR122_MARKER_SUMMARY = join(DIR_IDENTIFY, '{prefix}.ar122.markers_summary.tsv') PATH_TLN_TABLE_SUMMARY = join(DIR_IDENTIFY, '{prefix}.translation_table_summary.tsv') +PATH_FAILS = join(DIR_IDENTIFY,'{prefix}.failed_genomes.tsv') # Command: identify -> marker genes GENOME_FILE_SUFFIX = "_genomic.fna" @@ -91,3 +92,4 @@ # General files PATH_WARNINGS = '{prefix}.warnings.log' + diff --git a/gtdbtk/external/prodigal.py b/gtdbtk/external/prodigal.py index 381c7e34..9d0f03fb 100644 --- a/gtdbtk/external/prodigal.py +++ b/gtdbtk/external/prodigal.py @@ -33,6 +33,7 @@ class Prodigal(object): def __init__(self, threads, + failed_genomes_file, marker_gene_dir, protein_file_suffix, nt_gene_file_suffix, @@ -42,6 +43,7 @@ def __init__(self, self.logger = logging.getLogger('timestamp') self.warnings = logging.getLogger('warnings') + self.failed_genomes_file = failed_genomes_file self.threads = threads @@ -182,6 +184,8 @@ def run(self, genomic_files, tln_tables): for _ in range(self.threads): worker_queue.put(None) + worker_proc = [] + writer_proc = None try: manager = mp.Manager() out_dict = manager.dict() @@ -208,12 +212,12 @@ def run(self, genomic_files, tln_tables): writer_queue.put(None) writer_proc.join() - except Exception: + except Exception as e: for p in worker_proc: p.terminate() - - writer_proc.terminate() - raise ProdigalException('An exception was caught while running Prodigal.') + if writer_proc: + writer_proc.terminate() + raise ProdigalException(f'An exception was caught while running Prodigal: {e}') # Report if any genomes were skipped due to having already been processed. if n_skipped.value > 0: @@ -224,6 +228,7 @@ def run(self, genomic_files, tln_tables): # Report on any genomes which failed to have any genes called result_dict = dict() lq_gids = list() + fails = open(self.failed_genomes_file,'w') for gid, gid_dict in out_dict.items(): if os.path.getsize(gid_dict['aa_gene_path']) <= 1: lq_gids.append(gid) @@ -238,13 +243,17 @@ def run(self, genomic_files, tln_tables): f'been excluded from analysis due to Prodigal ' f'failing to call any genes:') + # If there are few low-quality genomes just output to console. if len(lq_gids) > 10: for lq_gid in lq_gids: self.warnings.info(lq_gid) + fails.write(f'{lq_gid}\tno genes were called by Prodigal\n') else: for lq_gid in lq_gids: self.logger.warning(f'Skipping: {lq_gid}') self.warnings.info(lq_gid) + fails.write(f'{lq_gid}\tno genes were called by Prodigal\n') + fails.close() return result_dict diff --git a/gtdbtk/main.py b/gtdbtk/main.py index 807f1e49..a85e2531 100644 --- a/gtdbtk/main.py +++ b/gtdbtk/main.py @@ -712,8 +712,6 @@ def parse_options(self, options): options.max_consensus = None options.rnd_seed = None options.skip_trimming = False - options.scratch_dir = None - options.recalculate_red = False self.align(options) diff --git a/gtdbtk/markers.py b/gtdbtk/markers.py index c4933a8b..c114e28b 100644 --- a/gtdbtk/markers.py +++ b/gtdbtk/markers.py @@ -55,6 +55,8 @@ def __init__(self, cpus=1, debug=False): self.cpus = cpus self.debug = debug self.marker_gene_dir = None + self.failed_genomes = None + self.genome_file_suffix = GENOME_FILE_SUFFIX self.protein_file_suffix = PROTEIN_FILE_SUFFIX @@ -110,6 +112,8 @@ def _report_identified_marker_genes(self, gene_dict, outdir, prefix, os.path.join(outdir, os.path.basename(PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)))) symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix), os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix)))) + symlink_f(PATH_FAILS.format(prefix=prefix), + os.path.join(outdir, os.path.basename(PATH_FAILS.format(prefix=prefix)))) # Write the single copy AR122/BAC120 FASTA files to disk. if write_single_copy_genes: @@ -176,7 +180,9 @@ def identify(self, genomes, tln_tables, out_dir, prefix, force, write_single_cop f'{self.cpus} threads.') self.marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE) + self.failed_genomes = os.path.join(out_dir, PATH_FAILS.format(prefix=prefix)) prodigal = Prodigal(self.cpus, + self.failed_genomes, self.marker_gene_dir, self.protein_file_suffix, self.nt_gene_file_suffix, @@ -415,15 +421,24 @@ def align(self, genomes_to_process=None): """Align marker genes in genomes.""" + # read genomes that failed identify steps to skip them + failed_genomes_file = os.path.join(os.path.join(identify_dir,os.path.basename(PATH_FAILS.format(prefix=prefix)))) + if os.path.isfile(failed_genomes_file): + with open(failed_genomes_file) as fgf: + failed_genomes = [row.split()[0] for row in fgf] + else: + failed_genomes = list() + # If the user is re-running this step, check if the identify step is consistent. genomic_files = self._path_to_identify_data(identify_dir, identify_dir != out_dir) if genomes_to_process is not None and len(genomic_files) != len(genomes_to_process): - self.logger.error('{} are not present in the input list of genome to process.'.format( - list(set(genomic_files.keys()) - set(genomes_to_process.keys())))) - raise InconsistentGenomeBatch( - 'You are attempting to run GTDB-Tk on a non-empty directory that contains extra ' - 'genomes not present in your initial identify directory. Remove them, or run ' - 'GTDB-Tk on a new directory.') + if list(set(genomic_files.keys()) - set(genomes_to_process.keys())).sort() != failed_genomes.sort(): + self.logger.error('{} are not present in the input list of genome to process.'.format( + list(set(genomic_files.keys()) - set(genomes_to_process.keys())))) + raise InconsistentGenomeBatch( + 'You are attempting to run GTDB-Tk on a non-empty directory that contains extra ' + 'genomes not present in your initial identify directory. Remove them, or run ' + 'GTDB-Tk on a new directory.') # If this is being run as a part of classify_wf, copy the required files. if identify_dir != out_dir: @@ -500,18 +515,6 @@ def align(self, # Generate the user MSA. user_msa = align.align_marker_set(cur_genome_files, marker_info_file, copy_number_f, self.cpus) - # self.logger.log(Config.LOG_TASK, f'Aligning {len(cur_genome_files):,} {domain_str} genomes.') - # hmm_aligner = HmmAligner(self.cpus, - # self.pfam_top_hit_suffix, - # self.tigrfam_top_hit_suffix, - # self.protein_file_suffix, - # self.pfam_hmm_dir, - # self.tigrfam_hmms, - # Config.BAC120_MARKERS, - # Config.AR122_MARKERS) - # user_msa = hmm_aligner.align_marker_set(cur_genome_files, - # marker_set_id) - # Write the individual marker alignments to disk if self.debug: self._write_individual_markers( diff --git a/gtdbtk/misc.py b/gtdbtk/misc.py index 28a63174..d3f3d8e6 100644 --- a/gtdbtk/misc.py +++ b/gtdbtk/misc.py @@ -158,7 +158,7 @@ def check_install(self): if user_hash != expected_hash: self.logger.info(" |-- {:16} {}".format( - base_name, colour('HASH MISMATCH', ['bright'], fg='yellow'))) + base_name, colour(f'HASH MISMATCH {user_hash}', ['bright'], fg='yellow'))) ok = False else: self.logger.info(" |-- {:16} {}".format( diff --git a/scripts/create_genome_paths.sh b/scripts/create_genome_paths.sh new file mode 100644 index 00000000..45b3ec42 --- /dev/null +++ b/scripts/create_genome_paths.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +DATAPATH='database' +for f in $DATAPATH/*.gz +do + mkdir --parents database/${f:9:3}/${f:13:3}/${f:16:3}/${f:19:3}/ ; mv $f $_ + filef="$(basename -- $f)" + echo "$filef database/${f:9:3}/${f:13:3}/${f:16:3}/${f:19:3}/ " >> genome_paths.tsv +done \ No newline at end of file diff --git a/scripts/restructure_reps_fna_folder.sh b/scripts/restructure_reps_fna_folder.sh new file mode 100644 index 00000000..1aa16810 --- /dev/null +++ b/scripts/restructure_reps_fna_folder.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +for f in $1/*.gz +do + filef="$(basename -- $f)" + mkdir --parents $1/${filef:0:3}/${filef:4:3}/${filef:7:3}/${filef:10:3}/ ; mv $f $_ + echo "$filef $1/${filef:0:3}/${filef:4:3}/${filef:7:3}/${filef:10:3}/" >> genome_paths.tsv + +done \ No newline at end of file diff --git a/tests/data/align_dir_reference/identify/gtdbtk.failed_genomes.tsv b/tests/data/align_dir_reference/identify/gtdbtk.failed_genomes.tsv new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/identify_dir_reference/identify/gtdbtk.failed_genomes.tsv b/tests/data/identify_dir_reference/identify/gtdbtk.failed_genomes.tsv new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_gtdbtk/test_external/test_fastani.py b/tests/test_gtdbtk/test_external/test_fastani.py index 17d3180c..47c6e14d 100644 --- a/tests/test_gtdbtk/test_external/test_fastani.py +++ b/tests/test_gtdbtk/test_external/test_fastani.py @@ -48,12 +48,12 @@ def test_run(self): d_compare = {'a': {'x', 'y'}, 'b': {'x'}, 'c': {'z'}} - d_paths = {'a': os.path.join(self.genome_root, 'GCA_001873845.1_genomic.fna.gz'), - 'b': os.path.join(self.genome_root, 'GCA_002083985.1_genomic.fna.gz'), - 'c': os.path.join(self.genome_root, 'GCF_000185805.1_genomic.fna.gz'), - 'x': os.path.join(self.genome_root, 'GCA_002841105.1_genomic.fna.gz'), - 'y': os.path.join(self.genome_root, 'GCA_000402295.1_genomic.fna.gz'), - 'z': os.path.join(self.genome_root, 'GCA_002011165.1_genomic.fna.gz')} + d_paths = {'a': os.path.join(self.genome_root,'GCA/001/873/845', 'GCA_001873845.1_genomic.fna.gz'), + 'b': os.path.join(self.genome_root,'GCA/002/083/985', 'GCA_002083985.1_genomic.fna.gz'), + 'c': os.path.join(self.genome_root,'GCF/000/185/805', 'GCF_000185805.1_genomic.fna.gz'), + 'x': os.path.join(self.genome_root,'GCA/002/841/105', 'GCA_002841105.1_genomic.fna.gz'), + 'y': os.path.join(self.genome_root,'GCA/000/402/295', 'GCA_000402295.1_genomic.fna.gz'), + 'z': os.path.join(self.genome_root,'GCA/002/011/165', 'GCA_002011165.1_genomic.fna.gz')} result = fa.run(d_compare, d_paths)