From 39c139d0b26f683e6564bbb4ccc15911f9eb9f6f Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Fri, 30 Aug 2024 15:48:38 +0930 Subject: [PATCH] issue #70 - SnakeMake --- generate_transcript_data/Snakefile | 108 ++++++++++++++++++ .../cdot_transcripts.yaml | 87 ++++++++++++++ generate_transcript_data/cdot_utils.smk | 12 ++ generate_transcript_data/transcripts.smk | 13 +-- 4 files changed, 210 insertions(+), 10 deletions(-) create mode 100644 generate_transcript_data/cdot_transcripts.yaml create mode 100644 generate_transcript_data/cdot_utils.smk diff --git a/generate_transcript_data/Snakefile b/generate_transcript_data/Snakefile index e69de29..a0c6db7 100644 --- a/generate_transcript_data/Snakefile +++ b/generate_transcript_data/Snakefile @@ -0,0 +1,108 @@ +import subprocess +from datetime import datetime + +configfile: os.path.join(workflow.basedir, "cdot_transcripts.yaml") + +cdot_json = os.path.join(workflow.basedir, "cdot_json.py") +cdot_dir = os.path.dirname(workflow.basedir) +cdot_output_raw = subprocess.check_output(f"{cdot_json} --version", shell=True, env={"PYTHONPATH": cdot_dir}) +cdot_data_version = cdot_output_raw.decode().strip() + +def get_cdot_command(wildcards): + url = urls[wildcards.name] + cdot_command = "gff_to_json" if url.endswith(".gff.gz") else "gtf_to_json" + return cdot_command + + +# Name it based on date as it may vary +today = datetime.now().date().isoformat() +gene_info_download_filename = f"Homo_sapiens.gene_info.{today}.gz" +gene_info_json = f"Homo_sapiens.gene-info-{cdot_data_version}.json.gz" + +genome_build_files = [] +for annotation_consortium, builds in config["config"].items(): + for genome_build in builds: + filename = os.path.join(annotation_consortium, genome_build, f"cdot-{cdot_data_version}-{annotation_consortium}.{genome_build}.json.gz") + genome_build_files.append(filename) + +rule all: + input: + gene_info_json, + # genome_build_files, + expand("{annotation_consortium}/{genome_build}/cdot-{cdot_data_version}-{annotation_consortium}.{genome_build}.json.gz", + annotation_consortium=["RefSeq", "Ensembl"], + genome_build=["GRCh37", "GRCh38", "T2T-CHM13v2.0"], + cdot_data_version=[cdot_data_version]) + +rule cdot_build_json: + # Merges all GFFs per build + output: + "{annotation_consortium}/{genome_build}/cdot-{cdot_data_version}-{annotation_consortium}.{genome_build}.json.gz" + input: + lambda wildcards: expand("{annotation_consortium}/{genome_build}/cdot-{cdot_data_version}-{name}.json.gz", name=wildcards.urls) + params: + urls = lambda wildcards: config[wildcards.annotation_consortium][wildcards.genome_build] + shell: + """ + PYTHONPATH={cdot_dir} \ + {cdot_json} \ + combine_builds \ + {input} \ + #--grch37 GRCh37/cdot-${CDOT_DATA_VERSION}.refseq.grch37.json.gz \ + #--grch38 GRCh38/cdot-${CDOT_DATA_VERSION}.refseq.grch38.json.gz \ + --output ${output} + """ + + +rule cdot_gff_json: + # Individual GFF + input: + gene_info_json=gene_info_json, + gff_file="downloads/{name}.gz" + output: + protected("{annotation_consortium}/{genome_build}/cdot-{cdot_data_version}-{name}.json.gz") + params: + url=lambda wildcards: urls[wildcards.name], + cdot_command=get_cdot_command + shell: + """ + PYTHONPATH={cdot_dir} \ + {cdot_json} \ + {params.cdot_command} \ + "{input.gff_file}" \ + --url "{params.url}" \ + --genome-build="{genome_build}" \ + --output "{output}" \ + --gene-info-json="{input.gene_info_json}" + """ + +rule download_gff_files: + output: + # Don't re-download if snakemake script changes + protected("downloads/{name}.gz") + params: + url=lambda wildcards: urls[wildcards.name] + shell: + "curl -o {output} {params.url}" + + +rule process_gene_info_json: + input: + "downloads/gene_info/{gene_info_download_filename}" + output: + protected("downloads/{gene_info_download_filename}") + shell: + """ + PYTHONPATH={cdot_dir} \ + "{workflow.basedir}/cdot_gene_info.py" \ + --gene-info {input} \ + --output {output} \ + --email cdot@cdot.cc + """ + +rule download_gene_info: + output: + protected("downloads/gene_info/{gene_info_download_filename}") + shell: + "curl -o {output} https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/Homo_sapiens.gene_info.gz" + diff --git a/generate_transcript_data/cdot_transcripts.yaml b/generate_transcript_data/cdot_transcripts.yaml new file mode 100644 index 0000000..bfc7827 --- /dev/null +++ b/generate_transcript_data/cdot_transcripts.yaml @@ -0,0 +1,87 @@ +config: + Ensembl: + # For Ensembl - we have to use GTFs as the GFF3s don't have protein versions in them + GRCh37: + #v81 (points to 75) and earlier at GTFs that don't have transcript versions - just skip them + #82 is first GFF3 for GRCh37 + #83 has no data + #84 is 82 again + #86 is 85 again + Homo_sapiens_GRCh37_Ensembl_82: "http://ftp.ensembl.org/pub/grch37/release-82/gtf/homo_sapiens/Homo_sapiens.GRCh37.82.gtf.gz" + Homo_sapiens_GRCh37_Ensembl_85: "http://ftp.ensembl.org/pub/grch37/release-85/gtf/homo_sapiens/Homo_sapiens.GRCh37.85.gtf.gz" + Homo_sapiens_GRCh37_Ensembl_87: "http://ftp.ensembl.org/pub/grch37/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz" + + GRCh38: + Homo_sapiens_GRCh38_Ensembl_81: "http://ftp.ensembl.org/pub/release-81/gtf/homo_sapiens/Homo_sapiens.GRCh38.81.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_82: "http://ftp.ensembl.org/pub/release-82/gtf/homo_sapiens/Homo_sapiens.GRCh38.82.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_83: "http://ftp.ensembl.org/pub/release-83/gtf/homo_sapiens/Homo_sapiens.GRCh38.83.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_84: "http://ftp.ensembl.org/pub/release-84/gtf/homo_sapiens/Homo_sapiens.GRCh38.84.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_85: "http://ftp.ensembl.org/pub/release-85/gtf/homo_sapiens/Homo_sapiens.GRCh38.85.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_86: "http://ftp.ensembl.org/pub/release-86/gtf/homo_sapiens/Homo_sapiens.GRCh38.86.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_87: "http://ftp.ensembl.org/pub/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh38.87.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_88: "http://ftp.ensembl.org/pub/release-88/gtf/homo_sapiens/Homo_sapiens.GRCh38.88.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_89: "http://ftp.ensembl.org/pub/release-89/gtf/homo_sapiens/Homo_sapiens.GRCh38.89.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_90: "http://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens/Homo_sapiens.GRCh38.90.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_91: "http://ftp.ensembl.org/pub/release-91/gtf/homo_sapiens/Homo_sapiens.GRCh38.91.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_92: "http://ftp.ensembl.org/pub/release-92/gtf/homo_sapiens/Homo_sapiens.GRCh38.92.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_93: "http://ftp.ensembl.org/pub/release-93/gtf/homo_sapiens/Homo_sapiens.GRCh38.93.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_94: "http://ftp.ensembl.org/pub/release-94/gtf/homo_sapiens/Homo_sapiens.GRCh38.94.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_95: "http://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_96: "http://ftp.ensembl.org/pub/release-96/gtf/homo_sapiens/Homo_sapiens.GRCh38.96.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_97: "http://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_98: "http://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_99: "http://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_100: "http://ftp.ensembl.org/pub/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh38.100.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_101: "http://ftp.ensembl.org/pub/release-101/gtf/homo_sapiens/Homo_sapiens.GRCh38.101.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_102: "http://ftp.ensembl.org/pub/release-102/gtf/homo_sapiens/Homo_sapiens.GRCh38.102.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_103: "http://ftp.ensembl.org/pub/release-103/gtf/homo_sapiens/Homo_sapiens.GRCh38.103.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_104: "http://ftp.ensembl.org/pub/release-104/gtf/homo_sapiens/Homo_sapiens.GRCh38.104.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_105: "http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_106: "http://ftp.ensembl.org/pub/release-106/gtf/homo_sapiens/Homo_sapiens.GRCh38.106.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_107: "http://ftp.ensembl.org/pub/release-107/gtf/homo_sapiens/Homo_sapiens.GRCh38.107.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_108: "http://ftp.ensembl.org/pub/release-108/gtf/homo_sapiens/Homo_sapiens.GRCh38.108.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_109: "http://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_110: "http://ftp.ensembl.org/pub/release-110/gtf/homo_sapiens/Homo_sapiens.GRCh38.110.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_111: "http://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz" + Homo_sapiens_GRCh38_Ensembl_112: "http://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.gtf.gz" + + T2T-CHM13v2.0: + Homo_sapiens_T2T-CHM13v2.0_Ensembl_2022_06: "https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/2022_06/Homo_sapiens-GCA_009914755.4-2022_06-genes.gff3.gz" + Homo_sapiens_T2T-CHM13v2.0_Ensembl_2022_07: "https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/2022_07/Homo_sapiens-GCA_009914755.4-2022_07-genes.gff3.gz" + + RefSeq: + GRCh37: + Homo_sapiens_GRCh37_RefSeq_p5: "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/BUILD.37.3/GFF/ref_GRCh37.p5_top_level.gff3.gz" + Homo_sapiens_GRCh37_RefSeq_103: "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.103/GFF/ref_GRCh37.p9_top_level.gff3.gz" + Homo_sapiens_GRCh37_RefSeq_104: "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.104/GFF/ref_GRCh37.p10_top_level.gff3.gz" + Homo_sapiens_GRCh37_RefSeq_105.20190906: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20190906/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz" + Homo_sapiens_GRCh37_RefSeq_105.20201022: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20201022/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz" + Homo_sapiens_GRCh37_RefSeq_105.20220307: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20220307/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz" + + GRCh38: + Homo_sapiens_GRCh38_RefSeq_106: "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" + Homo_sapiens_GRCh38_RefSeq_107: "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.107/GFF/ref_GRCh38.p2_top_level.gff3.gz" + Homo_sapiens_GRCh38_RefSeq_108: "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.108/GFF/ref_GRCh38.p7_top_level.gff3.gz" + Homo_sapiens_GRCh38_RefSeq_109: "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.109/GFF/ref_GRCh38.p12_top_level.gff3.gz" + # The date on this 109 version is 2020-2024 (after the other 109s below), not sure what's going on + Homo_sapiens_GRCh38_RefSeq_109.GCF: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.gff.gz" + Homo_sapiens_GRCh38_RefSeq_109.20190607: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20190607/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" + Homo_sapiens_GRCh38_RefSeq_109.20190905: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20190905/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" + Homo_sapiens_GRCh38_RefSeq_109.20191205: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20191205/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" + Homo_sapiens_GRCh38_RefSeq_109.20200228: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20200228/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" + Homo_sapiens_GRCh38_RefSeq_109.20200522: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20200522/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" + Homo_sapiens_GRCh38_RefSeq_109.20200815: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20200815/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" + Homo_sapiens_GRCh38_RefSeq_109.20201120: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20201120/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" + Homo_sapiens_GRCh38_RefSeq_109.20210226: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20210226/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" + Homo_sapiens_GRCh38_RefSeq_109.20210514: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20210514/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" + Homo_sapiens_GRCh38_RefSeq_109.20211119: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" + Homo_sapiens_GRCh38_RefSeq_110: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" + Homo_sapiens_GRCh38_RefSeq_RS_2023_03: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_03/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" + Homo_sapiens_GRCh38_RefSeq_RS_2023_10: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_10/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" + Homo_sapiens_GRCh38_RefSeq_RS_2024_08: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2024_08/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" + + T2T-CHM13v2.0: + Homo_sapiens_T2T-CHM13v2.0_RefSeq_110: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" + Homo_sapiens_T2T-CHM13v2.0_RefSeq_RS_2023_03: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2023_03/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" + Homo_sapiens_T2T-CHM13v2.0_RefSeq_RS_2023_10: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2023_10/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" + Homo_sapiens_T2T-CHM13v2.0_RefSeq_RS_2024_08: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2024_08/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" diff --git a/generate_transcript_data/cdot_utils.smk b/generate_transcript_data/cdot_utils.smk new file mode 100644 index 0000000..9598eb8 --- /dev/null +++ b/generate_transcript_data/cdot_utils.smk @@ -0,0 +1,12 @@ +import subprocess +from datetime import datetime + +cdot_json = os.path.join(workflow.basedir, "cdot_json.py") +cdot_dir = os.path.dirname(workflow.basedir) +cdot_output_raw = subprocess.check_output(f"{cdot_json} --version", shell=True, env={"PYTHONPATH": cdot_dir}) +cdot_data_version = cdot_output_raw.decode().strip() + +# Name it based on date as it may vary +today = datetime.now().date().isoformat() +gene_info_download_filename = f"Homo_sapiens.gene_info.{today}.gz" +gene_info_json = f"Homo_sapiens.gene-info-{cdot_data_version}.json.gz" diff --git a/generate_transcript_data/transcripts.smk b/generate_transcript_data/transcripts.smk index 6dbd7b3..2b8a718 100644 --- a/generate_transcript_data/transcripts.smk +++ b/generate_transcript_data/transcripts.smk @@ -1,18 +1,11 @@ -import subprocess +include: "cdot_utils.smk" annotation_consortium = config["annotation_consortium"] genome_build = config["genome_build"] urls = config["urls"] cdot_output_dir = os.path.join(annotation_consortium, genome_build) -cdot_json = os.path.join(workflow.basedir, "cdot_json.py") -cdot_dir = os.path.dirname(workflow.basedir) -cdot_output_raw = subprocess.check_output(f"{cdot_json} --version", shell=True, env={"PYTHONPATH": cdot_dir}) -version = cdot_output_raw.decode().strip() -# This needs to be made top level script -gene_info_json = f"Homo_sapiens.gene-info-{version}.json.gz" - -cdot_file_template = "cdot-" + version + "-{name}.json.gz" +cdot_file_template = "cdot-" + cdot_data_version + "-{name}.json.gz" def get_cdot_command(wildcards): url = urls[wildcards.name] @@ -20,7 +13,7 @@ def get_cdot_command(wildcards): return cdot_command -rule all: +rule all_transcripts: input: expand(os.path.join(cdot_output_dir, cdot_file_template), name=urls.keys())