Skip to content

Commit

Permalink
issue #70 - SnakeMake
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Aug 30, 2024
1 parent 1fece6e commit 39c139d
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 10 deletions.
108 changes: 108 additions & 0 deletions generate_transcript_data/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import subprocess
from datetime import datetime

configfile: os.path.join(workflow.basedir, "cdot_transcripts.yaml")

cdot_json = os.path.join(workflow.basedir, "cdot_json.py")
cdot_dir = os.path.dirname(workflow.basedir)
cdot_output_raw = subprocess.check_output(f"{cdot_json} --version", shell=True, env={"PYTHONPATH": cdot_dir})
cdot_data_version = cdot_output_raw.decode().strip()

def get_cdot_command(wildcards):
url = urls[wildcards.name]
cdot_command = "gff_to_json" if url.endswith(".gff.gz") else "gtf_to_json"
return cdot_command


# Name it based on date as it may vary
today = datetime.now().date().isoformat()
gene_info_download_filename = f"Homo_sapiens.gene_info.{today}.gz"
gene_info_json = f"Homo_sapiens.gene-info-{cdot_data_version}.json.gz"

genome_build_files = []
for annotation_consortium, builds in config["config"].items():
for genome_build in builds:
filename = os.path.join(annotation_consortium, genome_build, f"cdot-{cdot_data_version}-{annotation_consortium}.{genome_build}.json.gz")
genome_build_files.append(filename)

rule all:
input:
gene_info_json,
# genome_build_files,
expand("{annotation_consortium}/{genome_build}/cdot-{cdot_data_version}-{annotation_consortium}.{genome_build}.json.gz",
annotation_consortium=["RefSeq", "Ensembl"],
genome_build=["GRCh37", "GRCh38", "T2T-CHM13v2.0"],
cdot_data_version=[cdot_data_version])

rule cdot_build_json:
# Merges all GFFs per build
output:
"{annotation_consortium}/{genome_build}/cdot-{cdot_data_version}-{annotation_consortium}.{genome_build}.json.gz"
input:
lambda wildcards: expand("{annotation_consortium}/{genome_build}/cdot-{cdot_data_version}-{name}.json.gz", name=wildcards.urls)
params:
urls = lambda wildcards: config[wildcards.annotation_consortium][wildcards.genome_build]
shell:
"""
PYTHONPATH={cdot_dir} \
{cdot_json} \
combine_builds \
{input} \
#--grch37 GRCh37/cdot-${CDOT_DATA_VERSION}.refseq.grch37.json.gz \
#--grch38 GRCh38/cdot-${CDOT_DATA_VERSION}.refseq.grch38.json.gz \
--output ${output}
"""


rule cdot_gff_json:
# Individual GFF
input:
gene_info_json=gene_info_json,
gff_file="downloads/{name}.gz"
output:
protected("{annotation_consortium}/{genome_build}/cdot-{cdot_data_version}-{name}.json.gz")
params:
url=lambda wildcards: urls[wildcards.name],
cdot_command=get_cdot_command
shell:
"""
PYTHONPATH={cdot_dir} \
{cdot_json} \
{params.cdot_command} \
"{input.gff_file}" \
--url "{params.url}" \
--genome-build="{genome_build}" \
--output "{output}" \
--gene-info-json="{input.gene_info_json}"
"""

rule download_gff_files:
output:
# Don't re-download if snakemake script changes
protected("downloads/{name}.gz")
params:
url=lambda wildcards: urls[wildcards.name]
shell:
"curl -o {output} {params.url}"


rule process_gene_info_json:
input:
"downloads/gene_info/{gene_info_download_filename}"
output:
protected("downloads/{gene_info_download_filename}")
shell:
"""
PYTHONPATH={cdot_dir} \
"{workflow.basedir}/cdot_gene_info.py" \
--gene-info {input} \
--output {output} \
--email [email protected]
"""

rule download_gene_info:
output:
protected("downloads/gene_info/{gene_info_download_filename}")
shell:
"curl -o {output} https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/Homo_sapiens.gene_info.gz"

87 changes: 87 additions & 0 deletions generate_transcript_data/cdot_transcripts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
config:
Ensembl:
# For Ensembl - we have to use GTFs as the GFF3s don't have protein versions in them
GRCh37:
#v81 (points to 75) and earlier at GTFs that don't have transcript versions - just skip them
#82 is first GFF3 for GRCh37
#83 has no data
#84 is 82 again
#86 is 85 again
Homo_sapiens_GRCh37_Ensembl_82: "http://ftp.ensembl.org/pub/grch37/release-82/gtf/homo_sapiens/Homo_sapiens.GRCh37.82.gtf.gz"
Homo_sapiens_GRCh37_Ensembl_85: "http://ftp.ensembl.org/pub/grch37/release-85/gtf/homo_sapiens/Homo_sapiens.GRCh37.85.gtf.gz"
Homo_sapiens_GRCh37_Ensembl_87: "http://ftp.ensembl.org/pub/grch37/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz"

GRCh38:
Homo_sapiens_GRCh38_Ensembl_81: "http://ftp.ensembl.org/pub/release-81/gtf/homo_sapiens/Homo_sapiens.GRCh38.81.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_82: "http://ftp.ensembl.org/pub/release-82/gtf/homo_sapiens/Homo_sapiens.GRCh38.82.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_83: "http://ftp.ensembl.org/pub/release-83/gtf/homo_sapiens/Homo_sapiens.GRCh38.83.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_84: "http://ftp.ensembl.org/pub/release-84/gtf/homo_sapiens/Homo_sapiens.GRCh38.84.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_85: "http://ftp.ensembl.org/pub/release-85/gtf/homo_sapiens/Homo_sapiens.GRCh38.85.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_86: "http://ftp.ensembl.org/pub/release-86/gtf/homo_sapiens/Homo_sapiens.GRCh38.86.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_87: "http://ftp.ensembl.org/pub/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh38.87.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_88: "http://ftp.ensembl.org/pub/release-88/gtf/homo_sapiens/Homo_sapiens.GRCh38.88.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_89: "http://ftp.ensembl.org/pub/release-89/gtf/homo_sapiens/Homo_sapiens.GRCh38.89.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_90: "http://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens/Homo_sapiens.GRCh38.90.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_91: "http://ftp.ensembl.org/pub/release-91/gtf/homo_sapiens/Homo_sapiens.GRCh38.91.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_92: "http://ftp.ensembl.org/pub/release-92/gtf/homo_sapiens/Homo_sapiens.GRCh38.92.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_93: "http://ftp.ensembl.org/pub/release-93/gtf/homo_sapiens/Homo_sapiens.GRCh38.93.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_94: "http://ftp.ensembl.org/pub/release-94/gtf/homo_sapiens/Homo_sapiens.GRCh38.94.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_95: "http://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_96: "http://ftp.ensembl.org/pub/release-96/gtf/homo_sapiens/Homo_sapiens.GRCh38.96.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_97: "http://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_98: "http://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_99: "http://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_100: "http://ftp.ensembl.org/pub/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh38.100.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_101: "http://ftp.ensembl.org/pub/release-101/gtf/homo_sapiens/Homo_sapiens.GRCh38.101.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_102: "http://ftp.ensembl.org/pub/release-102/gtf/homo_sapiens/Homo_sapiens.GRCh38.102.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_103: "http://ftp.ensembl.org/pub/release-103/gtf/homo_sapiens/Homo_sapiens.GRCh38.103.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_104: "http://ftp.ensembl.org/pub/release-104/gtf/homo_sapiens/Homo_sapiens.GRCh38.104.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_105: "http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_106: "http://ftp.ensembl.org/pub/release-106/gtf/homo_sapiens/Homo_sapiens.GRCh38.106.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_107: "http://ftp.ensembl.org/pub/release-107/gtf/homo_sapiens/Homo_sapiens.GRCh38.107.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_108: "http://ftp.ensembl.org/pub/release-108/gtf/homo_sapiens/Homo_sapiens.GRCh38.108.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_109: "http://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_110: "http://ftp.ensembl.org/pub/release-110/gtf/homo_sapiens/Homo_sapiens.GRCh38.110.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_111: "http://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz"
Homo_sapiens_GRCh38_Ensembl_112: "http://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.gtf.gz"

T2T-CHM13v2.0:
Homo_sapiens_T2T-CHM13v2.0_Ensembl_2022_06: "https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/2022_06/Homo_sapiens-GCA_009914755.4-2022_06-genes.gff3.gz"
Homo_sapiens_T2T-CHM13v2.0_Ensembl_2022_07: "https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/2022_07/Homo_sapiens-GCA_009914755.4-2022_07-genes.gff3.gz"

RefSeq:
GRCh37:
Homo_sapiens_GRCh37_RefSeq_p5: "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/BUILD.37.3/GFF/ref_GRCh37.p5_top_level.gff3.gz"
Homo_sapiens_GRCh37_RefSeq_103: "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.103/GFF/ref_GRCh37.p9_top_level.gff3.gz"
Homo_sapiens_GRCh37_RefSeq_104: "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.104/GFF/ref_GRCh37.p10_top_level.gff3.gz"
Homo_sapiens_GRCh37_RefSeq_105.20190906: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20190906/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz"
Homo_sapiens_GRCh37_RefSeq_105.20201022: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20201022/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz"
Homo_sapiens_GRCh37_RefSeq_105.20220307: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20220307/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz"

GRCh38:
Homo_sapiens_GRCh38_RefSeq_106: "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz"
Homo_sapiens_GRCh38_RefSeq_107: "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.107/GFF/ref_GRCh38.p2_top_level.gff3.gz"
Homo_sapiens_GRCh38_RefSeq_108: "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.108/GFF/ref_GRCh38.p7_top_level.gff3.gz"
Homo_sapiens_GRCh38_RefSeq_109: "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.109/GFF/ref_GRCh38.p12_top_level.gff3.gz"
# The date on this 109 version is 2020-2024 (after the other 109s below), not sure what's going on
Homo_sapiens_GRCh38_RefSeq_109.GCF: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.gff.gz"
Homo_sapiens_GRCh38_RefSeq_109.20190607: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20190607/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
Homo_sapiens_GRCh38_RefSeq_109.20190905: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20190905/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
Homo_sapiens_GRCh38_RefSeq_109.20191205: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20191205/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
Homo_sapiens_GRCh38_RefSeq_109.20200228: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20200228/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
Homo_sapiens_GRCh38_RefSeq_109.20200522: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20200522/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
Homo_sapiens_GRCh38_RefSeq_109.20200815: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20200815/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
Homo_sapiens_GRCh38_RefSeq_109.20201120: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20201120/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
Homo_sapiens_GRCh38_RefSeq_109.20210226: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20210226/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
Homo_sapiens_GRCh38_RefSeq_109.20210514: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20210514/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
Homo_sapiens_GRCh38_RefSeq_109.20211119: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
Homo_sapiens_GRCh38_RefSeq_110: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz"
Homo_sapiens_GRCh38_RefSeq_RS_2023_03: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_03/GCF_000001405.40_GRCh38.p14_genomic.gff.gz"
Homo_sapiens_GRCh38_RefSeq_RS_2023_10: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_10/GCF_000001405.40_GRCh38.p14_genomic.gff.gz"
Homo_sapiens_GRCh38_RefSeq_RS_2024_08: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2024_08/GCF_000001405.40_GRCh38.p14_genomic.gff.gz"

T2T-CHM13v2.0:
Homo_sapiens_T2T-CHM13v2.0_RefSeq_110: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz"
Homo_sapiens_T2T-CHM13v2.0_RefSeq_RS_2023_03: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2023_03/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz"
Homo_sapiens_T2T-CHM13v2.0_RefSeq_RS_2023_10: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2023_10/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz"
Homo_sapiens_T2T-CHM13v2.0_RefSeq_RS_2024_08: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2024_08/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz"
12 changes: 12 additions & 0 deletions generate_transcript_data/cdot_utils.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import subprocess
from datetime import datetime

cdot_json = os.path.join(workflow.basedir, "cdot_json.py")
cdot_dir = os.path.dirname(workflow.basedir)
cdot_output_raw = subprocess.check_output(f"{cdot_json} --version", shell=True, env={"PYTHONPATH": cdot_dir})
cdot_data_version = cdot_output_raw.decode().strip()

# Name it based on date as it may vary
today = datetime.now().date().isoformat()
gene_info_download_filename = f"Homo_sapiens.gene_info.{today}.gz"
gene_info_json = f"Homo_sapiens.gene-info-{cdot_data_version}.json.gz"
13 changes: 3 additions & 10 deletions generate_transcript_data/transcripts.smk
Original file line number Diff line number Diff line change
@@ -1,26 +1,19 @@
import subprocess
include: "cdot_utils.smk"

annotation_consortium = config["annotation_consortium"]
genome_build = config["genome_build"]
urls = config["urls"]
cdot_output_dir = os.path.join(annotation_consortium, genome_build)
cdot_json = os.path.join(workflow.basedir, "cdot_json.py")
cdot_dir = os.path.dirname(workflow.basedir)
cdot_output_raw = subprocess.check_output(f"{cdot_json} --version", shell=True, env={"PYTHONPATH": cdot_dir})
version = cdot_output_raw.decode().strip()

# This needs to be made top level script
gene_info_json = f"Homo_sapiens.gene-info-{version}.json.gz"

cdot_file_template = "cdot-" + version + "-{name}.json.gz"
cdot_file_template = "cdot-" + cdot_data_version + "-{name}.json.gz"

def get_cdot_command(wildcards):
url = urls[wildcards.name]
cdot_command = "gff_to_json" if url.endswith(".gff.gz") else "gtf_to_json"
return cdot_command


rule all:
rule all_transcripts:
input:
expand(os.path.join(cdot_output_dir, cdot_file_template), name=urls.keys())

Expand Down

0 comments on commit 39c139d

Please sign in to comment.