Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve UniProtKB downloads #202

Merged
merged 3 commits into from
Jan 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions src/babel_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import subprocess
from ftplib import FTP
from io import BytesIO
import gzip
Expand Down Expand Up @@ -204,6 +205,74 @@ def pull_via_urllib(url: str, in_file_name: str, decompress = True, subpath=None
# return the filename to the caller
return out_file_name


def pull_via_wget(
url_prefix: str,
in_file_name: str,
decompress=True,
subpath:str=None,
continue_incomplete:bool=True,
retries:int=10):
"""
Download a file using wget. We call wget from the command line, and use command line options to
request continuing incomplete downloads.

:param url_prefix: The URL prefix to download.
:param in_file_name: The filename to download -- this will be concatenated to the URL prefix. This should include
the compression extension (e.g. `.gz`); we will remove that extension during decompression.
:param decompress: Whether this is a Gzip file that should be decompressed after download.
:param subpath: The subdirectory of `babel_download` where this file should be stored.
:param continue_incomplete: Should wget continue an incomplete download?
:param retries: The number of retries to attempt.
"""

# Prepare download URL and location
download_dir = get_config()['download_directory']
url = url_prefix + in_file_name
if subpath:
dl_file_name = os.path.join(download_dir, subpath, in_file_name)
else:
dl_file_name = os.path.join(download_dir, in_file_name)

# Prepare wget options.
wget_command_line = [
'wget',
'--progress=bar:force:noscroll',
]
if continue_incomplete:
wget_command_line.append('--continue')
if retries > 0:
wget_command_line.append(f'--tries={retries}')

# Add URL and output file.
wget_command_line.append(url)
wget_command_line.extend(['-O', dl_file_name])

# Execute wget.
logging.info(f"Downloading {dl_file_name} using wget: {wget_command_line}")
process = subprocess.run(wget_command_line)
if process.returncode != 0:
raise RuntimeError(f"Could not execute wget {wget_command_line}: {process.stderr}")

# Decompress the downloaded file if needed.
uncompressed_filename = None
if decompress:
if dl_file_name.endswith('.gz'):
uncompressed_filename = dl_file_name[:-3]
process = subprocess.run(['gunzip', dl_file_name])
if process.returncode != 0:
raise RuntimeError(f"Could not execute gunzip ['gunzip', {dl_file_name}]: {process.stderr}")
else:
raise RuntimeError(f"Don't know how to decompress {in_file_name}")

if os.path.isfile(uncompressed_filename):
file_size = os.path.getsize(uncompressed_filename)
if file_size > 0:
logging.info(f"Downloaded {uncompressed_filename} from {url}, file size {file_size} bytes.")
else:
raise RuntimeError(f'Expected uncompressed file {uncompressed_filename} does not exist.')


def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None):
"""
:param synonym_list:
Expand Down
7 changes: 0 additions & 7 deletions src/datahandlers/uniprotkb.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from src.babel_utils import pull_via_urllib, make_local_name

def pull_one_uniprotkb(which):
pull_via_urllib('ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB')

def readlabels(which):
swissname = make_local_name(f'UniProtKB/uniprot_{which}.fasta')
Expand All @@ -17,11 +15,6 @@ def readlabels(which):
swissprot_labels[uniprotid] = f'{name} ({which})'
return swissprot_labels

def pull_uniprotkb():
pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/',f'idmapping.dat.gz',subpath='UniProtKB')
for which in ['sprot','trembl']:
pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB')

def pull_uniprot_labels(sprotfile,tremblfile,fname):
slabels = readlabels('sprot')
tlabels = readlabels('trembl')
Expand Down
21 changes: 16 additions & 5 deletions src/snakefiles/datacollect.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import src.datahandlers.chebi as chebi
import src.datahandlers.hgncfamily as hgncfamily
import src.datahandlers.pantherfamily as pantherfamily
import src.datahandlers.complexportal as complexportal
from src.babel_utils import pull_via_wget

import src.prefixes as prefixes

Expand Down Expand Up @@ -91,13 +92,23 @@ rule get_mods_labels:

### UniProtKB

rule get_uniprotkb:
rule get_uniprotkb_idmapping:
output:
config['download_directory']+'/UniProtKB/uniprot_sprot.fasta',
config['download_directory']+'/UniProtKB/uniprot_trembl.fasta',
config['download_directory']+'/UniProtKB/idmapping.dat'
idmapping = config['download_directory']+'/UniProtKB/idmapping.dat'
run:
uniprotkb.pull_uniprotkb()
pull_via_wget("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/", "idmapping.dat.gz", decompress=True, subpath='UniProtKB')

rule get_uniprotkb_sprot:
output:
uniprot_sprot = config['download_directory']+'/UniProtKB/uniprot_sprot.fasta'
run:
pull_via_wget("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/", "uniprot_sprot.fasta.gz", decompress=True, subpath='UniProtKB')

rule get_uniprotkb_trembl:
output:
uniprot_trembl = config['download_directory']+'/UniProtKB/uniprot_trembl.fasta'
run:
pull_via_wget("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/", "uniprot_trembl.fasta.gz", decompress=True, subpath='UniProtKB')

rule get_uniprotkb_labels:
input:
Expand Down