Skip to content

Commit

Permalink
Merge pull request #202 from TranslatorSRI/improve-uniprotkb-downloads
Browse files Browse the repository at this point in the history
UniProtKB downloads have gotten really slow lately. Rather than relying on the built-in `pull_via_urllib()` method, this PR switches that over to using `wget --continue` so that we get progress updates and resume incomplete downloads. I've written a `pull_via_wget()` method that calls `wget()`.
  • Loading branch information
gaurav authored Jan 2, 2024
2 parents ba0f710 + 48991cb commit f7ed8f0
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 12 deletions.
69 changes: 69 additions & 0 deletions src/babel_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import subprocess
from ftplib import FTP
from io import BytesIO
import gzip
Expand Down Expand Up @@ -204,6 +205,74 @@ def pull_via_urllib(url: str, in_file_name: str, decompress = True, subpath=None
# return the filename to the caller
return out_file_name


def pull_via_wget(
url_prefix: str,
in_file_name: str,
decompress=True,
subpath:str=None,
continue_incomplete:bool=True,
retries:int=10):
"""
Download a file using wget. We call wget from the command line, and use command line options to
request continuing incomplete downloads.
:param url_prefix: The URL prefix to download.
:param in_file_name: The filename to download -- this will be concatenated to the URL prefix. This should include
the compression extension (e.g. `.gz`); we will remove that extension during decompression.
:param decompress: Whether this is a Gzip file that should be decompressed after download.
:param subpath: The subdirectory of `babel_download` where this file should be stored.
:param continue_incomplete: Should wget continue an incomplete download?
:param retries: The number of retries to attempt.
"""

# Prepare download URL and location
download_dir = get_config()['download_directory']
url = url_prefix + in_file_name
if subpath:
dl_file_name = os.path.join(download_dir, subpath, in_file_name)
else:
dl_file_name = os.path.join(download_dir, in_file_name)

# Prepare wget options.
wget_command_line = [
'wget',
'--progress=bar:force:noscroll',
]
if continue_incomplete:
wget_command_line.append('--continue')
if retries > 0:
wget_command_line.append(f'--tries={retries}')

# Add URL and output file.
wget_command_line.append(url)
wget_command_line.extend(['-O', dl_file_name])

# Execute wget.
logging.info(f"Downloading {dl_file_name} using wget: {wget_command_line}")
process = subprocess.run(wget_command_line)
if process.returncode != 0:
raise RuntimeError(f"Could not execute wget {wget_command_line}: {process.stderr}")

# Decompress the downloaded file if needed.
uncompressed_filename = None
if decompress:
if dl_file_name.endswith('.gz'):
uncompressed_filename = dl_file_name[:-3]
process = subprocess.run(['gunzip', dl_file_name])
if process.returncode != 0:
raise RuntimeError(f"Could not execute gunzip ['gunzip', {dl_file_name}]: {process.stderr}")
else:
raise RuntimeError(f"Don't know how to decompress {in_file_name}")

if os.path.isfile(uncompressed_filename):
file_size = os.path.getsize(uncompressed_filename)
if file_size > 0:
logging.info(f"Downloaded {uncompressed_filename} from {url}, file size {file_size} bytes.")
else:
raise RuntimeError(f'Expected uncompressed file {uncompressed_filename} does not exist.')


def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None):
"""
:param synonym_list:
Expand Down
7 changes: 0 additions & 7 deletions src/datahandlers/uniprotkb.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from src.babel_utils import pull_via_urllib, make_local_name

def pull_one_uniprotkb(which):
pull_via_urllib('ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB')

def readlabels(which):
swissname = make_local_name(f'UniProtKB/uniprot_{which}.fasta')
Expand All @@ -17,11 +15,6 @@ def readlabels(which):
swissprot_labels[uniprotid] = f'{name} ({which})'
return swissprot_labels

def pull_uniprotkb():
pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/',f'idmapping.dat.gz',subpath='UniProtKB')
for which in ['sprot','trembl']:
pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB')

def pull_uniprot_labels(sprotfile,tremblfile,fname):
slabels = readlabels('sprot')
tlabels = readlabels('trembl')
Expand Down
21 changes: 16 additions & 5 deletions src/snakefiles/datacollect.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import src.datahandlers.chebi as chebi
import src.datahandlers.hgncfamily as hgncfamily
import src.datahandlers.pantherfamily as pantherfamily
import src.datahandlers.complexportal as complexportal
from src.babel_utils import pull_via_wget

import src.prefixes as prefixes

Expand Down Expand Up @@ -91,13 +92,23 @@ rule get_mods_labels:

### UniProtKB

rule get_uniprotkb:
rule get_uniprotkb_idmapping:
output:
config['download_directory']+'/UniProtKB/uniprot_sprot.fasta',
config['download_directory']+'/UniProtKB/uniprot_trembl.fasta',
config['download_directory']+'/UniProtKB/idmapping.dat'
idmapping = config['download_directory']+'/UniProtKB/idmapping.dat'
run:
uniprotkb.pull_uniprotkb()
pull_via_wget("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/", "idmapping.dat.gz", decompress=True, subpath='UniProtKB')

rule get_uniprotkb_sprot:
output:
uniprot_sprot = config['download_directory']+'/UniProtKB/uniprot_sprot.fasta'
run:
pull_via_wget("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/", "uniprot_sprot.fasta.gz", decompress=True, subpath='UniProtKB')

rule get_uniprotkb_trembl:
output:
uniprot_trembl = config['download_directory']+'/UniProtKB/uniprot_trembl.fasta'
run:
pull_via_wget("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/", "uniprot_trembl.fasta.gz", decompress=True, subpath='UniProtKB')

rule get_uniprotkb_labels:
input:
Expand Down

0 comments on commit f7ed8f0

Please sign in to comment.