Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to choose download version #121

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 38 additions & 4 deletions ariba/ref_genes_getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@ class Error (Exception): pass


class RefGenesGetter:
def __init__(self, ref_db, genetic_code=11):
def __init__(self, ref_db, genetic_code=11, version=None):
allowed_ref_dbs = {'card', 'argannot', 'plasmidfinder', 'resfinder','srst2_argannot', 'vfdb'}
if ref_db not in allowed_ref_dbs:
raise Error('Error in RefGenesGetter. ref_db must be one of: ' + str(allowed_ref_dbs) + ', but I got "' + ref_db)
self.ref_db=ref_db
self.genetic_code = genetic_code
self.max_download_attempts = 3
self.sleep_time = 2
self.version = version
pyfastaq.sequences.genetic_code = self.genetic_code


Expand All @@ -41,6 +42,30 @@ def _download_file(self, url, outfile):
print(' done', flush=True)


def _get_card_versions(self, tmp_file):
print('Getting available CARD versions')
self._download_file('https://card.mcmaster.ca/download', tmp_file)
p = re.compile(r'''href="(/download/.*?broad.*?v([0-9]+\.[0-9]+\.[0-9]+)\.tar\.gz)"''')
versions = {}

with open(tmp_file) as f:
for line in f:
got = p.findall(line)
for match in got:
key = tuple([int(x) for x in match[1].split('.')])
versions[key] = 'https://card.mcmaster.ca' + match[0]

if len(versions) == 0:
raise Error('Error getting CARD versions. Cannot continue')

print('Found versions:')

for key, url in sorted(versions.items()):
print('.'.join([str(x) for x in key]), url, sep='\t')

os.unlink(tmp_file)
return versions


def _get_from_card(self, outprefix):
outprefix = os.path.abspath(outprefix)
Expand All @@ -53,8 +78,17 @@ def _get_from_card(self, outprefix):
except:
raise Error('Error mkdir/chdir ' + tmpdir)

card_version = '1.0.9'
card_tarball_url = 'https://card.mcmaster.ca/download/0/broadstreet-v' + card_version + '.tar.gz'
versions = self._get_card_versions('download.html')
if self.version is not None:
key = tuple([int(x) for x in self.version.split('.')])
if key not in versions:
raise Error('Error! Did not find requested version ' + self.version)
else:
key = sorted(list(versions.keys()))[-1]
self.version = '.'.join([str(x) for x in key])

print('Getting version', self.version)
card_tarball_url = versions[key]
card_tarball = 'card.tar.gz'
print('Working in temporary directory', tmpdir)
print('Downloading data from card:', card_tarball_url, flush=True)
Expand Down Expand Up @@ -149,7 +183,7 @@ def _get_from_card(self, outprefix):
print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
print('If you use this downloaded data, please cite:')
print('"The Comprehensive Antibiotic Resistance Database", McArthur et al 2013, PMID: 23650175')
print('and in your methods say that version', card_version, 'of the database was used')
print('and in your methods say that version', self.version, 'of the database was used')


def _get_from_resfinder(self, outprefix):
Expand Down
6 changes: 5 additions & 1 deletion ariba/tasks/getref.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@


def run(options):
getter = ref_genes_getter.RefGenesGetter(options.db, genetic_code=options.genetic_code)
getter = ref_genes_getter.RefGenesGetter(
options.db,
genetic_code=options.genetic_code,
version=options.version
)
getter.run(options.outprefix)

1 change: 1 addition & 0 deletions scripts/ariba
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ subparser_getref = subparsers.add_parser(
description='Download reference data from one of a few supported public resources',
)
subparser_getref.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
subparser_getref.add_argument('--version', help='Version of reference data to download. If not used, gets the latest version. Only applies to card')
subparser_getref.add_argument('db', help='Database to download. Must be one of: ' + ' '.join(allowed_dbs), choices=allowed_dbs, metavar="DB name")
subparser_getref.add_argument('outprefix', help='Prefix of output filenames')
subparser_getref.set_defaults(func=ariba.tasks.getref.run)
Expand Down