Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vfdb names #87

Merged
merged 5 commits into from
May 18, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ariba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
'summary_sample',
'tasks',
'versions',
'vfdb_parser',
]

from ariba import *
22 changes: 10 additions & 12 deletions ariba/ref_genes_getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class Error (Exception): pass
import urllib.request
import time
import json
from ariba import common, card_record
from ariba import common, card_record, vfdb_parser


class RefGenesGetter:
Expand Down Expand Up @@ -244,23 +244,21 @@ def _get_from_vfdb(self, outprefix):

try:
os.mkdir(tmpdir)
os.chdir(tmpdir)
except:
raise Error('Error mkdir/chdir ' + tmpdir)
raise Error('Error mkdir ' + tmpdir)

zipfile = 'VFDB_setA_nt.fas.gz'
zipfile = os.path.join(tmpdir, 'VFDB_setA_nt.fas.gz')
self._download_file('http://www.mgc.ac.cn/VFs/Down/VFDB_setA_nt.fas.gz', zipfile)
common.syscall('gunzip ' + zipfile)
os.chdir(current_dir)
print('Extracted files.')

final_fasta = outprefix + '_VFDB.fa'
shutil.move(tmpdir+'/VFDB_setA_nt.fas',final_fasta)
vparser = vfdb_parser.VfdbParser(zipfile, outprefix)
vparser.run()
shutil.rmtree(tmpdir)
print('Extracted files.')
final_fasta = outprefix + '.presence_absence.fa'
final_tsv = outprefix + '.metadata.tsv'

print('Extracted core DNA sequence dataset. Final file is called', final_fasta, end='\n\n')
print('Extracted core DNA sequence dataset and metadata. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
print('You can use it with ARIBA like this:')
print('ariba prepareref --presabs', final_fasta, 'output_directory\n')
print('ariba prepareref --ref_prefix', outprefix, 'output_directory\n')
print('If you use this downloaded data, please cite:')
print('"VFDB 2016: hierarchical and refined dataset for big data analysis-10 years on",\nChen LH et al 2016, Nucleic Acids Res. 44(Database issue):D694-D697. PMID: 26578559\n')

Expand Down
2 changes: 1 addition & 1 deletion ariba/reference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def _remove_bad_genes(self, seqs_dict, log_file):
def sanity_check(self, outprefix):
variants_only_removed = self._remove_bad_genes(self.seq_dicts['variants_only'], outprefix + '.00.check_fasta_variants_only.log')
presence_absence_removed = self._remove_bad_genes(self.seq_dicts['presence_absence'], outprefix + '.00.check_fasta_presence_absence.log')
self._filter_bad_variant_data(outprefix + '.01.check_variants', variants_only_removed, presence_absence_removed)
self._filter_bad_variant_data(outprefix + '.01.check_variants', presence_absence_removed, variants_only_removed)


@classmethod
Expand Down
6 changes: 6 additions & 0 deletions ariba/tests/data/vfdb_parser_test_run.in.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>VF123(gi:1234) (abcD) foobar description1 [abc] [genus1 species1]
AAAA
>VF234(gi:2345) (efgH) spam eggs description2 [abc] [genus2 species2]
CCCC
>seq1 blah
ACGT
6 changes: 6 additions & 0 deletions ariba/tests/data/vfdb_parser_test_run.out.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>abcD.VF123(gi:1234).genus1_species1
AAAA
>efgH.VF234(gi:2345).genus2_species2
CCCC
>seq1 blah
ACGT
2 changes: 2 additions & 0 deletions ariba/tests/data/vfdb_parser_test_run.out.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
abcD.VF123(gi:1234).genus1_species1 . . foobar description1 [abc]
efgH.VF234(gi:2345).genus2_species2 . . spam eggs description2 [abc]
60 changes: 60 additions & 0 deletions ariba/tests/vfdb_parser_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import unittest
import filecmp
import os
from ariba import vfdb_parser

modules_dir = os.path.dirname(os.path.abspath(vfdb_parser.__file__))
data_dir = os.path.join(modules_dir, 'tests', 'data')


class TestVfdbParser(unittest.TestCase):
def test_fa_header_to_name_pieces(self):
'''test _fa_header_to_name_pieces'''
tests = [
('VF123(gi:1234) (abcD) foobar description [abc] [genus species]', ('VF123(gi:1234)', 'abcD', 'foobar description [abc]', 'genus species')),
('F123(gi:1234) (abcD) foobar description [abc] [genus species]', None), # no V at start
('VF123(gi:1234) (abcD) foobar description [abc]', None), # missing genus species
('VF123(gi:1234) abcD foobar description [abc] [genus species]', None), # brackets missing around abcD
]

for header, expected in tests:
got = vfdb_parser.VfdbParser._fa_header_to_name_pieces(header)
self.assertEqual(expected, got)


def test_fa_header_to_name_and_metadata(self):
'''test _fa_header_to_name_and_metadata'''
headers = [
'VF123(gi:1234) (abcD) foobar description [abc] [genus species]',
'F123(gi:1234) (abcD) foobar description [abc] [genus species]', # no V at start
'VF123(gi:1234) (abcD) foobar description [abc]', # missing genus species
'VF123(gi:1234) abcD foobar description [abc] [genus species]', # brackets missing around abcD
]

expected = [
('abcD.VF123(gi:1234).genus_species', 'foobar description [abc]'),
(headers[1], None),
(headers[2], None),
(headers[3], None),
]

assert len(headers) == len(expected)
for i in range(len(headers)):
got = vfdb_parser.VfdbParser._fa_header_to_name_and_metadata(headers[i])
self.assertEqual(expected[i], got)


def test_run(self):
'''test run'''
infile = os.path.join(data_dir, 'vfdb_parser_test_run.in.fa')
expected_tsv = os.path.join(data_dir, 'vfdb_parser_test_run.out.tsv')
expected_fa = os.path.join(data_dir, 'vfdb_parser_test_run.out.fa')
outprefix = 'tmp.vfdb_parser_test_run'
got_tsv = outprefix + '.metadata.tsv'
got_fa = outprefix + '.presence_absence.fa'
vp = vfdb_parser.VfdbParser(infile, outprefix)
vp.run()
self.assertTrue(filecmp.cmp(expected_tsv, got_tsv, shallow=False))
self.assertTrue(filecmp.cmp(expected_fa, got_fa, shallow=False))
os.unlink(got_tsv)
os.unlink(got_fa)
46 changes: 46 additions & 0 deletions ariba/vfdb_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import re
import pyfastaq

class Error (Exception): pass

name_regex = re.compile(r'^(?P<vfdb_id>V\S+\)) \((?P<name>\S+)\) (?P<description>.*\]) \[(?P<genus_etc>.*)\]$')

class VfdbParser:
def __init__(self, infile, outprefix):
self.infile = infile
self.outprefix = outprefix


@classmethod
def _fa_header_to_name_pieces(cls, fa_header):
m = name_regex.search(fa_header)
if m is None:
return None
else:
return tuple([m.group(x) for x in ['vfdb_id', 'name', 'description', 'genus_etc']])


@staticmethod
def _fa_header_to_name_and_metadata(fa_header):
name_data = VfdbParser._fa_header_to_name_pieces(fa_header)
if name_data is None:
return fa_header, None
else:
vfdb_id, name, description, genus_etc = name_data
return name + '.' + vfdb_id + '.' + genus_etc.replace(' ', '_'), description


def run(self):
file_reader = pyfastaq.sequences.file_reader(self.infile)
fa_out = pyfastaq.utils.open_file_write(self.outprefix + '.presence_absence.fa')
tsv_out = pyfastaq.utils.open_file_write(self.outprefix + '.metadata.tsv')

for seq in file_reader:
seq.id, description = self._fa_header_to_name_and_metadata(seq.id)
if description is not None:
print(seq.id, '.', '.', description, sep='\t', file=tsv_out)
print(seq, file=fa_out)

pyfastaq.utils.close(fa_out)
pyfastaq.utils.close(tsv_out)