Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow fasta only to prepareref #119

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions ariba/ref_preparer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
import os
import pickle
import pyfastaq
from ariba import reference_data

class Error (Exception): pass
Expand All @@ -9,8 +10,9 @@ class Error (Exception): pass
class RefPreparer:
def __init__(self,
fasta_files,
metadata_tsv_files,
extern_progs,
metadata_tsv_files=None,
all_coding=None,
version_report_lines=None,
min_gene_length=6,
max_gene_length=10000,
Expand All @@ -30,7 +32,8 @@ def __init__(self,
self.version_report_lines = version_report_lines

self.fasta_files = fasta_files
self.metadata_tsv_files = metadata_tsv_files
self.metadata_tsv_files = [] if metadata_tsv_files is None else metadata_tsv_files
self.all_coding = all_coding
self.min_gene_length = min_gene_length
self.max_gene_length = max_gene_length
self.genetic_code = genetic_code
Expand All @@ -42,6 +45,15 @@ def __init__(self,
self.verbose = verbose


@classmethod
def _fasta_to_metadata(cls, infile, out_fh, all_coding):
seq_reader = pyfastaq.sequences.file_reader(infile)
coding = '1' if all_coding else '0'

for seq in seq_reader:
print(seq.id, coding, 0, '.', '.', '.', sep='\t', file=out_fh)


def _write_info_file(self, outfile):
with open(outfile, 'w') as fout:
for filename in self.fasta_files:
Expand Down Expand Up @@ -126,6 +138,18 @@ def run(self, outdir):
print(file=f)
print(*self.version_report_lines, sep='\n', file=f)

if self.all_coding is not None:
assert len(self.metadata_tsv_files) == 0
assert self.all_coding in {'yes', 'no'}
self.metadata_tsv_files = [os.path.join(outdir, '00.auto_metadata.tsv')]
f_out = pyfastaq.utils.open_file_write(self.metadata_tsv_files[0])
for fasta_file in self.fasta_files:
RefPreparer._fasta_to_metadata(fasta_file, f_out, self.all_coding=='yes')
pyfastaq.utils.close(f_out)
else:
assert self.all_coding is None
assert len(self.metadata_tsv_files) > 0

self._write_info_file(os.path.join(outdir, '00.info.txt'))

self.refdata = reference_data.ReferenceData(
Expand Down
3 changes: 2 additions & 1 deletion ariba/tasks/prepareref.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ def run(options):

preparer = ref_preparer.RefPreparer(
options.fasta_files,
options.tsv_files,
extern_progs,
metadata_tsv_files=options.tsv_files,
all_coding=options.all_coding,
version_report_lines=version_report_lines,
min_gene_length=options.min_gene_length,
max_gene_length=options.max_gene_length,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
seq1 1 0 . . .
seq2 1 0 . . .
seq3 1 0 . . .
6 changes: 6 additions & 0 deletions ariba/tests/data/ref_preparer_test_fasta_to_metadata.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>seq1
CACTACAT
>seq2
AAAA
>seq3
GGGGG
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
seq1 0 0 . . .
seq2 0 0 . . .
seq3 0 0 . . .
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
gene1 0 0 . . .
gene2 0 0 . . .
gene3 0 0 . . .
gene4.var_only 0 0 . . .
noncoding1 0 0 . . .
noncoding2.var_only 0 0 . . .
noncoding3.var_only 0 0 . . .
noncoding4.var_only 0 0 . . .
cannot_make_into_a_gene 0 0 . . .
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
input fasta file: /home/ubuntu/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.1.fa
input fasta file: /home/ubuntu/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.2.fa
input fasta file: /home/ubuntu/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.3.fa
input tsv file: tmp.ref_preparer_test_run/00.auto_metadata.tsv
genetic_code 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
ARIBA run with this command:
python3 -m unittest prepareref ariba.tests.ref_preparer_test.TestRefPreparer.test_run_all_noncoding
from this directory: /home/ubuntu/sanger-pathogens/ariba


Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
cannot_make_into_a_gene 0 0 . . .
gene1 0 0 . . .
gene2 0 0 . . .
gene3 0 0 . . .
gene4.var_only 0 0 . . .
noncoding1 0 0 . . .
noncoding2.var_only 0 0 . . .
noncoding3.var_only 0 0 . . .
noncoding4.var_only 0 0 . . .
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
>cannot_make_into_a_gene
AAAAAAAAAAAAAAAA
>gene1
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>gene2
ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>gene3
ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA
>gene4.var_only
ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA
>noncoding1
CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
>noncoding2.var_only
CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
>noncoding3.var_only
CTACTGATTATCTACTATCTGCATCGATGCCTGATCTA
>noncoding4.var_only
CAACCACATGCAGTCATGCAACCAACACTCTCATCTAA
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
cluster_1 gene1 gene2
cluster_2 cannot_make_into_a_gene
gene4+ gene3 gene4.var_only
noncoding- noncoding1 noncoding2.var_only noncoding3.var_only
noncoding4 noncoding4.var_only
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
>cannot_make_into_a_gene
AAAAAAAAAAAAAAAA
>gene1
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>gene2
ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>gene3
ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA
>gene4.var_only
ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA
>noncoding1
CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
>noncoding2.var_only
CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
>noncoding3.var_only
CTACTGATTATCTACTATCTGCATCGATGCCTGATCTA
>noncoding4.var_only
CAACCACATGCAGTCATGCAACCAACACTCTCATCTAA
58 changes: 57 additions & 1 deletion ariba/tests/ref_preparer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,24 @@


class TestRefPreparer(unittest.TestCase):
def test_fasta_to_metadata(self):
'''test _fasta_to_metadata'''
infile = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.fa')
tmp_out = 'tmp.test_fasta_to_metadata.tsv'
expected_coding = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.coding.tsv')
expected_noncoding = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.noncoding.tsv')

with open(tmp_out, 'w') as f:
ref_preparer.RefPreparer._fasta_to_metadata(infile, f, True)
self.assertTrue(filecmp.cmp(expected_coding, tmp_out, shallow=False))

with open(tmp_out, 'w') as f:
ref_preparer.RefPreparer._fasta_to_metadata(infile, f, False)
self.assertTrue(filecmp.cmp(expected_noncoding, tmp_out, shallow=False))

os.unlink(tmp_out)


def test_rename_clusters(self):
'''test _rename_clusters'''
clusters_in = {
Expand Down Expand Up @@ -72,7 +90,7 @@ def test_run(self):
]

extern_progs = external_progs.ExternalProgs()
refprep = ref_preparer.RefPreparer(fasta_in, tsv_in, extern_progs, genetic_code=1)
refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, metadata_tsv_files=tsv_in, genetic_code=1)
tmp_out = 'tmp.ref_preparer_test_run'
refprep.run(tmp_out)
expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run.out')
Expand All @@ -96,3 +114,41 @@ def test_run(self):

shutil.rmtree(tmp_out)


def test_run_all_noncoding(self):
'''test run with no metadata input, all sequences are noncoding'''
fasta_in = [
os.path.join(data_dir, 'ref_preparer_test_run.in.1.fa'),
os.path.join(data_dir, 'ref_preparer_test_run.in.2.fa'),
os.path.join(data_dir, 'ref_preparer_test_run.in.3.fa'),
]

extern_progs = external_progs.ExternalProgs()
refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, all_coding='no', genetic_code=1)
tmp_out = 'tmp.ref_preparer_test_run'
refprep.run(tmp_out)
expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_all_noncoding.out')

test_files = [
'00.auto_metadata.tsv',
'01.filter.check_metadata.tsv',
'01.filter.check_genes.log',
'01.filter.check_metadata.log',
'02.cdhit.all.fa',
'02.cdhit.clusters.tsv',
'02.cdhit.gene.fa',
'02.cdhit.gene.varonly.fa',
'02.cdhit.noncoding.fa',
'02.cdhit.noncoding.varonly.fa',
]

for filename in test_files:
expected = os.path.join(expected_outdir, filename)
got = os.path.join(tmp_out, filename)
self.assertTrue(filecmp.cmp(expected, got, shallow=False))

shutil.rmtree(tmp_out)




6 changes: 4 additions & 2 deletions scripts/ariba
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,13 @@ subparser_prepareref = subparsers.add_parser(
help='Prepare reference data for input to "run"',
usage='ariba prepareref [options] <outdir>',
description='Prepare reference data for running the pipeline with "ariba run"',
epilog='REQUIRED: -f and -m must each be used at least once',
epilog='REQUIRED: -f/--fasta, and also either -m/--metadata or --all_coding must be used',
)
input_group = subparser_prepareref.add_argument_group('input files options')
input_group.add_argument('-f', '--fasta', action='append', dest='fasta_files', required=True, help='REQUIRED. Name of fasta file. Can be used more than once if your sequences are spread over more than on file', metavar='FILENAME')
input_group.add_argument('-m', '--metadata', action='append', dest='tsv_files', required=True, help='REQUIRED. Name of tsv file of metadata about the input sequences. Can be used more than once if your metadata is spread over more than one file', metavar='FILENAME')
meta_group = input_group.add_mutually_exclusive_group(required=True)
meta_group.add_argument('-m', '--metadata', action='append', dest='tsv_files', help='Name of tsv file of metadata about the input sequences. Can be used more than once if your metadata is spread over more than one file. Incompatible with --all_coding', metavar='FILENAME')
meta_group.add_argument('--all_coding', choices=['yes', 'no'], help='Use this if you only have a fasta of presence absence sequences as input, and no metadata. Use "yes" if all sequences are coding, or "no" if they are all non-coding. Incompatible with -m/--metadata')

cdhit_group = subparser_prepareref.add_argument_group('cd-hit options')
cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.')
Expand Down