Skip to content

Commit

Permalink
Fix for issue #278 version 2
Browse files Browse the repository at this point in the history
  • Loading branch information
kpepper committed Aug 23, 2019
1 parent af3b047 commit 868c995
Show file tree
Hide file tree
Showing 32 changed files with 232 additions and 19 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Change Log

[v2.14.3](https://github.com/sanger-pathogens/ariba/tree/v2.14.3) (2019-08-23)
[Full Changelog](https://github.com/sanger-pathogens/ariba/compare/v2.14.2...v2.14.3)

**Fixed bugs:**

- Version 3.0.3 of CARD breaks prepareref [\#278](https://github.com/sanger-pathogens/ariba/issues/278)
- RT 667288: Change docker file Ariba git clone to a copy

[v2.14.2](https://github.com/sanger-pathogens/ariba/tree/v2.14.2) (2019-06-18)
[Full Changelog](https://github.com/sanger-pathogens/ariba/compare/v2.14.1...v2.14.2)

Expand Down
12 changes: 1 addition & 11 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ MAINTAINER [email protected]
# Software version numbers
ARG BOWTIE2_VERSION=2.2.9
ARG SPADES_VERSION=3.13.1
ARG CDHIT_VERSION=4.8.1
ARG ARIBA_TAG=master
ARG ARIBA_BUILD_DIR=/ariba

RUN apt-get -qq update && \
apt-get install --no-install-recommends -y \
build-essential \
cd-hit \
curl \
git \
libbz2-dev \
Expand All @@ -28,16 +28,6 @@ RUN apt-get -qq update && \
wget \
zlib1g-dev

# Install cd-hit
# We build cd-hit "manually" rather than using apt-get - see Ariba GitHub issue 278
RUN git clone https://github.com/weizhongli/cdhit.git \
&& cd cdhit \
&& git checkout V${CDHIT_VERSION} \
&& make MAX_SEQ=10000000 \
&& ln -s -f $PWD/cd-hit-est /usr/bin/cd-hit-est \
&& ln -s -f /usr/bin/cd-hit-est /usr/bin/cdhit-est \
&& cd ..

# Install bowtie
RUN wget -q http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/${BOWTIE2_VERSION}/bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip \
&& unzip bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip \
Expand Down
9 changes: 8 additions & 1 deletion ariba/ref_preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ def __init__(self,
version_report_lines=None,
min_gene_length=6,
max_gene_length=10000,
min_noncoding_length=6,
max_noncoding_length=20000,
genetic_code=11,
cdhit_min_id=0.9,
cdhit_min_length=0.0,
Expand All @@ -38,6 +40,8 @@ def __init__(self,
self.all_coding = all_coding
self.min_gene_length = min_gene_length
self.max_gene_length = max_gene_length
self.min_noncoding_length = min_noncoding_length
self.max_noncoding_length = max_noncoding_length
self.genetic_code = genetic_code
self.cdhit_min_id = cdhit_min_id
self.cdhit_min_length = cdhit_min_length
Expand Down Expand Up @@ -177,6 +181,8 @@ def run(self, outdir):
self.metadata_tsv_files,
min_gene_length=self.min_gene_length,
max_gene_length=self.max_gene_length,
min_noncoding_length = self.min_noncoding_length,
max_noncoding_length = self.max_noncoding_length,
genetic_code=self.genetic_code,
)

Expand Down Expand Up @@ -213,8 +219,9 @@ def run(self, outdir):
pickle.dump(clusters, f)

if number_of_removed_seqs > 0:
print('WARNING.', number_of_removed_seqs, 'sequence(s) excluded. Please see the log file 01.filter.check_genes.log for details. This will show them:', file=sys.stderr)
print('WARNING.', number_of_removed_seqs, 'sequence(s) excluded. Please see the 01.filter.check_genes.log and 01.filter.check_noncoding.log for details. This will show them:', file=sys.stderr)
print(' grep REMOVE', os.path.join(outdir, '01.filter.check_genes.log'), file=sys.stderr)
print(' cat', os.path.join(outdir, '01.filter.check_noncoding.log'), file=sys.stderr)

if number_of_bad_variants_logged > 0:
print('WARNING. Problem with at least one variant. Problem variants are removed. Please see the file', os.path.join(outdir, '01.filter.check_metadata.log'), 'for details.', file=sys.stderr)
57 changes: 53 additions & 4 deletions ariba/reference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,17 @@ def __init__(self,
rename_file=None,
min_gene_length=6,
max_gene_length=10000,
min_noncoding_length=6,
max_noncoding_length=20000,
genetic_code=11,
parameters_file=None,
):
self.seq_filenames = {}
self.seq_dicts = {}
self.min_gene_length = min_gene_length
self.max_gene_length = max_gene_length
self.min_noncoding_length = min_noncoding_length
self.max_noncoding_length = max_noncoding_length

self.sequences, self.metadata = ReferenceData._load_input_files_and_check_seq_names(fasta_files, metadata_tsv_files)
if len(self.sequences) == 0:
Expand Down Expand Up @@ -208,7 +212,7 @@ def _filter_bad_variant_data(cls, sequences, all_metadata, out_prefix, removed_s

for sequence_name, metadata_dict in sorted(all_metadata.items()):
if sequence_name in removed_sequences:
print(sequence_name, 'was removed because does not look like a gene, so removing its metadata', file=log_fh)
print(sequence_name, 'was removed because it failed filtering checks, so removing its metadata', file=log_fh)
log_lines += 1
del all_metadata[sequence_name]
continue
Expand Down Expand Up @@ -278,6 +282,16 @@ def _try_to_get_gene_seq(cls, seq, min_length, max_length):
return got[0], 'KEEP\tMade into gene. strand=' + got[1] + ', frame=' + str(got[2])


@classmethod
def _check_noncoding_seq(cls, seq, min_length, max_length):
if len(seq) < min_length:
return False, 'REMOVE\tToo short. Length: ' + str(len(seq))
elif len(seq) > max_length:
return False, 'REMOVE\tToo long. Length: ' + str(len(seq))
else:
return True, None


@classmethod
def _remove_bad_genes(cls, sequences, metadata, log_file, min_gene_length, max_gene_length):
to_remove = set()
Expand Down Expand Up @@ -308,11 +322,46 @@ def _remove_bad_genes(cls, sequences, metadata, log_file, min_gene_length, max_g
return to_remove


@classmethod
def _remove_bad_noncoding_seqs(cls, sequences, metadata, log_file, min_noncoding_length, max_noncoding_length):
to_remove = set()

if len(sequences) == 0:
return to_remove

log_fh = pyfastaq.utils.open_file_write(log_file)

for name in sorted(sequences):
if metadata[name]['seq_type'] != 'n':
continue

valid, message = ReferenceData._check_noncoding_seq(sequences[name], min_noncoding_length, max_noncoding_length)
if not valid:
to_remove.add(name)

if message is not None:
print(name, message, sep='\t', file=log_fh)

pyfastaq.utils.close(log_fh)

for name in to_remove:
sequences.pop(name)

return to_remove

def sanity_check(self, outprefix):
removed_seqs = self._remove_bad_genes(self.sequences, self.metadata, outprefix + '.check_genes.log', self.min_gene_length, self.max_gene_length)
log_lines = ReferenceData._filter_bad_variant_data(self.sequences, self.metadata, outprefix, removed_seqs)
return len(removed_seqs), log_lines

removed_gene_seqs = self._remove_bad_genes(self.sequences,
self.metadata, outprefix + '.check_genes.log',
self.min_gene_length, self.max_gene_length)

removed_noncoding_seqs = self._remove_bad_noncoding_seqs(self.sequences, self.metadata,
outprefix + '.check_noncoding.log', self.min_noncoding_length,
self.max_noncoding_length)

all_removed_seqs = removed_gene_seqs.union(removed_noncoding_seqs)
log_lines = ReferenceData._filter_bad_variant_data(self.sequences, self.metadata, outprefix, all_removed_seqs)
return len(all_removed_seqs), log_lines

@classmethod
def _new_seq_name(cls, name):
Expand Down
2 changes: 2 additions & 0 deletions ariba/tasks/prepareref.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def run(options):
version_report_lines=version_report_lines,
min_gene_length=options.min_gene_length,
max_gene_length=options.max_gene_length,
min_noncoding_length=options.min_noncoding_length,
max_noncoding_length=options.max_noncoding_length,
genetic_code=options.genetic_code,
cdhit_min_id=options.cdhit_min_id,
cdhit_min_length=options.cdhit_min_length,
Expand Down
17 changes: 17 additions & 0 deletions ariba/tests/data/ref_preparer_test_run.in.4.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
>noncoding1-toolong
CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
>noncoding2
CTACTGAT
>cannot_make_into_a_gene
AAAAAAAAAAAAAAAA
>noncoding3-tooshort
C
>gene1
ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>noncoding4-toolong
CTACTGATCATCTACTATCTG
>noncoding5-tooshort
CTCTC
>gene2
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA

8 changes: 8 additions & 0 deletions ariba/tests/data/ref_preparer_test_run.in.4.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
cannot_make_into_a_gene 1 0 . . .
noncoding1-toolong 0 0 . . .
noncoding2 0 0 C4T . .
noncoding3-tooshort 0 0 C4T . .
noncoding4-toolong 0 0 C4T . .
noncoding5-tooshort 0 0 C4T . .
gene1 1 0 . . .
gene2 1 0 . . .
Original file line number Diff line number Diff line change
@@ -1 +1 @@
cannot_make_into_a_gene was removed because does not look like a gene, so removing its metadata
cannot_make_into_a_gene was removed because it failed filtering checks, so removing its metadata
Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
input fasta file: /Users/kp11/workspace/applications/Ariba/ariba/ariba/tests/data/ref_preparer_test_run.in.4.fa
input tsv file: /Users/kp11/workspace/applications/Ariba/ariba/ariba/tests/data/ref_preparer_test_run.in.4.tsv
genetic_code 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
noncoding1-toolong noncoding1_toolong
noncoding3-tooshort noncoding3_tooshort
noncoding4-toolong noncoding4_toolong
noncoding5-tooshort noncoding5_tooshort
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
ARIBA run with this command:
setup.py prepareref test
from this directory: /Users/kp11/workspace/applications/Ariba/ariba


Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
cannot_make_into_a_gene REMOVE Does not look like a gene (tried both strands and all reading frames) AAAAAAAAAAAAAAAA
gene1 KEEP Made into gene. strand=+, frame=0
gene2 KEEP Made into gene. strand=+, frame=0
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
cannot_make_into_a_gene was removed because it failed filtering checks, so removing its metadata
noncoding1_toolong was removed because it failed filtering checks, so removing its metadata
noncoding3_tooshort was removed because it failed filtering checks, so removing its metadata
noncoding4_toolong was removed because it failed filtering checks, so removing its metadata
noncoding5_tooshort was removed because it failed filtering checks, so removing its metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
gene1 1 0 . . .
gene2 1 0 . . .
noncoding2 0 0 C4T . .
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
noncoding1_toolong REMOVE Too long. Length: 38
noncoding3_tooshort REMOVE Too short. Length: 1
noncoding4_toolong REMOVE Too long. Length: 21
noncoding5_tooshort REMOVE Too short. Length: 5
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>gene1
ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>gene2
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>noncoding2
CTACTGAT
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cluster gene1 gene2
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>gene1
ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>gene2
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>noncoding2
CTACTGAT
Empty file.
Empty file.
10 changes: 10 additions & 0 deletions ariba/tests/data/reference_data_remove_bad_noncoding.in.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>noncoding1
AAAA
>noncoding2
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
>noncoding3
CCCCCC
>noncoding4
TTTTTTTTTTTTTTT
>noncoding5
AAAAAAAAAAAA
5 changes: 5 additions & 0 deletions ariba/tests/data/reference_data_remove_bad_noncoding.in.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
noncoding1 0 0 . . .
noncoding2 0 0 . . .
noncoding3 0 0 . . .
noncoding4 0 0 . . .
noncoding5 0 0 . . .
2 changes: 2 additions & 0 deletions ariba/tests/data/reference_data_test_remove_bad_noncoding.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
noncoding1 REMOVE Too short. Length: 4
noncoding2 REMOVE Too long. Length: 133
37 changes: 37 additions & 0 deletions ariba/tests/ref_preparer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def test_run(self):
test_files = [
'01.filter.check_metadata.tsv',
'01.filter.check_genes.log',
'01.filter.check_noncoding.log',
'01.filter.check_metadata.log',
'02.cdhit.all.fa',
'02.cdhit.clusters.tsv',
Expand Down Expand Up @@ -152,6 +153,7 @@ def test_run_all_noncoding(self):
'00.auto_metadata.tsv',
'01.filter.check_metadata.tsv',
'01.filter.check_genes.log',
'01.filter.check_noncoding.log',
'01.filter.check_metadata.log',
'02.cdhit.all.fa',
'02.cdhit.clusters.tsv',
Expand All @@ -168,6 +170,41 @@ def test_run_all_noncoding(self):

common.rmtree(tmp_out)

def test_run_noncoding_checks(self):
'''test run with noncoding sequences that are outside of the allowed size range'''
fasta_in = [
os.path.join(data_dir, 'ref_preparer_test_run.in.4.fa')
]
tsv_in = [
os.path.join(data_dir, 'ref_preparer_test_run.in.4.tsv')
]

extern_progs = external_progs.ExternalProgs()
refprep = ref_preparer.RefPreparer(
fasta_in, extern_progs, min_noncoding_length=6, max_noncoding_length=20,
metadata_tsv_files=tsv_in, genetic_code=1)
tmp_out = 'tmp.ref_preparer_test_run_noncoding_checks'
refprep.run(tmp_out)
expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_noncoding_checks.out')

test_files = [
'01.filter.check_metadata.tsv',
'01.filter.check_genes.log',
'01.filter.check_noncoding.log',
'01.filter.check_metadata.log',
'02.cdhit.all.fa',
'02.cdhit.clusters.tsv',
'02.cdhit.gene.fa',
'02.cdhit.gene.varonly.fa',
'02.cdhit.noncoding.fa',
'02.cdhit.noncoding.varonly.fa',
]

for filename in test_files:
expected = os.path.join(expected_outdir, filename)
got = os.path.join(tmp_out, filename)
self.assertTrue(filecmp.cmp(expected, got, shallow=False))

common.rmtree(tmp_out)


36 changes: 36 additions & 0 deletions ariba/tests/reference_data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,19 @@ def test_try_to_get_gene_seq(self):
for seq, got_seq, message in tests:
self.assertEqual((got_seq, message), reference_data.ReferenceData._try_to_get_gene_seq(seq, 6, 99))

def test_check_noncoding_seq(self):
'''Test _check_noncoding_seq'''
tests = [
(pyfastaq.sequences.Fasta('x', 'A' * 3), False, 'REMOVE\tToo short. Length: 3'),
(pyfastaq.sequences.Fasta('x', 'A' * 21), False, 'REMOVE\tToo long. Length: 21'),
(pyfastaq.sequences.Fasta('x', 'A' * 5), True, None),
(pyfastaq.sequences.Fasta('x', 'A' * 4), True, None),
(pyfastaq.sequences.Fasta('x', 'A' * 20), True, None)
]

for seq, valid, message in tests:
self.assertEqual((valid, message), reference_data.ReferenceData._check_noncoding_seq(seq, 4, 20))


def test_remove_bad_genes(self):
'''Test _remove_bad_genes'''
Expand All @@ -287,6 +300,29 @@ def test_remove_bad_genes(self):
os.unlink(tmp_log)


def test_remove_bad_noncoding_seqs(self):
'''Test _remove_bad_noncoding_seqs'''
test_seq_dict = {}
fasta_file = os.path.join(data_dir, 'reference_data_remove_bad_noncoding.in.fa')
metadata_file = os.path.join(data_dir, 'reference_data_remove_bad_noncoding.in.tsv')
metadata = reference_data.ReferenceData._load_all_metadata_tsvs([metadata_file])
pyfastaq.tasks.file_to_dict(fasta_file, test_seq_dict)
tmp_log = 'tmp.test_remove_bad_noncoding.log'
expected_removed = {'noncoding1','noncoding2'}
got_removed = reference_data.ReferenceData._remove_bad_noncoding_seqs(test_seq_dict, metadata, tmp_log,
min_noncoding_length=6, max_noncoding_length=15)
self.assertEqual(expected_removed, got_removed)
expected_dict = {
'noncoding3': pyfastaq.sequences.Fasta('noncoding3', 'CCCCCC'),
'noncoding4': pyfastaq.sequences.Fasta('noncoding4', 'TTTTTTTTTTTTTTT'),
'noncoding5': pyfastaq.sequences.Fasta('noncoding5', 'AAAAAAAAAAAA')
}
self.assertEqual(expected_dict, test_seq_dict)
expected_log = os.path.join(data_dir, 'reference_data_test_remove_bad_noncoding.log')
self.assertTrue(filecmp.cmp(expected_log, tmp_log, shallow=False))
os.unlink(tmp_log)


def test_new_seq_name(self):
'''Test _new_seq_name'''
tests = [
Expand Down
Loading

0 comments on commit 868c995

Please sign in to comment.