Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for issue #278 Version 3.0.3 of CARD breaks prepareref #279

Merged
merged 3 commits into from
Aug 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Change Log

[v2.14.3](https://github.com/sanger-pathogens/ariba/tree/v2.14.3) (2019-08-23)
[Full Changelog](https://github.com/sanger-pathogens/ariba/compare/v2.14.2...v2.14.3)

**Fixed bugs:**

- Version 3.0.3 of CARD breaks prepareref [\#278](https://github.com/sanger-pathogens/ariba/issues/278)
- RT 667288: Change docker file Ariba git clone to a copy

[v2.14.2](https://github.com/sanger-pathogens/ariba/tree/v2.14.2) (2019-06-18)
[Full Changelog](https://github.com/sanger-pathogens/ariba/compare/v2.14.1...v2.14.2)

Expand Down
17 changes: 11 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ MAINTAINER [email protected]
ARG BOWTIE2_VERSION=2.2.9
ARG SPADES_VERSION=3.13.1
ARG ARIBA_TAG=master
ARG ARIBA_BUILD_DIR=/ariba

RUN apt-get -qq update && \
apt-get install --no-install-recommends -y \
Expand All @@ -27,10 +28,12 @@ RUN apt-get -qq update && \
wget \
zlib1g-dev

# Install bowtie
RUN wget -q http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/${BOWTIE2_VERSION}/bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip \
&& unzip bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip \
&& rm -f bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip

# Install SPAdes
RUN wget -q https://github.com/ablab/spades/releases/download/v${SPADES_VERSION}/SPAdes-${SPADES_VERSION}-Linux.tar.gz \
&& tar -zxf SPAdes-${SPADES_VERSION}-Linux.tar.gz \
&& rm -f SPAdes-${SPADES_VERSION}-Linux.tar.gz
Expand All @@ -40,13 +43,15 @@ RUN wget -q https://github.com/ablab/spades/releases/download/v${SPADES_VERSION}
ENV ARIBA_BOWTIE2=$PWD/bowtie2-${BOWTIE2_VERSION}/bowtie2 ARIBA_CDHIT=cdhit-est MPLBACKEND="agg"
ENV PATH=$PATH:$PWD/SPAdes-${SPADES_VERSION}-Linux/bin

RUN cd /usr/local/bin && ln -s /usr/bin/python3 python && cd
RUN ln -s -f /usr/bin/python3 /usr/local/bin/python

RUN git clone https://github.com/sanger-pathogens/ariba.git \
&& cd ariba \
&& git checkout ${ARIBA_TAG} \
&& rm -rf .git \
# Install Ariba
RUN mkdir -p $ARIBA_BUILD_DIR
COPY . $ARIBA_BUILD_DIR
RUN cd $ARIBA_BUILD_DIR \
&& python3 setup.py clean --all \
&& python3 setup.py test \
&& python3 setup.py install
&& python3 setup.py install \
&& rm -rf $ARIBA_BUILD_DIR

CMD ariba
9 changes: 8 additions & 1 deletion ariba/ref_preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ def __init__(self,
version_report_lines=None,
min_gene_length=6,
max_gene_length=10000,
min_noncoding_length=6,
max_noncoding_length=20000,
genetic_code=11,
cdhit_min_id=0.9,
cdhit_min_length=0.0,
Expand All @@ -38,6 +40,8 @@ def __init__(self,
self.all_coding = all_coding
self.min_gene_length = min_gene_length
self.max_gene_length = max_gene_length
self.min_noncoding_length = min_noncoding_length
self.max_noncoding_length = max_noncoding_length
self.genetic_code = genetic_code
self.cdhit_min_id = cdhit_min_id
self.cdhit_min_length = cdhit_min_length
Expand Down Expand Up @@ -177,6 +181,8 @@ def run(self, outdir):
self.metadata_tsv_files,
min_gene_length=self.min_gene_length,
max_gene_length=self.max_gene_length,
min_noncoding_length = self.min_noncoding_length,
max_noncoding_length = self.max_noncoding_length,
genetic_code=self.genetic_code,
)

Expand Down Expand Up @@ -213,8 +219,9 @@ def run(self, outdir):
pickle.dump(clusters, f)

if number_of_removed_seqs > 0:
print('WARNING.', number_of_removed_seqs, 'sequence(s) excluded. Please see the log file 01.filter.check_genes.log for details. This will show them:', file=sys.stderr)
print('WARNING.', number_of_removed_seqs, 'sequence(s) excluded. Please see the 01.filter.check_genes.log and 01.filter.check_noncoding.log for details. This will show them:', file=sys.stderr)
print(' grep REMOVE', os.path.join(outdir, '01.filter.check_genes.log'), file=sys.stderr)
print(' cat', os.path.join(outdir, '01.filter.check_noncoding.log'), file=sys.stderr)

if number_of_bad_variants_logged > 0:
print('WARNING. Problem with at least one variant. Problem variants are removed. Please see the file', os.path.join(outdir, '01.filter.check_metadata.log'), 'for details.', file=sys.stderr)
57 changes: 53 additions & 4 deletions ariba/reference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,17 @@ def __init__(self,
rename_file=None,
min_gene_length=6,
max_gene_length=10000,
min_noncoding_length=6,
max_noncoding_length=20000,
genetic_code=11,
parameters_file=None,
):
self.seq_filenames = {}
self.seq_dicts = {}
self.min_gene_length = min_gene_length
self.max_gene_length = max_gene_length
self.min_noncoding_length = min_noncoding_length
self.max_noncoding_length = max_noncoding_length

self.sequences, self.metadata = ReferenceData._load_input_files_and_check_seq_names(fasta_files, metadata_tsv_files)
if len(self.sequences) == 0:
Expand Down Expand Up @@ -208,7 +212,7 @@ def _filter_bad_variant_data(cls, sequences, all_metadata, out_prefix, removed_s

for sequence_name, metadata_dict in sorted(all_metadata.items()):
if sequence_name in removed_sequences:
print(sequence_name, 'was removed because does not look like a gene, so removing its metadata', file=log_fh)
print(sequence_name, 'was removed because it failed filtering checks, so removing its metadata', file=log_fh)
log_lines += 1
del all_metadata[sequence_name]
continue
Expand Down Expand Up @@ -278,6 +282,16 @@ def _try_to_get_gene_seq(cls, seq, min_length, max_length):
return got[0], 'KEEP\tMade into gene. strand=' + got[1] + ', frame=' + str(got[2])


@classmethod
def _check_noncoding_seq(cls, seq, min_length, max_length):
if len(seq) < min_length:
return False, 'REMOVE\tToo short. Length: ' + str(len(seq))
elif len(seq) > max_length:
return False, 'REMOVE\tToo long. Length: ' + str(len(seq))
else:
return True, None


@classmethod
def _remove_bad_genes(cls, sequences, metadata, log_file, min_gene_length, max_gene_length):
to_remove = set()
Expand Down Expand Up @@ -308,11 +322,46 @@ def _remove_bad_genes(cls, sequences, metadata, log_file, min_gene_length, max_g
return to_remove


@classmethod
def _remove_bad_noncoding_seqs(cls, sequences, metadata, log_file, min_noncoding_length, max_noncoding_length):
to_remove = set()

if len(sequences) == 0:
return to_remove

log_fh = pyfastaq.utils.open_file_write(log_file)

for name in sorted(sequences):
if metadata[name]['seq_type'] != 'n':
continue

valid, message = ReferenceData._check_noncoding_seq(sequences[name], min_noncoding_length, max_noncoding_length)
if not valid:
to_remove.add(name)

if message is not None:
print(name, message, sep='\t', file=log_fh)

pyfastaq.utils.close(log_fh)

for name in to_remove:
sequences.pop(name)

return to_remove

def sanity_check(self, outprefix):
removed_seqs = self._remove_bad_genes(self.sequences, self.metadata, outprefix + '.check_genes.log', self.min_gene_length, self.max_gene_length)
log_lines = ReferenceData._filter_bad_variant_data(self.sequences, self.metadata, outprefix, removed_seqs)
return len(removed_seqs), log_lines

removed_gene_seqs = self._remove_bad_genes(self.sequences,
self.metadata, outprefix + '.check_genes.log',
self.min_gene_length, self.max_gene_length)

removed_noncoding_seqs = self._remove_bad_noncoding_seqs(self.sequences, self.metadata,
outprefix + '.check_noncoding.log', self.min_noncoding_length,
self.max_noncoding_length)

all_removed_seqs = removed_gene_seqs.union(removed_noncoding_seqs)
log_lines = ReferenceData._filter_bad_variant_data(self.sequences, self.metadata, outprefix, all_removed_seqs)
return len(all_removed_seqs), log_lines

@classmethod
def _new_seq_name(cls, name):
Expand Down
2 changes: 2 additions & 0 deletions ariba/tasks/prepareref.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def run(options):
version_report_lines=version_report_lines,
min_gene_length=options.min_gene_length,
max_gene_length=options.max_gene_length,
min_noncoding_length=options.min_noncoding_length,
max_noncoding_length=options.max_noncoding_length,
genetic_code=options.genetic_code,
cdhit_min_id=options.cdhit_min_id,
cdhit_min_length=options.cdhit_min_length,
Expand Down
17 changes: 17 additions & 0 deletions ariba/tests/data/ref_preparer_test_run.in.4.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
>noncoding1-toolong
CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
>noncoding2
CTACTGAT
>cannot_make_into_a_gene
AAAAAAAAAAAAAAAA
>noncoding3-tooshort
C
>gene1
ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>noncoding4-toolong
CTACTGATCATCTACTATCTG
>noncoding5-tooshort
CTCTC
>gene2
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA

8 changes: 8 additions & 0 deletions ariba/tests/data/ref_preparer_test_run.in.4.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
cannot_make_into_a_gene 1 0 . . .
noncoding1-toolong 0 0 . . .
noncoding2 0 0 C4T . .
noncoding3-tooshort 0 0 C4T . .
noncoding4-toolong 0 0 C4T . .
noncoding5-tooshort 0 0 C4T . .
gene1 1 0 . . .
gene2 1 0 . . .
Original file line number Diff line number Diff line change
@@ -1 +1 @@
cannot_make_into_a_gene was removed because does not look like a gene, so removing its metadata
cannot_make_into_a_gene was removed because it failed filtering checks, so removing its metadata
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
input fasta file: /Users/kp11/workspace/applications/Ariba/ariba/ariba/tests/data/ref_preparer_test_run.in.4.fa
input tsv file: /Users/kp11/workspace/applications/Ariba/ariba/ariba/tests/data/ref_preparer_test_run.in.4.tsv
genetic_code 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
noncoding1-toolong noncoding1_toolong
noncoding3-tooshort noncoding3_tooshort
noncoding4-toolong noncoding4_toolong
noncoding5-tooshort noncoding5_tooshort
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
ARIBA run with this command:
setup.py prepareref test
from this directory: /Users/kp11/workspace/applications/Ariba/ariba


Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
cannot_make_into_a_gene REMOVE Does not look like a gene (tried both strands and all reading frames) AAAAAAAAAAAAAAAA
gene1 KEEP Made into gene. strand=+, frame=0
gene2 KEEP Made into gene. strand=+, frame=0
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
cannot_make_into_a_gene was removed because it failed filtering checks, so removing its metadata
noncoding1_toolong was removed because it failed filtering checks, so removing its metadata
noncoding3_tooshort was removed because it failed filtering checks, so removing its metadata
noncoding4_toolong was removed because it failed filtering checks, so removing its metadata
noncoding5_tooshort was removed because it failed filtering checks, so removing its metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
gene1 1 0 . . .
gene2 1 0 . . .
noncoding2 0 0 C4T . .
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
noncoding1_toolong REMOVE Too long. Length: 38
noncoding3_tooshort REMOVE Too short. Length: 1
noncoding4_toolong REMOVE Too long. Length: 21
noncoding5_tooshort REMOVE Too short. Length: 5
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>gene1
ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>gene2
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>noncoding2
CTACTGAT
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cluster gene1 gene2
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>gene1
ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>gene2
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>noncoding2
CTACTGAT
Empty file.
10 changes: 10 additions & 0 deletions ariba/tests/data/reference_data_remove_bad_noncoding.in.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>noncoding1
AAAA
>noncoding2
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
>noncoding3
CCCCCC
>noncoding4
TTTTTTTTTTTTTTT
>noncoding5
AAAAAAAAAAAA
5 changes: 5 additions & 0 deletions ariba/tests/data/reference_data_remove_bad_noncoding.in.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
noncoding1 0 0 . . .
noncoding2 0 0 . . .
noncoding3 0 0 . . .
noncoding4 0 0 . . .
noncoding5 0 0 . . .
2 changes: 2 additions & 0 deletions ariba/tests/data/reference_data_test_remove_bad_noncoding.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
noncoding1 REMOVE Too short. Length: 4
noncoding2 REMOVE Too long. Length: 133
37 changes: 37 additions & 0 deletions ariba/tests/ref_preparer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def test_run(self):
test_files = [
'01.filter.check_metadata.tsv',
'01.filter.check_genes.log',
'01.filter.check_noncoding.log',
'01.filter.check_metadata.log',
'02.cdhit.all.fa',
'02.cdhit.clusters.tsv',
Expand Down Expand Up @@ -152,6 +153,7 @@ def test_run_all_noncoding(self):
'00.auto_metadata.tsv',
'01.filter.check_metadata.tsv',
'01.filter.check_genes.log',
'01.filter.check_noncoding.log',
'01.filter.check_metadata.log',
'02.cdhit.all.fa',
'02.cdhit.clusters.tsv',
Expand All @@ -168,6 +170,41 @@ def test_run_all_noncoding(self):

common.rmtree(tmp_out)

def test_run_noncoding_checks(self):
'''test run with noncoding sequences that are outside of the allowed size range'''
fasta_in = [
os.path.join(data_dir, 'ref_preparer_test_run.in.4.fa')
]
tsv_in = [
os.path.join(data_dir, 'ref_preparer_test_run.in.4.tsv')
]

extern_progs = external_progs.ExternalProgs()
refprep = ref_preparer.RefPreparer(
fasta_in, extern_progs, min_noncoding_length=6, max_noncoding_length=20,
metadata_tsv_files=tsv_in, genetic_code=1)
tmp_out = 'tmp.ref_preparer_test_run_noncoding_checks'
refprep.run(tmp_out)
expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_noncoding_checks.out')

test_files = [
'01.filter.check_metadata.tsv',
'01.filter.check_genes.log',
'01.filter.check_noncoding.log',
'01.filter.check_metadata.log',
'02.cdhit.all.fa',
'02.cdhit.clusters.tsv',
'02.cdhit.gene.fa',
'02.cdhit.gene.varonly.fa',
'02.cdhit.noncoding.fa',
'02.cdhit.noncoding.varonly.fa',
]

for filename in test_files:
expected = os.path.join(expected_outdir, filename)
got = os.path.join(tmp_out, filename)
self.assertTrue(filecmp.cmp(expected, got, shallow=False))

common.rmtree(tmp_out)


Loading