Skip to content

Commit

Permalink
Merge pull request #34 from martinghunt/master
Browse files Browse the repository at this point in the history
Various improvements
  • Loading branch information
John Tate committed Sep 10, 2015
2 parents 0ee5c4e + 9ca157b commit d76cb3a
Show file tree
Hide file tree
Showing 10 changed files with 144 additions and 20 deletions.
34 changes: 34 additions & 0 deletions ariba/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def __init__(self,
self.final_assembly_bam = os.path.join(self.root_dir, 'assembly.reads_mapped.bam')
self.final_assembly_read_depths = os.path.join(self.root_dir, 'assembly.reads_mapped.bam.read_depths.gz')
self.final_assembly_vcf = os.path.join(self.root_dir, 'assembly.reads_mapped.bam.vcf')
self.final_assembled_genes_fa = os.path.join(self.root_dir, 'assembly.genes.fa')
self.final_assembly = {}
self.mummer_variants = {}
self.variant_depths = {}
Expand Down Expand Up @@ -525,6 +526,38 @@ def _nucmer_hits_to_gene_cov_per_contig(self):
return cov


@staticmethod
def _nucmer_hits_to_assembled_gene_sequences(nucmer_hits, ref_gene, assembly, outfile):
f = pyfastaq.utils.open_file_write(outfile)

for contig in sorted(nucmer_hits):
for hit in nucmer_hits[contig]:
qry_coords = hit.qry_coords()
fa = assembly[hit.qry_name].subseq(qry_coords.start, qry_coords.end + 1)
if hit.on_same_strand():
strand = '+'
else:
fa.revcomp()
strand = '-'
ref_coords = hit.ref_coords()
fa.id = '.'.join([
ref_gene.id,
str(ref_coords.start + 1),
str(ref_coords.end + 1),
contig,
str(qry_coords.start + 1),
str(qry_coords.end + 1),
strand
])

if hit.hit_length_ref == hit.ref_length:
fa.id += '.complete'

print(fa, file=f)

pyfastaq.utils.close(f)


def _whole_gene_covered_by_nucmer_hits(self):
covered = self._nucmer_hits_to_ref_coords()
pyfastaq.intervals.merge_overlapping_in_list(covered)
Expand Down Expand Up @@ -1046,6 +1079,7 @@ def run(self):
self._update_flag_from_nucmer_file()
self._make_assembly_vcf()
self._get_vcf_variant_counts()
self._nucmer_hits_to_assembled_gene_sequences(self.nucmer_hits, self.gene, self.final_assembly, self.final_assembled_genes_fa)

self._make_report_lines()
self._clean()
43 changes: 35 additions & 8 deletions ariba/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def __init__(self,
run_cd_hit=True,
clean=1,
):
self.db_fasta = os.path.abspath(db_fasta)
self.reads_1 = os.path.abspath(reads_1)
self.reads_2 = os.path.abspath(reads_2)
self.outdir = os.path.abspath(outdir)
Expand All @@ -57,12 +56,13 @@ def __init__(self,
self.assembly_kmer = assembly_kmer
self.spades_other = spades_other

self.db_fasta_clustered = os.path.join(self.outdir, 'genes.clustered.fa')
self.db_fasta_clustered = os.path.join(self.outdir, 'input_genes.clustered.fa')
self.cluster_ids = {}
self.bam_prefix = os.path.join(self.outdir, 'map_all_reads')
self.bam = self.bam_prefix + '.bam'
self.report_file_tsv = os.path.join(self.outdir, 'report.tsv')
self.report_file_xls = os.path.join(self.outdir, 'report.xls')
self.catted_assembled_genes_fasta = os.path.join(self.outdir, 'assembled_genes.fa')
self.threads = threads
self.verbose = verbose

Expand Down Expand Up @@ -120,6 +120,10 @@ def __init__(self,
except:
raise Error('Error mkdir ' + d)

self.db_fasta = os.path.join(self.outdir, 'input_genes.not_clustered.fa')
pyfastaq.tasks.to_fasta(db_fasta, self.db_fasta, check_unique=True)


def _run_cdhit(self):
r = cdhit.Runner(
self.db_fasta,
Expand Down Expand Up @@ -357,12 +361,27 @@ def _write_reports(self):
workbook.save(self.report_file_xls)


def _write_catted_assembled_genes_fasta(self):
f = pyfastaq.utils.open_file_write(self.catted_assembled_genes_fasta)

for gene in sorted(self.clusters):
cluster_fasta = self.clusters[gene].final_assembled_genes_fa
if os.path.exists(cluster_fasta):
file_reader = pyfastaq.sequences.file_reader(cluster_fasta)
for seq in file_reader:
print(seq, file=f)

pyfastaq.utils.close(f)


def _clean(self):
to_clean = [
[
],
[
self.bam
self.bam,
self.db_fasta,
self.db_fasta + '.fai',
],
[
self.db_fasta_clustered,
Expand Down Expand Up @@ -400,14 +419,22 @@ def run(self):
print('Finished mapping\n')
print('{:_^79}'.format(' Generating clusters '), flush=True)
self._bam_to_clusters_reads()
self._set_insert_size_data()
if self.verbose:
print('{:_^79}'.format(' Assembling each cluster '), flush=True)
self._init_and_run_clusters()
if len(self.cluster_to_dir) > 0:
self._set_insert_size_data()
if self.verbose:
print('{:_^79}'.format(' Assembling each cluster '), flush=True)
self._init_and_run_clusters()
if self.verbose:
print('Finished assembling clusters\n')
else:
if self.verbose:
print('No reads mapped. Skipping all assemblies', flush=True)
print('WARNING: no reads mapped to reference genes. Therefore no local assemblies will be run', file=sys.stderr)

if self.verbose:
print('Finished assembling clusters\n')
print('{:_^79}'.format(' Writing report files '), flush=True)
self._write_reports()
self._write_catted_assembled_genes_fasta()
if self.verbose:
print('Finished writing report files. Cleaning files', flush=True)
self._clean()
Expand Down
2 changes: 1 addition & 1 deletion ariba/common.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sys
import subprocess

version = '0.4.1'
version = '0.5.0'

def syscall(cmd, allow_fail=False, verbose=False):
if verbose:
Expand Down
43 changes: 36 additions & 7 deletions ariba/tests/cluster_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def clean_cluster_dir(d, exclude=None):
def file2lines(filename):
f = pyfastaq.utils.open_file_read(filename)
lines = f.readlines()
pyfastaq.utils.close(f)
pyfastaq.utils.close(f)
return lines


Expand Down Expand Up @@ -303,8 +303,8 @@ def test_nucmer_hits_to_percent_identity(self):
]
}
expected = {'scaff1': round((90*10 + 100*34) / (10+34), 2), 'scaff2': 42.42}
c._nucmer_hits_to_percent_identity()
self.assertEqual(expected, c.percent_identities)
c._nucmer_hits_to_percent_identity()
self.assertEqual(expected, c.percent_identities)


def test_nucmer_hits_to_scaff_coords(self):
Expand Down Expand Up @@ -396,12 +396,41 @@ def test_nucmer_hits_to_gene_cov_per_contig(self):
pymummer.alignment.Alignment('\t'.join(hits[2])),
]
}

expected = {'contig1': 85, 'contig2': 11}
self.assertEqual(expected, c._nucmer_hits_to_gene_cov_per_contig())
clean_cluster_dir(cluster_dir)


def test_nucmer_hits_to_assembled_gene_sequences(self):
'''test _nucmer_hits_to_assembled_gene_sequences'''
ref_gene = pyfastaq.sequences.Fasta('ref_gene', 'ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAG')
assembly = {
'contig1': pyfastaq.sequences.Fasta('contig1', 'CATCTATGCTGCATCGATCACTGACGTATCATCATCAGCGTACTGACGTATTAGTTTGTAATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAGACGTCGTACTACTGTATATGCATCGATCTGAA'),
'contig2': pyfastaq.sequences.Fasta('contig2', 'AGTGATATCCTGCGATCTATAATTTTTTTCGCGGGATCTTGAACGCGACGATGTTCGATAATTCAATGCAAAGGAGCGACCCGCAAGTACACAGGACTGCAAA')
}

hits = [
['1', '147', '61', '207', '147', '147', '100.00', '147', '239', '1', '1', 'ref_gene', 'contig1'],
['18', '120', '103', '1', '103', '103', '100.00', '147', '103', '1', '-1', 'ref_gene', 'contig2']
]
nucmer_hits = {
'contig1': [
pymummer.alignment.Alignment('\t'.join(hits[0])),
],
'contig2': [
pymummer.alignment.Alignment('\t'.join(hits[1])),
]
}

assembly_fasta = os.path.join(data_dir, 'cluster_test_nucmer_hits_to_assembled_gene_sequences.assembly.fa')
tmp_outfile = 'tmp.test_nucmer_hits_to_assembled_gene_sequences.out.fa'
expected_outfile = os.path.join(data_dir, 'cluster_test_nucmer_hits_to_assembled_gene_sequences.expected.out.fa')
cluster.Cluster._nucmer_hits_to_assembled_gene_sequences(nucmer_hits, ref_gene, assembly, tmp_outfile)
self.assertTrue(filecmp.cmp(tmp_outfile, expected_outfile, shallow=False))
os.unlink(tmp_outfile)


def test_whole_gene_covered_by_nucmer_hits(self):
'''test _whole_gene_covered_by_nucmer_hits'''
cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
Expand Down Expand Up @@ -637,7 +666,7 @@ def test_get_assembly_read_depths(self):
( ('ref1', 3), ('C', 'A,G', 42, '21,11,10') ),
( ('ref1', 4), ('C', 'AC', 41, '0,42') )
]

for t in tests:
self.assertEqual(c._get_assembly_read_depths(t[0][0], t[0][1]), t[1])

Expand All @@ -652,7 +681,7 @@ def test_get_samtools_variant_positions(self):
('16__cat_2_M35190.scaffold.1', 179),
('16__cat_2_M35190.scaffold.1', 263),
('16__cat_2_M35190.scaffold.6', 93)
]
]
self.assertEqual(expected, c._get_samtools_variant_positions())


Expand Down Expand Up @@ -764,4 +793,4 @@ def test_make_report_lines_assembly_fail(self):
]
self.assertEqual(expected, c.report_lines)
clean_cluster_dir(cluster_dir)

21 changes: 18 additions & 3 deletions ariba/tests/clusters_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ def test_sam_pair_to_insert(self):
def test_bam_to_clusters_reads(self):
'''test _bam_to_clusters_reads'''
clusters_dir = 'tmp.Cluster.test_bam_to_clusters_reads'
reads1 = os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.reads_1.fq')
reads2 = os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.reads_2.fq')
reads1 = os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.reads_1.fq')
reads2 = os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.reads_2.fq')
ref = os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.db.fa')
c = clusters.Clusters(ref, reads1, reads2, clusters_dir)
shutil.copyfile(os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.bam'), c.bam)
Expand Down Expand Up @@ -110,7 +110,7 @@ def test_set_insert_size_data(self):
10: 1,
}
self.clusters.insert_hist.bin_width=1

self.clusters._set_insert_size_data()
self.assertEqual(self.clusters.insert_size, 5.5)
self.assertEqual(self.clusters.insert_sspace_sd, 0.91)
Expand All @@ -131,3 +131,18 @@ def __init__(self, lines):
self.assertTrue(filecmp.cmp(expected, self.clusters.report_file_tsv, shallow=False))
self.assertTrue(os.path.exists(self.clusters.report_file_xls))


def test_write_catted_assembled_genes_fasta(self):
'''test _write_catted_assembled_genes_fasta'''
class FakeCluster:
def __init__(self, filename):
self.final_assembled_genes_fa = filename

self.clusters.clusters = {
'gene1': FakeCluster(os.path.join(data_dir, 'clusters_test_write_catted_assembled_genes_fasta.in.gene1.fa')),
'gene2': FakeCluster(os.path.join(data_dir, 'clusters_test_write_catted_assembled_genes_fasta.in.gene2.fa')),
}

self.clusters._write_catted_assembled_genes_fasta()
expected = os.path.join(data_dir, 'clusters_test_write_catted_assembled_genes_fasta.expected.out.fa')
self.assertTrue(filecmp.cmp(expected, self.clusters.catted_assembled_genes_fasta, shallow=False))
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
>ref_gene.1.147.contig1.61.207.+.complete
ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAAT
TATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACT
GCCAGTGGCATCTGTGTAAGCGCTTAG
>ref_gene.18.120.contig2.1.103.-
TTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCG
TTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACT
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>gene1.1
ACGT
>gene1.2
CAT
>gene2
GTGT
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>gene1.1
ACGT
>gene1.2
CAT
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>gene2
GTGT
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setup(
name='ariba',
version='0.4.1',
version='0.5.0',
description='ARIBA: Antibiotic Resistance Identification By Assembly',
packages = find_packages(),
author='Martin Hunt',
Expand Down

0 comments on commit d76cb3a

Please sign in to comment.