Skip to content

Commit

Permalink
Merge pull request #80 from martinghunt/specify_cdhit_clusters
Browse files Browse the repository at this point in the history
Specify cdhit clusters
  • Loading branch information
martinghunt committed May 10, 2016
2 parents 2c15c69 + bedabc9 commit 436b939
Show file tree
Hide file tree
Showing 22 changed files with 315 additions and 5 deletions.
53 changes: 52 additions & 1 deletion ariba/cdhit.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,58 @@ def fake_run(self):
print(seq, file=f)

pyfastaq.utils.close(f)
clusters = self._rename_clusters(clusters, tmp_fa, self.outfile)
clusters = self._rename_clusters(clusters, tmp_fa, self.outfile, rename_suffix=self.rename_suffix)
shutil.rmtree(tmpdir)
return clusters


@staticmethod
def _load_user_clusters_file(filename):
f = pyfastaq.utils.open_file_read(filename)
seq_to_cluster = {}
for line in f:
data = line.rstrip().split()

for seq_name in data:
if seq_name in seq_to_cluster:
pyfastaq.utils.close(f)
raise Error('Error reading clusters file. The sequence "' + seq_name + '" was found more than once in the file ' + filename)
seq_to_cluster[seq_name] = data[0]

pyfastaq.utils.close(f)
return seq_to_cluster


def run_get_clusters_from_file(self, infile):
'''Instead of running cdhit, gets the clusters info from the input dict.
Dict expected to be key=sequence name, value=name of cluster'''
seq_to_cluster = self._load_user_clusters_file(infile)
cluster_names = set(seq_to_cluster.values())
tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
tmp_fa = os.path.join(tmpdir, 'cdhit.fa')
clusters = {}
seq_reader = pyfastaq.sequences.file_reader(self.infile)
f = pyfastaq.utils.open_file_write(tmp_fa)

for seq in seq_reader:
if seq.id in clusters:
pyfastaq.utils.close(f)
shutil.rmtree(tmpdir)
raise Error('Sequence name "' + seq.id + '" not unique. Cannot continue')

if seq.id not in seq_to_cluster:
raise Error('Error forcing cdhit clustering. Found sequence ' + seq.id + ' in FASTA file, but not in provided clusters info from file ' + infile)

cluster = seq_to_cluster[seq.id]
if cluster not in clusters:
clusters[cluster] = set()

clusters[cluster].add(seq.id)
if seq.id in cluster_names:
print(seq, file=f)

pyfastaq.utils.close(f)
clusters = self._rename_clusters(clusters, tmp_fa, self.outfile, rename_suffix=self.rename_suffix)
shutil.rmtree(tmpdir)
return clusters

Expand Down
5 changes: 4 additions & 1 deletion ariba/ref_preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def __init__(self,
cdhit_min_id=0.9,
cdhit_min_length=0.9,
run_cdhit=True,
clusters_file=None,
threads=1,
verbose=False,
):
Expand All @@ -42,6 +43,7 @@ def __init__(self,
self.cdhit_min_id = cdhit_min_id
self.cdhit_min_length = cdhit_min_length
self.run_cdhit = run_cdhit
self.clusters_file = clusters_file
self.threads = threads
self.verbose = verbose

Expand Down Expand Up @@ -152,7 +154,8 @@ def run(self, outdir):
length_diff_cutoff=self.cdhit_min_length,
nocluster=not self.run_cdhit,
verbose=self.verbose,
cd_hit_est=self.extern_progs.exe('cdhit')
cd_hit_est=self.extern_progs.exe('cdhit'),
clusters_file=self.clusters_file,
)

if self.verbose:
Expand Down
6 changes: 4 additions & 2 deletions ariba/reference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ def write_cluster_allocation_file(clusters, outfile):
pyfastaq.utils.close(f_out)


def cluster_with_cdhit(self, inprefix, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.9, nocluster=False, verbose=False, cd_hit_est='cd-hit-est'):
def cluster_with_cdhit(self, inprefix, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.9, nocluster=False, verbose=False, cd_hit_est='cd-hit-est', clusters_file=None):
files_to_cat = []
clusters = {}

Expand All @@ -478,7 +478,9 @@ def cluster_with_cdhit(self, inprefix, outprefix, seq_identity_threshold=0.9, th
rename_suffix = seqs_type[0]
)

if nocluster:
if clusters_file is not None:
new_clusters = cdhit_runner.run_get_clusters_from_file(clusters_file)
elif nocluster:
new_clusters = cdhit_runner.fake_run()
else:
new_clusters = cdhit_runner.run()
Expand Down
8 changes: 7 additions & 1 deletion ariba/tasks/prepareref.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import sys
import argparse
from ariba import ref_preparer, external_progs, versions

Expand All @@ -14,7 +15,8 @@ def run():
input_group.add_argument('--metadata', help='tsv file of metadata about the reference sequences', metavar='FILENAME')

cdhit_group = parser.add_argument_group('cd-hit options')
cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster"')
cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.')
cdhit_group.add_argument('--cdhit_clusters', help='File specifying how the sequences should be clustered. Will be used instead of running cdhit. Format is one cluster per line. Sequence names separated by whitespace. First name in line is the cluster representative. Incompatible with --no_cdhit', metavar='FILENAME')
cdhit_group.add_argument('--cdhit_min_id', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT')
cdhit_group.add_argument('--cdhit_min_length', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.9, metavar='FLOAT')

Expand All @@ -28,6 +30,9 @@ def run():
parser.add_argument('outdir', help='Output directory (must not already exist)')
options = parser.parse_args()

if options.no_cdhit and options.cdhit_clusters is not None:
sys.exit('Cannot use both --no_cdhit and --cdhit_clusters. Neither or exactly one of those options must be used')

extern_progs, version_report_lines = versions.get_all_versions()
if options.verbose:
print(*version_report_lines, sep='\n')
Expand All @@ -46,6 +51,7 @@ def run():
cdhit_min_id=options.cdhit_min_id,
cdhit_min_length=options.cdhit_min_length,
run_cdhit=not options.no_cdhit,
clusters_file=options.cdhit_clusters,
threads=options.threads,
verbose=options.verbose,
)
Expand Down
44 changes: 44 additions & 0 deletions ariba/tests/cdhit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,47 @@ def test_fake_run_fail(self):
with self.assertRaises(cdhit.Error):
clusters = r.fake_run()


def test_load_user_clusters_file_good_file(self):
'''test _load_user_clusters_file with good input file'''
infile = os.path.join(data_dir, 'cdhit_test_load_user_clusters_file.good')
expected = {
'seq1': 'seq1',
'seq2': 'seq1',
'seq3': 'seq1',
'seq4': 'seq4',
'seq5': 'seq5',
'seq6': 'seq5',
}

got = cdhit.Runner._load_user_clusters_file(infile)
self.assertEqual(expected, got)


def test_load_user_clusters_file_bad_file(self):
'''test _load_user_clusters_file with bad input files'''
infiles = [
os.path.join(data_dir, 'cdhit_test_load_user_clusters_file.bad1'),
os.path.join(data_dir, 'cdhit_test_load_user_clusters_file.bad2'),
os.path.join(data_dir, 'cdhit_test_load_user_clusters_file.bad3')
]
for filename in infiles:
with self.assertRaises(cdhit.Error):
cdhit.Runner._load_user_clusters_file(filename)


def test_run_get_clusters_from_file(self):
'''test run_get_clusters_from_file'''
fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.fa')
clusters_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.clusters')
expected_outfile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.out.fa')
tmpfile = 'tmp.cdhit_test_run_get_clusters_from_dict.out.fa'
r = cdhit.Runner(fa_infile, tmpfile, cd_hit_est=extern_progs.exe('cdhit'))
clusters = r.run_get_clusters_from_file(clusters_infile)
expected_clusters = {
'seq1.x': {'seq1', 'seq2'},
'seq3.x': {'seq3'},
}
self.assertEqual(clusters, expected_clusters)
self.assertTrue(filecmp.cmp(tmpfile, expected_outfile, shallow=False))
os.unlink(tmpfile)
1 change: 1 addition & 0 deletions ariba/tests/data/cdhit_test_load_user_clusters_file.bad1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
seq1 seq1
2 changes: 2 additions & 0 deletions ariba/tests/data/cdhit_test_load_user_clusters_file.bad2
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
seq1 seq2
seq3 seq1
2 changes: 2 additions & 0 deletions ariba/tests/data/cdhit_test_load_user_clusters_file.bad3
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
seq1 seq2
seq3 seq2
3 changes: 3 additions & 0 deletions ariba/tests/data/cdhit_test_load_user_clusters_file.good
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
seq1 seq2 seq3
seq4
seq5 seq6
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
seq1 seq2
seq3
6 changes: 6 additions & 0 deletions ariba/tests/data/cdhit_test_run_get_clusters_from_dict.in.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>seq1
ACGT
>seq2
AAAA
>seq3
CCCC
4 changes: 4 additions & 0 deletions ariba/tests/data/cdhit_test_run_get_clusters_from_dict.out.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>seq1.x
ACGT
>seq3.x
CCCC
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
noncoding1
noncoding2
presence_absence1 presence_absence3 presence_absence4
presence_absence2
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
presence_absence1.p presence_absence1 presence_absence3 presence_absence4
presence_absence2.p presence_absence2
noncoding1.n noncoding1
noncoding2.n noncoding2
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
>presence_absence1.p
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
>presence_absence2.p
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAAATGGCGAGCACCAACATTAGCCAT
ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
>noncoding1.n
GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
ATGCTGGGAAGGGTTTGCTG
>noncoding2.n
GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
ATGCTGGGAAGGGTTTGCTG
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>noncoding1
GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
ATGCTGGGAAGGGTTTGCTG
>noncoding2
GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
ATGCTGGGAAGGGTTTGCTG
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>presence_absence1
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
>presence_absence2
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAAATGGCGAGCACCAACATTAGCCAT
ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
>presence_absence3
ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
>presence_absence4
ATGGCGTGCGATGAATTTGGCATGATTAAACTGATGAACCCGCAGCGCAGCACCTAA
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
presence_absence1.p presence_absence1
presence_absence2.p presence_absence2
presence_absence3.p presence_absence3
presence_absence4.p presence_absence4
noncoding1.n noncoding1
noncoding2.n noncoding2
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
>noncoding1.n
GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
ATGCTGGGAAGGGTTTGCTG
>noncoding2.n
GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
ATGCTGGGAAGGGTTTGCTG
>presence_absence1.p
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
>presence_absence2.p
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAAATGGCGAGCACCAACATTAGCCAT
ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
>presence_absence3.p
ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
>presence_absence4.p
ATGGCGTGCGATGAATTTGGCATGATTAAACTGATGAACCCGCAGCGCAGCACCTAA
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>noncoding1
GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
ATGCTGGGAAGGGTTTGCTG
>noncoding2
GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
ATGCTGGGAAGGGTTTGCTG
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>presence_absence1
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
>presence_absence2
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAAATGGCGAGCACCAACATTAGCCAT
ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
>presence_absence3
ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
>presence_absence4
ATGGCGTGCGATGAATTTGGCATGATTAAACTGATGAACCCGCAGCGCAGCACCTAA
Loading

0 comments on commit 436b939

Please sign in to comment.