Merge pull request #80 from martinghunt/specify_cdhit_clusters

Specify cdhit clusters
sanger-pathogens · May 10, 2016 · 436b939 · 436b939
2 parents 2c15c69 + bedabc9
commit 436b939
Show file tree

Hide file tree

Showing 22 changed files with 315 additions and 5 deletions.
diff --git a/ariba/cdhit.py b/ariba/cdhit.py
@@ -50,7 +50,58 @@ def fake_run(self):
             print(seq, file=f)
 
         pyfastaq.utils.close(f)
-        clusters = self._rename_clusters(clusters, tmp_fa, self.outfile)
+        clusters = self._rename_clusters(clusters, tmp_fa, self.outfile, rename_suffix=self.rename_suffix)
+        shutil.rmtree(tmpdir)
+        return clusters
+
+
+    @staticmethod
+    def _load_user_clusters_file(filename):
+        f = pyfastaq.utils.open_file_read(filename)
+        seq_to_cluster = {}
+        for line in f:
+            data = line.rstrip().split()
+
+            for seq_name in data:
+                if seq_name in seq_to_cluster:
+                    pyfastaq.utils.close(f)
+                    raise Error('Error reading clusters file. The sequence "' + seq_name + '" was found more than once in the file ' + filename)
+                seq_to_cluster[seq_name] = data[0]
+
+        pyfastaq.utils.close(f)
+        return seq_to_cluster
+
+
+    def run_get_clusters_from_file(self, infile):
+        '''Instead of running cdhit, gets the clusters info from the input dict.
+           Dict expected to be key=sequence name, value=name of cluster'''
+        seq_to_cluster = self._load_user_clusters_file(infile)
+        cluster_names = set(seq_to_cluster.values())
+        tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
+        tmp_fa = os.path.join(tmpdir, 'cdhit.fa')
+        clusters = {}
+        seq_reader = pyfastaq.sequences.file_reader(self.infile)
+        f = pyfastaq.utils.open_file_write(tmp_fa)
+
+        for seq in seq_reader:
+            if seq.id in clusters:
+                pyfastaq.utils.close(f)
+                shutil.rmtree(tmpdir)
+                raise Error('Sequence name "' + seq.id + '" not unique. Cannot continue')
+
+            if seq.id not in seq_to_cluster:
+                raise Error('Error forcing cdhit clustering. Found sequence ' + seq.id + ' in FASTA file, but not in provided clusters info from file ' + infile)
+
+            cluster = seq_to_cluster[seq.id]
+            if cluster not in clusters:
+                clusters[cluster] = set()
+
+            clusters[cluster].add(seq.id)
+            if seq.id in cluster_names:
+                print(seq, file=f)
+
+        pyfastaq.utils.close(f)
+        clusters = self._rename_clusters(clusters, tmp_fa, self.outfile, rename_suffix=self.rename_suffix)
         shutil.rmtree(tmpdir)
         return clusters
 

diff --git a/ariba/ref_preparer.py b/ariba/ref_preparer.py
@@ -21,6 +21,7 @@ def __init__(self,
         cdhit_min_id=0.9,
         cdhit_min_length=0.9,
         run_cdhit=True,
+        clusters_file=None,
         threads=1,
         verbose=False,
     ):
@@ -42,6 +43,7 @@ def __init__(self,
         self.cdhit_min_id = cdhit_min_id
         self.cdhit_min_length = cdhit_min_length
         self.run_cdhit = run_cdhit
+        self.clusters_file = clusters_file
         self.threads = threads
         self.verbose = verbose
 
@@ -152,7 +154,8 @@ def run(self, outdir):
             length_diff_cutoff=self.cdhit_min_length,
             nocluster=not self.run_cdhit,
             verbose=self.verbose,
-            cd_hit_est=self.extern_progs.exe('cdhit')
+            cd_hit_est=self.extern_progs.exe('cdhit'),
+            clusters_file=self.clusters_file,
         )
 
         if self.verbose:

diff --git a/ariba/reference_data.py b/ariba/reference_data.py
@@ -459,7 +459,7 @@ def write_cluster_allocation_file(clusters, outfile):
         pyfastaq.utils.close(f_out)
 
 
-    def cluster_with_cdhit(self, inprefix, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.9, nocluster=False, verbose=False, cd_hit_est='cd-hit-est'):
+    def cluster_with_cdhit(self, inprefix, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.9, nocluster=False, verbose=False, cd_hit_est='cd-hit-est', clusters_file=None):
         files_to_cat = []
         clusters = {}
 
@@ -478,7 +478,9 @@ def cluster_with_cdhit(self, inprefix, outprefix, seq_identity_threshold=0.9, th
                   rename_suffix = seqs_type[0]
                 )
 
-                if nocluster:
+                if clusters_file is not None:
+                    new_clusters = cdhit_runner.run_get_clusters_from_file(clusters_file)
+                elif nocluster:
                     new_clusters = cdhit_runner.fake_run()
                 else:
                     new_clusters = cdhit_runner.run()

diff --git a/ariba/tasks/prepareref.py b/ariba/tasks/prepareref.py
@@ -1,3 +1,4 @@
+import sys
 import argparse
 from ariba import ref_preparer, external_progs, versions
 
@@ -14,7 +15,8 @@ def run():
     input_group.add_argument('--metadata', help='tsv file of metadata about the reference sequences', metavar='FILENAME')
 
     cdhit_group = parser.add_argument_group('cd-hit options')
-    cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster"')
+    cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.')
+    cdhit_group.add_argument('--cdhit_clusters', help='File specifying how the sequences should be clustered. Will be used instead of running cdhit. Format is one cluster per line. Sequence names separated by whitespace. First name in line is the cluster representative. Incompatible with --no_cdhit', metavar='FILENAME')
     cdhit_group.add_argument('--cdhit_min_id', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT')
     cdhit_group.add_argument('--cdhit_min_length', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.9, metavar='FLOAT')
 
@@ -28,6 +30,9 @@ def run():
     parser.add_argument('outdir', help='Output directory (must not already exist)')
     options = parser.parse_args()
 
+    if options.no_cdhit and options.cdhit_clusters is not None:
+        sys.exit('Cannot use both --no_cdhit and --cdhit_clusters. Neither or exactly one of those options must be used')
+
     extern_progs, version_report_lines = versions.get_all_versions()
     if options.verbose:
         print(*version_report_lines, sep='\n')
@@ -46,6 +51,7 @@ def run():
         cdhit_min_id=options.cdhit_min_id,
         cdhit_min_length=options.cdhit_min_length,
         run_cdhit=not options.no_cdhit,
+        clusters_file=options.cdhit_clusters,
         threads=options.threads,
         verbose=options.verbose,
     )

diff --git a/ariba/tests/cdhit_test.py b/ariba/tests/cdhit_test.py
@@ -101,3 +101,47 @@ def test_fake_run_fail(self):
         with self.assertRaises(cdhit.Error):
             clusters = r.fake_run()
 
+
+    def test_load_user_clusters_file_good_file(self):
+        '''test _load_user_clusters_file with good input file'''
+        infile = os.path.join(data_dir, 'cdhit_test_load_user_clusters_file.good')
+        expected  = {
+            'seq1': 'seq1',
+            'seq2': 'seq1',
+            'seq3': 'seq1',
+            'seq4': 'seq4',
+            'seq5': 'seq5',
+            'seq6': 'seq5',
+        }
+
+        got = cdhit.Runner._load_user_clusters_file(infile)
+        self.assertEqual(expected, got)
+
+
+    def test_load_user_clusters_file_bad_file(self):
+        '''test _load_user_clusters_file with bad input files'''
+        infiles = [
+            os.path.join(data_dir, 'cdhit_test_load_user_clusters_file.bad1'),
+            os.path.join(data_dir, 'cdhit_test_load_user_clusters_file.bad2'),
+            os.path.join(data_dir, 'cdhit_test_load_user_clusters_file.bad3')
+        ]
+        for filename in infiles:
+            with self.assertRaises(cdhit.Error):
+                cdhit.Runner._load_user_clusters_file(filename)
+
+
+    def test_run_get_clusters_from_file(self):
+        '''test run_get_clusters_from_file'''
+        fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.fa')
+        clusters_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.clusters')
+        expected_outfile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.out.fa')
+        tmpfile = 'tmp.cdhit_test_run_get_clusters_from_dict.out.fa'
+        r = cdhit.Runner(fa_infile, tmpfile, cd_hit_est=extern_progs.exe('cdhit'))
+        clusters = r.run_get_clusters_from_file(clusters_infile)
+        expected_clusters = {
+            'seq1.x': {'seq1', 'seq2'},
+            'seq3.x': {'seq3'},
+        }
+        self.assertEqual(clusters, expected_clusters)
+        self.assertTrue(filecmp.cmp(tmpfile, expected_outfile, shallow=False))
+        os.unlink(tmpfile)
diff --git a/ariba/tests/data/cdhit_test_load_user_clusters_file.bad1 b/ariba/tests/data/cdhit_test_load_user_clusters_file.bad1
@@ -0,0 +1 @@
+seq1 seq1
diff --git a/ariba/tests/data/cdhit_test_load_user_clusters_file.bad2 b/ariba/tests/data/cdhit_test_load_user_clusters_file.bad2
@@ -0,0 +1,2 @@
+seq1 seq2
+seq3 seq1
diff --git a/ariba/tests/data/cdhit_test_load_user_clusters_file.bad3 b/ariba/tests/data/cdhit_test_load_user_clusters_file.bad3
@@ -0,0 +1,2 @@
+seq1 seq2
+seq3 seq2
diff --git a/ariba/tests/data/cdhit_test_load_user_clusters_file.good b/ariba/tests/data/cdhit_test_load_user_clusters_file.good
@@ -0,0 +1,3 @@
+seq1 seq2 seq3
+seq4
+seq5 seq6
diff --git a/ariba/tests/data/cdhit_test_run_get_clusters_from_dict.in.clusters b/ariba/tests/data/cdhit_test_run_get_clusters_from_dict.in.clusters
@@ -0,0 +1,2 @@
+seq1 seq2
+seq3
diff --git a/ariba/tests/data/cdhit_test_run_get_clusters_from_dict.in.fa b/ariba/tests/data/cdhit_test_run_get_clusters_from_dict.in.fa
@@ -0,0 +1,6 @@
+>seq1
+ACGT
+>seq2
+AAAA
+>seq3
+CCCC
diff --git a/ariba/tests/data/cdhit_test_run_get_clusters_from_dict.out.fa b/ariba/tests/data/cdhit_test_run_get_clusters_from_dict.out.fa
@@ -0,0 +1,4 @@
+>seq1.x
+ACGT
+>seq3.x
+CCCC
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.clusters b/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.clusters
@@ -0,0 +1,4 @@
+noncoding1
+noncoding2
+presence_absence1 presence_absence3 presence_absence4
+presence_absence2
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.clusters.tsv b/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.clusters.tsv
@@ -0,0 +1,4 @@
+presence_absence1.p	presence_absence1	presence_absence3	presence_absence4
+presence_absence2.p	presence_absence2
+noncoding1.n	noncoding1
+noncoding2.n	noncoding2
diff --git a/.../data/reference_data_test_cluster_with_cdhit_clusters_in_file.expected_representatives.fa b/.../data/reference_data_test_cluster_with_cdhit_clusters_in_file.expected_representatives.fa
@@ -0,0 +1,16 @@
+>presence_absence1.p
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2.p
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAAATGGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>noncoding1.n
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
+>noncoding2.n
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.non_coding.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.non_coding.fa
@@ -0,0 +1,10 @@
+>noncoding1
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
+>noncoding2
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.presence_absence.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.presence_absence.fa
@@ -0,0 +1,10 @@
+>presence_absence1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAAATGGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence3
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
+>presence_absence4
+ATGGCGTGCGATGAATTTGGCATGATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.clusters.tsv b/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.clusters.tsv
@@ -0,0 +1,6 @@
+presence_absence1.p	presence_absence1
+presence_absence2.p	presence_absence2
+presence_absence3.p	presence_absence3
+presence_absence4.p	presence_absence4
+noncoding1.n	noncoding1
+noncoding2.n	noncoding2
diff --git a/...a/tests/data/reference_data_test_cluster_with_cdhit_nocluster.expected_representatives.fa b/...a/tests/data/reference_data_test_cluster_with_cdhit_nocluster.expected_representatives.fa
@@ -0,0 +1,20 @@
+>noncoding1.n
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
+>noncoding2.n
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
+>presence_absence1.p
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2.p
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAAATGGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence3.p
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
+>presence_absence4.p
+ATGGCGTGCGATGAATTTGGCATGATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.non_coding.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.non_coding.fa
@@ -0,0 +1,10 @@
+>noncoding1
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
+>noncoding2
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.presence_absence.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.presence_absence.fa
@@ -0,0 +1,10 @@
+>presence_absence1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAAATGGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence3
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
+>presence_absence4
+ATGGCGTGCGATGAATTTGGCATGATTAAACTGATGAACCCGCAGCGCAGCACCTAA
-Original file line number
+Diff line change
@@ -0,0 +1,6 @@
+    >seq1
+    ACGT
+    >seq2
+    AAAA
+    >seq3
+    CCCC