sanger-pathogens · martinghunt · Jul 28, 2016 · Jul 28, 2016 · Jul 28, 2016 · Jul 28, 2016
diff --git a/ariba/clusters.py b/ariba/clusters.py
@@ -360,11 +360,8 @@ def _init_and_run_clusters(self):
         counter = 0
         cluster_list = []
         self.log_files = []
-        cluster_numbers = [int(x) for x in self.cluster_to_dir.keys()]
-        cluster_numbers.sort()
 
-        for cluster_number in cluster_numbers:
-            cluster_name = str(cluster_number)
+        for cluster_name in sorted(self.cluster_to_dir):
             counter += 1
             if self.verbose:
                 print('Constructing cluster ', cluster_name, ' (', counter, ' of ', len(self.cluster_to_dir), ')', sep='')

diff --git a/third_party/fermi-lite-0.1/fml-asm_ariba.cpp → ariba/ext/fml-asm_ariba.cpp b/third_party/fermi-lite-0.1/fml-asm_ariba.cpp → ariba/ext/fml-asm_ariba.cpp
@@ -144,6 +144,7 @@ int assemble(char *readsFile, char *fastaOut, char* logfileOut)
     fml_opt_init(&opt);
     opt.max_cnt = 10000;
     opt.min_asm_ovlp = 15;
+    opt.mag_opt.flag |= MAG_F_AGGRESSIVE;
     std::vector<unsigned short> minCounts;
     minCounts.push_back(4);
     minCounts.push_back(8);

diff --git a/third_party/minimap/minimap_ariba.cpp → ariba/ext/minimap_ariba.cpp b/third_party/minimap/minimap_ariba.cpp → ariba/ext/minimap_ariba.cpp
diff --git a/ariba/ref_preparer.py b/ariba/ref_preparer.py
@@ -53,6 +53,56 @@ def _write_info_file(self, outfile):
             print('genetic_code', self.genetic_code, sep='\t', file=fout)
 
 
+    @staticmethod
+    def _rename_clusters(clusters_in):
+        new_clusters = {}
+        key_count = {}
+        min_prefix_length = 3
+
+        for old_name, name_set in sorted(clusters_in.items()):
+            names_before_dots = {}
+            for name in name_set:
+                if '.' in name:
+                    prefix = name.split('.')[0]
+                    if len(prefix) >= min_prefix_length:
+                        names_before_dots[prefix] = names_before_dots.get(prefix, 0) + 1
+
+            if len(names_before_dots) == 0:
+                new_key = 'cluster'
+            elif len(names_before_dots) == 1:
+                new_key = list(names_before_dots.keys())[0]
+                if sum(list(names_before_dots.values())) < len(name_set):
+                    new_key += '+'
+            else:
+                common_prefix = os.path.commonprefix(list(names_before_dots.keys()))
+                if common_prefix == '' or len(common_prefix) < min_prefix_length:
+                    max_value = max(list(names_before_dots.values()))
+                    possible_keys = [x for x in names_before_dots if names_before_dots[x] == max_value]
+                    possible_keys.sort()
+                    new_key = possible_keys[0] + '+'
+                else:
+                    new_key = common_prefix + '-'
+
+            if new_key in key_count:
+                if new_key in new_clusters:
+                    assert key_count[new_key] == 1
+                    assert new_key + '_1' not in new_clusters
+                    new_clusters[new_key + '_1'] = new_clusters[new_key]
+                    del new_clusters[new_key]
+
+                key_count[new_key] += 1
+                new_name = new_key + '_' + str(key_count[new_key])
+            else:
+                key_count[new_key] = 1
+                new_name = new_key
+
+            assert new_name not in new_clusters
+            new_clusters[new_name] = name_set
+
+
+        return new_clusters
+
+
     def run(self, outdir):
         original_dir = os.getcwd()
 
@@ -101,6 +151,9 @@ def run(self, outdir):
             clusters_file=self.clusters_file,
         )
 
+        clusters = self._rename_clusters(clusters)
+        reference_data.ReferenceData.write_cluster_allocation_file(clusters, cdhit_outprefix + '.clusters.tsv')
+
         if self.verbose:
             print('\nWriting clusters to file.', len(clusters), 'in total', flush=True)
 

diff --git a/ariba/tests/data/assembly_assemble_with_fermilite.expected.fa b/ariba/tests/data/assembly_assemble_with_fermilite.expected.fa
@@ -1,2 +1,2 @@
 >contig.0
-CATCTAGGTTGGACAGCCTTGAACCTCAGCGCATGGTTGGTACTTCGCTAGCCGCATCAGCTGACACTTATTCAGGGCCTAGCAGGCTCCTGCCGTGTCGTAGGCTAAGCAGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAGAGTAGGGATGAGTCCGGACGATATTACCGACTCAAGCATACGCACCGCCTGAGGACGGGTATCCGGAACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGAAGGTCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGTTGGATATCAATTAGAGTTGTAGCTGGTGCCAGAGTTGGACTTAGAGCTACAAACTTTAGCCCTATGATCGCGTCAGAGTTAGTCAAGGGAGGGCCACTCAGGGCATTGGGCCAGGCTAAAGCCGCGAG
+CTCGCGGCTTTAGCCTGGCCCAATGCCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCAGAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCATCGTGCTCTGACGATTATATGTTGACCTTCTACTCCCATTGTCTTTGACGCTTTCTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCGTCCTCAGGCGGTGCGTATGCTTGAGTCGGTAATATCGTCCGGACTCATCCCTACTCTTACAACTTACGTGGTTACGCTTACCCAGAGAAATATGTGCGCTACCTGCTTAGCCTACGACACGGCAGGAGCCTGCTAGGCCCTGAATAAGTGTCAGCTGATGCGGCTAGCGAAGTACCAACCATGCGCTGAGGTTCAAGGCTGTCCAACCTAGATG
diff --git a/ariba/tests/data/assembly_assemble_with_fermilite.expected.log b/ariba/tests/data/assembly_assemble_with_fermilite.expected.log
@@ -1,11 +1,11 @@
 First line
 Fermilite assembly stats:
 Min_count	Contig_number	Mean_length	Longest
-4	3	230.333	428
+4	1	499	499
 8	1	499	499
 12	1	428	428
 16	1	428	428
 20	1	384	384
-25	1	303	303
-30	1	304	304
-Best assembly is from min_count 8
+25	1	384	384
+30	1	373	373
+Best assembly is from min_count 4
diff --git a/ariba/tests/data/assembly_run_fermilite.expected.fa b/ariba/tests/data/assembly_run_fermilite.expected.fa
@@ -1,2 +1,2 @@
 >contig.0
-CATCTAGGTTGGACAGCCTTGAACCTCAGCGCATGGTTGGTACTTCGCTAGCCGCATCAGCTGACACTTATTCAGGGCCTAGCAGGCTCCTGCCGTGTCGTAGGCTAAGCAGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAGAGTAGGGATGAGTCCGGACGATATTACCGACTCAAGCATACGCACCGCCTGAGGACGGGTATCCGGAACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGAAGGTCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGTTGGATATCAATTAGAGTTGTAGCTGGTGCCAGAGTTGGACTTAGAGCTACAAACTTTAGCCCTATGATCGCGTCAGAGTTAGTCAAGGGAGGGCCACTCAGGGCATTGGGCCAGGCTAAAGCCGCGAG
+CTCGCGGCTTTAGCCTGGCCCAATGCCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCAGAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCATCGTGCTCTGACGATTATATGTTGACCTTCTACTCCCATTGTCTTTGACGCTTTCTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCGTCCTCAGGCGGTGCGTATGCTTGAGTCGGTAATATCGTCCGGACTCATCCCTACTCTTACAACTTACGTGGTTACGCTTACCCAGAGAAATATGTGCGCTACCTGCTTAGCCTACGACACGGCAGGAGCCTGCTAGGCCCTGAATAAGTGTCAGCTGATGCGGCTAGCGAAGTACCAACCATGCGCTGAGGTTCAAGGCTGTCCAACCTAGATG
diff --git a/ariba/tests/data/assembly_run_fermilite.expected.log b/ariba/tests/data/assembly_run_fermilite.expected.log
@@ -1,10 +1,10 @@
 Fermilite assembly stats:
 Min_count	Contig_number	Mean_length	Longest
-4	3	230.333	428
+4	1	499	499
 8	1	499	499
 12	1	428	428
 16	1	428	428
 20	1	384	384
-25	1	303	303
-30	1	304	304
-Best assembly is from min_count 8
+25	1	384	384
+30	1	373	373
+Best assembly is from min_count 4
diff --git a/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv b/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv
@@ -1,5 +1,5 @@
-0	noncoding1	noncoding2.var_only	noncoding3.var_only
-1	noncoding4.var_only
-2	gene1	gene2
-3	gene3
-4	gene4.var_only
+cluster_1	gene1	gene2
+cluster_2	gene3
+gene4	gene4.var_only
+noncoding-	noncoding1	noncoding2.var_only	noncoding3.var_only
+noncoding4	noncoding4.var_only
diff --git a/ariba/tests/ref_preparer_test.py b/ariba/tests/ref_preparer_test.py
@@ -9,6 +9,39 @@
 
 
 class TestRefPreparer(unittest.TestCase):
+    def test_rename_clusters(self):
+        '''test _rename_clusters'''
+        clusters_in = {
+           '0': {'no_dot_in_name'},
+           '1': {'another_no_dot_in_name'},
+           '2': {'foo.blah_blah_blah', 'foo.xyz'},
+           '3': {'foo.abc', 'foo.def'},
+           '4': {'pre1.abc', 'pre2.abc'},
+           '5': {'pre1.def', 'pre2.pqr', 'pre2.zxy'},
+           '6': {'prefix1.abc', 'prefix1.def', 'something_else.abc'},
+           '7': {'prefix1.fgh', 'prefix1.ijk', 'something_else_again.abc'},
+           '8': {'xyz.1', 'xyz.2', 'abcdefgh'},
+           '9': {'a.foo', 'a.bar'},
+        }
+
+        expected = {
+            'cluster_1': {'no_dot_in_name'},
+            'cluster_2': {'another_no_dot_in_name'},
+            'foo_1': {'foo.blah_blah_blah', 'foo.xyz'},
+            'foo_2': {'foo.abc', 'foo.def'},
+            'pre-_1': {'pre1.abc', 'pre2.abc'},
+            'pre-_2': {'pre1.def', 'pre2.pqr', 'pre2.zxy'},
+            'prefix1+_1': {'prefix1.abc', 'prefix1.def', 'something_else.abc'},
+            'prefix1+_2': {'prefix1.fgh', 'prefix1.ijk', 'something_else_again.abc'},
+            'xyz+': {'xyz.1', 'xyz.2', 'abcdefgh'},
+            'cluster_3': {'a.foo', 'a.bar'},
+        }
+
+        self.maxDiff = None
+        got = ref_preparer.RefPreparer._rename_clusters(clusters_in)
+        self.assertEqual(expected, got)
+
+
     def test_run(self):
         '''test run'''
         fasta_in = [

diff --git a/scripts/ariba b/scripts/ariba
@@ -69,7 +69,7 @@ input_group.add_argument('-m', '--metadata', action='append', dest='tsv_files',
 
 cdhit_group = subparser_prepareref.add_argument_group('cd-hit options')
 cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.')
-cdhit_group.add_argument('--cdhit_clusters', help='File specifying how the sequences should be clustered. Will be used instead of running cdhit. Format is one cluster per line. Sequence names separated by whitespace. First name in line is the cluster representative. Incompatible with --no_cdhit', metavar='FILENAME')
+cdhit_group.add_argument('--cdhit_clusters', help='File specifying how the sequences should be clustered. Will be used instead of running cdhit. Format is one cluster per line. Sequence names separated by whitespace. Incompatible with --no_cdhit', metavar='FILENAME')
 cdhit_group.add_argument('--cdhit_min_id', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT')
 cdhit_group.add_argument('--cdhit_min_length', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.9, metavar='FLOAT')
 

diff --git a/setup.py b/setup.py
@@ -5,7 +5,6 @@
 from setuptools import setup, find_packages, Extension
 
 minimap_c_files = [
-    'minimap_ariba.cpp',
     'bseq.c',
     'index.c',
     'kthread.c',
@@ -15,14 +14,19 @@
     'sketch.c',
 ]
 
-minimap_c_files = [os.path.join('third_party', 'minimap', x) for x in minimap_c_files]
-minimap_mod = Extension("minimap_ariba", minimap_c_files, extra_link_args=['-lz'])
+minimap_c_files = [os.path.join('third_party', 'minimap-0.2', x) for x in minimap_c_files]
+minimap_c_files.append(os.path.join('ariba', 'ext', 'minimap_ariba.cpp'))
+minimap_mod = Extension(
+    "minimap_ariba",
+    minimap_c_files,
+    extra_link_args=['-lz'],
+    include_dirs=[os.path.join('third_party', 'minimap-0.2')],
+)
 
 fermilite_c_files = [
     'bfc.c',
     'bseq.c',
     'bubble.c',
-    'fml-asm_ariba.cpp',
     'htab.c',
     'ksw.c',
     'kthread.c',
@@ -35,7 +39,13 @@
     'unitig.c'
 ]
 fermilite_c_files = [os.path.join('third_party', 'fermi-lite-0.1', x) for x in fermilite_c_files]
-fermilite_mod = Extension("fermilite_ariba", fermilite_c_files, extra_link_args=['-lz'])
+fermilite_c_files.append(os.path.join('ariba', 'ext', 'fml-asm_ariba.cpp'))
+fermilite_mod = Extension(
+    "fermilite_ariba",
+    fermilite_c_files,
+    extra_link_args=['-lz'],
+    include_dirs=[os.path.join('third_party', 'fermi-lite-0.1')],
+)
 
 
 setup(

diff --git a/third_party/minimap/LICENSE.txt → third_party/minimap-0.2/LICENSE.txt b/third_party/minimap/LICENSE.txt → third_party/minimap-0.2/LICENSE.txt
diff --git a/third_party/minimap/README.md → third_party/minimap-0.2/README.md b/third_party/minimap/README.md → third_party/minimap-0.2/README.md
diff --git a/third_party/minimap/bseq.c → third_party/minimap-0.2/bseq.c b/third_party/minimap/bseq.c → third_party/minimap-0.2/bseq.c
diff --git a/third_party/minimap/bseq.h → third_party/minimap-0.2/bseq.h b/third_party/minimap/bseq.h → third_party/minimap-0.2/bseq.h
diff --git a/third_party/minimap/example.c → third_party/minimap-0.2/example.c b/third_party/minimap/example.c → third_party/minimap-0.2/example.c
diff --git a/third_party/minimap/index.c → third_party/minimap-0.2/index.c b/third_party/minimap/index.c → third_party/minimap-0.2/index.c
diff --git a/third_party/minimap/kdq.h → third_party/minimap-0.2/kdq.h b/third_party/minimap/kdq.h → third_party/minimap-0.2/kdq.h
diff --git a/third_party/minimap/khash.h → third_party/minimap-0.2/khash.h b/third_party/minimap/khash.h → third_party/minimap-0.2/khash.h
diff --git a/third_party/minimap/kseq.h → third_party/minimap-0.2/kseq.h b/third_party/minimap/kseq.h → third_party/minimap-0.2/kseq.h
diff --git a/third_party/minimap/ksort.h → third_party/minimap-0.2/ksort.h b/third_party/minimap/ksort.h → third_party/minimap-0.2/ksort.h
diff --git a/third_party/minimap/kthread.c → third_party/minimap-0.2/kthread.c b/third_party/minimap/kthread.c → third_party/minimap-0.2/kthread.c
diff --git a/third_party/minimap/kvec.h → third_party/minimap-0.2/kvec.h b/third_party/minimap/kvec.h → third_party/minimap-0.2/kvec.h
diff --git a/third_party/minimap/main.c → third_party/minimap-0.2/main.c b/third_party/minimap/main.c → third_party/minimap-0.2/main.c
diff --git a/third_party/minimap/map.c → third_party/minimap-0.2/map.c b/third_party/minimap/map.c → third_party/minimap-0.2/map.c
diff --git a/third_party/minimap/minimap.1 → third_party/minimap-0.2/minimap.1 b/third_party/minimap/minimap.1 → third_party/minimap-0.2/minimap.1
diff --git a/third_party/minimap/minimap.h → third_party/minimap-0.2/minimap.h b/third_party/minimap/minimap.h → third_party/minimap-0.2/minimap.h
diff --git a/third_party/minimap/misc.c → third_party/minimap-0.2/misc.c b/third_party/minimap/misc.c → third_party/minimap-0.2/misc.c
diff --git a/third_party/minimap/sdust.c → third_party/minimap-0.2/sdust.c b/third_party/minimap/sdust.c → third_party/minimap-0.2/sdust.c
diff --git a/third_party/minimap/sdust.h → third_party/minimap-0.2/sdust.h b/third_party/minimap/sdust.h → third_party/minimap-0.2/sdust.h
diff --git a/third_party/minimap/sketch.c → third_party/minimap-0.2/sketch.c b/third_party/minimap/sketch.c → third_party/minimap-0.2/sketch.c