Merge pull request biolab#82 from pavlin-policar/louvain-sorted-clusters

OWLouvain: Sort cluster names by number of instances
VesnaT · Feb 7, 2018 · abdd64e · abdd64e
2 parents 3a3e5b4 + 8ff50b4
commit abdd64e
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 2 deletions.
diff --git a/orangecontrib/single_cell/tests/test_owlouvain.py b/orangecontrib/single_cell/tests/test_owlouvain.py
@@ -0,0 +1,41 @@
+import numpy as np
+
+from Orange.data import Table, Domain
+from Orange.widgets.tests.base import WidgetTest
+from orangecontrib.single_cell.widgets.owlouvainclustering import \
+    OWLouvainClustering
+
+
+# Deterministic tests
+np.random.seed(42)
+
+
+class TestOWLouvain(WidgetTest):
+    def setUp(self):
+        self.widget = self.create_widget(
+            OWLouvainClustering, stored_settings={'auto_commit': False}
+        )
+
+    def tearDown(self):
+        self.widget.onDeleteWidget()
+        super().tearDown()
+
+    def test_clusters_ordered_by_size(self):
+        """Cluster names should be sorted based on the number of instances."""
+        x1 = np.array([[0, 0]] * 20)
+        x2 = np.array([[1, 0]] * 15)
+        x3 = np.array([[0, 1]] * 10)
+        x4 = np.array([[1, 1]] * 5)
+        data = np.vstack((x1, x2, x3, x4))
+        # Remove any order depencence in data, not that this should affect it
+        np.random.shuffle(data)
+
+        table = Table.from_numpy(domain=Domain.from_numpy(X=data), X=data)
+        self.send_signal(self.widget.Inputs.data, table)
+        self.widget.k_neighbours = 4
+        self.widget.commit(force=True)
+        output = self.get_output(self.widget.Outputs.annotated_data, wait=1000)
+
+        clustering = output.get_column_view('Cluster')[0].astype(int)
+        counts = np.bincount(clustering)
+        np.testing.assert_equal(counts, sorted(counts, reverse=True))
diff --git a/orangecontrib/single_cell/widgets/owlouvainclustering.py b/orangecontrib/single_cell/widgets/owlouvainclustering.py
@@ -292,14 +292,20 @@ def commit(self, force=False):
 
     def _send_data(self):
         domain = self.data.domain
+        # Compute the frequency of each cluster index
+        counts = np.bincount(self.partition)
+        indices = np.argsort(counts)[::-1]
+        index_map = {n: o for n, o in zip(indices, range(len(indices)))}
+        new_partition = list(map(index_map.get, self.partition))
+
         cluster_var = DiscreteVariable(
             get_next_name(domain, 'Cluster'),
-            values=['C%d' % (i + 1) for i, _ in enumerate(np.unique(self.partition))]
+            values=['C%d' % (i + 1) for i, _ in enumerate(np.unique(new_partition))]
         )
 
         new_domain = add_columns(domain, metas=[cluster_var])
         new_table = self.data.transform(new_domain)
-        new_table.get_column_view(cluster_var)[0][:] = self.partition
+        new_table.get_column_view(cluster_var)[0][:] = new_partition
         self.Outputs.annotated_data.send(new_table)
 
         if Graph is not None: