angelolab · ngreenwald · Mar 1, 2024 · Dec 6, 2023 · Dec 12, 2023 · Dec 12, 2023
diff --git a/src/ark/phenotyping/post_cluster_utils.py b/src/ark/phenotyping/post_cluster_utils.py
@@ -1,5 +1,7 @@
 import os
 import pathlib
+import itertools
+
 from typing import List, Union
 
 import matplotlib.pyplot as plt
@@ -167,3 +169,40 @@ def create_mantis_project(
         img_sub_folder="",
         seg_suffix_name=seg_suffix_name,
     )
+
+
+def generate_new_cluster_resolution(cell_table, cluster_col, new_cluster_col, cluster_mapping,
+                                    save_path):
+    """Add new column of more broad cell cluster assignments to the cell table.
+
+    Args:
+        cell_table (pd.DataFrame): cell table with clustered cell populations
+        cluster_col (str): column containing the cell phenotype
+        new_cluster_col (str): new column to create
+        cluster_mapping (dict): dictionary with keys detailing the new cluster names and values
+            explaining which cell types to group together
+        save_path (str): where to save the new cell table
+    """
+    # validation checks
+    misc_utils.verify_in_list(cluster_col=[cluster_col], cell_table_columns=cell_table.columns)
+    if new_cluster_col in cell_table.columns:
+        raise ValueError(f"The column {new_cluster_col} already exists in the cell table. "
+                         f"Please specify a different name for the new column.")
+
+    cluster_mapping_values = list(cluster_mapping.values())
+    not_list = [type(group) != list for group in cluster_mapping_values]
+    if any(not_list):
+        raise ValueError(f"Please make sure all values of the dictionary specify a list.")
+    cluster_list = list(itertools.chain.from_iterable(cluster_mapping_values))
+    misc_utils.verify_same_elements(
+        specified_cell_clusters=cluster_list,
+        cell_clusters_in_table=list(cell_table[cluster_col].unique()))
+
+    # assign each cell to new cluster
+    for new_cluster in cluster_mapping:
+        pops = cluster_mapping[new_cluster]
+        idx = np.isin(cell_table[cluster_col].values, pops)
+        cell_table.loc[idx, new_cluster_col] = new_cluster
+
+    # save updated cell table
+    cell_table.to_csv(os.path.join(save_path), index=False)
diff --git a/templates/4_Post_Clustering.ipynb b/templates/4_Post_Clustering.ipynb
@@ -6,9 +6,10 @@
    "metadata": {},
    "source": [
     "# Post-clustering tasks\n",
-    "This notebook allows the user to inspect and fine-tune the output of the [clustering](https://github.com/angelolab/ark-analysis/blob/main/templates/3_Cluster_Cells.ipynb) notebook. There are two parts of this notebook. \n",
+    "This notebook allows the user to inspect and fine-tune the output of the [clustering](https://github.com/angelolab/ark-analysis/blob/main/templates/3_Cluster_Cells.ipynb) notebook. There are three parts of this notebook. \n",
     "1. Clustering cleanup: If there are clusters that were not properly separated during Pixie cell clustering, this provides the option of defining manual thresholds based on marker intensity to combine or separate specific clusters\n",
-    "2. Marker thresholding: For markers that were not directly used for clustering, but whose expression is important for phenotyping individual cells, this provides the user with visualization to determine accurate thresholds for positive/negative classification"
+    "2. Marker thresholding: For markers that were not directly used for clustering, but whose expression is important for phenotyping individual cells, this provides the user with visualization to determine accurate thresholds for positive/negative classification\n",
+    "3. Generate additional cell cluster column(s) to represent different resolutions of clustering data."
    ]
   },
   {
@@ -29,7 +30,8 @@
     "import pandas as pd\n",
     "\n",
     "from ark.phenotyping.post_cluster_utils import (create_mantis_project,\n",
-    "                                                plot_hist_thresholds)\n",
+    "                                                plot_hist_thresholds, \n",
+    "                                                generate_new_cluster_resolution)\n",
     "from ark.utils import example_dataset"
    ]
   },
@@ -92,7 +94,7 @@
    "id": "6893ee36-47f4-492d-a7ef-70708c77952a",
    "metadata": {},
    "source": [
-    "## 1. Set variables\n",
+    "## Set variables\n",
     "* `testing_fovs`: a list of FOVs to use for evaluating the post-clustering tasks\n",
     "* `base_dir`: the path to all of your imaging data. Should be the same one used for the [cell clustering notebook](https://github.com/angelolab/ark-analysis/blob/main/templates_ark/3_Cluster_Cells.ipynb)\n",
     "* `cell_output_dir`: the path to the directory used for cell clustering\n",
@@ -165,7 +167,7 @@
    "id": "74a4f3b4-b1cb-4f7e-b8eb-a980d80a3f00",
    "metadata": {},
    "source": [
-    "## 2. Clustering Cleanup\n",
+    "## 1. Clustering Cleanup\n",
     "You may have noticed during the cell clustering process that certain populations need to be modfified/adjusted. This part of the notebook allows you to identify proportions of a given cell population to be reassigned to a different one. "
    ]
   },
@@ -324,7 +326,7 @@
    "id": "ded3c307-b2be-42b7-a4eb-91e8837a0c37",
    "metadata": {},
    "source": [
-    "## 3. Marker thresholding\n",
+    "## 2. Marker thresholding\n",
     "For markers like Ki67 or PD1, we generally don't include them in the clustering process. Intead, following clustering, we score each cell as positive or negative for these \"functional markers.\" This requires picking a threshold for each marker to determine positivity and negativity. Looking directly at the histograms can be useful for getting a general range for where the threshold lies, but picking a specific threshold based only on the histogram can be challenging. Instead, what is often most useful is simply looking at images to determine the specific cutoff value."
    ]
   },
@@ -490,6 +492,109 @@
     "# save thresholded cell table\n",
     "cell_table.to_csv(os.path.join(post_cluster_dir, 'cell_table_thresholded.csv'))"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d0b1e5b3-dcb5-4fc3-af66-51645993a03a",
+   "metadata": {},
+   "source": [
+    "## 3. Generate Multiple Cluster Resolutions\n",
+    "In this section, you can create a new column of cluster labels, in order to represent higher-level cell information. You will need to set the following variables:\n",
+    "* `cluster_col`: the column containing the cell clusters you would like to group\n",
+    "* `new_cluster_col`: the name of a new column to be created with the higher resolution clusters, cannont be the same as any existing columns in the cell table\n",
+    "* `cluster_mapping`: a dictionary detailing the groupings of the original cell clusters, where the keys are the new cluster names and the value lists contain the original cluster names\n",
+    "\n",
+    "You can re-run this section for as many new cluster columns as you would like in generate."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12faad12-aca8-4688-be91-d66e3867323c",
+   "metadata": {
+    "tags": [
+     "read_table"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "cell_table = pd.read_csv(os.path.join(post_cluster_dir, 'cell_table_thresholded.csv'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b72e8ad8-48d7-40e1-9298-afcc19b93f23",
+   "metadata": {
+    "tags": [
+     "set_cluster_args"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "cluster_col = 'cell_meta_cluster'\n",
+    "new_cluster_col = 'new_cell_clustering'\n",
+    "\n",
+    "save_path = os.path.join(post_cluster_dir, 'final_cell_table.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d87ca06a-ae08-4dc3-a8de-fd6806b77fbb",
+   "metadata": {},
+   "source": [
+    "The below code will provide the unique cluster names found in the column provided. Each of these must be included in a group in the `cluster_mapping` dictionary below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8b9d175-3006-4096-a1a7-d7902e8f8139",
+   "metadata": {
+    "tags": [
+     "print_clusters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "print(cell_table[cluster_col].drop_duplicates().reset_index(drop=True))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5499e56-8f65-45e5-a5f1-5a47e17beabb",
+   "metadata": {},
+   "source": [
+    "Using the listed clusters, define the new cluster groups and then generate a new column in the cell table."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35c798ea-55bc-4699-b45a-bb22dc31124a",
+   "metadata": {
+    "tags": [
+     "cluster_mapping"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "cluster_mapping = {'AB': ['A', 'B'], 'C': ['C']}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54f1d61f-d469-453e-8049-047ec99c45d2",
+   "metadata": {
+    "tags": [
+     "generate_new_clusters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "generate_new_cluster_resolution(cell_table, cluster_col, new_cluster_col, cluster_mapping, save_path)"
+   ]
   }
  ],
  "metadata": {
@@ -508,7 +613,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.11.6"
   },
   "vscode": {
    "interpreter": {

diff --git a/tests/phenotyping/post_cluster_utils_test.py b/tests/phenotyping/post_cluster_utils_test.py
@@ -1,12 +1,14 @@
 import os
+import pytest
+import tempfile
 
 import numpy as np
 import pandas as pd
-import pytest
 import skimage.io as io
-from alpineer import image_utils, test_utils
-from ark import settings
 
+from ark import settings
+from test_utils import make_cell_table
+from alpineer import image_utils, test_utils, misc_utils
 from ark.phenotyping import post_cluster_utils
 
 
@@ -96,3 +98,48 @@ def test_create_mantis_project(tmp_path):
         # mask should be non-zero in the same places as original
         seg = io.imread(os.path.join(seg_dir, fov + "_whole_cell_test.tiff"))
         assert np.array_equal(mask > 0, seg > 0)
+
+
+def test_generate_new_cluster_resolution():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        cell_table = make_cell_table(n_cells=20, n_markers=0)
+        cluster_assignments = {'AB': ['A', 'B'], 'C': ['C']}
+        new_path = os.path.join(temp_dir, 'new_table.csv')
+
+        # generate and save a new cell table with new cell cluster resolution
+        post_cluster_utils.generate_new_cluster_resolution(
+            cell_table, cluster_col=settings.CELL_TYPE, new_cluster_col="new_clusters",
+            cluster_mapping=cluster_assignments, save_path=new_path)
+
+        new_table = pd.read_csv(new_path)
+
+        # check new column exists
+        assert "new_clusters" in new_table.columns
+
+        # check for new cell cluster names
+        assert misc_utils.verify_same_elements(
+            inteded_clusters=list(cluster_assignments.keys()),
+            table_clusters=list(np.unique(new_table.new_clusters)))
+
+        # check no cells were dropped
+        assert len(cell_table[settings.CELL_LABEL]) == len(new_table[settings.CELL_LABEL])
+
+        # check error raise when new_cluster_col already exists
+        with pytest.raises(ValueError):
+            post_cluster_utils.generate_new_cluster_resolution(
+                cell_table, cluster_col=settings.CELL_TYPE, new_cluster_col="new_clusters",
+                cluster_mapping=cluster_assignments, save_path=new_path)
+
+        # check error raise when cell types missing from assignment dict
+        with pytest.raises(ValueError):
+            missing_assignments = {'A': ['A'], 'C': ['C']}
+            post_cluster_utils.generate_new_cluster_resolution(
+                cell_table, cluster_col=settings.CELL_TYPE, new_cluster_col="new_clusters_bad",
+                cluster_mapping=missing_assignments, save_path=new_path)
+
+        # check error raise when dict value is not list
+        with pytest.raises(ValueError):
+            missing_assignments = {'A': ['A'], 'C': 'C'}
+            post_cluster_utils.generate_new_cluster_resolution(
+                cell_table, cluster_col=settings.CELL_TYPE, new_cluster_col="new_clusters_bad",
+                cluster_mapping=missing_assignments, save_path=new_path)
diff --git a/tests/utils/notebooks_test.py b/tests/utils/notebooks_test.py
@@ -738,6 +738,28 @@ def test_threshold_list_vars(self):
     def test_cell_table_threshold(self):
         self.tb.execute_cell("cell_table_threshold")
 
+    def test_read_table(self):
+        self.tb.execute_cell("read_table")
+
+    def test_set_cluster_args(self):
+        self.tb.execute_cell("set_cluster_args")
+
+    def test_print_clusters(self):
+        self.tb.execute_cell("print_clusters")
+
+    def test_cluster_mapping(self):
+        cell_dict = {'A': ['CD4T', 'CD8T', 'CD14_monocyte', 'Bcell'],
+                     'B': ['other', 'M2_macrophage', 'M1_macrophage', 'APC', 'stroma'],
+                     'C': ['immune_other', 'endothelium', 'Myofibroblast', 'tumor_ck17'],
+                     'D': ['tumor_ecad']}
+        cluster_mapping_inject = f"""
+            cluster_mapping = {cell_dict}
+        """
+        self.tb.inject(cluster_mapping_inject, "cluster_mapping")
+
+    def test_generate_new_clusters(self):
+        self.tb.execute_cell("generate_new_clusters")
+
 
 class Test_EZSegmenter:
     """