Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multiple clustering resolution functionality #1095

Merged
merged 19 commits into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions src/ark/phenotyping/post_cluster_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import pathlib
import itertools

from typing import List, Union

import matplotlib.pyplot as plt
Expand Down Expand Up @@ -167,3 +169,40 @@ def create_mantis_project(
img_sub_folder="",
seg_suffix_name=seg_suffix_name,
)


def generate_new_cluster_resolution(cell_table, cluster_col, new_cluster_col, cluster_mapping,
save_path):
"""Add new column of more broad cell cluster assignments to the cell table.

Args:
cell_table (pd.DataFrame): cell table with clustered cell populations
cluster_col (str): column containing the cell phenotype
new_cluster_col (str): new column to create
cluster_mapping (dict): dictionary with keys detailing the new cluster names and values
explaining which cell types to group together
save_path (str): where to save the new cell table
"""
# validation checks
misc_utils.verify_in_list(cluster_col=[cluster_col], cell_table_columns=cell_table.columns)
if new_cluster_col in cell_table.columns:
raise ValueError(f"The column {new_cluster_col} already exists in the cell table. "
f"Please specify a different name for the new column.")

cluster_mapping_values = list(cluster_mapping.values())
not_list = [type(group) != list for group in cluster_mapping_values]
if any(not_list):
raise ValueError(f"Please make sure all values of the dictionary specify a list.")
cluster_list = list(itertools.chain.from_iterable(cluster_mapping_values))
misc_utils.verify_same_elements(
specified_cell_clusters=cluster_list,
cell_clusters_in_table=list(cell_table[cluster_col].unique()))

# assign each cell to new cluster
for new_cluster in cluster_mapping:
pops = cluster_mapping[new_cluster]
idx = np.isin(cell_table[cluster_col].values, pops)
cell_table.loc[idx, new_cluster_col] = new_cluster

# save updated cell table
cell_table.to_csv(os.path.join(save_path), index=False)
119 changes: 112 additions & 7 deletions templates/4_Post_Clustering.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
"metadata": {},
camisowers marked this conversation as resolved.
Show resolved Hide resolved
camisowers marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

@alex-l-kong alex-l-kong Dec 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a better way to do this other than raw printing everything out? At the very least, I think this should be formatted so it lists a separate cluster name per row.


Reply via ReviewNB

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really an automated way to do this since the user needs to manually define the cluster grouping dictionary. I'll fix it so it prints like below.

0              CD4T
1              CD8T
2     CD14_monocyte
3             Bcell
4             other
5     M2_macrophage
6      immune_other
7     M1_macrophage
8               APC
9            stroma
10      endothelium
11    Myofibroblast
12       tumor_ck17
13       tumor_ecad
Name: cell_meta_cluster, dtype: object

"source": [
"# Post-clustering tasks\n",
"This notebook allows the user to inspect and fine-tune the output of the [clustering](https://github.com/angelolab/ark-analysis/blob/main/templates/3_Cluster_Cells.ipynb) notebook. There are two parts of this notebook. \n",
"This notebook allows the user to inspect and fine-tune the output of the [clustering](https://github.com/angelolab/ark-analysis/blob/main/templates/3_Cluster_Cells.ipynb) notebook. There are three parts of this notebook. \n",
"1. Clustering cleanup: If there are clusters that were not properly separated during Pixie cell clustering, this provides the option of defining manual thresholds based on marker intensity to combine or separate specific clusters\n",
"2. Marker thresholding: For markers that were not directly used for clustering, but whose expression is important for phenotyping individual cells, this provides the user with visualization to determine accurate thresholds for positive/negative classification"
"2. Marker thresholding: For markers that were not directly used for clustering, but whose expression is important for phenotyping individual cells, this provides the user with visualization to determine accurate thresholds for positive/negative classification\n",
"3. Generate additional cell cluster column(s) to represent different resolutions of clustering data."
]
},
{
Expand All @@ -29,7 +30,8 @@
"import pandas as pd\n",
"\n",
"from ark.phenotyping.post_cluster_utils import (create_mantis_project,\n",
" plot_hist_thresholds)\n",
" plot_hist_thresholds, \n",
" generate_new_cluster_resolution)\n",
"from ark.utils import example_dataset"
]
},
Expand Down Expand Up @@ -92,7 +94,7 @@
"id": "6893ee36-47f4-492d-a7ef-70708c77952a",
"metadata": {},
"source": [
"## 1. Set variables\n",
"## Set variables\n",
"* `testing_fovs`: a list of FOVs to use for evaluating the post-clustering tasks\n",
"* `base_dir`: the path to all of your imaging data. Should be the same one used for the [cell clustering notebook](https://github.com/angelolab/ark-analysis/blob/main/templates_ark/3_Cluster_Cells.ipynb)\n",
"* `cell_output_dir`: the path to the directory used for cell clustering\n",
Expand Down Expand Up @@ -165,7 +167,7 @@
"id": "74a4f3b4-b1cb-4f7e-b8eb-a980d80a3f00",
"metadata": {},
"source": [
"## 2. Clustering Cleanup\n",
"## 1. Clustering Cleanup\n",
"You may have noticed during the cell clustering process that certain populations need to be modfified/adjusted. This part of the notebook allows you to identify proportions of a given cell population to be reassigned to a different one. "
]
},
Expand Down Expand Up @@ -324,7 +326,7 @@
"id": "ded3c307-b2be-42b7-a4eb-91e8837a0c37",
"metadata": {},
"source": [
"## 3. Marker thresholding\n",
"## 2. Marker thresholding\n",
"For markers like Ki67 or PD1, we generally don't include them in the clustering process. Intead, following clustering, we score each cell as positive or negative for these \"functional markers.\" This requires picking a threshold for each marker to determine positivity and negativity. Looking directly at the histograms can be useful for getting a general range for where the threshold lies, but picking a specific threshold based only on the histogram can be challenging. Instead, what is often most useful is simply looking at images to determine the specific cutoff value."
]
},
Expand Down Expand Up @@ -490,6 +492,109 @@
"# save thresholded cell table\n",
"cell_table.to_csv(os.path.join(post_cluster_dir, 'cell_table_thresholded.csv'))"
]
},
{
"cell_type": "markdown",
"id": "d0b1e5b3-dcb5-4fc3-af66-51645993a03a",
"metadata": {},
"source": [
"## 3. Generate Multiple Cluster Resolutions\n",
"In this section, you can create a new column of cluster labels, in order to represent higher-level cell information. You will need to set the following variables:\n",
"* `cluster_col`: the column containing the cell clusters you would like to group\n",
"* `new_cluster_col`: the name of a new column to be created with the higher resolution clusters, cannont be the same as any existing columns in the cell table\n",
"* `cluster_mapping`: a dictionary detailing the groupings of the original cell clusters, where the keys are the new cluster names and the value lists contain the original cluster names\n",
"\n",
"You can re-run this section for as many new cluster columns as you would like in generate."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "12faad12-aca8-4688-be91-d66e3867323c",
"metadata": {
"tags": [
"read_table"
]
},
"outputs": [],
"source": [
"cell_table = pd.read_csv(os.path.join(post_cluster_dir, 'cell_table_thresholded.csv'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b72e8ad8-48d7-40e1-9298-afcc19b93f23",
"metadata": {
"tags": [
"set_cluster_args"
]
},
"outputs": [],
"source": [
"cluster_col = 'cell_meta_cluster'\n",
"new_cluster_col = 'new_cell_clustering'\n",
"\n",
"save_path = os.path.join(post_cluster_dir, 'final_cell_table.csv')"
]
},
{
"cell_type": "markdown",
"id": "d87ca06a-ae08-4dc3-a8de-fd6806b77fbb",
"metadata": {},
"source": [
"The below code will provide the unique cluster names found in the column provided. Each of these must be included in a group in the `cluster_mapping` dictionary below."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e8b9d175-3006-4096-a1a7-d7902e8f8139",
"metadata": {
"tags": [
"print_clusters"
]
},
"outputs": [],
"source": [
"print(cell_table[cluster_col].drop_duplicates().reset_index(drop=True))"
]
},
{
"cell_type": "markdown",
"id": "e5499e56-8f65-45e5-a5f1-5a47e17beabb",
"metadata": {},
"source": [
"Using the listed clusters, define the new cluster groups and then generate a new column in the cell table."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "35c798ea-55bc-4699-b45a-bb22dc31124a",
"metadata": {
"tags": [
"cluster_mapping"
]
},
"outputs": [],
"source": [
"cluster_mapping = {'AB': ['A', 'B'], 'C': ['C']}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "54f1d61f-d469-453e-8049-047ec99c45d2",
"metadata": {
"tags": [
"generate_new_clusters"
]
},
"outputs": [],
"source": [
"generate_new_cluster_resolution(cell_table, cluster_col, new_cluster_col, cluster_mapping, save_path)"
]
}
],
"metadata": {
Expand All @@ -508,7 +613,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
"version": "3.11.6"
},
"vscode": {
"interpreter": {
Expand Down
53 changes: 50 additions & 3 deletions tests/phenotyping/post_cluster_utils_test.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import os
import pytest
import tempfile

import numpy as np
import pandas as pd
import pytest
import skimage.io as io
from alpineer import image_utils, test_utils
from ark import settings

from ark import settings
from test_utils import make_cell_table
from alpineer import image_utils, test_utils, misc_utils
from ark.phenotyping import post_cluster_utils


Expand Down Expand Up @@ -96,3 +98,48 @@ def test_create_mantis_project(tmp_path):
# mask should be non-zero in the same places as original
seg = io.imread(os.path.join(seg_dir, fov + "_whole_cell_test.tiff"))
assert np.array_equal(mask > 0, seg > 0)


def test_generate_new_cluster_resolution():
with tempfile.TemporaryDirectory() as temp_dir:
cell_table = make_cell_table(n_cells=20, n_markers=0)
cluster_assignments = {'AB': ['A', 'B'], 'C': ['C']}
new_path = os.path.join(temp_dir, 'new_table.csv')

# generate and save a new cell table with new cell cluster resolution
post_cluster_utils.generate_new_cluster_resolution(
cell_table, cluster_col=settings.CELL_TYPE, new_cluster_col="new_clusters",
cluster_mapping=cluster_assignments, save_path=new_path)

new_table = pd.read_csv(new_path)

# check new column exists
assert "new_clusters" in new_table.columns

# check for new cell cluster names
assert misc_utils.verify_same_elements(
inteded_clusters=list(cluster_assignments.keys()),
table_clusters=list(np.unique(new_table.new_clusters)))

# check no cells were dropped
assert len(cell_table[settings.CELL_LABEL]) == len(new_table[settings.CELL_LABEL])

# check error raise when new_cluster_col already exists
with pytest.raises(ValueError):
post_cluster_utils.generate_new_cluster_resolution(
cell_table, cluster_col=settings.CELL_TYPE, new_cluster_col="new_clusters",
cluster_mapping=cluster_assignments, save_path=new_path)

# check error raise when cell types missing from assignment dict
with pytest.raises(ValueError):
missing_assignments = {'A': ['A'], 'C': ['C']}
post_cluster_utils.generate_new_cluster_resolution(
cell_table, cluster_col=settings.CELL_TYPE, new_cluster_col="new_clusters_bad",
cluster_mapping=missing_assignments, save_path=new_path)

# check error raise when dict value is not list
with pytest.raises(ValueError):
missing_assignments = {'A': ['A'], 'C': 'C'}
post_cluster_utils.generate_new_cluster_resolution(
cell_table, cluster_col=settings.CELL_TYPE, new_cluster_col="new_clusters_bad",
cluster_mapping=missing_assignments, save_path=new_path)
22 changes: 22 additions & 0 deletions tests/utils/notebooks_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,28 @@ def test_threshold_list_vars(self):
def test_cell_table_threshold(self):
self.tb.execute_cell("cell_table_threshold")

def test_read_table(self):
self.tb.execute_cell("read_table")

def test_set_cluster_args(self):
self.tb.execute_cell("set_cluster_args")

def test_print_clusters(self):
self.tb.execute_cell("print_clusters")

def test_cluster_mapping(self):
cell_dict = {'A': ['CD4T', 'CD8T', 'CD14_monocyte', 'Bcell'],
'B': ['other', 'M2_macrophage', 'M1_macrophage', 'APC', 'stroma'],
'C': ['immune_other', 'endothelium', 'Myofibroblast', 'tumor_ck17'],
'D': ['tumor_ecad']}
cluster_mapping_inject = f"""
cluster_mapping = {cell_dict}
"""
self.tb.inject(cluster_mapping_inject, "cluster_mapping")

def test_generate_new_clusters(self):
self.tb.execute_cell("generate_new_clusters")


class Test_EZSegmenter:
"""
Expand Down
Loading