Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add JUMP analysis figure #55

Merged
merged 9 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
826 changes: 18 additions & 808 deletions 3.evaluate_model/process_jump_phenotype_profiles.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,10 @@
#
# 1. Load in this data from the JUMP-single-cell repo
# 2. Summarize replicate KS test metrics (mean value) and align across cell types and time variables
# 3. Explore the top results per phenotype/treatment_type/model_type
# 3. Explore the top results per phenotype/treatment_type/model_type (Supplementary Table S1)
# 4. Convert it to wide format
#
# This wide format represents a "phenotypic profile" which we can use similarly as an image-based morphology profile.
#
# We also fit a UMAP to this phenotypic profile for downstream visualization.

# In[1]:

Expand All @@ -33,46 +31,6 @@
# In[2]:


def umap_phenotype(
phenotype_df: pd.DataFrame,
feature_columns: List[str],
metadata_columns: List[str],
n_components: int,
random_seed: int,
model_type: str
) -> pd.DataFrame:
"""
Fit a UMAP (Uniform Manifold Approximation and Projection) model on the provided phenotype profile and return a transformed DataFrame with metadata.

Parameters:
- phenotype_df (pd.DataFrame): DataFrame containing the phenotype profile with both feature and metadata columns.
- feature_columns (List[str]): List of column names in phenotype_df that represent the features to be used for UMAP embedding.
- metadata_columns (List[str]): List of column names in phenotype_df that represent metadata to be retained in the output.
- n_components (int): Number of dimensions for the UMAP embedding.
- random_seed (int): Random seed for reproducibility of the UMAP model.
- model_type (str): Identifier for the model type, to be added as a column in the output DataFrame.

Returns:
- umap_embeddings_with_metadata_df (pd.DataFrame): DataFrame with UMAP embeddings and specified metadata columns, including an additional 'model_type' column.
"""

# Initialize UMAP
umap_fit = umap.UMAP(random_state=random_seed, n_components=n_components)

# Fit UMAP and convert to pandas DataFrame
embeddings = pd.DataFrame(
umap_fit.fit_transform(phenotype_df.loc[:, feature_columns]),
columns=[f"UMAP{x}" for x in range(0, n_components)],
)

# Combine with metadata
umap_embeddings_with_metadata_df = pd.concat([phenotype_df.loc[:, metadata_columns], embeddings], axis=1).assign(model_type=model_type)
return umap_embeddings_with_metadata_df


# In[3]:


# Set file paths
# JUMP phenotype probabilities from AreaShape model
commit = "4225e427fd9da59159de69f53be65c31b4d4644a"
Expand All @@ -86,7 +44,7 @@ def umap_phenotype(
n_top_results_to_explore = 10


# In[4]:
# In[3]:


# Set output files
Expand All @@ -97,12 +55,10 @@ def umap_phenotype(
final_jump_phenotype_file = pathlib.Path(output_dir, "jump_phenotype_profiles.tsv.gz")
shuffled_jump_phenotype_file = pathlib.Path(output_dir, "jump_phenotype_profiles_shuffled.tsv.gz")

jump_umap_file = pathlib.Path(output_dir, "jump_phenotype_profiling_umap.tsv.gz")


# ## Load and process data

# In[5]:
# In[4]:


# Load KS test results and drop uninformative columns
Expand All @@ -115,7 +71,7 @@ def umap_phenotype(
jump_pred_df.head()


# In[6]:
# In[5]:


# Process data to match treatments and scores across cell types
Expand Down Expand Up @@ -163,7 +119,7 @@ def umap_phenotype(
jump_pred_compare_df.head()


# In[7]:
# In[6]:


# Focus on the top results for downstream interpretation
Expand All @@ -182,133 +138,30 @@ def umap_phenotype(

# ## Summarize data

# In[8]:
# In[7]:


# How many unique plates?
jump_pred_df.Metadata_Plate.nunique()


# In[9]:
# In[8]:


# How many different individual treatments?
jump_pred_df.query("Metadata_model_type == 'final'").treatment_type.value_counts()


# In[10]:
# In[9]:


# How many unique treatments per treatment type?
jump_pred_df.groupby("treatment_type").treatment.nunique()


# In[11]:
# In[10]:


# How many treatments with phenotype predictions?
jump_pred_df.query("Metadata_model_type == 'final'").phenotype.value_counts()


# ## Convert data to phenotypic profiles

# In[12]:


metadata_columns = [
"Metadata_Plate",
"treatment",
"treatment_type",
"Cell_type",
"Time",
"Metadata_Well",
"cell_count"
]


# In[13]:


jump_wide_final_df = (
jump_pred_df
.query("Metadata_model_type == 'final'")
.drop(columns=["p_value"])
.pivot(index=metadata_columns, columns="phenotype", values="comparison_metric_value")
.reset_index()
)

jump_wide_final_df.to_csv(final_jump_phenotype_file, sep="\t", index=False)

print(jump_wide_final_df.shape)
jump_wide_final_df.head()


# In[14]:


jump_wide_shuffled_df = (
jump_pred_df
.query("Metadata_model_type == 'shuffled'")
.drop(columns=["p_value"])
.pivot(index=metadata_columns, columns="phenotype", values="comparison_metric_value")
.reset_index()
)

jump_wide_shuffled_df.to_csv(shuffled_jump_phenotype_file, sep="\t", index=False)

print(jump_wide_shuffled_df.shape)
jump_wide_shuffled_df.head()


# ## Apply UMAP to phenotypic profiles

# In[15]:


umap_random_seed = 123
umap_n_components = 2

feature_columns = jump_wide_final_df.drop(columns=metadata_columns).columns.tolist()
print(len(feature_columns))


# In[16]:


umap_with_metadata_df = umap_phenotype(
phenotype_df=jump_wide_final_df,
feature_columns=feature_columns,
metadata_columns=metadata_columns,
n_components=umap_n_components,
random_seed=umap_random_seed,
model_type="final"
)


# In[17]:


umap_shuffled_with_metadata_df = umap_phenotype(
phenotype_df=jump_wide_shuffled_df,
feature_columns=feature_columns,
metadata_columns=metadata_columns,
n_components=umap_n_components,
random_seed=umap_random_seed,
model_type="shuffled"
)


# In[18]:


# Output file
umap_full_df = pd.concat([
umap_with_metadata_df,
umap_shuffled_with_metadata_df
], axis="rows")

umap_full_df.to_csv(jump_umap_file, sep="\t", index=False)

print(umap_full_df.shape)
umap_full_df.head()

Loading