WayScience · gwaybio · Mar 8, 2024 · Feb 11, 2024 · Feb 22, 2024 · Feb 23, 2024
diff --git a/...e_model/jump_phenotype_profiles/jump_compare_cell_types_and_time_across_phenotypes.tsv.gz b/...e_model/jump_phenotype_profiles/jump_compare_cell_types_and_time_across_phenotypes.tsv.gz
diff --git a/3.evaluate_model/jump_phenotype_profiles/jump_phenotype_profiles.tsv.gz b/3.evaluate_model/jump_phenotype_profiles/jump_phenotype_profiles.tsv.gz
diff --git a/3.evaluate_model/jump_phenotype_profiles/jump_phenotype_profiles_shuffled.tsv.gz b/3.evaluate_model/jump_phenotype_profiles/jump_phenotype_profiles_shuffled.tsv.gz
diff --git a/3.evaluate_model/jump_phenotype_profiles/jump_phenotype_profiling_umap.tsv.gz b/3.evaluate_model/jump_phenotype_profiles/jump_phenotype_profiling_umap.tsv.gz
diff --git a/3.evaluate_model/process_jump_phenotype_profiles.ipynb b/3.evaluate_model/process_jump_phenotype_profiles.ipynb
diff --git a/3.evaluate_model/scripts/nbconverted/process_jump_phenotype_profiles.py b/3.evaluate_model/scripts/nbconverted/process_jump_phenotype_profiles.py
@@ -13,12 +13,10 @@
 # 
 # 1. Load in this data from the JUMP-single-cell repo
 # 2. Summarize replicate KS test metrics (mean value) and align across cell types and time variables
-# 3. Explore the top results per phenotype/treatment_type/model_type
+# 3. Explore the top results per phenotype/treatment_type/model_type (Supplementary Table S1)
 # 4. Convert it to wide format
 # 
 # This wide format represents a "phenotypic profile" which we can use similarly as an image-based morphology profile.
-# 
-# We also fit a UMAP to this phenotypic profile for downstream visualization.
 
 # In[1]:
 
@@ -33,46 +31,6 @@
 # In[2]:
 
 
-def umap_phenotype(
-    phenotype_df: pd.DataFrame,
-    feature_columns: List[str],
-    metadata_columns: List[str],
-    n_components: int,
-    random_seed: int,
-    model_type: str
-) -> pd.DataFrame:
-    """
-    Fit a UMAP (Uniform Manifold Approximation and Projection) model on the provided phenotype profile and return a transformed DataFrame with metadata.
-
-    Parameters:
-    - phenotype_df (pd.DataFrame): DataFrame containing the phenotype profile with both feature and metadata columns.
-    - feature_columns (List[str]): List of column names in phenotype_df that represent the features to be used for UMAP embedding.
-    - metadata_columns (List[str]): List of column names in phenotype_df that represent metadata to be retained in the output.
-    - n_components (int): Number of dimensions for the UMAP embedding.
-    - random_seed (int): Random seed for reproducibility of the UMAP model.
-    - model_type (str): Identifier for the model type, to be added as a column in the output DataFrame.
-
-    Returns:
-    - umap_embeddings_with_metadata_df (pd.DataFrame): DataFrame with UMAP embeddings and specified metadata columns, including an additional 'model_type' column.
-    """
-
-    # Initialize UMAP
-    umap_fit = umap.UMAP(random_state=random_seed, n_components=n_components)
-
-    # Fit UMAP and convert to pandas DataFrame
-    embeddings = pd.DataFrame(
-        umap_fit.fit_transform(phenotype_df.loc[:, feature_columns]),
-        columns=[f"UMAP{x}" for x in range(0, n_components)],
-    )
-
-    # Combine with metadata
-    umap_embeddings_with_metadata_df = pd.concat([phenotype_df.loc[:, metadata_columns], embeddings], axis=1).assign(model_type=model_type)
-    return umap_embeddings_with_metadata_df
-
-
-# In[3]:
-
-
 # Set file paths
 # JUMP phenotype probabilities from AreaShape model
 commit = "4225e427fd9da59159de69f53be65c31b4d4644a"
@@ -86,7 +44,7 @@ def umap_phenotype(
 n_top_results_to_explore = 10
 
 
-# In[4]:
+# In[3]:
 
 
 # Set output files
@@ -97,12 +55,10 @@ def umap_phenotype(
 final_jump_phenotype_file = pathlib.Path(output_dir, "jump_phenotype_profiles.tsv.gz")
 shuffled_jump_phenotype_file = pathlib.Path(output_dir, "jump_phenotype_profiles_shuffled.tsv.gz")
 
-jump_umap_file = pathlib.Path(output_dir, "jump_phenotype_profiling_umap.tsv.gz")
-
 
 # ## Load and process data
 
-# In[5]:
+# In[4]:
 
 
 # Load KS test results and drop uninformative columns
@@ -115,7 +71,7 @@ def umap_phenotype(
 jump_pred_df.head()
 
 
-# In[6]:
+# In[5]:
 
 
 # Process data to match treatments and scores across cell types
@@ -163,7 +119,7 @@ def umap_phenotype(
 jump_pred_compare_df.head()
 
 
-# In[7]:
+# In[6]:
 
 
 # Focus on the top results for downstream interpretation
@@ -182,133 +138,30 @@ def umap_phenotype(
 
 # ## Summarize data
 
-# In[8]:
+# In[7]:
 
 
 # How many unique plates?
 jump_pred_df.Metadata_Plate.nunique()
 
 
-# In[9]:
+# In[8]:
 
 
 # How many different individual treatments?
 jump_pred_df.query("Metadata_model_type == 'final'").treatment_type.value_counts()
 
 
-# In[10]:
+# In[9]:
 
 
 # How many unique treatments per treatment type?
 jump_pred_df.groupby("treatment_type").treatment.nunique()
 
 
-# In[11]:
+# In[10]:
 
 
 # How many treatments with phenotype predictions?
 jump_pred_df.query("Metadata_model_type == 'final'").phenotype.value_counts()
 
-
-# ## Convert data to phenotypic profiles
-
-# In[12]:
-
-
-metadata_columns = [
-    "Metadata_Plate",
-    "treatment",
-    "treatment_type",
-    "Cell_type",
-    "Time",
-    "Metadata_Well",
-    "cell_count"
-]
-
-
-# In[13]:
-
-
-jump_wide_final_df = (
-    jump_pred_df
-    .query("Metadata_model_type == 'final'")
-    .drop(columns=["p_value"])
-    .pivot(index=metadata_columns, columns="phenotype", values="comparison_metric_value")
-    .reset_index()
-)
-
-jump_wide_final_df.to_csv(final_jump_phenotype_file, sep="\t", index=False)
-
-print(jump_wide_final_df.shape)
-jump_wide_final_df.head()
-
-
-# In[14]:
-
-
-jump_wide_shuffled_df = (
-    jump_pred_df
-    .query("Metadata_model_type == 'shuffled'")
-    .drop(columns=["p_value"])
-    .pivot(index=metadata_columns, columns="phenotype", values="comparison_metric_value")
-    .reset_index()
-)
-
-jump_wide_shuffled_df.to_csv(shuffled_jump_phenotype_file, sep="\t", index=False)
-
-print(jump_wide_shuffled_df.shape)
-jump_wide_shuffled_df.head()
-
-
-# ## Apply UMAP to phenotypic profiles
-
-# In[15]:
-
-
-umap_random_seed = 123
-umap_n_components = 2
-
-feature_columns = jump_wide_final_df.drop(columns=metadata_columns).columns.tolist()
-print(len(feature_columns))
-
-
-# In[16]:
-
-
-umap_with_metadata_df = umap_phenotype(
-    phenotype_df=jump_wide_final_df,
-    feature_columns=feature_columns,
-    metadata_columns=metadata_columns,
-    n_components=umap_n_components,
-    random_seed=umap_random_seed,
-    model_type="final"
-)
-
-
-# In[17]:
-
-
-umap_shuffled_with_metadata_df = umap_phenotype(
-    phenotype_df=jump_wide_shuffled_df,
-    feature_columns=feature_columns,
-    metadata_columns=metadata_columns,
-    n_components=umap_n_components,
-    random_seed=umap_random_seed,
-    model_type="shuffled"
-)
-
-
-# In[18]:
-
-
-# Output file
-umap_full_df = pd.concat([
-    umap_with_metadata_df,
-    umap_shuffled_with_metadata_df
-], axis="rows")
-
-umap_full_df.to_csv(jump_umap_file, sep="\t", index=False)
-
-print(umap_full_df.shape)
-umap_full_df.head()
-