WayScience · roshankern · Apr 13, 2023 · Apr 11, 2023 · Apr 11, 2023 · Apr 11, 2023
diff --git a/3.evaluate_model/README.md b/3.evaluate_model/README.md
@@ -6,14 +6,14 @@ After training the models in [2.train_model](../2.train_model/), we use these mo
 Evaluations are done on each model for each combination of model type (final, shuffled baseline), feature type (CP, DP, CP_and_DP), and dataset (train, test).
 
 In [get_model_predictions.ipynb](get_model_predictions.ipynb), we derive the predicted and true phenotypic class for each model, feature type, and dataset combination.
-These predictions are saved in [compiled_predictions.tsv](predictions/compiled_predictions.tsv).
+These predictions are saved in [predictions](predictions/).
 
 In [confusion_matrices.ipynb](confusion_matrices.ipynb), we evaluate these sets of predictions with a confusion matrix to see the true/false positives and negatives (see [sklearn.metrics.confusion_matrix](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html) for more details).
-The confusion matrix data are saved to [confusion_matrices](evaluations/confusion_matrices).
+The confusion matrix data are saved to [confusion_matrices](evaluations/confusion_matrices/).
 
 In [F1_scores.ipynb](F1_scores.ipynb), we evaluate each model (final, shuffled baseline) trained with each feature type (CP, DP, CP_and_DP) on each dataset (train, test, etc) to determine phenotypic and weighted f1 scores.
 F1 score measures the models precision and recall performance for each phenotypic class (see [sklearn.metrics.f1_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html) for more details).
-The f1 score data are saved to [F1_scores](evaluations/F1_scores).
+The f1 score data are saved to [F1_scores](evaluations/F1_scores/).
 
 In [class_PR_curves.ipynb](class_PR_curves.ipynb), we use [sklearn.metrics.precision_recall_curve](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html) to derive the precision-recall curves for each model, feature type, and dataset combination.
 These PR curves are created for each label type of the logistic regression model.
@@ -31,7 +31,7 @@ The LOIO evaluation procedure is as follows:
     - Train a logistic regression model with optimal hyperparameters (`C` and `l1_ratio`) on every cell that is **not** in the specific image.
     - Predict probabilities on every cell that **is** in the specific image.
 
-The probabilities are saved to [LOIO_probas](evaluations/LOIO_probas).
+The probabilities are saved to [LOIO_probas](evaluations/LOIO_probas/).
 
 **Notes:** 
 1) Intermediate `.tsv` data are stored in tidy format, a standardized data structure (see [Tidy Data](https://vita.had.co.nz/papers/tidy-data.pdf) by Hadley Wickham for more details).

diff --git a/3.evaluate_model/evaluations/LOIO_probas/compiled_SCM_LOIO_probabilites.tsv b/3.evaluate_model/evaluations/LOIO_probas/compiled_SCM_LOIO_probabilites.tsv
diff --git a/3.evaluate_model/get_LOIO_probabilities.ipynb b/3.evaluate_model/get_LOIO_probabilities.ipynb
diff --git a/3.evaluate_model/get_model_predictions.ipynb b/3.evaluate_model/get_model_predictions.ipynb
diff --git a/3.evaluate_model/predictions/compiled_SCM_predictions.tsv b/3.evaluate_model/predictions/compiled_SCM_predictions.tsv
diff --git a/3.evaluate_model/predictions/compiled_predictions.tsv b/3.evaluate_model/predictions/compiled_predictions.tsv
diff --git a/3.evaluate_model/scripts/nbconverted/get_LOIO_probabilities.py b/3.evaluate_model/scripts/nbconverted/get_LOIO_probabilities.py
@@ -1,26 +1,34 @@
 #!/usr/bin/env python
 # coding: utf-8
 
+# ### Load Libraries
+# 
+
 # In[1]:
 
 
 import pathlib
 import warnings
+import sys
+import itertools
 
 import pandas as pd
+from joblib import load
 
 from sklearn.linear_model import LogisticRegression
 from sklearn.utils import parallel_backend
 from sklearn.exceptions import ConvergenceWarning
-from joblib import load
-
-import sys
+from sklearn.metrics import f1_score
 
 sys.path.append("../utils")
 from split_utils import get_features_data
-from train_utils import get_dataset, get_X_y_data
+from train_utils import get_X_y_data
+from evaluate_utils import get_SCM_model_data
 
 
+# ### Load/Preview Labeled Data
+# 
+
 # In[2]:
 
 
@@ -41,21 +49,22 @@
 print(f"There are {num_images} images to perform LOIO evaluation on per model.")
 
 
+# ### Get LOIO probabilities
+# 
+
 # In[4]:
 
 
 # directory to load the models from
-models_dir = pathlib.Path("../2.train_model/models/")
+models_dir = pathlib.Path("../2.train_model/models/multi_class_models")
 
 # use a list to keep track of LOIO probabilities in tidy long format for each model combination
 compiled_LOIO_wide_data = []
 
-count = 0
-
 # iterate through each model (final model, shuffled baseline model, etc)
 # sorted so final models are loaded before shuffled_baseline
 for model_path in sorted(models_dir.iterdir()):
-    # only perform LOIO with hyper params from final models
+    # only perform LOIO with hyper params from final models so skip shuffled_baseline models
     if "shuffled" in model_path.name:
         continue
 
@@ -81,6 +90,8 @@
 
         # capture convergence warning from sklearn
         # this warning does not affect the model but takes up lots of space in the output
+        # this warning must be caught with parallel_backend because the logistic regression model uses parallel_backend
+        # (n_jobs=-1 means use all processors)
         with parallel_backend("multiprocessing"):
             with warnings.catch_warnings():
                 warnings.filterwarnings(
@@ -121,16 +132,9 @@
         # add tidy long data to compiled data
         compiled_LOIO_wide_data.append(test_cells_wide_data)
 
-        # DELETE THE REST OF THE LINES IN THIS NOTEBOOK BEFORE FINAL RUN
-        score = LOIO_model.score(X_test, y_test)
-        print(
-            f"Leaving out image: {image_path}; number of cells: {test_cells.shape[0]}, score: {score}"
-        )
-
-        count += 1
-        if count % 5 == 0:
-            break
 
+# ### Format and save LOIO probabilities
+# 
 
 # In[5]:
 
@@ -166,3 +170,144 @@
 # preview tidy long data
 compiled_LOIO_tidy_long_data
 
+
+# ### Get LOIO probabilities (single class models)
+# 
+
+# In[6]:
+
+
+# directory to load the models from
+models_dir = pathlib.Path("../2.train_model/models/single_class_models")
+
+# use a list to keep track of LOIO probabilities in tidy long format for each model combination
+compiled_LOIO_wide_data = []
+
+# define combinations to test over
+model_types = [
+    "final"
+]  # only perform LOIO with hyper params from final models so skip shuffled_baseline models
+feature_types = ["CP", "DP", "CP_and_DP"]
+phenotypic_classes = labeled_data["Mitocheck_Phenotypic_Class"].unique()
+
+# iterate through each combination of feature_types, evaluation_types, phenotypic_classes
+for model_type, feature_type, phenotypic_class in itertools.product(
+    model_types, feature_types, phenotypic_classes
+):
+    single_class_model_path = pathlib.Path(
+        f"{models_dir}/{phenotypic_class}_models/{model_type}__{feature_type}.joblib"
+    )
+
+    # load the model
+    model = load(single_class_model_path)
+
+    print(
+        f"Performing LOIO on {phenotypic_class} model for feature type {feature_type} with parameters C: {model.C}, l1_ratio: {model.l1_ratio}"
+    )
+
+    # iterate through image paths
+    for image_path in labeled_data["Metadata_DNA"].unique():
+        # get training and testing cells from image path
+        # every cell from the image path is for testing, the rest are for training
+        train_cells = labeled_data.loc[labeled_data["Metadata_DNA"] != image_path]
+        test_cells = labeled_data.loc[labeled_data["Metadata_DNA"] == image_path]
+
+        # rename negative label and downsample over represented classes
+        train_cells = get_SCM_model_data(train_cells, phenotypic_class, "train")
+        test_cells = get_SCM_model_data(test_cells, phenotypic_class, "test")
+
+        # get X, y from training and testing cells
+        X_train, y_train = get_X_y_data(train_cells, feature_type)
+        X_test, y_test = get_X_y_data(test_cells, feature_type)
+
+        # capture convergence warning from sklearn
+        # this warning does not affect the model but takes up lots of space in the output
+        # this warning must be caught with parallel_backend because the logistic regression model uses parallel_backend
+        # (n_jobs=-1 means use all processors)
+        with parallel_backend("multiprocessing"):
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore", category=ConvergenceWarning, module="sklearn"
+                )
+
+                # fit a logisitc regression model on the training X, y
+                LOIO_model = LogisticRegression(
+                    penalty="elasticnet",
+                    solver="saga",
+                    max_iter=100,
+                    n_jobs=-1,
+                    random_state=0,
+                    C=model.C,
+                    l1_ratio=model.l1_ratio,
+                ).fit(X_train, y_train)
+
+        # create metadata dataframe for test cells with model parameters
+        metadata_dataframe = pd.concat(
+            [
+                test_cells["Cell_UUID"],
+                test_cells["Metadata_DNA"],
+                test_cells["Mitocheck_Phenotypic_Class"],
+            ],
+            axis=1,
+        ).reset_index(drop=True)
+        metadata_dataframe["Model_Feature_Type"] = feature_type
+        metadata_dataframe["Model_C"] = model.C
+        metadata_dataframe["Model_l1_ratio"] = model.l1_ratio
+        metadata_dataframe["Model_Phenotypic_Class"] = phenotypic_class
+
+        # predict probabilities for test cells and make these probabilities into a dataframe
+        probas = LOIO_model.predict_proba(X_test)
+        probas_dataframe = pd.DataFrame(probas, columns=model.classes_)
+        # make column names consistent for all single cell models (SCMs)
+        # positive label corresponds to that SCM's phenotypic class, negative is all other labels
+        probas_dataframe = probas_dataframe.rename(
+            columns={
+                phenotypic_class: "Positive_Label",
+                f"Not {phenotypic_class}": "Negative_Label",
+            }
+        )
+
+        # combine metadata and probabilities dataframes for test cells to create wide data
+        test_cells_wide_data = pd.concat([metadata_dataframe, probas_dataframe], axis=1)
+
+        # add tidy long data to compiled data
+        compiled_LOIO_wide_data.append(test_cells_wide_data)
+
+
+# ### Format and save LOIO probabilities
+# 
+
+# In[7]:
+
+
+# compile list of wide data into one dataframe
+compiled_LOIO_wide_data = pd.concat(compiled_LOIO_wide_data).reset_index(drop=True)
+
+# convert wide data to tidy long data and sort by Cell_UUID, Model_Feature_Type, and Model_Phenotypic_Class for pretty formatting
+compiled_LOIO_tidy_long_data = (
+    pd.melt(
+        compiled_LOIO_wide_data,
+        id_vars=metadata_dataframe.columns,
+        value_vars=probas_dataframe.columns,
+        var_name="Predicted_Label",
+        value_name="Predicted_Probability",
+    )
+    .sort_values(["Model_Feature_Type", "Cell_UUID", "Model_Phenotypic_Class"])
+    .reset_index(drop=True)
+)
+
+# specify results directory
+LOIO_probas_dir = pathlib.Path("evaluations/LOIO_probas/")
+LOIO_probas_dir.mkdir(parents=True, exist_ok=True)
+
+# define save path
+compiled_LOIO_save_path = pathlib.Path(
+    f"{LOIO_probas_dir}/compiled_SCM_LOIO_probabilites.tsv"
+)
+
+# save data as tsv
+compiled_LOIO_tidy_long_data.to_csv(compiled_LOIO_save_path, sep="\t")
+
+# preview tidy long data
+compiled_LOIO_tidy_long_data
+