No more transformers in perf_data, No more transformers in EpochManag…

…er, transforms happen right before it's used. Transformations are un-done before passing into perf_data
ATOMScience-org · Dec 10, 2024 · 18987d3 · 18987d3
1 parent a2520a4
commit 18987d3
Show file tree

Hide file tree

Showing 6 changed files with 204 additions and 268 deletions.
diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py
@@ -379,6 +379,7 @@ def get_featurized_data(self, params=None):
                 if params.prediction_type=='classification':
                     w = w.astype(np.float32)
 
+                self.untransformed_dataset = NumpyDataset(features, self.vals, ids=ids)
                 self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w)
                 self.log.info("Using prefeaturized data; number of features = " + str(self.n_features))
                 return
@@ -404,6 +405,7 @@ def get_featurized_data(self, params=None):
         self.log.debug("Number of features: " + str(self.n_features))
 
         # Create the DeepChem dataset       
+        self.untransformed_dataset = NumpyDataset(features, self.vals, ids=ids)
         self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w)
         # Checking for minimum number of rows
         if len(self.dataset) < params.min_compound_number:
@@ -681,7 +683,7 @@ def has_all_feature_columns(self, dset_df):
 
     # *************************************************************************************
 
-    def get_subset_responses_and_weights(self, subset, transformers):
+    def get_subset_responses_and_weights(self, subset):
         """Returns a dictionary mapping compound IDs in the given dataset subset to arrays of response values
         and weights.  Used by the perf_data module under k-fold CV.
 
@@ -703,16 +705,33 @@ def get_subset_responses_and_weights(self, subset, transformers):
             else:
                 raise ValueError('Unknown dataset subset type "%s"' % subset)
 
-            y = dc.trans.undo_transforms(dataset.y, transformers)
+            response_vals = dict()
+            dataset_ids = set(dataset.ids)
+            for id, y in zip(self.untransformed_dataset.ids, self.untransformed_dataset.y):
+                if id in dataset_ids:
+                    response_vals[id] = y
+
             w = dataset.w
-            response_vals = dict([(id, y[i,:]) for i, id in enumerate(dataset.ids)])
             weights = dict([(id, w[i,:]) for i, id in enumerate(dataset.ids)])
             self.subset_response_dict[subset] = response_vals
             self.subset_weight_dict[subset] = weights
         return self.subset_response_dict[subset], self.subset_weight_dict[subset]
 
     # *************************************************************************************
 
+    def get_untransformed_responses(self, ids):
+        """ Returns a numpy array of untransformed response values
+        """
+        response_vals = np.zeros((len(ids), self.untransformed_dataset.y.shape[1]))
+        response_dict = dict([(id, y) for id, y in zip(self.untransformed_dataset.ids, self.untransformed_dataset.y)])
+
+        for i, id in enumerate(ids):
+            response_vals[i] = response_dict[id]
+
+        return response_vals
+
+    # *************************************************************************************
+
     def _get_split_key(self):
         """Creates the proper CSV name for a split file
 
@@ -828,6 +847,8 @@ def get_featurized_data(self, dset_df, is_featurized=False):
                                                                                     params, self.contains_responses)
             self.log.warning("Done")
         self.n_features = self.featurization.get_feature_count()
+
+        self.untransformed_dataset= NumpyDataset(features, self.vals, ids=ids)
         self.dataset = NumpyDataset(features, self.vals, ids=ids)
 
     # ****************************************************************************************

diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py
@@ -305,13 +305,6 @@ def load_featurize_data(self, params=None):
         else:
             self.run_mode = ''
 
-        if self.run_mode == 'training':
-            for i, (train, valid) in enumerate(self.data.train_valid_dsets):
-                train = self.model_wrapper.transform_dataset(train)
-                valid = self.model_wrapper.transform_dataset(valid)
-                self.data.train_valid_dsets[i] = (train, valid)
-            self.data.test_dset = self.model_wrapper.transform_dataset(self.data.test_dset)
-
         # ****************************************************************************************
 
     def create_model_metadata(self):
@@ -864,7 +857,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
         # Get features for each compound and construct a DeepChem Dataset from them
         self.data.get_featurized_data(dset_df, is_featurized)
         # Transform the features and responses if needed
-        self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset)
+        self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset, fold='final')
 
         # Note that at this point, the dataset may contain fewer rows than the input. Typically this happens because
         # of invalid SMILES strings. Remove any rows from the input dataframe corresponding to SMILES strings that were
@@ -995,7 +988,7 @@ def predict_embedding(self, dset_df, dset_params=None):
         self.data = model_datasets.create_minimal_dataset(self.params, self.featurization)
         self.data.get_featurized_data(dset_df, is_featurized=False)
         # Not sure the following is necessary
-        self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset)
+        self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset, fold='final')
 
         # Get the embeddings as a numpy array
         embeddings = self.model_wrapper.generate_embeddings(self.data.dataset)
@@ -1577,7 +1570,7 @@ def ensemble_predict(model_uuids, collections, dset_df, labels=None, dset_params
             raise Exception("response_cols missing from model params")
         is_featurized = (len(set(pipe.featurization.get_feature_columns()) - set(dset_df.columns.values)) == 0)
         pipe.data.get_featurized_data(dset_df, is_featurized)
-        pipe.data.dataset = pipe.model_wrapper.transform_dataset(pipe.data.dataset)
+        pipe.data.dataset = pipe.model_wrapper.transform_dataset(pipe.data.dataset, fold='final')
 
         # Create a temporary data frame to hold the compound IDs and predictions. The model may not
         # return predictions for all the requested compounds, so we have to outer join the predictions