From d44ee761932cbe9315c3ea10aafadb20d234406c Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Mon, 9 Dec 2024 13:19:34 -0800 Subject: [PATCH] One transformer set for each fold and for train_valid dataset --- atomsci/ddm/pipeline/model_pipeline.py | 2 +- atomsci/ddm/pipeline/model_wrapper.py | 147 +++++++++++++----------- atomsci/ddm/pipeline/perf_data.py | 72 ++++++------ atomsci/ddm/pipeline/transformations.py | 27 +++++ 4 files changed, 144 insertions(+), 104 deletions(-) diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py index d6862709..70dd7f84 100644 --- a/atomsci/ddm/pipeline/model_pipeline.py +++ b/atomsci/ddm/pipeline/model_pipeline.py @@ -301,7 +301,7 @@ def load_featurize_data(self, params=None): # is fitted to the training data only. The transformers are then applied to the training, # validation and test sets separately. if not params.split_only: - self.model_wrapper.create_transformers(self.data) + self.model_wrapper.create_transformers(trans.get_all_training_datasets(self.data)) else: self.run_mode = '' diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py index b788a43e..a9d7fc35 100644 --- a/atomsci/ddm/pipeline/model_wrapper.py +++ b/atomsci/ddm/pipeline/model_wrapper.py @@ -283,9 +283,9 @@ def __init__(self, params, featurizer, ds_client): self.output_dir = self.params.output_dir self.model_dir = os.path.join(self.output_dir, 'model') os.makedirs(self.model_dir, exist_ok=True) - self.transformers = [] - self.transformers_x = [] - self.transformers_w = [] + self.transformers = {} + self.transformers_x = {} + self.transformers_w = {} # **************************************************************************************** @@ -336,7 +336,7 @@ def _create_output_transformers(self, model_dataset): """ # TODO: Just a warning, we may have response transformers for classification datasets in the future if self.params.prediction_type=='regression' and self.params.transformers is True: - self.transformers = [trans.NormalizationTransformerMissingData(transform_y=True, dataset=model_dataset.dataset)] + return [trans.NormalizationTransformerMissingData(transform_y=True, dataset=model_dataset.dataset)] # **************************************************************************************** @@ -351,15 +351,15 @@ def _create_feature_transformers(self, model_dataset): transformers_x: A list of deepchem transformation objects on featurizers, only if conditions are met. """ # Set up transformers for features, if needed - self.transformers_x = trans.create_feature_transformers(self.params, model_dataset) + return trans.create_feature_transformers(self.params, model_dataset) # **************************************************************************************** - def create_transformers(self, model_dataset): + def create_transformers(self, training_datasets): """Initialize transformers for responses, features and weights, and persist them for later. Args: - model_dataset: The ModelDataset object that handles the current dataset + training_datasets: The ModelDataset object that handles the current dataset Side effects: Overwrites the attributes: @@ -372,22 +372,23 @@ def create_transformers(self, model_dataset): params.transformer_key: A string pointing to the dataset key containing the transformer in the datastore, or the path to the transformer """ - self._create_output_transformers(model_dataset) + for k, td in training_datasets.items(): + self.transformers[k] = self._create_output_transformers(td) - self._create_feature_transformers(model_dataset) + self.transformers_x[k] = self._create_feature_transformers(td) - # Set up transformers for weights, if needed - self.transformers_w = trans.create_weight_transformers(self.params, model_dataset) + # Set up transformers for weights, if needed + self.transformers_w[k] = trans.create_weight_transformers(self.params, td) - if len(self.transformers) + len(self.transformers_x) + len(self.transformers_w) > 0: + if len(self.transformers[k]) + len(self.transformers_x[k]) + len(self.transformers_w[k]) > 0: - # Transformers are no longer saved as separate datastore objects; they are included in the model tarball - self.params.transformer_key = os.path.join(self.output_dir, 'transformers.pkl') - with open(self.params.transformer_key, 'wb') as txfmrpkl: - pickle.dump((self.transformers, self.transformers_x, self.transformers_w), txfmrpkl) - self.log.info("Wrote transformers to %s" % self.params.transformer_key) - self.params.transformer_oid = "" - self.params.transformer_bucket = "" + # Transformers are no longer saved as separate datastore objects; they are included in the model tarball + self.params.transformer_key = os.path.join(self.output_dir, f'transformers_{k}.pkl') + with open(self.params.transformer_key, 'wb') as txfmrpkl: + pickle.dump((self.transformers[k], self.transformers_x[k], self.transformers_w[k]), txfmrpkl) + self.log.info("Wrote transformers to %s" % self.params.transformer_key) + self.params.transformer_oid = "" + self.params.transformer_bucket = "" # **************************************************************************************** @@ -400,58 +401,69 @@ def reload_transformers(self): # Try local path first to check for transformers unpacked from model tarball if not trans.transformers_needed(self.params): return - local_path = f"{self.output_dir}/transformers.pkl" - if os.path.exists(local_path): - self.log.info(f"Reloading transformers from model tarball {local_path}") - with open(local_path, 'rb') as txfmr: - transformers_tuple = pickle.load(txfmr) - else: - if self.params.transformer_key is not None: - if self.params.save_results: - self.log.info(f"Reloading transformers from datastore key {self.params.transformer_key}") - transformers_tuple = dsf.retrieve_dataset_by_datasetkey( - dataset_key = self.params.transformer_key, - bucket = self.params.transformer_bucket, - client = self.ds_client ) - else: - self.log.info(f"Reloading transformers from file {self.params.transformer_key}") - with open(self.params.transformer_key, 'rb') as txfmr: - transformers_tuple = pickle.load(txfmr) + + for i in trans.get_transformer_keys(): + # for backwards compatibity if this file exists, all folds use the same transformers + local_path = f"{self.output_dir}/transformers.pkl" + if not os.path.exists(local_path): + local_path = f"{self.output_dir}/transformers_{i}.pkl" + + if os.path.exists(local_path): + self.log.info(f"Reloading transformers from model tarball {local_path}") + with open(local_path, 'rb') as txfmr: + transformers_tuple = pickle.load(txfmr) else: - # Shouldn't happen - raise Exception("Transformers needed to reload model, but no transformer_key specified.") + if self.params.transformer_key is not None: + if self.params.save_results: + self.log.info(f"Reloading transformers from datastore key {self.params.transformer_key}") + transformers_tuple = dsf.retrieve_dataset_by_datasetkey( + dataset_key = self.params.transformer_key, + bucket = self.params.transformer_bucket, + client = self.ds_client ) + else: + self.log.info(f"Reloading transformers from file {self.params.transformer_key}") + with open(self.params.transformer_key, 'rb') as txfmr: + transformers_tuple = pickle.load(txfmr) + else: + # Shouldn't happen + raise Exception("Transformers needed to reload model, but no transformer_key specified.") - if len(transformers_tuple) == 3: - self.transformers, self.transformers_x, self.transformers_w = transformers_tuple - else: - self.transformers, self.transformers_x = transformers_tuple - self.transformers_w = [] + if len(transformers_tuple) == 3: + ty, tx, tw = transformers_tuple + else: + ty, tx = transformers_tuple + tw = [] + + self.transformers[i] = ty + self.transformers_x[i] = tx + self.transformers_w[i] = tw # **************************************************************************************** - def transform_dataset(self, dataset): + def transform_dataset(self, dataset, fold=0): """Transform the responses and/or features in the given DeepChem dataset using the current transformers. Args: dataset: The DeepChem DiskDataset that contains a dataset + fold (int): Which fold is being transformed. Returns: transformed_dataset: The transformed DeepChem DiskDataset """ transformed_dataset = dataset - if len(self.transformers) > 0: + if len(self.transformers[fold]) > 0: self.log.info("Transforming response data") - for transformer in self.transformers: + for transformer in self.transformers[fold]: transformed_dataset = transformer.transform(transformed_dataset) - if len(self.transformers_x) > 0: + if len(self.transformers_x[fold]) > 0: self.log.info("Transforming feature data") - for transformer in self.transformers_x: + for transformer in self.transformers_x[fold]: transformed_dataset = transformer.transform(transformed_dataset) - if len(self.transformers_w) > 0: + if len(self.transformers_w[fold]) > 0: self.log.info("Transforming weights") - for transformer in self.transformers_w: + for transformer in self.transformers_w[fold]: transformed_dataset = transformer.transform(transformed_dataset) return transformed_dataset @@ -486,7 +498,7 @@ def get_train_valid_pred_results(self, perf_data): return perf_data.get_prediction_results() # **************************************************************************************** - def get_test_perf_data(self, model_dir, model_dataset): + def get_test_perf_data(self, model_dir, model_dataset, fold): """Returns the predicted values and metrics for the current test dataset against the version of the model stored in model_dir, as a PerfData object. @@ -506,14 +518,15 @@ def get_test_perf_data(self, model_dir, model_dataset): # We pass transformed=False to indicate that the preds and uncertainties we get from # generate_predictions are already untransformed, so that perf_data.get_prediction_results() # doesn't untransform them again. - if hasattr(self.transformers[0], "ishybrid"): + if hasattr(self.transformers[0][0], "ishybrid"): # indicate that we are training a hybrid model + # ASDF need to know what to pass in as the y transform now that they are fold dependent. perf_data = perf.create_perf_data("hybrid", model_dataset, self.transformers, 'test', is_ki=self.params.is_ki, ki_convert_ratio=self.params.ki_convert_ratio, transformed=False) else: perf_data = perf.create_perf_data(self.params.prediction_type, model_dataset, self.transformers, 'test', transformed=False) test_dset = model_dataset.test_dset test_preds, test_stds = self.generate_predictions(test_dset) - _ = perf_data.accumulate_preds(test_preds, test_dset.ids, test_stds) + _ = perf_data.accumulate_preds(test_preds, test_dset.ids, test_stds, fold=fold) return perf_data # **************************************************************************************** @@ -532,7 +545,7 @@ def get_test_pred_results(self, model_dir, model_dataset): return perf_data.get_prediction_results() # **************************************************************************************** - def get_full_dataset_perf_data(self, model_dataset): + def get_full_dataset_perf_data(self, model_dataset, fold): """Returns the predicted values and metrics from the current model for the full current dataset, as a PerfData object. @@ -555,7 +568,7 @@ def get_full_dataset_perf_data(self, model_dataset): else: perf_data = perf.create_perf_data(self.params.prediction_type, model_dataset, self.transformers, 'full', transformed=False) full_preds, full_stds = self.generate_predictions(model_dataset.dataset) - _ = perf_data.accumulate_preds(full_preds, model_dataset.dataset.ids, full_stds) + _ = perf_data.accumulate_preds(full_preds, model_dataset.dataset.ids, full_stds, fold) return perf_data # **************************************************************************************** @@ -913,10 +926,10 @@ def train_kfold_cv(self, pipeline): train_pred = self.model.predict(train_dset, []) test_pred = self.model.predict(test_dset, []) - train_perf = train_perf_data.accumulate_preds(train_pred, train_dset.ids) - test_perf = test_perf_data.accumulate_preds(test_pred, test_dset.ids) + train_perf = train_perf_data.accumulate_preds(train_pred, train_dset.ids, fold=k) + test_perf = test_perf_data.accumulate_preds(test_pred, test_dset.ids, fold=k) - valid_perf = em.accumulate(ei, subset='valid', dset=valid_dset) + valid_perf = em.accumulate(ei, subset='valid', dset=valid_dset, fold=k) self.log.info("Fold %d, epoch %d: training %s = %.3f, validation %s = %.3f, test %s = %.3f" % ( k, ei, pipeline.metric_type, train_perf, pipeline.metric_type, valid_perf, pipeline.metric_type, test_perf)) @@ -939,7 +952,7 @@ def train_kfold_cv(self, pipeline): for ei in range(self.best_epoch+1): self.model.fit(fit_dataset, nb_epoch=1, checkpoint_interval=0, restore=False) - train_perf, test_perf = em.update_epoch(ei, train_dset=fit_dataset, test_dset=test_dset) + train_perf, test_perf = em.update_epoch(ei, train_dset=fit_dataset, test_dset=test_dset, fold='train_valid') self.log.info(f"Combined folds: Epoch {ei}, training {pipeline.metric_type} = {train_perf:.3}," + f"test {pipeline.metric_type} = {test_perf:.3}") @@ -999,7 +1012,7 @@ def train_with_early_stopping(self, pipeline): # saved will be the one we created intentionally when we reached a new best validation score. self.model.fit(train_dset, nb_epoch=1, checkpoint_interval=0) train_perf, valid_perf, test_perf = em.update_epoch(ei, - train_dset=train_dset, valid_dset=valid_dset, test_dset=test_dset) + train_dset=train_dset, valid_dset=valid_dset, test_dset=test_dset, fold=0) self.log.info("Epoch %d: training %s = %.3f, validation %s = %.3f, test %s = %.3f" % ( ei, pipeline.metric_type, train_perf, pipeline.metric_type, valid_perf, @@ -1455,7 +1468,7 @@ def train(self, pipeline): valid_loss_ep /= (valid_data.n_ki + valid_data.n_bind) train_perf, valid_perf, test_perf = em.update_epoch(ei, - train_dset=train_dset, valid_dset=valid_dset, test_dset=test_dset) + train_dset=train_dset, valid_dset=valid_dset, test_dset=test_dset, fold=0) self.log.info("Epoch %d: training %s = %.3f, training loss = %.3f, validation %s = %.3f, validation loss = %.3f, test %s = %.3f" % ( ei, pipeline.metric_type, train_perf, train_loss_ep, pipeline.metric_type, valid_perf, valid_loss_ep, @@ -1650,13 +1663,13 @@ def train(self, pipeline): self.model.fit(train_dset) train_pred = self.model.predict(train_dset, []) - train_perf = self.train_perf_data.accumulate_preds(train_pred, train_dset.ids) + train_perf = self.train_perf_data.accumulate_preds(train_pred, train_dset.ids, fold=k) valid_pred = self.model.predict(valid_dset, []) - valid_perf = self.valid_perf_data.accumulate_preds(valid_pred, valid_dset.ids) + valid_perf = self.valid_perf_data.accumulate_preds(valid_pred, valid_dset.ids, fold=k) test_pred = self.model.predict(test_dset, []) - test_perf = self.test_perf_data.accumulate_preds(test_pred, test_dset.ids) + test_perf = self.test_perf_data.accumulate_preds(test_pred, test_dset.ids, fold=k) self.log.info("Fold %d: training %s = %.3f, validation %s = %.3f, test %s = %.3f" % ( k, pipeline.metric_type, train_perf, pipeline.metric_type, valid_perf, pipeline.metric_type, test_perf)) @@ -2069,13 +2082,13 @@ def train(self, pipeline): self.model.fit(train_dset) train_pred = self.model.predict(train_dset, []) - train_perf = self.train_perf_data.accumulate_preds(train_pred, train_dset.ids) + train_perf = self.train_perf_data.accumulate_preds(train_pred, train_dset.ids, fold=k) valid_pred = self.model.predict(valid_dset, []) - valid_perf = self.valid_perf_data.accumulate_preds(valid_pred, valid_dset.ids) + valid_perf = self.valid_perf_data.accumulate_preds(valid_pred, valid_dset.ids, fold=k) test_pred = self.model.predict(test_dset, []) - test_perf = self.test_perf_data.accumulate_preds(test_pred, test_dset.ids) + test_perf = self.test_perf_data.accumulate_preds(test_pred, test_dset.ids, fold=k) self.log.info("Fold %d: training %s = %.3f, validation %s = %.3f, test %s = %.3f" % ( k, pipeline.metric_type, train_perf, pipeline.metric_type, valid_perf, pipeline.metric_type, test_perf)) diff --git a/atomsci/ddm/pipeline/perf_data.py b/atomsci/ddm/pipeline/perf_data.py index 12b7bcb8..5863ca22 100644 --- a/atomsci/ddm/pipeline/perf_data.py +++ b/atomsci/ddm/pipeline/perf_data.py @@ -132,7 +132,7 @@ def __init__(self, model_dataset, subset): """Initialize any attributes that are common to all PerfData subclasses""" # **************************************************************************************** - def accumulate_preds(self, predicted_vals, ids, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): """Raises: NotImplementedError: The method is implemented by subclasses """ @@ -217,7 +217,7 @@ def __init__(self, model_dataset, subset): self.weights = None # **************************************************************************************** - def accumulate_preds(self, predicted_vals, ids, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): """Raises: NotImplementedError: The method is implemented by subclasses """ @@ -406,7 +406,7 @@ def __init__(self, model_dataset, subset): self.weights = None # **************************************************************************************** - def accumulate_preds(self, predicted_vals, ids, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): """Raises: NotImplementedError: The method is implemented by subclasses """ @@ -631,7 +631,7 @@ def __init__(self, model_dataset, subset): self.weights = None # **************************************************************************************** - def accumulate_preds(self, predicted_vals, ids, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): """Raises: NotImplementedError: The method is implemented by subclasses """ @@ -945,7 +945,7 @@ def __init__(self, model_dataset, transformers, subset, transformed=True): # **************************************************************************************** # class KFoldRegressionPerfData - def accumulate_preds(self, predicted_vals, ids, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): """Add training, validation or test set predictions from the current fold to the data structure where we keep track of them. @@ -989,7 +989,7 @@ def accumulate_preds(self, predicted_vals, ids, pred_stds=None): self.pred_vals[id] = np.concatenate([self.pred_vals[id], predicted_vals[i,:].reshape((1,-1))], axis=0) self.folds += 1 - pred_vals = dc.trans.undo_transforms(predicted_vals, self.transformers) + pred_vals = dc.trans.undo_transforms(predicted_vals, self.transformers[fold]) real_vals = self.get_real_values(ids) weights = self.get_weights(ids) @@ -1023,15 +1023,15 @@ def get_pred_values(self): ids = sorted(self.pred_vals.keys()) if self.subset in ['train', 'test', 'train_valid']: rawvals = np.concatenate([self.pred_vals[id].mean(axis=0, keepdims=True).reshape((1,-1)) for id in ids]) - vals = dc.trans.undo_transforms(rawvals, self.transformers) + vals = dc.trans.undo_transforms(rawvals, self.transformers[fold]) if self.folds > 1: stds = dc.trans.undo_transforms(np.concatenate([self.pred_vals[id].std(axis=0, keepdims=True).reshape((1,-1)) - for id in ids]), self.transformers) + for id in ids]), self.transformers[fold]) else: stds = None else: rawvals = np.concatenate([self.pred_vals[id].reshape((1,-1)) for id in ids], axis=0) - vals = dc.trans.undo_transforms(rawvals, self.transformers) + vals = dc.trans.undo_transforms(rawvals, self.transformers[fold]) stds = None return (ids, vals, stds) @@ -1053,7 +1053,7 @@ def get_real_values(self, ids=None): if ids is None: ids = sorted(self.pred_vals.keys()) real_vals = np.concatenate([self.real_vals[id].reshape((1,-1)) for id in ids], axis=0) - return dc.trans.undo_transforms(real_vals, self.transformers) + return dc.trans.undo_transforms(real_vals, self.transformers[fold]) # **************************************************************************************** @@ -1210,7 +1210,7 @@ def __init__(self, model_dataset, transformers, subset, predict_probs=True, tran # **************************************************************************************** # class KFoldClassificationPerfData - def accumulate_preds(self, predicted_vals, ids, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): """Add training, validation or test set predictions from the current fold to the data structure where we keep track of them. @@ -1245,7 +1245,7 @@ def accumulate_preds(self, predicted_vals, ids, pred_stds=None): task_real_vals = np.squeeze(real_vals[nzrows,i,:]) task_class_probs = dc.trans.undo_transforms( np.squeeze(class_probs[nzrows,i,:]), - self.transformers) + self.transformers[fold]) scores.append(roc_auc_score(task_real_vals, task_class_probs, average='macro')) else: # For binary classifier, sklearn metrics functions are expecting single array of 1s and 0s for real_vals_list, @@ -1253,7 +1253,7 @@ def accumulate_preds(self, predicted_vals, ids, pred_stds=None): task_real_vals = np.squeeze(real_vals[nzrows,i]) task_class_probs = dc.trans.undo_transforms( np.squeeze(class_probs[nzrows,i,1]), - self.transformers) + self.transformers[fold]) scores.append(roc_auc_score(task_real_vals, task_class_probs)) self.perf_metrics.append(np.array(scores)) return float(np.mean(scores)) @@ -1284,11 +1284,11 @@ def get_pred_values(self): #prob_stds = np.concatenate([dc.trans.undo_transforms(self.pred_vals[id], self.transformers).std(axis=0, keepdims=True) # for id in ids], axis=0) class_probs = dc.trans.undo_transforms(np.concatenate([self.pred_vals[id].mean(axis=0, keepdims=True) - for id in ids], axis=0), self.transformers) + for id in ids], axis=0), self.transformers[fold]) prob_stds = dc.trans.undo_transforms(np.concatenate([self.pred_vals[id].std(axis=0, keepdims=True) - for id in ids], axis=0), self.transformers) + for id in ids], axis=0), self.transformers[fold]) else: - class_probs = np.concatenate([dc.trans.undo_transforms(self.pred_vals[id], self.transformers) for id in ids], axis=0) + class_probs = np.concatenate([dc.trans.undo_transforms(self.pred_vals[id], self.transformers[fold]) for id in ids], axis=0) prob_stds = None pred_classes = np.argmax(class_probs, axis=2) return (ids, pred_classes, class_probs, prob_stds) @@ -1450,7 +1450,7 @@ def __init__(self, model_dataset, transformers, subset, transformed=True): # **************************************************************************************** # class SimpleRegressionPerfData - def accumulate_preds(self, predicted_vals, ids, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): """Add training, validation or test set predictions to the data structure where we keep track of them. @@ -1469,7 +1469,7 @@ def accumulate_preds(self, predicted_vals, ids, pred_stds=None): self.pred_vals = self._reshape_preds(predicted_vals) if pred_stds is not None: self.pred_stds = self._reshape_preds(pred_stds) - pred_vals = dc.trans.undo_transforms(self.pred_vals, self.transformers) + pred_vals = dc.trans.undo_transforms(self.pred_vals, self.transformers[fold]) real_vals = self.get_real_values(ids) weights = self.get_weights(ids) scores = [] @@ -1497,15 +1497,15 @@ def get_pred_values(self): stds (np.array): Contains (ncmpds, ntasks) array of prediction standard deviations """ - vals = dc.trans.undo_transforms(self.pred_vals, self.transformers) + vals = dc.trans.undo_transforms(self.pred_vals, self.transformers[fold]) stds = None if self.pred_stds is not None: stds = self.pred_stds - if len(self.transformers) == 1 and (isinstance(self.transformers[0], dc.trans.NormalizationTransformer) or isinstance(self.transformers[0],trans.NormalizationTransformerMissingData)): + if len(self.transformers[fold]) == 1 and (isinstance(self.transformers[fold][0], dc.trans.NormalizationTransformer) or isinstance(self.transformers[fold][0],trans.NormalizationTransformerMissingData)): # Untransform the standard deviations, if we can. This is a bit of a hack, but it works for # NormalizationTransformer, since the standard deviations used to scale the data are # stored in the transformer object. - y_stds = self.transformers[0].y_stds.reshape((1,-1,1)) + y_stds = self.transformers[fold][0].y_stds.reshape((1,-1,1)) stds = stds / y_stds return (self.ids, vals, stds) @@ -1523,7 +1523,7 @@ def get_real_values(self, ids=None): np.array: Containing the real dataset response values with transformations undone. """ - return dc.trans.undo_transforms(self.real_vals, self.transformers) + return dc.trans.undo_transforms(self.real_vals, self.transformers[fold]) # **************************************************************************************** @@ -1687,7 +1687,7 @@ def __init__(self, model_dataset, transformers, subset, predict_probs=True, tran # **************************************************************************************** # class SimpleClassificationPerfData - def accumulate_preds(self, predicted_vals, ids, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): """Add training, validation or test set predictions from the current dataset to the data structure where we keep track of them. @@ -1716,7 +1716,7 @@ def accumulate_preds(self, predicted_vals, ids, pred_stds=None): task_real_vals = np.squeeze(real_vals[nzrows,i,:]) task_class_probs = dc.trans.undo_transforms( np.squeeze(class_probs[nzrows,i,:]), - self.transformers) + self.transformers[fold]) scores.append(roc_auc_score(task_real_vals, task_class_probs, average='macro')) else: # For binary classifier, sklearn metrics functions are expecting single array of 1s and 0s for real_vals_list, @@ -1724,7 +1724,7 @@ def accumulate_preds(self, predicted_vals, ids, pred_stds=None): task_real_vals = np.squeeze(real_vals[nzrows,i]) task_class_probs = dc.trans.undo_transforms( np.squeeze(class_probs[nzrows,i,1]), - self.transformers) + self.transformers[fold]) scores.append(roc_auc_score(task_real_vals, task_class_probs)) self.perf_metrics.append(np.array(scores)) return float(np.mean(scores)) @@ -1752,7 +1752,7 @@ class probability estimates. prob_stds (np.array): Contains (ncmpds, ntasks, nclasses) array of standard errors for the class probability estimates """ - class_probs = dc.trans.undo_transforms(self.pred_vals, self.transformers) + class_probs = dc.trans.undo_transforms(self.pred_vals, self.transformers[fold]) pred_classes = np.argmax(class_probs, axis=2) prob_stds = self.pred_stds return (self.ids, pred_classes, class_probs, prob_stds) @@ -1907,7 +1907,7 @@ def __init__(self, model_dataset, transformers, subset, is_ki, ki_convert_ratio= # **************************************************************************************** # class SimpleHybridPerfData - def accumulate_preds(self, predicted_vals, ids, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): """Add training, validation or test set predictions to the data structure where we keep track of them. @@ -2013,7 +2013,7 @@ def get_real_values(self, ids=None): np.array: Containing the real dataset response values with transformations undone. """ - return self.transformers[0].untransform(self.real_vals) + return self.transformers[fold][0].untransform(self.real_vals) # **************************************************************************************** @@ -2160,7 +2160,7 @@ def should_stop(self): # **************************************************************************************** # class EpochManager - def update_epoch(self, ei, train_dset=None, valid_dset=None, test_dset=None): + def update_epoch(self, ei, fold, train_dset=None, valid_dset=None, test_dset=None): """Update training state after an epoch This function updates train/valid/test_perf_data. Call this function once @@ -2186,15 +2186,15 @@ def update_epoch(self, ei, train_dset=None, valid_dset=None, test_dset=None): This function updates self._should_stop """ - train_perf = self.update(ei, 'train', train_dset) - valid_perf = self.update(ei, 'valid', valid_dset) - test_perf = self.update(ei, 'test', test_dset) + train_perf = self.update(ei, 'train', train_dset, fold) + valid_perf = self.update(ei, 'valid', valid_dset, fold) + test_perf = self.update(ei, 'test', test_dset, fold) return [p for p in [train_perf, valid_perf, test_perf] if p is not None] # **************************************************************************************** # class EpochManager - def accumulate(self, ei, subset, dset): + def accumulate(self, ei, subset, dset, fold): """Accumulate predictions Makes predictions, accumulate predictions and calculate the performance metric. Calls PerfData.accumulate_preds @@ -2211,7 +2211,7 @@ def accumulate(self, ei, subset, dset): float: Performance metric for the given dset. """ pred = self._make_pred(dset) - perf = getattr(self.wrapper, f'{subset}_perf_data')[ei].accumulate_preds(pred, dset.ids) + perf = getattr(self.wrapper, f'{subset}_perf_data')[ei].accumulate_preds(pred, dset.ids, fold) return perf # **************************************************************************************** @@ -2270,7 +2270,7 @@ def update_valid(self, ei): # **************************************************************************************** # class EpochManager - def update(self, ei, subset, dset=None): + def update(self, ei, subset, fold, dset=None): """Update training state Updates the training state for a given subset and epoch index with the given dataset. @@ -2289,7 +2289,7 @@ def update(self, ei, subset, dset=None): if dset is None: return None - perf = self.accumulate(ei, subset, dset) + perf = self.accumulate(ei, subset, dset, fold) self.compute(ei, subset) if subset == 'valid': diff --git a/atomsci/ddm/pipeline/transformations.py b/atomsci/ddm/pipeline/transformations.py index 5b8ca7a8..e94ec261 100644 --- a/atomsci/ddm/pipeline/transformations.py +++ b/atomsci/ddm/pipeline/transformations.py @@ -138,7 +138,34 @@ def get_transformer_specific_metadata(params): return meta_dict # **************************************************************************************** +def get_transformer_keys(params): + """Makes all transformer keys + There is one set of transformers for each fold and then one transformer + for both validation and training sets. AMPL automatically trains a model + using all validation and training data at the end of the training loop. + """ + if params.split_strategy == 'k_fold_cv': + return [0, 'train_val'] + else: + return list(range(params.num_folds))+['train_val'] + +# **************************************************************************************** +def get_all_training_datasets(model_dataset): + """Returns all 'training' datasets + This takes a model_dataset and returns a dictionary of all + datasets that will need a transformer. The keys will match + what is returned by get_transformer_keys + """ + result = {} + # First, get the training set from all the folds + for i, t in enumerate(model_dataset.train_valid_dsets): + result[i] = t + + # Next, add the dataset that contains all training+validation data + result['train_val'] = model_dataset.combined_training_data() + +# **************************************************************************************** class UMAPTransformer(Transformer): """Dimension reduction transformations using the UMAP algorithm.