From 18987d36ee27cd26b2952b8f4d2f0cc6972c9185 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Tue, 10 Dec 2024 14:58:38 -0800 Subject: [PATCH] No more transformers in perf_data, No more transformers in EpochManager, transforms happen right before it's used. Transformations are un-done before passing into perf_data --- atomsci/ddm/pipeline/model_datasets.py | 27 +- atomsci/ddm/pipeline/model_pipeline.py | 13 +- atomsci/ddm/pipeline/model_wrapper.py | 130 +++++---- atomsci/ddm/pipeline/perf_data.py | 288 +++++++------------- atomsci/ddm/pipeline/transformations.py | 12 +- atomsci/ddm/test/unit/test_model_wrapper.py | 2 +- 6 files changed, 204 insertions(+), 268 deletions(-) diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py index 87bbffd9..36f6be3b 100644 --- a/atomsci/ddm/pipeline/model_datasets.py +++ b/atomsci/ddm/pipeline/model_datasets.py @@ -379,6 +379,7 @@ def get_featurized_data(self, params=None): if params.prediction_type=='classification': w = w.astype(np.float32) + self.untransformed_dataset = NumpyDataset(features, self.vals, ids=ids) self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w) self.log.info("Using prefeaturized data; number of features = " + str(self.n_features)) return @@ -404,6 +405,7 @@ def get_featurized_data(self, params=None): self.log.debug("Number of features: " + str(self.n_features)) # Create the DeepChem dataset + self.untransformed_dataset = NumpyDataset(features, self.vals, ids=ids) self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w) # Checking for minimum number of rows if len(self.dataset) < params.min_compound_number: @@ -681,7 +683,7 @@ def has_all_feature_columns(self, dset_df): # ************************************************************************************* - def get_subset_responses_and_weights(self, subset, transformers): + def get_subset_responses_and_weights(self, subset): """Returns a dictionary mapping compound IDs in the given dataset subset to arrays of response values and weights. Used by the perf_data module under k-fold CV. @@ -703,9 +705,13 @@ def get_subset_responses_and_weights(self, subset, transformers): else: raise ValueError('Unknown dataset subset type "%s"' % subset) - y = dc.trans.undo_transforms(dataset.y, transformers) + response_vals = dict() + dataset_ids = set(dataset.ids) + for id, y in zip(self.untransformed_dataset.ids, self.untransformed_dataset.y): + if id in dataset_ids: + response_vals[id] = y + w = dataset.w - response_vals = dict([(id, y[i,:]) for i, id in enumerate(dataset.ids)]) weights = dict([(id, w[i,:]) for i, id in enumerate(dataset.ids)]) self.subset_response_dict[subset] = response_vals self.subset_weight_dict[subset] = weights @@ -713,6 +719,19 @@ def get_subset_responses_and_weights(self, subset, transformers): # ************************************************************************************* + def get_untransformed_responses(self, ids): + """ Returns a numpy array of untransformed response values + """ + response_vals = np.zeros((len(ids), self.untransformed_dataset.y.shape[1])) + response_dict = dict([(id, y) for id, y in zip(self.untransformed_dataset.ids, self.untransformed_dataset.y)]) + + for i, id in enumerate(ids): + response_vals[i] = response_dict[id] + + return response_vals + + # ************************************************************************************* + def _get_split_key(self): """Creates the proper CSV name for a split file @@ -828,6 +847,8 @@ def get_featurized_data(self, dset_df, is_featurized=False): params, self.contains_responses) self.log.warning("Done") self.n_features = self.featurization.get_feature_count() + + self.untransformed_dataset= NumpyDataset(features, self.vals, ids=ids) self.dataset = NumpyDataset(features, self.vals, ids=ids) # **************************************************************************************** diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py index 70dd7f84..8cb3adc9 100644 --- a/atomsci/ddm/pipeline/model_pipeline.py +++ b/atomsci/ddm/pipeline/model_pipeline.py @@ -305,13 +305,6 @@ def load_featurize_data(self, params=None): else: self.run_mode = '' - if self.run_mode == 'training': - for i, (train, valid) in enumerate(self.data.train_valid_dsets): - train = self.model_wrapper.transform_dataset(train) - valid = self.model_wrapper.transform_dataset(valid) - self.data.train_valid_dsets[i] = (train, valid) - self.data.test_dset = self.model_wrapper.transform_dataset(self.data.test_dset) - # **************************************************************************************** def create_model_metadata(self): @@ -864,7 +857,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= # Get features for each compound and construct a DeepChem Dataset from them self.data.get_featurized_data(dset_df, is_featurized) # Transform the features and responses if needed - self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset) + self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset, fold='final') # Note that at this point, the dataset may contain fewer rows than the input. Typically this happens because # of invalid SMILES strings. Remove any rows from the input dataframe corresponding to SMILES strings that were @@ -995,7 +988,7 @@ def predict_embedding(self, dset_df, dset_params=None): self.data = model_datasets.create_minimal_dataset(self.params, self.featurization) self.data.get_featurized_data(dset_df, is_featurized=False) # Not sure the following is necessary - self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset) + self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset, fold='final') # Get the embeddings as a numpy array embeddings = self.model_wrapper.generate_embeddings(self.data.dataset) @@ -1577,7 +1570,7 @@ def ensemble_predict(model_uuids, collections, dset_df, labels=None, dset_params raise Exception("response_cols missing from model params") is_featurized = (len(set(pipe.featurization.get_feature_columns()) - set(dset_df.columns.values)) == 0) pipe.data.get_featurized_data(dset_df, is_featurized) - pipe.data.dataset = pipe.model_wrapper.transform_dataset(pipe.data.dataset) + pipe.data.dataset = pipe.model_wrapper.transform_dataset(pipe.data.dataset, fold='final') # Create a temporary data frame to hold the compound IDs and predictions. The model may not # return predictions for all the requested compounds, so we have to outer join the predictions diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py index 2ff7ccee..521db40c 100644 --- a/atomsci/ddm/pipeline/model_wrapper.py +++ b/atomsci/ddm/pipeline/model_wrapper.py @@ -8,6 +8,7 @@ import joblib import deepchem as dc +import deepchem.trans as dctrans import numpy as np import tensorflow as tf if dc.__version__.startswith('2.1'): @@ -441,7 +442,7 @@ def reload_transformers(self): # **************************************************************************************** - def transform_dataset(self, dataset, fold='final'): + def transform_dataset(self, dataset, fold): """Transform the responses and/or features in the given DeepChem dataset using the current transformers. Args: @@ -514,19 +515,14 @@ def get_test_perf_data(self, model_dir, model_dataset, fold): # Create a PerfData object, which knows how to format the prediction results in the structure # expected by the model tracker. - - # We pass transformed=False to indicate that the preds and uncertainties we get from - # generate_predictions are already untransformed, so that perf_data.get_prediction_results() - # doesn't untransform them again. if hasattr(self.transformers['final'][0], "ishybrid"): # indicate that we are training a hybrid model - # ASDF need to know what to pass in as the y transform now that they are fold dependent. - perf_data = perf.create_perf_data("hybrid", model_dataset, self.transformers, 'test', is_ki=self.params.is_ki, ki_convert_ratio=self.params.ki_convert_ratio, transformed=False) + perf_data = perf.create_perf_data("hybrid", model_dataset, 'test', is_ki=self.params.is_ki, ki_convert_ratio=self.params.ki_convert_ratio) else: - perf_data = perf.create_perf_data(self.params.prediction_type, model_dataset, self.transformers, 'test', transformed=False) + perf_data = perf.create_perf_data(self.params.prediction_type, model_dataset, 'test') test_dset = model_dataset.test_dset test_preds, test_stds = self.generate_predictions(test_dset) - _ = perf_data.accumulate_preds(test_preds, test_dset.ids, test_stds, fold=fold) + _ = perf_data.accumulate_preds(test_preds, test_dset.ids, test_stds) return perf_data # **************************************************************************************** @@ -558,17 +554,13 @@ def get_full_dataset_perf_data(self, model_dataset, fold): # Create a PerfData object, which knows how to format the prediction results in the structure # expected by the model tracker. - - # We pass transformed=False to indicate that the preds and uncertainties we get from - # generate_predictions are already untransformed, so that perf_data.get_prediction_results() - # doesn't untransform them again. if hasattr(self.transformers['final'][0], "ishybrid"): # indicate that we are training a hybrid model - perf_data = perf.create_perf_data("hybrid", model_dataset, self.transformers, 'full', is_ki=self.params.is_ki, ki_convert_ratio=self.params.ki_convert_ratio, transformed=False) + perf_data = perf.create_perf_data("hybrid", model_dataset, 'full', is_ki=self.params.is_ki, ki_convert_ratio=self.params.ki_convert_ratio) else: - perf_data = perf.create_perf_data(self.params.prediction_type, model_dataset, self.transformers, 'full', transformed=False) + perf_data = perf.create_perf_data(self.params.prediction_type, model_dataset, 'full') full_preds, full_stds = self.generate_predictions(model_dataset.dataset) - _ = perf_data.accumulate_preds(full_preds, model_dataset.dataset.ids, full_stds, fold) + _ = perf_data.accumulate_preds(full_preds, model_dataset.dataset.ids, full_stds) return perf_data # **************************************************************************************** @@ -901,12 +893,9 @@ def train_kfold_cv(self, pipeline): subsets={'train':'train_valid', 'valid':'valid', 'test':'test'}, prediction_type=self.params.prediction_type, model_dataset=pipeline.data, - production=self.params.production, - transformers=self.transformers) - em.set_make_pred(lambda x: self.model.predict(x, [])) - em.on_new_best_valid(lambda : 1+1) # does not need to take any action + production=self.params.production) - test_dset = pipeline.data.test_dset + em.on_new_best_valid(lambda : 1+1) # does not need to take any action # Train a separate model for each fold models = [] @@ -915,21 +904,28 @@ def train_kfold_cv(self, pipeline): for ei in LCTimerKFoldIterator(self.params, pipeline, self.log): # Create PerfData structures that are only used within loop to compute metrics during initial training - train_perf_data = perf.create_perf_data(self.params.prediction_type, pipeline.data, self.transformers, 'train') - test_perf_data = perf.create_perf_data(self.params.prediction_type, pipeline.data, self.transformers, 'test') + train_perf_data = perf.create_perf_data(self.params.prediction_type, 'train') + test_perf_data = perf.create_perf_data(self.params.prediction_type, 'test') for k in range(num_folds): self.model = models[k] train_dset, valid_dset = pipeline.data.train_valid_dsets[k] + train_dset = self.transform_dataset(train_dset, fold=k) + valid_dset = self.transform_dataset(valid_dset, fold=k) + test_dset = self.transform_dataset(pipeline.data.test_dset, fold=k) # We turn off automatic checkpointing - we only want to save a checkpoints for the final model. self.model.fit(train_dset, nb_epoch=1, checkpoint_interval=0, restore=False) - train_pred = self.model.predict(train_dset, []) - test_pred = self.model.predict(test_dset, []) - - train_perf = train_perf_data.accumulate_preds(train_pred, train_dset.ids, fold=k) - test_perf = test_perf_data.accumulate_preds(test_pred, test_dset.ids, fold=k) - - valid_perf = em.accumulate(ei, subset='valid', dset=valid_dset, fold=k) + train_pred = self.model.predict(train_dset, [self.transformers[k]]) + test_pred = self.model.predict(test_dset, [self.transformers[k]]) + + train_perf = train_perf_data.accumulate_preds(train_pred, train_dset.ids) + test_perf = test_perf_data.accumulate_preds(test_pred, test_dset.ids) + + # update the make pred function to include latest transformers + def make_pred(x): + return self.model.predict(x, self.transformers[k]) + em.set_make_pred(make_pred) + valid_perf = em.accumulate(ei, subset='valid', dset=valid_dset, transforms=self.transformers[k]) self.log.info("Fold %d, epoch %d: training %s = %.3f, validation %s = %.3f, test %s = %.3f" % ( k, ei, pipeline.metric_type, train_perf, pipeline.metric_type, valid_perf, pipeline.metric_type, test_perf)) @@ -945,7 +941,12 @@ def train_kfold_cv(self, pipeline): # Train a new model for best_epoch epochs on the combined training/validation set. Compute the training and test # set metrics at each epoch. - fit_dataset = pipeline.data.combined_training_data() + fit_dataset = self.transform_dataset(pipeline.data.combined_training_data(), fold='final') + test_dset = self.transform_dataset(pipeline.data.test_dset, fold='final') + def make_pred(x): + return self.model.predict(x, self.transformers['final']) + em.set_make_pred(make_pred) + retrain_start = time.time() self.model = self.recreate_model() self.log.info(f"Best epoch was {self.best_epoch}, retraining with combined training/validation set") @@ -1000,19 +1001,22 @@ def train_with_early_stopping(self, pipeline): em = perf.EpochManager(self, prediction_type=self.params.prediction_type, model_dataset=pipeline.data, - production=self.params.production, - transformers=self.transformers) - em.set_make_pred(lambda x: self.model.predict(x, [])) + production=self.params.production) + def make_pred(dset): + return self.model.predict(dset, self.transformers['final']) + em.set_make_pred(make_pred) em.on_new_best_valid(lambda : self.model.save_checkpoint()) - test_dset = pipeline.data.test_dset train_dset, valid_dset = pipeline.data.train_valid_dsets[0] + train_dset = self.transform_dataset(train_dset, 'final') + valid_dset = self.transform_dataset(valid_dset, 'final') + test_dset = self.transform_dataset(pipeline.data.test_dset, 'final') for ei in LCTimerIterator(self.params, pipeline, self.log): # Train the model for one epoch. We turn off automatic checkpointing, so the last checkpoint # saved will be the one we created intentionally when we reached a new best validation score. self.model.fit(train_dset, nb_epoch=1, checkpoint_interval=0) train_perf, valid_perf, test_perf = em.update_epoch(ei, - train_dset=train_dset, valid_dset=valid_dset, test_dset=test_dset, fold='final') + train_dset=train_dset, valid_dset=valid_dset, test_dset=test_dset) self.log.info("Epoch %d: training %s = %.3f, validation %s = %.3f, test %s = %.3f" % ( ei, pipeline.metric_type, train_perf, pipeline.metric_type, valid_perf, @@ -1447,9 +1451,11 @@ def train(self, pipeline): opt, ei, self.model_dict)) train_dset, valid_dset = pipeline.data.train_valid_dsets[0] + train_dset = self.transform_dataset(train_dset, 'final') + valid_dset = self.transform_dataset(valid_dset, 'final') + test_dset = self.transform_dataset(pipeline.data.test_dset, 'final') if len(pipeline.data.train_valid_dsets) > 1: raise Exception("Currently the hybrid model doesn't support K-fold cross validation splitting.") - test_dset = pipeline.data.test_dset train_data, valid_data = self.train_valid_dsets[0] for ei in LCTimerIterator(self.params, pipeline, self.log): # Train the model for one epoch. We turn off automatic checkpointing, so the last checkpoint @@ -1657,25 +1663,27 @@ def train(self, pipeline): self.data = pipeline.data self.best_epoch = None - self.train_perf_data = perf.create_perf_data(self.params.prediction_type, pipeline.data, self.transformers,'train') - self.valid_perf_data = perf.create_perf_data(self.params.prediction_type, pipeline.data, self.transformers, 'valid') - self.test_perf_data = perf.create_perf_data(self.params.prediction_type, pipeline.data, self.transformers, 'test') + self.train_perf_data = perf.create_perf_data(self.params.prediction_type, pipeline.data, 'train') + self.valid_perf_data = perf.create_perf_data(self.params.prediction_type, pipeline.data, 'valid') + self.test_perf_data = perf.create_perf_data(self.params.prediction_type, pipeline.data, 'test') - test_dset = pipeline.data.test_dset num_folds = len(pipeline.data.train_valid_dsets) for k in range(num_folds): train_dset, valid_dset = pipeline.data.train_valid_dsets[k] + train_dset = self.transform_dataset(train_dset, fold=k) + valid_dset = self.transform_dataset(valid_dset, fold=k) + test_dset = self.transform_dataset(pipeline.data.test_dset, fold=k) self.model.fit(train_dset) - train_pred = self.model.predict(train_dset, []) - train_perf = self.train_perf_data.accumulate_preds(train_pred, train_dset.ids, fold=k) + train_pred = self.model.predict(train_dset, self.transformers[k]) + train_perf = self.train_perf_data.accumulate_preds(train_pred, train_dset.ids) - valid_pred = self.model.predict(valid_dset, []) - valid_perf = self.valid_perf_data.accumulate_preds(valid_pred, valid_dset.ids, fold=k) + valid_pred = self.model.predict(valid_dset, self.transformers[k]) + valid_perf = self.valid_perf_data.accumulate_preds(valid_pred, valid_dset.ids) - test_pred = self.model.predict(test_dset, []) - test_perf = self.test_perf_data.accumulate_preds(test_pred, test_dset.ids, fold=k) + test_pred = self.model.predict(test_dset, self.transformers[k]) + test_perf = self.test_perf_data.accumulate_preds(test_pred, test_dset.ids) self.log.info("Fold %d: training %s = %.3f, validation %s = %.3f, test %s = %.3f" % ( k, pipeline.metric_type, train_perf, pipeline.metric_type, valid_perf, pipeline.metric_type, test_perf)) @@ -1692,6 +1700,7 @@ def train(self, pipeline): if num_folds > 1: # For k-fold CV, retrain on the combined training and validation sets fit_dataset = self.data.combined_training_data() + fit_dataset = self.transform_dataset(fit_dataset, fold='final') self.model.fit(fit_dataset) self.model_save() # The best model is just the single RF training run. @@ -1898,7 +1907,7 @@ def generate_predictions(self, dataset): pred, std = None, None self.log.info("Evaluating current model") - pred = self.model.predict(dataset, self.transformers) + pred = self.model.predict(dataset, self.transformers['final']) ncmpds = pred.shape[0] pred = pred.reshape((ncmpds,1,-1)) @@ -1908,7 +1917,7 @@ def generate_predictions(self, dataset): ## s.d. from forest if self.params.transformers and self.transformers is not None: RF_per_tree_pred = [dc.trans.undo_transforms( - tree.predict(dataset.X), self.transformers) for tree in rf_model.estimators_] + tree.predict(dataset.X), self.transformers['final']) for tree in rf_model.estimators_] else: RF_per_tree_pred = [tree.predict(dataset.X) for tree in rf_model.estimators_] @@ -2076,25 +2085,27 @@ def train(self, pipeline): self.data = pipeline.data self.best_epoch = None - self.train_perf_data = perf.create_perf_data(self.params.prediction_type, pipeline.data, self.transformers,'train') - self.valid_perf_data = perf.create_perf_data(self.params.prediction_type, pipeline.data, self.transformers, 'valid') - self.test_perf_data = perf.create_perf_data(self.params.prediction_type, pipeline.data, self.transformers, 'test') + self.train_perf_data = perf.create_perf_data(self.params.prediction_type, pipeline.data, 'train') + self.valid_perf_data = perf.create_perf_data(self.params.prediction_type, pipeline.data, 'valid') + self.test_perf_data = perf.create_perf_data(self.params.prediction_type, pipeline.data, 'test') - test_dset = pipeline.data.test_dset num_folds = len(pipeline.data.train_valid_dsets) for k in range(num_folds): train_dset, valid_dset = pipeline.data.train_valid_dsets[k] + train_dset = self.transform_dataset(train_dset, fold=k) + valid_dset = self.transform_dataset(valid_dset, fold=k) + test_dset = self.transform_dataset(pipeline.data.test_dset, fold=k) self.model.fit(train_dset) - train_pred = self.model.predict(train_dset, []) - train_perf = self.train_perf_data.accumulate_preds(train_pred, train_dset.ids, fold=k) + train_pred = self.model.predict(train_dset, self.transformers[k]) + train_perf = self.train_perf_data.accumulate_preds(train_pred, train_dset.ids) - valid_pred = self.model.predict(valid_dset, []) - valid_perf = self.valid_perf_data.accumulate_preds(valid_pred, valid_dset.ids, fold=k) + valid_pred = self.model.predict(valid_dset, self.transformers[k]) + valid_perf = self.valid_perf_data.accumulate_preds(valid_pred, valid_dset.ids) - test_pred = self.model.predict(test_dset, []) - test_perf = self.test_perf_data.accumulate_preds(test_pred, test_dset.ids, fold=k) + test_pred = self.model.predict(test_dset, self.transformers[k]) + test_perf = self.test_perf_data.accumulate_preds(test_pred, test_dset.ids) self.log.info("Fold %d: training %s = %.3f, validation %s = %.3f, test %s = %.3f" % ( k, pipeline.metric_type, train_perf, pipeline.metric_type, valid_perf, pipeline.metric_type, test_perf)) @@ -2110,6 +2121,7 @@ def train(self, pipeline): if num_folds > 1: # For k-fold CV, retrain on the combined training and validation sets fit_dataset = self.data.combined_training_data() + fit_dataset = self.transform_dataset(fit_dataset, fold='final') self.model.fit(fit_dataset) self.model_save() # The best model is just the single xgb training run. diff --git a/atomsci/ddm/pipeline/perf_data.py b/atomsci/ddm/pipeline/perf_data.py index 48271d32..6d0590da 100644 --- a/atomsci/ddm/pipeline/perf_data.py +++ b/atomsci/ddm/pipeline/perf_data.py @@ -11,10 +11,6 @@ from sklearn.metrics import accuracy_score, matthews_corrcoef, cohen_kappa_score, log_loss, balanced_accuracy_score from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error -from atomsci.ddm.pipeline import transformations as trans - - - # ****************************************************************************************************************************** def rms_error(y_real, y_pred): """Calculates the root mean squared error. Score function used for model selection. @@ -75,7 +71,7 @@ def negative_predictive_value(y_real, y_pred): binary_class_only = {'npv'} # ****************************************************************************************************************************** -def create_perf_data(prediction_type, model_dataset, transformers, subset, **kwargs): +def create_perf_data(prediction_type, model_dataset, subset, **kwargs): """Factory function that creates the right kind of PerfData object for the given subset, prediction_type (classification or regression) and split strategy (k-fold or train/valid/test). @@ -84,8 +80,6 @@ def create_perf_data(prediction_type, model_dataset, transformers, subset, **kwa model_dataset (ModelDataset): Object representing the full dataset. - transformers (list): A list of transformer objects. - subset (str): Label in ['train', 'valid', 'test', 'full'], indicating the type of subset of dataset for tracking predictions **kwargs: Additional PerfData subclass arguments @@ -104,20 +98,20 @@ def create_perf_data(prediction_type, model_dataset, transformers, subset, **kwa if prediction_type == 'regression': if subset == 'full' or split_strategy == 'train_valid_test': # Called simple because no need to track compound IDs across multiple training folds - return SimpleRegressionPerfData(model_dataset, transformers, subset, **kwargs) + return SimpleRegressionPerfData(model_dataset, subset, **kwargs) elif split_strategy == 'k_fold_cv': - return KFoldRegressionPerfData(model_dataset, transformers, subset, **kwargs) + return KFoldRegressionPerfData(model_dataset, subset, **kwargs) else: raise ValueError('Unknown split_strategy %s' % split_strategy) elif prediction_type == 'classification': if subset == 'full' or split_strategy == 'train_valid_test': - return SimpleClassificationPerfData(model_dataset, transformers, subset, **kwargs) + return SimpleClassificationPerfData(model_dataset, subset, **kwargs) elif split_strategy == 'k_fold_cv': - return KFoldClassificationPerfData(model_dataset, transformers, subset, **kwargs) + return KFoldClassificationPerfData(model_dataset, subset, **kwargs) else: raise ValueError('Unknown split_strategy %s' % split_strategy) elif prediction_type == "hybrid": - return SimpleHybridPerfData(model_dataset, transformers, subset, **kwargs) + return SimpleHybridPerfData(model_dataset, subset, **kwargs) else: raise ValueError('Unknown prediction type %s' % prediction_type) @@ -132,21 +126,21 @@ def __init__(self, model_dataset, subset): """Initialize any attributes that are common to all PerfData subclasses""" # **************************************************************************************** - def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, pred_stds=None): """Raises: NotImplementedError: The method is implemented by subclasses """ raise NotImplementedError # **************************************************************************************** - def get_pred_values(self, fold): + def get_pred_values(self): """Raises: NotImplementedError: The method is implemented by subclasses """ raise NotImplementedError # **************************************************************************************** - def get_real_values(self, fold, ids=None): + def get_real_values(self, ids=None): """Raises: NotImplementedError: The method is implemented by subclasses """ @@ -170,7 +164,7 @@ def compute_perf_metrics(self, per_task=False): raise NotImplementedError # **************************************************************************************** - def get_prediction_results(self, fold): + def get_prediction_results(self): """Raises: NotImplementedError: The method is implemented by subclasses """ @@ -217,14 +211,14 @@ def __init__(self, model_dataset, subset): self.weights = None # **************************************************************************************** - def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, pred_stds=None): """Raises: NotImplementedError: The method is implemented by subclasses """ raise NotImplementedError # **************************************************************************************** - def get_pred_values(self, fold): + def get_pred_values(self): """Raises: NotImplementedError: The method is implemented by subclasses """ @@ -257,7 +251,7 @@ def model_choice_score(self, score_type='r2'): """ ids, pred_vals, stds = self.get_pred_values() - real_vals = self.get_real_values('train_valid', ids) + real_vals = self.get_real_values(ids=ids) weights = self.get_weights(ids) scores = [] for i in range(self.num_tasks): @@ -274,7 +268,7 @@ def model_choice_score(self, score_type='r2'): # **************************************************************************************** # class RegressionPerfData - def get_prediction_results(self, fold): + def get_prediction_results(self): """Returns a dictionary of performance metrics for a regression model. The dictionary values should contain only primitive Python types, so that it can be easily JSONified. @@ -303,8 +297,8 @@ def get_prediction_results(self, fold): # and then averaging the metrics. If people start asking for SDs of MAE and RMSE scores over folds, # we'll change the code to compute all metrics the same way. - (ids, pred_vals, pred_stds) = self.get_pred_values(fold=fold) - real_vals = self.get_real_values(ids, fold=fold) + (ids, pred_vals, pred_stds) = self.get_pred_values() + real_vals = self.get_real_values(ids) weights = self.get_weights(ids) mae_scores = [] rms_scores = [] @@ -406,7 +400,7 @@ def __init__(self, model_dataset, subset): self.weights = None # **************************************************************************************** - def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, pred_stds=None): """Raises: NotImplementedError: The method is implemented by subclasses """ @@ -442,7 +436,7 @@ def model_choice_score(self, score_type='r2'): """ ids, pred_vals, stds = self.get_pred_values() - real_vals = self.get_real_values(ids) + real_vals = self.get_real_values(ids=ids) weights = self.get_weights(ids) scores = [] @@ -477,7 +471,7 @@ def model_choice_score(self, score_type='r2'): # **************************************************************************************** # class HybridPerfData - def get_prediction_results(self, fold): + def get_prediction_results(self): """Returns a dictionary of performance metrics for a regression model. The dictionary values should contain only primitive Python types, so that it can be easily JSONified. @@ -507,7 +501,7 @@ def get_prediction_results(self, fold): # we'll change the code to compute all metrics the same way. (ids, pred_vals, pred_stds) = self.get_pred_values() - real_vals = self.get_real_values(ids) + real_vals = self.get_real_values(ids=ids) weights = self.get_weights(ids) mae_scores = [] rms_scores = [] @@ -631,7 +625,7 @@ def __init__(self, model_dataset, subset): self.weights = None # **************************************************************************************** - def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, pred_stds=None): """Raises: NotImplementedError: The method is implemented by subclasses """ @@ -707,7 +701,7 @@ def model_choice_score(self, score_type='roc_auc'): # **************************************************************************************** # class ClassificationPerfData - def get_prediction_results(self, fold): + def get_prediction_results(self): """Returns a dictionary of performance metrics for a classification model. The dictionary values will contain only primitive Python types, so that it can be easily JSONified. @@ -722,7 +716,7 @@ def get_prediction_results(self, fold): pred_results = {} (ids, pred_classes, class_probs, prob_stds) = self.get_pred_values() - real_vals = self.get_real_values(ids) + real_vals = self.get_real_values(ids=ids) weights = self.get_weights(ids) if self.num_classes > 2: real_classes = np.argmax(real_vals, axis=2) @@ -882,26 +876,20 @@ class KFoldRegressionPerfData(RegressionPerfData): folds (int): Initialized at zero, flag for determining which k-fold is being assessed - transformers (list of Transformer objects): from input arguments - real_vals (dict): The dictionary containing the origin response column values """ # **************************************************************************************** # class KFoldRegressionPerfData - def __init__(self, model_dataset, transformers, subset, transformed=True): + def __init__(self, model_dataset, subset): """# Initialize any attributes that are common to all KFoldRegressionPerfData subclasses Args: model_dataset (ModelDataset object): contains the dataset and related methods - transformers (list of transformer objects): contains the list of transformers used to transform the dataset - subset (str): Label in ['train', 'valid', 'test', 'full'], indicating the type of subset of dataset for tracking predictions - transformed (bool): True if values to be passed to accumulate preds function are transformed values - Side effects: Sets the following attributes of KFoldRegressionPerfData: subset (str): Label of the type of subset of dataset for tracking predictions @@ -914,8 +902,6 @@ def __init__(self, model_dataset, transformers, subset, transformed=True): folds (int): Initialized at zero, flag for determining which k-fold is being assessed - transformers (list of Transformer objects): from input arguments - real_vals (dict): The dictionary containing the origin response column values """ @@ -932,20 +918,13 @@ def __init__(self, model_dataset, transformers, subset, transformed=True): self.folds = 0 self.perf_metrics = [] self.model_score = None - # Want predictions and real values to be in the same space, either transformed or untransformed - if transformed: - # Predictions passed to accumulate_preds() will be transformed - self.real_vals, self.weights = model_dataset.get_subset_responses_and_weights(self.subset, []) - self.transformers = transformers - else: - # If these were never transformed, transformers will be [], which is fine with undo_transforms - self.real_vals, self.weights = model_dataset.get_subset_responses_and_weights(self.subset, transformers) - self.transformers = [] + # Want predictions and real values to be in the same space, untransformed + self.real_vals, self.weights = model_dataset.get_subset_responses_and_weights(self.subset) # **************************************************************************************** # class KFoldRegressionPerfData - def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, pred_stds=None): """Add training, validation or test set predictions from the current fold to the data structure where we keep track of them. @@ -989,15 +968,13 @@ def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): self.pred_vals[id] = np.concatenate([self.pred_vals[id], predicted_vals[i,:].reshape((1,-1))], axis=0) self.folds += 1 - pred_vals = dc.trans.undo_transforms(predicted_vals, self.transformers[fold]) - - real_vals = self.get_real_values(ids) + real_vals = self.get_real_values(ids=ids) weights = self.get_weights(ids) scores = [] for i in range(self.num_tasks): nzrows = np.where(weights[:,i] != 0)[0] task_real_vals = np.squeeze(real_vals[nzrows,i]) - task_pred_vals = np.squeeze(pred_vals[nzrows,i]) + task_pred_vals = np.squeeze(predicted_vals[nzrows,i]) scores.append(regr_score_func['r2'](task_real_vals, task_pred_vals)) self.perf_metrics.append(np.array(scores)) return float(np.mean(scores)) @@ -1005,8 +982,8 @@ def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): # **************************************************************************************** # class KFoldRegressionPerfData - def get_pred_values(self, fold): - """Returns the predicted values accumulated over training, with any transformations undone. + def get_pred_values(self): + """Returns the predicted values accumulated over training. If self.subset is 'train' or 'test', the function will return averages over the training folds for each compound along with standard deviations when there are predictions from multiple folds. Otherwise, returns a single predicted value for each compound. @@ -1022,38 +999,36 @@ def get_pred_values(self, fold): """ ids = sorted(self.pred_vals.keys()) if self.subset in ['train', 'test', 'train_valid']: - rawvals = np.concatenate([self.pred_vals[id].mean(axis=0, keepdims=True).reshape((1,-1)) for id in ids]) - vals = dc.trans.undo_transforms(rawvals, self.transformers[fold]) + vals = np.concatenate([self.pred_vals[id].mean(axis=0, keepdims=True).reshape((1,-1)) for id in ids]) if self.folds > 1: - stds = dc.trans.undo_transforms(np.concatenate([self.pred_vals[id].std(axis=0, keepdims=True).reshape((1,-1)) - for id in ids]), self.transformers[fold]) + stds = np.concatenate([self.pred_vals[id].std(axis=0, keepdims=True).reshape((1,-1)) + for id in ids]) else: stds = None else: - rawvals = np.concatenate([self.pred_vals[id].reshape((1,-1)) for id in ids], axis=0) - vals = dc.trans.undo_transforms(rawvals, self.transformers[fold]) + vals = np.concatenate([self.pred_vals[id].reshape((1,-1)) for id in ids], axis=0) stds = None return (ids, vals, stds) # **************************************************************************************** # class KFoldRegressionPerfData - def get_real_values(self, fold, ids=None): - """Returns the real dataset response values, with any transformations undone, as an (ncmpds, ntasks) array + def get_real_values(self, ids=None): + """Returns the real dataset response values, as an (ncmpds, ntasks) array in the same ID order as get_pred_values() (unless ids is specified). Args: ids (list of str): Optional list of compound IDs to return values for. Returns: - np.array (ncmpds, ntasks) of the real dataset response values, with any transformations undone, in the same + np.array (ncmpds, ntasks) of the real dataset response values, in the same ID order as get_pred_values(). """ if ids is None: ids = sorted(self.pred_vals.keys()) real_vals = np.concatenate([self.real_vals[id].reshape((1,-1)) for id in ids], axis=0) - return dc.trans.undo_transforms(real_vals, self.transformers[fold]) + return real_vals # **************************************************************************************** @@ -1117,7 +1092,6 @@ class KFoldClassificationPerfData(ClassificationPerfData): num_tasks (int): The number of tasks in the dataset pred-vals (dict): The dictionary of prediction results folds (int): Initialized at zero, flag for determining which k-fold is being assessed - transformers (list of Transformer objects): from input arguments real_vals (dict): The dictionary containing the origin response column values class_names (np.array): Assumes the classes are of deepchem index type (e.g. 0,1,2,...) num_classes (int): The number of classes to predict on @@ -1125,21 +1099,17 @@ class KFoldClassificationPerfData(ClassificationPerfData): # **************************************************************************************** # class KFoldClassificationPerfData - def __init__(self, model_dataset, transformers, subset, predict_probs=True, transformed=True): + def __init__(self, model_dataset, subset, predict_probs=True): """Initialize any attributes that are common to all KFoldClassificationPerfData subclasses Args: model_dataset (ModelDataset object): contains the dataset and related methods - transformers (list of transformer objects): contains the list of transformers used to transform the dataset - subset (str): Label in ['train', 'valid', 'test', 'full'], indicating the type of subset of dataset for tracking predictions predict_probs (bool): True if using classifier supports probabilistic predictions, False otherwise - transformed (bool): True if values to be passed to accumulate preds function are transformed values - Raises: ValueError if subset not in ['train','valid','test'], unsupported dataset subset @@ -1157,8 +1127,6 @@ def __init__(self, model_dataset, transformers, subset, predict_probs=True, tran folds (int): Initialized at zero, flag for determining which k-fold is being assessed - transformers (list of Transformer objects): from input arguments - real_vals (dict): The dictionary containing the origin response column values in one-hot encoding class_names (np.array): Assumes the classes are of deepchem index type (e.g. 0,1,2,...) @@ -1187,7 +1155,7 @@ def __init__(self, model_dataset, transformers, subset, predict_probs=True, tran self.num_classes = len(set(model_dataset.dataset.y.flatten())) self.pred_vals = dict([(id, np.empty((0, self.num_tasks, self.num_classes), dtype=np.float32)) for id in dataset.ids]) - real_vals, self.weights = model_dataset.get_subset_responses_and_weights(self.subset, []) + real_vals, self.weights = model_dataset.get_subset_responses_and_weights(self.subset) self.real_classes = real_vals # Change real_vals to one-hot encoding if self.num_classes > 2: @@ -1201,16 +1169,11 @@ def __init__(self, model_dataset, transformers, subset, predict_probs=True, tran self.folds = 0 self.perf_metrics = [] self.model_score = None - if transformed: - # Predictions passed to accumulate_preds() will be transformed - self.transformers = transformers - else: - self.transformers = [] # **************************************************************************************** # class KFoldClassificationPerfData - def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, pred_stds=None): """Add training, validation or test set predictions from the current fold to the data structure where we keep track of them. @@ -1234,7 +1197,7 @@ def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): for i, id in enumerate(ids): self.pred_vals[id] = np.concatenate([self.pred_vals[id], class_probs[i,:,:].reshape((1,self.num_tasks,-1))], axis=0) self.folds += 1 - real_vals = self.get_real_values(ids) + real_vals = self.get_real_values(ids=ids) weights = self.get_weights(ids) # Break out different predictions for each task, with zero-weight compounds masked out, and compute per-task metrics scores = [] @@ -1243,25 +1206,21 @@ def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): if self.num_classes > 2: # If more than 2 classes, real_vals is indicator matrix (one-hot encoded). task_real_vals = np.squeeze(real_vals[nzrows,i,:]) - task_class_probs = dc.trans.undo_transforms( - np.squeeze(class_probs[nzrows,i,:]), - self.transformers[fold]) + task_class_probs =np.squeeze(class_probs[nzrows,i,:]) scores.append(roc_auc_score(task_real_vals, task_class_probs, average='macro')) else: # For binary classifier, sklearn metrics functions are expecting single array of 1s and 0s for real_vals_list, # and class_probs for class 1 only. task_real_vals = np.squeeze(real_vals[nzrows,i]) - task_class_probs = dc.trans.undo_transforms( - np.squeeze(class_probs[nzrows,i,1]), - self.transformers[fold]) + task_class_probs = np.squeeze(class_probs[nzrows,i,1]) scores.append(roc_auc_score(task_real_vals, task_class_probs)) self.perf_metrics.append(np.array(scores)) return float(np.mean(scores)) # **************************************************************************************** # class KFoldClassificationPerfData - def get_pred_values(self, fold): - """Returns the predicted values accumulated over training, with any transformations undone. If self.subset + def get_pred_values(self): + """Returns the predicted values accumulated over training. If self.subset is 'train', 'train_valid' or 'test', the function will return the means and standard deviations of the class probabilities over the training folds for each compound, for each task. Otherwise, returns a single set of predicted probabilites for each validation set compound. For all subsets, returns the compound IDs and the most probable classes for each task. @@ -1279,16 +1238,12 @@ def get_pred_values(self, fold): """ ids = sorted(self.pred_vals.keys()) if self.subset in ['train', 'test', 'train_valid']: - #class_probs = np.concatenate([dc.trans.undo_transforms(self.pred_vals[id], self.transformers).mean(axis=0, keepdims=True) - # for id in ids], axis=0) - #prob_stds = np.concatenate([dc.trans.undo_transforms(self.pred_vals[id], self.transformers).std(axis=0, keepdims=True) - # for id in ids], axis=0) - class_probs = dc.trans.undo_transforms(np.concatenate([self.pred_vals[id].mean(axis=0, keepdims=True) - for id in ids], axis=0), self.transformers[fold]) - prob_stds = dc.trans.undo_transforms(np.concatenate([self.pred_vals[id].std(axis=0, keepdims=True) - for id in ids], axis=0), self.transformers[fold]) + class_probs = np.concatenate([self.pred_vals[id].mean(axis=0, keepdims=True) + for id in ids], axis=0) + prob_stds = np.concatenate([self.pred_vals[id].std(axis=0, keepdims=True) + for id in ids], axis=0) else: - class_probs = np.concatenate([dc.trans.undo_transforms(self.pred_vals[id], self.transformers[fold]) for id in ids], axis=0) + class_probs = np.concatenate([self.pred_vals[id] for id in ids], axis=0) prob_stds = None pred_classes = np.argmax(class_probs, axis=2) return (ids, pred_classes, class_probs, prob_stds) @@ -1381,27 +1336,21 @@ class SimpleRegressionPerfData(RegressionPerfData): folds (int): Initialized at zero, flag for determining which k-fold is being assessed - transformers (list of Transformer objects): from input arguments - real_vals (dict): The dictionary containing the origin response column values """ # **************************************************************************************** # class SimpleRegressionPerfData - def __init__(self, model_dataset, transformers, subset, transformed=True): + def __init__(self, model_dataset, subset): """Initialize any attributes that are common to all SimpleRegressionPerfData subclasses Args: model_dataset (ModelDataset object): contains the dataset and related methods - transformers (list of transformer objects): contains the list of transformers used to transform the dataset - subset (str): Label in ['train', 'valid', 'test', 'full'], indicating the type of subset of dataset for tracking predictions - transformed (bool): True if values to be passed to accumulate preds function are transformed values - Raises: ValueError: if subset not in ['train','valid','test','full'], subset not supported @@ -1415,8 +1364,6 @@ def __init__(self, model_dataset, transformers, subset, transformed=True): pred_vals (dict): The dictionary of prediction results - transformers (list of Transformer objects): from input arguments - real_vals (dict): The dictionary containing the origin response column values """ @@ -1439,18 +1386,13 @@ def __init__(self, model_dataset, transformers, subset, transformed=True): self.pred_stds = None self.perf_metrics = [] self.model_score = None - if transformed: - # Predictions passed to accumulate_preds() will be transformed - self.transformers = transformers - self.real_vals = dataset.y - else: - self.real_vals = dc.trans.undo_transforms(dataset.y, transformers) - self.transformers = [] + + self.real_vals = model_dataset.get_untransformed_responses(dataset.ids) # **************************************************************************************** # class SimpleRegressionPerfData - def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, pred_stds=None): """Add training, validation or test set predictions to the data structure where we keep track of them. @@ -1469,8 +1411,8 @@ def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): self.pred_vals = self._reshape_preds(predicted_vals) if pred_stds is not None: self.pred_stds = self._reshape_preds(pred_stds) - pred_vals = dc.trans.undo_transforms(self.pred_vals, self.transformers[fold]) - real_vals = self.get_real_values(ids) + pred_vals = self.pred_vals + real_vals = self.get_real_values(ids=ids) weights = self.get_weights(ids) scores = [] for i in range(self.num_tasks): @@ -1484,8 +1426,8 @@ def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): # **************************************************************************************** # class SimpleRegressionPerfData - def get_pred_values(self, fold): - """Returns the predicted values accumulated over training, with any transformations undone. Returns + def get_pred_values(self): + """Returns the predicted values accumulated over training. Returns a tuple (ids, values, stds), where ids is the list of compound IDs, values is a (ncmpds, ntasks) array of predictions, and stds is always None for this class. @@ -1497,33 +1439,28 @@ def get_pred_values(self, fold): stds (np.array): Contains (ncmpds, ntasks) array of prediction standard deviations """ - vals = dc.trans.undo_transforms(self.pred_vals, self.transformers[fold]) + vals = self.pred_vals stds = None if self.pred_stds is not None: stds = self.pred_stds - if len(self.transformers[fold]) == 1 and (isinstance(self.transformers[fold][0], dc.trans.NormalizationTransformer) or isinstance(self.transformers[fold][0],trans.NormalizationTransformerMissingData)): - # Untransform the standard deviations, if we can. This is a bit of a hack, but it works for - # NormalizationTransformer, since the standard deviations used to scale the data are - # stored in the transformer object. - y_stds = self.transformers[fold][0].y_stds.reshape((1,-1,1)) - stds = stds / y_stds + return (self.ids, vals, stds) # **************************************************************************************** # class SimpleRegressionPerfData - def get_real_values(self, fold, ids=None): - """Returns the real dataset response values, with any transformations undone, as an (ncmpds, ntasks) array + def get_real_values(self, ids=None): + """Returns the real dataset response values, as an (ncmpds, ntasks) array with compounds in the same ID order as in the return from get_pred_values(). Args: ids: Ignored for this class Returns: - np.array: Containing the real dataset response values with transformations undone. + np.array: Containing the real dataset response values. """ - return dc.trans.undo_transforms(self.real_vals, self.transformers[fold]) + return self.real_vals # **************************************************************************************** @@ -1581,8 +1518,6 @@ class SimpleClassificationPerfData(ClassificationPerfData): folds (int): Initialized at zero, flag for determining which k-fold is being assessed - transformers (list of Transformer objects): from input arguments - real_vals (dict): The dictionary containing the origin response column values class_names (np.array): Assumes the classes are of deepchem index type (e.g. 0,1,2,...) @@ -1593,21 +1528,17 @@ class SimpleClassificationPerfData(ClassificationPerfData): # **************************************************************************************** # class SimpleClassificationPerfData - def __init__(self, model_dataset, transformers, subset, predict_probs=True, transformed=True): + def __init__(self, model_dataset, subset, predict_probs=True): """Initialize any attributes that are common to all SimpleClassificationPerfData subclasses Args: model_dataset (ModelDataset object): contains the dataset and related methods - transformers (list of transformer objects): contains the list of transformers used to transform the dataset - subset (str): Label in ['train', 'valid', 'test', 'full'], indicating the type of subset of dataset for tracking predictions predict_probs (bool): True if using classifier supports probabilistic predictions, False otherwise - transformed (bool): True if values to be passed to accumulate preds function are transformed values - Raises: ValueError: if subset not in ['train','valid','test','full'], subset not supported @@ -1623,8 +1554,6 @@ def __init__(self, model_dataset, transformers, subset, predict_probs=True, tran pred_vals (dict): The dictionary of prediction results - transformers (list of Transformer objects): from input arguments - real_vals (dict): The dictionary containing the origin response column values num_classes (int): The number of classes to predict on @@ -1661,11 +1590,6 @@ def __init__(self, model_dataset, transformers, subset, predict_probs=True, tran self.ids = dataset.ids self.perf_metrics = [] self.model_score = None - if transformed: - # Predictions passed to accumulate_preds() will be transformed - self.transformers = transformers - else: - self.transformers = [] self.weights = dataset.w # TODO: Everything down to here is same as in SimpleRegressionPerfData.__init__. @@ -1674,20 +1598,20 @@ def __init__(self, model_dataset, transformers, subset, predict_probs=True, tran # DeepChem does not currently support arbitary class names in classification datasets; # enforce class indices (0, 1, 2, ...) here. - self.class_indeces = list(set(model_dataset.dataset.y.flatten())) + self.real_classes = model_dataset.get_untransformed_responses(dataset.ids) + self.class_indeces = list(set(self.real_classes.flatten())) self.num_classes = len(self.class_indeces) - self.real_classes = dataset.y # Convert true values to one-hot encoding if self.num_classes > 2: - self.real_vals = np.concatenate([dc.metrics.to_one_hot(dataset.y[:,j], self.num_classes).reshape(-1,1,self.num_classes) + self.real_vals = np.concatenate([dc.metrics.to_one_hot(self.real_classes[:,j], self.num_classes).reshape(-1,1,self.num_classes) for j in range(self.num_tasks)], axis=1) else: - self.real_vals = dataset.y.reshape((-1,self.num_tasks)) + self.real_vals = self.real_classes.reshape((-1,self.num_tasks)) # **************************************************************************************** # class SimpleClassificationPerfData - def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, pred_stds=None): """Add training, validation or test set predictions from the current dataset to the data structure where we keep track of them. @@ -1705,7 +1629,7 @@ def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): class_probs = self.pred_vals = self._reshape_preds(predicted_vals) if pred_stds is not None: self.pred_stds = self._reshape_preds(pred_stds) - real_vals = self.get_real_values(ids) + real_vals = self.get_real_values(ids=ids) weights = self.get_weights(ids) # Break out different predictions for each task, with zero-weight compounds masked out, and compute per-task metrics scores = [] @@ -1714,25 +1638,22 @@ def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): if self.num_classes > 2: # If more than 2 classes, real_vals is indicator matrix (one-hot encoded). task_real_vals = np.squeeze(real_vals[nzrows,i,:]) - task_class_probs = dc.trans.undo_transforms( - np.squeeze(class_probs[nzrows,i,:]), - self.transformers[fold]) + task_class_probs =np.squeeze(class_probs[nzrows,i,:]) + scores.append(roc_auc_score(task_real_vals, task_class_probs, average='macro')) else: # For binary classifier, sklearn metrics functions are expecting single array of 1s and 0s for real_vals_list, # and class_probs for class 1 only. task_real_vals = np.squeeze(real_vals[nzrows,i]) - task_class_probs = dc.trans.undo_transforms( - np.squeeze(class_probs[nzrows,i,1]), - self.transformers[fold]) + task_class_probs = np.squeeze(class_probs[nzrows,i,1]) scores.append(roc_auc_score(task_real_vals, task_class_probs)) self.perf_metrics.append(np.array(scores)) return float(np.mean(scores)) # **************************************************************************************** # class SimpleClassificationPerfData - def get_pred_values(self, fold): - """Returns the predicted values accumulated over training, with any transformations undone. + def get_pred_values(self): + """Returns the predicted values accumulated over training. If self.subset is 'train', the function will average class probabilities over the k-1 folds in which each compound was part of the training set, and return the most probable class. Otherwise, there should be a single set of predicted probabilites for each validation or test set compound. Returns a tuple (ids, @@ -1752,7 +1673,7 @@ class probability estimates. prob_stds (np.array): Contains (ncmpds, ntasks, nclasses) array of standard errors for the class probability estimates """ - class_probs = dc.trans.undo_transforms(self.pred_vals, self.transformers[fold]) + class_probs = self.pred_vals pred_classes = np.argmax(class_probs, axis=2) prob_stds = self.pred_stds return (self.ids, pred_classes, class_probs, prob_stds) @@ -1830,22 +1751,18 @@ class SimpleHybridPerfData(HybridPerfData): folds (int): Initialized at zero, flag for determining which k-fold is being assessed - transformers (list of Transformer objects): from input arguments - real_vals (dict): The dictionary containing the origin response column values """ # **************************************************************************************** # class SimpleHybridPerfData - def __init__(self, model_dataset, transformers, subset, is_ki, ki_convert_ratio=None, transformed=True): + def __init__(self, model_dataset, subset, is_ki, ki_convert_ratio=None): """Initialize any attributes that are common to all SimpleRegressionPerfData subclasses Args: model_dataset (ModelDataset object): contains the dataset and related methods - transformers (list of transformer objects): contains the list of transformers used to transform the dataset - subset (str): Label in ['train', 'valid', 'test', 'full'], indicating the type of subset of dataset for tracking predictions @@ -1856,8 +1773,6 @@ def __init__(self, model_dataset, transformers, subset, is_ki, ki_convert_ratio= ratio of concentration and Kd of the radioligand in a competitive binding assay, or the concentration of the substrate and Michaelis constant (Km) of enzymatic inhibition assay. - transformed (bool): True if values to be passed to accumulate preds function are transformed values - Raises: ValueError: if subset not in ['train','valid','test','full'], subset not supported @@ -1871,8 +1786,6 @@ def __init__(self, model_dataset, transformers, subset, is_ki, ki_convert_ratio= pred_vals (dict): The dictionary of prediction results - transformers (list of Transformer objects): from input arguments - real_vals (dict): The dictionary containing the origin response column values """ @@ -1897,17 +1810,11 @@ def __init__(self, model_dataset, transformers, subset, is_ki, ki_convert_ratio= self.model_score = None self.is_ki = is_ki self.ki_convert_ratio = ki_convert_ratio - if transformed: - # Predictions passed to accumulate_preds() will be transformed - self.transformers = transformers - self.real_vals = dataset.y - else: - self.real_vals = transformers[0].untransform(dataset.y) - self.transformers = [] + self.real_vals = model_dataset.get_untransformed_responses(dataset.ids) # **************************************************************************************** # class SimpleHybridPerfData - def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids, pred_stds=None): """Add training, validation or test set predictions to the data structure where we keep track of them. @@ -1926,9 +1833,8 @@ def accumulate_preds(self, predicted_vals, ids, fold, pred_stds=None): self.pred_vals = self._reshape_preds(predicted_vals) if pred_stds is not None: self.pred_stds = self._reshape_preds(pred_stds) - # pred_vals = self.transformers[0].untransform(self.pred_vals, isreal=False) pred_vals = self.pred_vals - real_vals = self.get_real_values(ids) + real_vals = self.get_real_values(ids=ids) weights = self.get_weights(ids) scores = [] pos_ki = np.where(np.isnan(real_vals[:, 1]))[0] @@ -1979,8 +1885,8 @@ def _predict_binding(self, activity, conc): # **************************************************************************************** # class SimpleHybridPerfData - def get_pred_values(self, fold): - """Returns the predicted values accumulated over training, with any transformations undone. Returns + def get_pred_values(self): + """Returns the predicted values accumulated over training. Returns a tuple (ids, values, stds), where ids is the list of compound IDs, values is a (ncmpds, ntasks) array of predictions, and stds is always None for this class. @@ -2002,18 +1908,18 @@ def get_pred_values(self, fold): # **************************************************************************************** # class SimpleHybridPerfData - def get_real_values(self, fold, ids=None): - """Returns the real dataset response values, with any transformations undone, as an (ncmpds, ntasks) array + def get_real_values(self, ids=None): + """Returns the real dataset response values, as an (ncmpds, ntasks) array with compounds in the same ID order as in the return from get_pred_values(). Args: ids: Ignored for this class Returns: - np.array: Containing the real dataset response values with transformations undone. + np.array: Containing the real dataset response values. """ - return self.transformers[fold][0].untransform(self.real_vals) + return self.real_vals # **************************************************************************************** @@ -2160,7 +2066,7 @@ def should_stop(self): # **************************************************************************************** # class EpochManager - def update_epoch(self, ei, fold, train_dset=None, valid_dset=None, test_dset=None): + def update_epoch(self, ei, train_dset=None, valid_dset=None, test_dset=None): """Update training state after an epoch This function updates train/valid/test_perf_data. Call this function once @@ -2186,15 +2092,15 @@ def update_epoch(self, ei, fold, train_dset=None, valid_dset=None, test_dset=Non This function updates self._should_stop """ - train_perf = self.update(ei, 'train', train_dset, fold) - valid_perf = self.update(ei, 'valid', valid_dset, fold) - test_perf = self.update(ei, 'test', test_dset, fold) + train_perf = self.update(ei, 'train', train_dset) + valid_perf = self.update(ei, 'valid', valid_dset) + test_perf = self.update(ei, 'test', test_dset) return [p for p in [train_perf, valid_perf, test_perf] if p is not None] # **************************************************************************************** # class EpochManager - def accumulate(self, ei, subset, dset, fold): + def accumulate(self, ei, subset, dset): """Accumulate predictions Makes predictions, accumulate predictions and calculate the performance metric. Calls PerfData.accumulate_preds @@ -2211,7 +2117,7 @@ def accumulate(self, ei, subset, dset, fold): float: Performance metric for the given dset. """ pred = self._make_pred(dset) - perf = getattr(self.wrapper, f'{subset}_perf_data')[ei].accumulate_preds(pred, dset.ids, fold) + perf = getattr(self.wrapper, f'{subset}_perf_data')[ei].accumulate_preds(pred, dset.ids) return perf # **************************************************************************************** @@ -2270,7 +2176,7 @@ def update_valid(self, ei): # **************************************************************************************** # class EpochManager - def update(self, ei, subset, fold, dset=None): + def update(self, ei, subset, dset=None): """Update training state Updates the training state for a given subset and epoch index with the given dataset. @@ -2289,7 +2195,7 @@ def update(self, ei, subset, fold, dset=None): if dset is None: return None - perf = self.accumulate(ei, subset, dset, fold) + perf = self.accumulate(ei, subset, dset) self.compute(ei, subset) if subset == 'valid': diff --git a/atomsci/ddm/pipeline/transformations.py b/atomsci/ddm/pipeline/transformations.py index a9143f1c..2e7c7420 100644 --- a/atomsci/ddm/pipeline/transformations.py +++ b/atomsci/ddm/pipeline/transformations.py @@ -148,7 +148,7 @@ def get_transformer_keys(params): using all validation and training data at the end of the training loop. """ if params.split_strategy != 'k_fold_cv': - return ['final'] + return [0, 'final'] else: return list(range(params.num_folds))+['final'] @@ -157,7 +157,7 @@ def get_blank_transformations(): """Get empty transformations dictionary These keys must always exist, even when there are no transformations """ - return {'final':[]} + return {0:[], 'final':[]} # **************************************************************************************** def get_all_training_datasets(model_dataset): @@ -171,10 +171,14 @@ def get_all_training_datasets(model_dataset): # this dataset is not split into training and validation, use all data result['final'] = model_dataset.dataset elif len(model_dataset.train_valid_dsets)==1: - result['final'] = model_dataset.train_valid_dsets[0] + # there is only one fold, use the training set from that + # for random forests and xgboost models, the final and + # 0th fold are the same if there k-fold is not used + result['final'] = model_dataset.train_valid_dsets[0][0] + result[0] = model_dataset.train_valid_dsets[0][0] else: # First, get the training set from all the folds - for i, t in enumerate(model_dataset.train_valid_dsets): + for i, (t, v) in enumerate(model_dataset.train_valid_dsets): result[i] = t # Next, add the dataset that contains all training+validation data diff --git a/atomsci/ddm/test/unit/test_model_wrapper.py b/atomsci/ddm/test/unit/test_model_wrapper.py index 0b270987..fc43206f 100644 --- a/atomsci/ddm/test/unit/test_model_wrapper.py +++ b/atomsci/ddm/test/unit/test_model_wrapper.py @@ -185,7 +185,7 @@ def test_super_transform_dataset(): mdl = model_wrapper.create_model_wrapper(inp_params, data_obj_ecfp.featurization) mdl.setup_model_dirs() mdl.create_transformers(trans.get_all_training_datasets(data_obj_ecfp)) - dataset = mdl.transform_dataset(data_obj_ecfp.dataset) + dataset = mdl.transform_dataset(data_obj_ecfp.dataset, fold='final') test = [] # checking that the dataset is the correct type