Skip to content

Commit

Permalink
No more transformers in perf_data, No more transformers in EpochManag…
Browse files Browse the repository at this point in the history
…er, transforms happen right before it's used. Transformations are un-done before passing into perf_data
  • Loading branch information
stewarthe6 committed Dec 10, 2024
1 parent a2520a4 commit 18987d3
Show file tree
Hide file tree
Showing 6 changed files with 204 additions and 268 deletions.
27 changes: 24 additions & 3 deletions atomsci/ddm/pipeline/model_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,7 @@ def get_featurized_data(self, params=None):
if params.prediction_type=='classification':
w = w.astype(np.float32)

self.untransformed_dataset = NumpyDataset(features, self.vals, ids=ids)
self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w)
self.log.info("Using prefeaturized data; number of features = " + str(self.n_features))
return
Expand All @@ -404,6 +405,7 @@ def get_featurized_data(self, params=None):
self.log.debug("Number of features: " + str(self.n_features))

# Create the DeepChem dataset
self.untransformed_dataset = NumpyDataset(features, self.vals, ids=ids)
self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w)
# Checking for minimum number of rows
if len(self.dataset) < params.min_compound_number:
Expand Down Expand Up @@ -681,7 +683,7 @@ def has_all_feature_columns(self, dset_df):

# *************************************************************************************

def get_subset_responses_and_weights(self, subset, transformers):
def get_subset_responses_and_weights(self, subset):
"""Returns a dictionary mapping compound IDs in the given dataset subset to arrays of response values
and weights. Used by the perf_data module under k-fold CV.
Expand All @@ -703,16 +705,33 @@ def get_subset_responses_and_weights(self, subset, transformers):
else:
raise ValueError('Unknown dataset subset type "%s"' % subset)

y = dc.trans.undo_transforms(dataset.y, transformers)
response_vals = dict()
dataset_ids = set(dataset.ids)
for id, y in zip(self.untransformed_dataset.ids, self.untransformed_dataset.y):
if id in dataset_ids:
response_vals[id] = y

w = dataset.w
response_vals = dict([(id, y[i,:]) for i, id in enumerate(dataset.ids)])
weights = dict([(id, w[i,:]) for i, id in enumerate(dataset.ids)])
self.subset_response_dict[subset] = response_vals
self.subset_weight_dict[subset] = weights
return self.subset_response_dict[subset], self.subset_weight_dict[subset]

# *************************************************************************************

def get_untransformed_responses(self, ids):
""" Returns a numpy array of untransformed response values
"""
response_vals = np.zeros((len(ids), self.untransformed_dataset.y.shape[1]))
response_dict = dict([(id, y) for id, y in zip(self.untransformed_dataset.ids, self.untransformed_dataset.y)])

for i, id in enumerate(ids):
response_vals[i] = response_dict[id]

return response_vals

# *************************************************************************************

def _get_split_key(self):
"""Creates the proper CSV name for a split file
Expand Down Expand Up @@ -828,6 +847,8 @@ def get_featurized_data(self, dset_df, is_featurized=False):
params, self.contains_responses)
self.log.warning("Done")
self.n_features = self.featurization.get_feature_count()

self.untransformed_dataset= NumpyDataset(features, self.vals, ids=ids)
self.dataset = NumpyDataset(features, self.vals, ids=ids)

# ****************************************************************************************
Expand Down
13 changes: 3 additions & 10 deletions atomsci/ddm/pipeline/model_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,13 +305,6 @@ def load_featurize_data(self, params=None):
else:
self.run_mode = ''

if self.run_mode == 'training':
for i, (train, valid) in enumerate(self.data.train_valid_dsets):
train = self.model_wrapper.transform_dataset(train)
valid = self.model_wrapper.transform_dataset(valid)
self.data.train_valid_dsets[i] = (train, valid)
self.data.test_dset = self.model_wrapper.transform_dataset(self.data.test_dset)

# ****************************************************************************************

def create_model_metadata(self):
Expand Down Expand Up @@ -864,7 +857,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
# Get features for each compound and construct a DeepChem Dataset from them
self.data.get_featurized_data(dset_df, is_featurized)
# Transform the features and responses if needed
self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset)
self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset, fold='final')

# Note that at this point, the dataset may contain fewer rows than the input. Typically this happens because
# of invalid SMILES strings. Remove any rows from the input dataframe corresponding to SMILES strings that were
Expand Down Expand Up @@ -995,7 +988,7 @@ def predict_embedding(self, dset_df, dset_params=None):
self.data = model_datasets.create_minimal_dataset(self.params, self.featurization)
self.data.get_featurized_data(dset_df, is_featurized=False)
# Not sure the following is necessary
self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset)
self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset, fold='final')

# Get the embeddings as a numpy array
embeddings = self.model_wrapper.generate_embeddings(self.data.dataset)
Expand Down Expand Up @@ -1577,7 +1570,7 @@ def ensemble_predict(model_uuids, collections, dset_df, labels=None, dset_params
raise Exception("response_cols missing from model params")
is_featurized = (len(set(pipe.featurization.get_feature_columns()) - set(dset_df.columns.values)) == 0)
pipe.data.get_featurized_data(dset_df, is_featurized)
pipe.data.dataset = pipe.model_wrapper.transform_dataset(pipe.data.dataset)
pipe.data.dataset = pipe.model_wrapper.transform_dataset(pipe.data.dataset, fold='final')

# Create a temporary data frame to hold the compound IDs and predictions. The model may not
# return predictions for all the requested compounds, so we have to outer join the predictions
Expand Down
Loading

0 comments on commit 18987d3

Please sign in to comment.