-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 4f421ed
Showing
8 changed files
with
741,871 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,177 @@ | ||
#!/usr/bin/env python3 | ||
''' Defines the functions necessary for calculating the Frechet Inception | ||
Distance (FCD) to evalulate generative models for molecules. | ||
The FCD metric calculates the distance between two distributions of molecules. | ||
Typically, we have summary statistics (mean & covariance matrix) of one | ||
of these distributions, while the 2nd distribution is given by the generative | ||
model. | ||
The FID is calculated by assuming that X_1 and X_2 are the activations of | ||
the preulitmate layer of the CHEMBLNET for generated samples and real world | ||
samples respectivly. | ||
''' | ||
|
||
from __future__ import absolute_import, division, print_function | ||
import numpy as np | ||
import os | ||
import gzip, pickle | ||
import tensorflow as tf | ||
from scipy.misc import imread | ||
from scipy import linalg | ||
import pathlib | ||
import urllib | ||
|
||
import keras | ||
import keras.backend as K | ||
from keras.models import load_model | ||
|
||
|
||
def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): | ||
"""Numpy implementation of the Frechet Distance. | ||
The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1) | ||
and X_2 ~ N(mu_2, C_2) is | ||
d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)). | ||
Stable version by Dougal J. Sutherland. | ||
Params: | ||
-- mu1: The mean of the activations of preultimate layer of the | ||
CHEMBLNET ( like returned by the function 'get_predictions') | ||
for generated samples. | ||
-- mu2: The mean of the activations of preultimate layer of the | ||
CHEMBLNET ( like returned by the function 'get_predictions') | ||
for real samples. | ||
-- sigma1: The covariance matrix of the activations of preultimate layer of the | ||
CHEMBLNET ( like returned by the function 'get_predictions') | ||
for generated samples. | ||
-- sigma2: The covariance matrix of the activations of preultimate layer of the | ||
CHEMBLNET ( like returned by the function 'get_predictions') | ||
for real samples. | ||
Returns: | ||
-- : The Frechet Distance. | ||
""" | ||
|
||
mu1 = np.atleast_1d(mu1) | ||
mu2 = np.atleast_1d(mu2) | ||
|
||
sigma1 = np.atleast_2d(sigma1) | ||
sigma2 = np.atleast_2d(sigma2) | ||
|
||
assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths" | ||
assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions" | ||
|
||
diff = mu1 - mu2 | ||
|
||
# product might be almost singular | ||
covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False) | ||
if not np.isfinite(covmean).all(): | ||
msg = "fid calculation produces singular product; adding %s to diagonal of cov estimates" % eps | ||
warnings.warn(msg) | ||
offset = np.eye(sigma1.shape[0]) * eps | ||
covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset)) | ||
|
||
# numerical error might give slight imaginary component | ||
if np.iscomplexobj(covmean): | ||
if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3): | ||
m = np.max(np.abs(covmean.imag)) | ||
raise ValueError("Imaginary component {}".format(m)) | ||
covmean = covmean.real | ||
|
||
tr_covmean = np.trace(covmean) | ||
|
||
return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean | ||
#------------------------------------------------------------------------------- | ||
|
||
def build_masked_loss(loss_function, mask_value): | ||
"""Builds a loss function that masks based on targets | ||
Args: | ||
loss_function: The loss function to mask | ||
mask_value: The value to mask in the targets | ||
Returns: | ||
function: a loss function that acts like loss_function with masked inputs | ||
""" | ||
|
||
def masked_loss_function(y_true, y_pred): | ||
mask = K.cast(K.not_equal(y_true, mask_value), K.floatx()) | ||
return loss_function(y_true * mask, y_pred * mask) | ||
|
||
return masked_loss_function | ||
#------------------------------------------------------------------------------- | ||
|
||
def masked_accuracy(y_true, y_pred): | ||
mask_value = 0.5 | ||
a = K.sum(K.cast(K.equal(y_true,K.round(y_pred)),K.floatx())) | ||
c = K.sum(K.cast(K.not_equal(y_true,0.5),K.floatx())) | ||
acc = (a) / c | ||
return acc | ||
#------------------------------------------------------------------------------- | ||
|
||
def get_one_hot(smiles, pad_len=-1): | ||
one_hot = asym = ['C','N','O', 'H', 'F', 'Cl', 'P', 'B', 'Br', 'S', 'I', 'Si', | ||
'#', '(', ')', '+', '-', '1', '2', '3', '4', '5', '6', '7', '8', '=', '[', ']', '@', | ||
'c', 'n', 'o', 's', 'X', '.'] | ||
smiles = smiles + '.' | ||
if pad_len < 0: | ||
vec = np.zeros((len(smiles), len(one_hot) )) | ||
else: | ||
vec = np.zeros((pad_len, len(one_hot) )) | ||
cont = True | ||
j = 0 | ||
i = 0 | ||
while cont: | ||
if smiles[i+1] in ['r', 'i', 'l']: | ||
sym = smiles[i:i+2] | ||
i += 2 | ||
else: | ||
sym = smiles[i] | ||
i += 1 | ||
if sym in one_hot: | ||
vec[j, one_hot.index(sym)] = 1 | ||
else: | ||
vec[j,one_hot.index('X')] = 1 | ||
j+=1 | ||
if smiles[i] == '.' or j >= (pad_len-1) and pad_len > 0: | ||
vec[j,one_hot.index('.')] = 1 | ||
cont = False | ||
return (vec) | ||
#------------------------------------------------------------------------------- | ||
|
||
def myGenerator_predict(smilesList, batch_size=128, pad_len=350): | ||
while 1: | ||
N = len(smilesList) | ||
nn = pad_len | ||
idxSamples = np.arange(N) | ||
|
||
for j in range(int(np.ceil(N / batch_size))): | ||
idx = idxSamples[j*batch_size : min((j+1)*batch_size,N)] | ||
|
||
x = [] | ||
for i in range(0,len(idx)): | ||
currentSmiles = smilesList[idx[i]] | ||
smiEnc = get_one_hot(currentSmiles, pad_len=nn) | ||
x.append(smiEnc) | ||
|
||
x = np.asarray(x)/35 | ||
yield x | ||
#------------------------------------------------------------------------------- | ||
|
||
def get_predictions(gen_mol): | ||
masked_loss_function = build_masked_loss(K.binary_crossentropy,0.5) | ||
print('loading model') | ||
model = load_model('model_FCD.h5', | ||
custom_objects={'masked_loss_function':masked_loss_function,'masked_accuracy':masked_accuracy}) | ||
model.pop() | ||
model.pop() | ||
print('calculating activations') | ||
gen_mol_act = model.predict_generator(myGenerator_predict(gen_mol, batch_size=128), | ||
steps= np.ceil(len(gen_mol)/128)) | ||
return gen_mol_act | ||
#------------------------------------------------------------------------------- | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import numpy as np\n", | ||
"import pandas as pd\n", | ||
"import pickle\n", | ||
"import os\n", | ||
"\n", | ||
"os.environ[\"CUDA_VISIBLE_DEVICES\"]= '1' #set gpu " | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Load Data and Functions" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/system/apps/biosoft/python-361/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", | ||
" from ._conv import register_converters as _register_converters\n", | ||
"Using TensorFlow backend.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Load generated molecules\n", | ||
"gen_mol_file = \"generated_smiles/JKUnet.smi\" #input file which contains one generated SMILES per line\n", | ||
"gen_mol = pd.read_csv(gen_mol_file,header=None)[0].values[:5000] #take at least 5000 molecules \n", | ||
"\n", | ||
"# Load ChEMBL Stats\n", | ||
"mu_chembl, cov_chembl = pickle.load(open(\"chembl_50k_stats.p\", 'rb')).values() #statistics of random 50,000 chembl molecules, which were not used for training\n", | ||
"\n", | ||
"#Load Auxiliary functions\n", | ||
"exec(open('FCD.py').read()) #frechet distance, keras loss functions, data_generator, CHEMBLNET" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"source": [ | ||
"## Calculation of Frechet CHEMBLNET DISTANCE" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"loading model\n", | ||
"calculating activations\n" | ||
] | ||
}, | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"2.565788716564988" | ||
] | ||
}, | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"#get CHEBMLNET activations of generated molecules \n", | ||
"gen_mol_act = get_predictions(gen_mol)\n", | ||
"FCD = calculate_frechet_distance(mu1=np.mean(gen_mol_act, axis=0), mu2=mu_chembl, sigma1=np.cov(gen_mol_act.T), sigma2=cov_chembl)\n", | ||
"FCD" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.1" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 1 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Fréchet ChEMBLNet Distance | ||
|
||
The new wave of successful generative models in machine learning has increased | ||
the interest in deep learning driven de novo drug design. However, assessing | ||
the performance of such generative models is notoriously difficult. Metrics that | ||
are typically used to assess the performance of such generative models are the | ||
percentage of chemically valid molecules or the similarity to real molecules in | ||
terms of particular descriptors, such as the partition coefficient (logP) or druglikeness. | ||
However, method comparison is difficult because of the inconsistent use of | ||
evaluation metrics, the necessity for multiple metrics, and the fact that some of | ||
these measures can easily be tricked by simple rule-based systems. We propose a | ||
novel distance measure between two sets of molecules, called Fréchet ChemblNet | ||
distance (FCD), that can be used as evaluation metric for generative models. The | ||
FCD is similar to a recently established performance metric for comparing image | ||
generation methods, the Fréchet Inception Distance (FID). Whereas the FID uses | ||
one of the hidden layers of InceptionNet, the FCD utilizes the penultimate layer | ||
of a deep neural network called “ChemblNet”, which was trained to predict drug | ||
activities. Thus, the FCD metric takes into account chemically and biologically | ||
relevant information about molecules, and also measures the diversity of the set | ||
via the distribution of generated molecules. The FCD’s advantage over previous | ||
metrics is that it can detect if generated molecules are a) diverse and have similar | ||
b) chemical and c) biological properties as real molecules. We further provide an | ||
easy-to-use implementation that only requires the SMILES representation of the | ||
generated molecules as input to calculate the FCD. | ||
|
||
|
Binary file not shown.
Oops, something went wrong.