Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
KristinaPreuer committed Mar 22, 2018
0 parents commit 4f421ed
Show file tree
Hide file tree
Showing 8 changed files with 741,871 additions and 0 deletions.
177 changes: 177 additions & 0 deletions FCD.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#!/usr/bin/env python3
''' Defines the functions necessary for calculating the Frechet Inception
Distance (FCD) to evalulate generative models for molecules.
The FCD metric calculates the distance between two distributions of molecules.
Typically, we have summary statistics (mean & covariance matrix) of one
of these distributions, while the 2nd distribution is given by the generative
model.
The FID is calculated by assuming that X_1 and X_2 are the activations of
the preulitmate layer of the CHEMBLNET for generated samples and real world
samples respectivly.
'''

from __future__ import absolute_import, division, print_function
import numpy as np
import os
import gzip, pickle
import tensorflow as tf
from scipy.misc import imread
from scipy import linalg
import pathlib
import urllib

import keras
import keras.backend as K
from keras.models import load_model


def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
"""Numpy implementation of the Frechet Distance.
The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
and X_2 ~ N(mu_2, C_2) is
d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
Stable version by Dougal J. Sutherland.
Params:
-- mu1: The mean of the activations of preultimate layer of the
CHEMBLNET ( like returned by the function 'get_predictions')
for generated samples.
-- mu2: The mean of the activations of preultimate layer of the
CHEMBLNET ( like returned by the function 'get_predictions')
for real samples.
-- sigma1: The covariance matrix of the activations of preultimate layer of the
CHEMBLNET ( like returned by the function 'get_predictions')
for generated samples.
-- sigma2: The covariance matrix of the activations of preultimate layer of the
CHEMBLNET ( like returned by the function 'get_predictions')
for real samples.
Returns:
-- : The Frechet Distance.
"""

mu1 = np.atleast_1d(mu1)
mu2 = np.atleast_1d(mu2)

sigma1 = np.atleast_2d(sigma1)
sigma2 = np.atleast_2d(sigma2)

assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths"
assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions"

diff = mu1 - mu2

# product might be almost singular
covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
if not np.isfinite(covmean).all():
msg = "fid calculation produces singular product; adding %s to diagonal of cov estimates" % eps
warnings.warn(msg)
offset = np.eye(sigma1.shape[0]) * eps
covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))

# numerical error might give slight imaginary component
if np.iscomplexobj(covmean):
if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
m = np.max(np.abs(covmean.imag))
raise ValueError("Imaginary component {}".format(m))
covmean = covmean.real

tr_covmean = np.trace(covmean)

return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
#-------------------------------------------------------------------------------

def build_masked_loss(loss_function, mask_value):
"""Builds a loss function that masks based on targets
Args:
loss_function: The loss function to mask
mask_value: The value to mask in the targets
Returns:
function: a loss function that acts like loss_function with masked inputs
"""

def masked_loss_function(y_true, y_pred):
mask = K.cast(K.not_equal(y_true, mask_value), K.floatx())
return loss_function(y_true * mask, y_pred * mask)

return masked_loss_function
#-------------------------------------------------------------------------------

def masked_accuracy(y_true, y_pred):
mask_value = 0.5
a = K.sum(K.cast(K.equal(y_true,K.round(y_pred)),K.floatx()))
c = K.sum(K.cast(K.not_equal(y_true,0.5),K.floatx()))
acc = (a) / c
return acc
#-------------------------------------------------------------------------------

def get_one_hot(smiles, pad_len=-1):
one_hot = asym = ['C','N','O', 'H', 'F', 'Cl', 'P', 'B', 'Br', 'S', 'I', 'Si',
'#', '(', ')', '+', '-', '1', '2', '3', '4', '5', '6', '7', '8', '=', '[', ']', '@',
'c', 'n', 'o', 's', 'X', '.']
smiles = smiles + '.'
if pad_len < 0:
vec = np.zeros((len(smiles), len(one_hot) ))
else:
vec = np.zeros((pad_len, len(one_hot) ))
cont = True
j = 0
i = 0
while cont:
if smiles[i+1] in ['r', 'i', 'l']:
sym = smiles[i:i+2]
i += 2
else:
sym = smiles[i]
i += 1
if sym in one_hot:
vec[j, one_hot.index(sym)] = 1
else:
vec[j,one_hot.index('X')] = 1
j+=1
if smiles[i] == '.' or j >= (pad_len-1) and pad_len > 0:
vec[j,one_hot.index('.')] = 1
cont = False
return (vec)
#-------------------------------------------------------------------------------

def myGenerator_predict(smilesList, batch_size=128, pad_len=350):
while 1:
N = len(smilesList)
nn = pad_len
idxSamples = np.arange(N)

for j in range(int(np.ceil(N / batch_size))):
idx = idxSamples[j*batch_size : min((j+1)*batch_size,N)]

x = []
for i in range(0,len(idx)):
currentSmiles = smilesList[idx[i]]
smiEnc = get_one_hot(currentSmiles, pad_len=nn)
x.append(smiEnc)

x = np.asarray(x)/35
yield x
#-------------------------------------------------------------------------------

def get_predictions(gen_mol):
masked_loss_function = build_masked_loss(K.binary_crossentropy,0.5)
print('loading model')
model = load_model('model_FCD.h5',
custom_objects={'masked_loss_function':masked_loss_function,'masked_accuracy':masked_accuracy})
model.pop()
model.pop()
print('calculating activations')
gen_mol_act = model.predict_generator(myGenerator_predict(gen_mol, batch_size=128),
steps= np.ceil(len(gen_mol)/128))
return gen_mol_act
#-------------------------------------------------------------------------------




120 changes: 120 additions & 0 deletions FCD_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import pickle\n",
"import os\n",
"\n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"]= '1' #set gpu "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load Data and Functions"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/system/apps/biosoft/python-361/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" from ._conv import register_converters as _register_converters\n",
"Using TensorFlow backend.\n"
]
}
],
"source": [
"# Load generated molecules\n",
"gen_mol_file = \"generated_smiles/JKUnet.smi\" #input file which contains one generated SMILES per line\n",
"gen_mol = pd.read_csv(gen_mol_file,header=None)[0].values[:5000] #take at least 5000 molecules \n",
"\n",
"# Load ChEMBL Stats\n",
"mu_chembl, cov_chembl = pickle.load(open(\"chembl_50k_stats.p\", 'rb')).values() #statistics of random 50,000 chembl molecules, which were not used for training\n",
"\n",
"#Load Auxiliary functions\n",
"exec(open('FCD.py').read()) #frechet distance, keras loss functions, data_generator, CHEMBLNET"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Calculation of Frechet CHEMBLNET DISTANCE"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading model\n",
"calculating activations\n"
]
},
{
"data": {
"text/plain": [
"2.565788716564988"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#get CHEBMLNET activations of generated molecules \n",
"gen_mol_act = get_predictions(gen_mol)\n",
"FCD = calculate_frechet_distance(mu1=np.mean(gen_mol_act, axis=0), mu2=mu_chembl, sigma1=np.cov(gen_mol_act.T), sigma2=cov_chembl)\n",
"FCD"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Fréchet ChEMBLNet Distance

The new wave of successful generative models in machine learning has increased
the interest in deep learning driven de novo drug design. However, assessing
the performance of such generative models is notoriously difficult. Metrics that
are typically used to assess the performance of such generative models are the
percentage of chemically valid molecules or the similarity to real molecules in
terms of particular descriptors, such as the partition coefficient (logP) or druglikeness.
However, method comparison is difficult because of the inconsistent use of
evaluation metrics, the necessity for multiple metrics, and the fact that some of
these measures can easily be tricked by simple rule-based systems. We propose a
novel distance measure between two sets of molecules, called Fréchet ChemblNet
distance (FCD), that can be used as evaluation metric for generative models. The
FCD is similar to a recently established performance metric for comparing image
generation methods, the Fréchet Inception Distance (FID). Whereas the FID uses
one of the hidden layers of InceptionNet, the FCD utilizes the penultimate layer
of a deep neural network called “ChemblNet”, which was trained to predict drug
activities. Thus, the FCD metric takes into account chemically and biologically
relevant information about molecules, and also measures the diversity of the set
via the distribution of generated molecules. The FCD’s advantage over previous
metrics is that it can detect if generated molecules are a) diverse and have similar
b) chemical and c) biological properties as real molecules. We further provide an
easy-to-use implementation that only requires the SMILES representation of the
generated molecules as input to calculate the FCD.


Binary file added chembl_50k_stats.p
Binary file not shown.
Loading

0 comments on commit 4f421ed

Please sign in to comment.