first commit

bioinf-jku · Mar 22, 2018 · 4f421ed · 4f421ed
commit 4f421ed
Show file tree

Hide file tree

Showing 8 changed files with 741,871 additions and 0 deletions.
diff --git a/FCD.py b/FCD.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+''' Defines the functions necessary for calculating the Frechet Inception 
+Distance (FCD) to evalulate generative models for molecules.
+
+The FCD metric calculates the distance between two distributions of molecules.
+Typically, we have summary statistics (mean & covariance matrix) of one
+of these distributions, while the 2nd distribution is given by the generative 
+model.
+
+The FID is calculated by assuming that X_1 and X_2 are the activations of
+the preulitmate layer of the CHEMBLNET for generated samples and real world
+samples respectivly.
+'''
+
+from __future__ import absolute_import, division, print_function
+import numpy as np
+import os
+import gzip, pickle
+import tensorflow as tf
+from scipy.misc import imread
+from scipy import linalg
+import pathlib
+import urllib
+
+import keras
+import keras.backend as K
+from keras.models import load_model
+
+
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    """Numpy implementation of the Frechet Distance.
+    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+    and X_2 ~ N(mu_2, C_2) is
+            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+            
+    Stable version by Dougal J. Sutherland.
+
+    Params:
+    -- mu1:    The mean of the activations of preultimate layer of the
+               CHEMBLNET ( like returned by the function 'get_predictions')
+               for generated samples.
+    -- mu2:    The mean of the activations of preultimate layer of the
+               CHEMBLNET ( like returned by the function 'get_predictions')
+               for real samples.
+    -- sigma1: The covariance matrix of the activations of preultimate layer of the
+               CHEMBLNET ( like returned by the function 'get_predictions')
+               for generated samples.
+    -- sigma2: The covariance matrix of the activations of preultimate layer of the
+               CHEMBLNET ( like returned by the function 'get_predictions')
+               for real samples.
+
+    Returns:
+    --   : The Frechet Distance.
+    """
+
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+
+    assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths"
+    assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions"
+
+    diff = mu1 - mu2
+
+    # product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = "fid calculation produces singular product; adding %s to diagonal of cov estimates" % eps
+        warnings.warn(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+
+    # numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError("Imaginary component {}".format(m))
+        covmean = covmean.real
+
+    tr_covmean = np.trace(covmean)
+
+    return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
+#-------------------------------------------------------------------------------
+
+def build_masked_loss(loss_function, mask_value):
+    """Builds a loss function that masks based on targets
+
+    Args:
+        loss_function: The loss function to mask
+        mask_value: The value to mask in the targets
+
+    Returns:
+        function: a loss function that acts like loss_function with masked inputs
+    """
+
+    def masked_loss_function(y_true, y_pred):
+        mask = K.cast(K.not_equal(y_true, mask_value), K.floatx())
+        return loss_function(y_true * mask, y_pred * mask)
+
+    return masked_loss_function
+#-------------------------------------------------------------------------------
+
+def masked_accuracy(y_true, y_pred):
+        mask_value = 0.5
+        a = K.sum(K.cast(K.equal(y_true,K.round(y_pred)),K.floatx()))
+        c = K.sum(K.cast(K.not_equal(y_true,0.5),K.floatx()))
+        acc = (a) / c
+        return acc
+#-------------------------------------------------------------------------------
+
+def get_one_hot(smiles, pad_len=-1):
+    one_hot = asym = ['C','N','O', 'H', 'F', 'Cl', 'P', 'B', 'Br', 'S', 'I', 'Si', 
+                      '#', '(', ')', '+', '-', '1', '2', '3', '4', '5', '6', '7', '8', '=', '[', ']', '@',
+                      'c', 'n', 'o', 's', 'X', '.']
+    smiles = smiles + '.'
+    if pad_len < 0:
+        vec = np.zeros((len(smiles), len(one_hot) ))
+    else:
+        vec = np.zeros((pad_len, len(one_hot) ))
+    cont = True
+    j = 0
+    i = 0
+    while cont:
+        if smiles[i+1] in ['r', 'i', 'l']:
+            sym = smiles[i:i+2]
+            i += 2
+        else:
+            sym = smiles[i]
+            i += 1
+        if sym in one_hot:
+            vec[j, one_hot.index(sym)] = 1
+        else:
+            vec[j,one_hot.index('X')] = 1
+        j+=1
+        if smiles[i] == '.' or j >= (pad_len-1) and pad_len > 0:
+            vec[j,one_hot.index('.')] = 1
+            cont = False
+    return (vec)
+#-------------------------------------------------------------------------------
+
+def myGenerator_predict(smilesList, batch_size=128, pad_len=350):
+    while 1: 
+        N = len(smilesList)
+        nn = pad_len        
+        idxSamples = np.arange(N)
+
+        for j in range(int(np.ceil(N / batch_size))):
+            idx = idxSamples[j*batch_size  : min((j+1)*batch_size,N)]
+
+            x = []
+            for i in range(0,len(idx)):
+                currentSmiles = smilesList[idx[i]]
+                smiEnc = get_one_hot(currentSmiles, pad_len=nn)
+                x.append(smiEnc)
+
+            x = np.asarray(x)/35
+            yield x
+#-------------------------------------------------------------------------------
+
+def get_predictions(gen_mol):
+    masked_loss_function = build_masked_loss(K.binary_crossentropy,0.5)
+    print('loading model')
+    model = load_model('model_FCD.h5', 
+                       custom_objects={'masked_loss_function':masked_loss_function,'masked_accuracy':masked_accuracy})
+    model.pop()
+    model.pop()
+    print('calculating activations')
+    gen_mol_act = model.predict_generator(myGenerator_predict(gen_mol, batch_size=128),
+                                          steps= np.ceil(len(gen_mol)/128))
+    return gen_mol_act
+#-------------------------------------------------------------------------------                    
+
+
+
+
diff --git a/FCD_example.ipynb b/FCD_example.ipynb
@@ -0,0 +1,120 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import pickle\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"]= '1' #set gpu "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load Data and Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/system/apps/biosoft/python-361/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
+      "  from ._conv import register_converters as _register_converters\n",
+      "Using TensorFlow backend.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load generated molecules\n",
+    "gen_mol_file = \"generated_smiles/JKUnet.smi\" #input file which contains one generated SMILES per line\n",
+    "gen_mol = pd.read_csv(gen_mol_file,header=None)[0].values[:5000] #take at least 5000 molecules \n",
+    "\n",
+    "# Load ChEMBL Stats\n",
+    "mu_chembl, cov_chembl = pickle.load(open(\"chembl_50k_stats.p\", 'rb')).values() #statistics of random 50,000 chembl molecules, which were not used for training\n",
+    "\n",
+    "#Load Auxiliary functions\n",
+    "exec(open('FCD.py').read()) #frechet distance, keras loss functions, data_generator, CHEMBLNET"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "## Calculation of Frechet CHEMBLNET DISTANCE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "loading model\n",
+      "calculating activations\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "2.565788716564988"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#get CHEBMLNET activations of generated molecules \n",
+    "gen_mol_act = get_predictions(gen_mol)\n",
+    "FCD = calculate_frechet_distance(mu1=np.mean(gen_mol_act, axis=0), mu2=mu_chembl, sigma1=np.cov(gen_mol_act.T), sigma2=cov_chembl)\n",
+    "FCD"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/README.md b/README.md
@@ -0,0 +1,26 @@
+# Fréchet ChEMBLNet Distance
+
+The new wave of successful generative models in machine learning has increased
+the interest in deep learning driven de novo drug design. However, assessing
+the performance of such generative models is notoriously difficult. Metrics that
+are typically used to assess the performance of such generative models are the
+percentage of chemically valid molecules or the similarity to real molecules in
+terms of particular descriptors, such as the partition coefficient (logP) or druglikeness.
+However, method comparison is difficult because of the inconsistent use of
+evaluation metrics, the necessity for multiple metrics, and the fact that some of
+these measures can easily be tricked by simple rule-based systems. We propose a
+novel distance measure between two sets of molecules, called Fréchet ChemblNet
+distance (FCD), that can be used as evaluation metric for generative models. The
+FCD is similar to a recently established performance metric for comparing image
+generation methods, the Fréchet Inception Distance (FID). Whereas the FID uses
+one of the hidden layers of InceptionNet, the FCD utilizes the penultimate layer
+of a deep neural network called “ChemblNet”, which was trained to predict drug
+activities. Thus, the FCD metric takes into account chemically and biologically
+relevant information about molecules, and also measures the diversity of the set
+via the distribution of generated molecules. The FCD’s advantage over previous
+metrics is that it can detect if generated molecules are a) diverse and have similar
+b) chemical and c) biological properties as real molecules. We further provide an
+easy-to-use implementation that only requires the SMILES representation of the
+generated molecules as input to calculate the FCD.
+
+
diff --git a/chembl_50k_stats.p b/chembl_50k_stats.p