diff --git a/.gitignore b/.gitignore index 49796c6..9e3cb5c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,132 @@ -**/__pycache__ -**/.vscode \ No newline at end of file +# adapted from https://github.com/fastai/nbdev + +.vscode +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/.ipynb_checkpoints/ChEMBL-LSTM-predict-FCD-github-checkpoint.ipynb b/.ipynb_checkpoints/ChEMBL-LSTM-predict-FCD-github-checkpoint.ipynb deleted file mode 100644 index c8650c8..0000000 --- a/.ipynb_checkpoints/ChEMBL-LSTM-predict-FCD-github-checkpoint.ipynb +++ /dev/null @@ -1,209 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "# Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/system/apps/biosoft/python-361/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", - " from ._conv import register_converters as _register_converters\n", - "Using TensorFlow backend.\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import warnings\n", - "import pickle\n", - "\n", - "import keras\n", - "import keras.backend as K\n", - "from keras.models import load_model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Load generated molecules" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/system/apps/biosoft/python-361/lib/python3.6/site-packages/ipykernel_launcher.py:4: UserWarning: Too many generated samples. FCD is calculated for 50,000 randomly drawn generated samples.\n", - " after removing the cwd from sys.path.\n" - ] - } - ], - "source": [ - "gen_mol_file = \"LSTM-generated_molecules.smiles\" #input file which contains one generated SMILES per line\n", - "gen_mol = pd.read_csv(gen_mol_file,header=None)[0].values\n", - "if gen_mol.shape[0]>50000:\n", - " warnings.warn('Too many generated samples. FCD is calculated for 50,000 randomly drawn generated samples.')\n", - " gen_mol = np.random.choice(gen_mol, 50000, replace=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Load input data & set parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "model_file = \"/publicwork/klambaue/LSTMdescriptors/models2/model_FCD.h5\" #chembl network \n", - "chembl_net_act = pickle.load(open(\"chembl_activations.p\", 'rb')) #activations of random 50,000 chembl molecules, which were not used for training\n", - "batch_size = 128 #batch size for generating activations of generated SMILES" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "## Auxiliary functions" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "exec(open('aux.py').read()) #frechet distance, keras loss functions, data_generator" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "# Calculation activations for generated molecules\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "masked_loss_function = build_masked_loss(K.binary_crossentropy,0.5)\n", - "model = load_model(model_file, \n", - " custom_objects={'masked_loss_function': masked_loss_function,'masked_accuracy':masked_accuracy})\n", - "model.pop()\n", - "model.pop()\n", - "\n", - "gen_mol_act = model.predict_generator(myGenerator_predict(gen_mol, one_hot=one_hot, batch_size=batch_size),\n", - " steps= np.ceil(gen_mol.shape[0]/batch_size))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Calculation of Frechet ChEMBL Distance (FCD)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "FCD = calculate_frechet_distance(mu1=np.mean(gen_mol_act,axis=0), mu2=np.mean(chembl_net_act,axis=0), \n", - " sigma1=np.cov(gen_mol_act.T, bias=True), sigma2=np.cov(chembl_net_act.T, bias=True))\n", - "FCD" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/FCD.egg-info/PKG-INFO b/FCD.egg-info/PKG-INFO deleted file mode 100644 index 87e9bfa..0000000 --- a/FCD.egg-info/PKG-INFO +++ /dev/null @@ -1,37 +0,0 @@ -Metadata-Version: 2.1 -Name: FCD -Version: 1.0 -Summary: Fréchet ChEMNet Distance -Home-page: https://github.com/bioinf-jku/FCD -Author: -Author-email: -License: LGPLv3 -Description: # Fréchet ChemNet Distance - - The new wave of successful generative models in machine learning has increased - the interest in deep learning driven de novo drug design. However, assessing - the performance of such generative models is notoriously difficult. Metrics that - are typically used to assess the performance of such generative models are the - percentage of chemically valid molecules or the similarity to real molecules in - terms of particular descriptors, such as the partition coefficient (logP) or druglike- - ness. However, method comparison is difficult because of the inconsistent use of - evaluation metrics, the necessity for multiple metrics, and the fact that some of - these measures can easily be tricked by simple rule-based systems. We propose a - novel distance measure between two sets of molecules, called Fréchet ChemNet - distance (FCD), that can be used as an evaluation metric for generative models. The - FCD is similar to a recently established performance metric for comparing image - generation methods, the Fréchet Inception Distance (FID). Whereas the FID uses - one of the hidden layers of InceptionNet, the FCD utilizes the penultimate layer - of a deep neural network called “ChemNet”, which was trained to predict drug - activities. Thus, the FCD metric takes into account chemically and biologically - relevant information about molecules, and also measures the diversity of the set - via the distribution of generated molecules. The FCD’s advantage over previous - metrics is that it can detect if generated molecules are a) diverse and have similar - b) chemical and c) biological properties as real molecules. We further provide an - easy-to-use implementation that only requires the SMILES representation of the - generated molecules as input to calculate the FCD. - - -Platform: UNKNOWN -Description-Content-Type: text/markdown -Provides-Extra: rdkit diff --git a/FCD.egg-info/SOURCES.txt b/FCD.egg-info/SOURCES.txt deleted file mode 100644 index 220ab9e..0000000 --- a/FCD.egg-info/SOURCES.txt +++ /dev/null @@ -1,11 +0,0 @@ -MANIFEST.in -README.md -setup.py -FCD.egg-info/PKG-INFO -FCD.egg-info/SOURCES.txt -FCD.egg-info/dependency_links.txt -FCD.egg-info/requires.txt -FCD.egg-info/top_level.txt -fcd/ChemNet_v0.13_pretrained.h5 -fcd/FCD.py -fcd/__init__.py \ No newline at end of file diff --git a/FCD.egg-info/dependency_links.txt b/FCD.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/FCD.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/FCD.egg-info/requires.txt b/FCD.egg-info/requires.txt deleted file mode 100644 index d583182..0000000 --- a/FCD.egg-info/requires.txt +++ /dev/null @@ -1,7 +0,0 @@ -keras -numpy -scipy -tensorflow - -[rdkit] -rdkit diff --git a/FCD.egg-info/top_level.txt b/FCD.egg-info/top_level.txt deleted file mode 100644 index ec00c4c..0000000 --- a/FCD.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -fcd diff --git a/build/lib/fcd/ChemNet_v0.13_pretrained.h5 b/build/lib/fcd/ChemNet_v0.13_pretrained.h5 deleted file mode 100644 index e41f2a3..0000000 Binary files a/build/lib/fcd/ChemNet_v0.13_pretrained.h5 and /dev/null differ diff --git a/build/lib/fcd/FCD.py b/build/lib/fcd/FCD.py deleted file mode 100644 index 90729f3..0000000 --- a/build/lib/fcd/FCD.py +++ /dev/null @@ -1,191 +0,0 @@ -#!/usr/bin/env python3 -''' Defines the functions necessary for calculating the Frechet ChemNet -Distance (FCD) to evalulate generative models for molecules. - -The FCD metric calculates the distance between two distributions of molecules. -Typically, we have summary statistics (mean & covariance matrix) of one -of these distributions, while the 2nd distribution is given by the generative -model. - -The FCD is calculated by assuming that X_1 and X_2 are the activations of -the preulitmate layer of the CHEMNET for generated samples and real world -samples respectivly. -''' - -from __future__ import absolute_import, division, print_function -import numpy as np -from multiprocessing import Pool -from rdkit import Chem -import warnings -warnings.filterwarnings('ignore') -import os -import gzip, pickle -import tensorflow as tf -from scipy.misc import imread -from scipy import linalg -import pathlib -import urllib -import keras -import keras.backend as K -from keras.models import load_model - - -def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): - """Numpy implementation of the Frechet Distance. - The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1) - and X_2 ~ N(mu_2, C_2) is - d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)). - - Stable version by Dougal J. Sutherland. - - Params: - -- mu1: The mean of the activations of preultimate layer of the - CHEMNET ( like returned by the function 'get_predictions') - for generated samples. - -- mu2: The mean of the activations of preultimate layer of the - CHEMNET ( like returned by the function 'get_predictions') - for real samples. - -- sigma1: The covariance matrix of the activations of preultimate layer of the - CHEMNET ( like returned by the function 'get_predictions') - for generated samples. - -- sigma2: The covariance matrix of the activations of preultimate layer of the - CHEMNET ( like returned by the function 'get_predictions') - for real samples. - - Returns: - -- : The Frechet Distance. - """ - - mu1 = np.atleast_1d(mu1) - mu2 = np.atleast_1d(mu2) - - sigma1 = np.atleast_2d(sigma1) - sigma2 = np.atleast_2d(sigma2) - - assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths" - assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions" - - diff = mu1 - mu2 - - # product might be almost singular - covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False) - if not np.isfinite(covmean).all(): - offset = np.eye(sigma1.shape[0]) * eps - covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset)) - - # numerical error might give slight imaginary component - if np.iscomplexobj(covmean): - if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3): - m = np.max(np.abs(covmean.imag)) - raise ValueError("Imaginary component {}".format(m)) - covmean = covmean.real - - tr_covmean = np.trace(covmean) - - return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean -#------------------------------------------------------------------------------- - -def build_masked_loss(loss_function, mask_value): - """Builds a loss function that masks based on targets - - Args: - loss_function: The loss function to mask - mask_value: The value to mask in the targets - - Returns: - function: a loss function that acts like loss_function with masked inputs - """ - - def masked_loss_function(y_true, y_pred): - mask = K.cast(K.not_equal(y_true, mask_value), K.floatx()) - return loss_function(y_true * mask, y_pred * mask) - - return masked_loss_function -#------------------------------------------------------------------------------- - -def masked_accuracy(y_true, y_pred): - mask_value = 0.5 - a = K.sum(K.cast(K.equal(y_true,K.round(y_pred)),K.floatx())) - c = K.sum(K.cast(K.not_equal(y_true,0.5),K.floatx())) - acc = (a) / c - return acc -#------------------------------------------------------------------------------- - -def get_one_hot(smiles, pad_len=-1): - one_hot = asym = ['C','N','O', 'H', 'F', 'Cl', 'P', 'B', 'Br', 'S', 'I', 'Si', - '#', '(', ')', '+', '-', '1', '2', '3', '4', '5', '6', '7', '8', '=', '[', ']', '@', - 'c', 'n', 'o', 's', 'X', '.'] - smiles = smiles + '.' - if pad_len < 0: - vec = np.zeros((len(smiles), len(one_hot) )) - else: - vec = np.zeros((pad_len, len(one_hot) )) - cont = True - j = 0 - i = 0 - while cont: - if smiles[i+1] in ['r', 'i', 'l']: - sym = smiles[i:i+2] - i += 2 - else: - sym = smiles[i] - i += 1 - if sym in one_hot: - vec[j, one_hot.index(sym)] = 1 - else: - vec[j,one_hot.index('X')] = 1 - j+=1 - if smiles[i] == '.' or j >= (pad_len-1) and pad_len > 0: - vec[j,one_hot.index('.')] = 1 - cont = False - return (vec) -#------------------------------------------------------------------------------- - -def myGenerator_predict(smilesList, batch_size=128, pad_len=350): - while 1: - N = len(smilesList) - nn = pad_len - idxSamples = np.arange(N) - - for j in range(int(np.ceil(N / batch_size))): - idx = idxSamples[j*batch_size : min((j+1)*batch_size,N)] - - x = [] - for i in range(0,len(idx)): - currentSmiles = smilesList[idx[i]] - smiEnc = get_one_hot(currentSmiles, pad_len=nn) - x.append(smiEnc) - - x = np.asarray(x)/35 - yield x -#------------------------------------------------------------------------------- -def load_ref_model(model_file = None): - if model_file==None: - model_file = 'ChemNet_v0.13_pretrained.h5' - masked_loss_function = build_masked_loss(K.binary_crossentropy,0.5) - model = load_model(model_file, - custom_objects={'masked_loss_function':masked_loss_function,'masked_accuracy':masked_accuracy}) - model.pop() - model.pop() - return(model) -#------------------------------------------------------------------------------- -def get_predictions(model, gen_mol): - gen_mol_act = model.predict_generator(myGenerator_predict(gen_mol, batch_size=128), - steps= np.ceil(len(gen_mol)/128)) - return gen_mol_act -#------------------------------------------------------------------------------- -def canonical(smi): - try: - smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi)) - except: - pass - return smi -#------------------------------------------------------------------------------- -def canoncial_smiles(smiles): - pool = Pool(32) - smiles = pool.map(canonical, smiles) - pool.close() - return(smiles) - - - diff --git a/build/lib/fcd/__init__.py b/build/lib/fcd/__init__.py deleted file mode 100644 index f6c6d90..0000000 --- a/build/lib/fcd/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .FCD import calculate_frechet_distance, build_masked_loss, masked_accuracy, get_one_hot, myGenerator_predict, \ - load_ref_model, get_predictions, canonical, canoncial_smiles diff --git a/dist/FCD-1.0-py3-none-any.whl b/dist/FCD-1.0-py3-none-any.whl deleted file mode 100644 index 48621c3..0000000 Binary files a/dist/FCD-1.0-py3-none-any.whl and /dev/null differ diff --git a/dist/FCD-1.0.tar.gz b/dist/FCD-1.0.tar.gz deleted file mode 100644 index d2bef74..0000000 Binary files a/dist/FCD-1.0.tar.gz and /dev/null differ diff --git a/fcd/FCD.py b/fcd/FCD.py index ce2c045..5e3f690 100644 --- a/fcd/FCD.py +++ b/fcd/FCD.py @@ -13,13 +13,17 @@ ''' from __future__ import absolute_import, division, print_function -from keras.models import load_model + +import warnings +from functools import lru_cache +from multiprocessing import Pool + import keras.backend as K -from scipy import linalg import numpy as np -from multiprocessing import Pool +from keras.models import load_model from rdkit import Chem -import warnings +from scipy import linalg + warnings.filterwarnings('ignore') @@ -76,7 +80,6 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): tr_covmean = np.trace(covmean) return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean -# ------------------------------------------------------------------------------- def build_masked_loss(loss_function, mask_value): @@ -95,7 +98,6 @@ def masked_loss_function(y_true, y_pred): return loss_function(y_true * mask, y_pred * mask) return masked_loss_function -# ------------------------------------------------------------------------------- def masked_accuracy(y_true, y_pred): @@ -103,13 +105,13 @@ def masked_accuracy(y_true, y_pred): c = K.sum(K.cast(K.not_equal(y_true, 0.5), K.floatx())) acc = (a) / c return acc -# ------------------------------------------------------------------------------- def get_one_hot(smiles, pad_len=-1): - one_hot = ['C', 'N', 'O', 'H', 'F', 'Cl', 'P', 'B', 'Br', 'S', 'I', 'Si', - '#', '(', ')', '+', '-', '1', '2', '3', '4', '5', '6', '7', '8', '=', '[', ']', '@', - 'c', 'n', 'o', 's', 'X', '.'] + one_hot = [ + 'C', 'N', 'O', 'H', 'F', 'Cl', 'P', 'B', 'Br', 'S', 'I', 'Si', + '#', '(', ')', '+', '-', '1', '2', '3', '4', '5', '6', '7', '8', '=', '[', ']', '@', + 'c', 'n', 'o', 's', 'X', '.'] smiles = smiles + '.' if pad_len < 0: vec = np.zeros((len(smiles), len(one_hot))) @@ -134,7 +136,6 @@ def get_one_hot(smiles, pad_len=-1): vec[j, one_hot.index('.')] = 1 cont = False return (vec) -# ------------------------------------------------------------------------------- def myGenerator_predict(smilesList, batch_size=128, pad_len=350): @@ -154,9 +155,9 @@ def myGenerator_predict(smilesList, batch_size=128, pad_len=350): x = np.asarray(x)/35 yield x -# ------------------------------------------------------------------------------- +@lru_cache(maxsize=1) def load_ref_model(model_file=None): if model_file is None: model_file = 'ChemNet_v0.13_pretrained.h5' @@ -165,15 +166,13 @@ def load_ref_model(model_file=None): custom_objects={'masked_loss_function': masked_loss_function, 'masked_accuracy': masked_accuracy}) model.pop() model.pop() - return(model) -# ------------------------------------------------------------------------------- + return model def get_predictions(model, gen_mol): gen_mol_act = model.predict_generator(myGenerator_predict(gen_mol, batch_size=128), steps=np.ceil(len(gen_mol)/128)) return gen_mol_act -# ------------------------------------------------------------------------------- def canonical(smi): @@ -182,11 +181,8 @@ def canonical(smi): except: pass return smi -# ------------------------------------------------------------------------------- -def canoncial_smiles(smiles): - pool = Pool(32) - smiles = pool.map(canonical, smiles) - pool.close() - return(smiles) +def canoncial_smiles(smiles, njobs=32): + with Pool(njobs) as pool: + return pool.map(canonical, smiles)