From ad2fb7a334292d32fec35bb1a184754c2ac220d0 Mon Sep 17 00:00:00 2001 From: Philipp Renz Date: Fri, 29 Mar 2024 18:40:55 +0100 Subject: [PATCH] Multiple changes are part of this commit. 1. Changes to `get_one_hot` Problems are given in: - https://github.com/bioinf-jku/FCD/issues/14 - https://github.com/bioinf-jku/FCD/pull/17 - https://github.com/bioinf-jku/FCD/pull/13 I discarded the changes in the PRs and and added more comprehensive handling of the input data in the `SmilesDataset` class and the `get_one_hot` function. 2. Imaginary components Frechet distance calculation fails to work for some cases because of badly conditioned matrices, as described here https://github.com/bioinf-jku/FCD/issues/15. Could not reproduce the error locally, but could do so on colab. Fixed it in `calculate_frechet_distance` by checking if the first `covmean` computation is real add a small value to the diagonal. This made it work for me and I got the same result as the original implementation run locally. 3. Added some more tests and changed to pytest 4. As described in https://github.com/bioinf-jku/FCD/pull/16 I changed the data type of the activations to float32 in the `get_predictions` function, which saves memory for larger datasets. --- fcd/__init__.py | 14 +++++++-- fcd/fcd.py | 25 +++++++++++----- fcd/utils.py | 23 ++++++++++----- ruff.toml | 77 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 18 deletions(-) create mode 100644 ruff.toml diff --git a/fcd/__init__.py b/fcd/__init__.py index 4ee7c79..1099eed 100644 --- a/fcd/__init__.py +++ b/fcd/__init__.py @@ -1,4 +1,14 @@ -from .fcd import get_fcd, get_predictions, load_ref_model -from .utils import calculate_frechet_distance, canonical_smiles +# ruff: noqa: F401 + +from fcd.fcd import get_fcd, get_predictions, load_ref_model +from fcd.utils import calculate_frechet_distance, canonical_smiles + +__all__ = [ + "get_fcd", + "get_predictions", + "load_ref_model", + "calculate_frechet_distance", + "canonical_smiles", +] __version__ = "1.2" diff --git a/fcd/fcd.py b/fcd/fcd.py index ee89219..a84fb3b 100644 --- a/fcd/fcd.py +++ b/fcd/fcd.py @@ -1,4 +1,3 @@ -import ast import os import pkgutil import tempfile @@ -10,7 +9,7 @@ from torch import nn from torch.utils.data import DataLoader -from .utils import ( +from fcd.utils import ( SmilesDataset, calculate_frechet_distance, load_imported_model, @@ -94,13 +93,25 @@ def get_fcd(smiles1: List[str], smiles2: List[str], model: nn.Module | None = No """Calculate FCD between two sets of Smiles Args: - smiles1 (List[str]): First set of smiles - smiles2 (List[str]): Second set of smiles + smiles1 (List[str]): First set of SMILES. + smiles2 (List[str]): Second set of SMILES. model (nn.Module, optional): The model to use. Loads default model if None. + device: The device to use for computation. Returns: - float: The FCD score + float: The FCD score. + + Raises: + ValueError: If the input SMILES lists are empty. + + Example: + >>> smiles1 = ['CCO', 'CCN'] + >>> smiles2 = ['CCO', 'CCC'] + >>> fcd_score = get_fcd(smiles1, smiles2) """ + if not smiles1 or not smiles2: + raise ValueError("Input SMILES lists cannot be empty.") + if model is None: model = load_ref_model() @@ -114,8 +125,6 @@ def get_fcd(smiles1: List[str], smiles2: List[str], model: nn.Module | None = No mu2 = np.mean(act2, axis=0) sigma2 = np.cov(act2.T) - fcd_score = calculate_frechet_distance( - mu1=mu1, mu2=mu2, sigma1=sigma1, sigma2=sigma2 - ) + fcd_score = calculate_frechet_distance(mu1=mu1, mu2=mu2, sigma1=sigma1, sigma2=sigma2) return fcd_score diff --git a/fcd/utils.py b/fcd/utils.py index ffc3c93..d8da931 100644 --- a/fcd/utils.py +++ b/fcd/utils.py @@ -1,8 +1,8 @@ import re +import warnings from contextlib import contextmanager from multiprocessing import Pool from typing import List -import warnings import numpy as np import torch @@ -11,7 +11,7 @@ from torch import nn from torch.utils.data import Dataset -from .torch_layers import IndexTensor, IndexTuple, Reverse, SamePadding1d, Transpose +from fcd.torch_layers import IndexTensor, IndexTuple, Reverse, SamePadding1d, Transpose # fmt: off __vocab = ["C","N","O","H","F","Cl","P","B","Br","S","I","Si","#","(",")","+","-","1","2","3","4","5","6","7","8","=","[","]","@","c","n","o","s","X","."] @@ -156,7 +156,13 @@ def __len__(self): return len(self.smiles_list) -def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): +def calculate_frechet_distance( + mu1: np.ndarray, + sigma1: np.ndarray, + mu2: np.ndarray, + sigma2: np.ndarray, + eps: float = 1e-6, +) -> float: """Numpy implementation of the Frechet Distance. The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1) and X_2 ~ N(mu_2, C_2) is @@ -202,7 +208,8 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): if not np.isfinite(covmean).all() or not is_real: offset = np.eye(sigma1.shape[0]) * eps covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset)) - + + assert isinstance(covmean, np.ndarray) # numerical error might give slight imaginary component if np.iscomplexobj(covmean): if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3): @@ -212,7 +219,7 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): tr_covmean = np.trace(covmean) - return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean + return float(diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean) @contextmanager @@ -225,11 +232,11 @@ def todevice(model, device): def canonical(smi): try: - return Chem.MolToSmiles(Chem.MolFromSmiles(smi)) - except: + return Chem.MolToSmiles(Chem.MolFromSmiles(smi)) # type: ignore + except Exception: return None -def canonical_smiles(smiles, njobs=32): +def canonical_smiles(smiles, njobs=-1): with Pool(njobs) as pool: return pool.map(canonical, smiles) diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..27fd3e6 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,77 @@ +# Exclude a variety of commonly ignored directories. +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".ipynb_checkpoints", + ".mypy_cache", + ".nox", + ".pants.d", + ".pyenv", + ".pytest_cache", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + ".vscode", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "site-packages", + "venv", +] + +# Same as Black. +line-length = 120 +indent-width = 4 + +# Assume Python 3.8 +target-version = "py38" + +[lint] +# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. +# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or +# McCabe complexity (`C901`) by default. +select = ["E4", "E7", "E9", "F"] +ignore = [] + +# Allow fix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] + +# Allow unused variables when underscore-prefixed. +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +[format] +# Like Black, use double quotes for strings. +quote-style = "double" + +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" + +# Enable auto-formatting of code examples in docstrings. Markdown, +# reStructuredText code/literal blocks and doctests are all supported. +# +# This is currently disabled by default, but it is planned for this +# to be opt-out in the future. +docstring-code-format = false + +# Set the line length limit used when formatting code snippets in +# docstrings. +# +# This only has an effect when the `docstring-code-format` setting is +# enabled. +docstring-code-line-length = "dynamic" \ No newline at end of file