Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add compute_drug_likeness function to rdkit_easy #384

Merged
merged 2 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions atomsci/ddm/test/unit/test_compute_drug_likeness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pytest
import pandas as pd
from rdkit import Chem
from atomsci.ddm.utils.rdkit_easy import compute_drug_likeness

def test_compute_drug_likeness():
# Create a DataFrame with sample SMILES strings
data = {
'smiles': [
'CCO', # Ethanol
'CC(=O)OC1=CC=CC=C1C(=O)O', # Aspirin
'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O', # Ibuprofen
'C1=CC=C(C=C1)C=O', # Benzaldehyde
'CC(C)NCC(O)COC1=CC=CC=C1' # Pseudoephedrine
]
}
df = pd.DataFrame(data)
df['mol'] = df['smiles'].apply(Chem.MolFromSmiles)

# Compute drug likeness
result_df = compute_drug_likeness(df, molecule_column='mol')

# Check if the expected columns are present in the result DataFrame
expected_columns = [
'MolWt', 'LogP', 'NumHDonors', 'NumHAcceptors', 'TPSA', 'NumRotatableBonds',
'MolarRefractivity', 'QED', 'TotalAtoms', 'Lipinski', 'Ghose', 'Veber'
]
for col in expected_columns:
assert col in result_df.columns

# Check if the values are computed correctly for a known molecule (Ethanol)
ethanol_row = result_df[result_df['smiles'] == 'CCO'].iloc[0]
assert pytest.approx(ethanol_row['MolWt'], 0.1) == 46.07
assert pytest.approx(ethanol_row['LogP'], 0.1) == -0.0014
assert ethanol_row['NumHDonors'] == 1
assert ethanol_row['NumHAcceptors'] == 1
assert pytest.approx(ethanol_row['TPSA'], 0.1) == 20.23
assert ethanol_row['NumRotatableBonds'] == 0
assert pytest.approx(ethanol_row['MolarRefractivity'], 0.1) == 12.76
assert pytest.approx(ethanol_row['QED'], 0.1) == 0.41
assert ethanol_row['TotalAtoms'] == 9
assert ethanol_row['Lipinski'] == True
assert ethanol_row['Ghose'] == False
assert ethanol_row['Veber'] == True
109 changes: 109 additions & 0 deletions atomsci/ddm/utils/rdkit_easy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@

import pandas as pd
from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import QED
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import MolToImage, rdMolDraw2D
Expand Down Expand Up @@ -74,6 +76,113 @@ def calculate_descriptors(df, molecule_column='mol'):
df=df.join(df2, lsuffix='', rsuffix='_rdk')
return df



def compute_drug_likeness(df, molecule_column='mol'):
"""Compute various molecular descriptors and drug-likeness criteria for compounds specified by RDKit Mol objects.
The descriptors are added to the input data frame, and are limited to those used to compute the Lipinski
rule-of-five, Ghose and Veber drug-likeness filters. The QED (qualitative estimate of drug-likeness) score is
also added to the data frame, along with columns of booleans indicating whether the various sets of filter
criteria are met.

Args:
df (pandas.DataFrame): Input DataFrame containing RDKit Mol objects.
molecule_column (str): Name of the column in the DataFrame that contains the RDKit Mol objects. Default is 'mol'.
Returns:
pandas.DataFrame: A copy of the input DataFrame with additional columns for the computed descriptors:
- MolWt: Molecular weight
- LogP: Logarithm of the partition coefficient between n-octanol and water
- NumHDonors: Number of hydrogen bond donors
- NumHAcceptors: Number of hydrogen bond acceptors
- TPSA: Topological polar surface area
- NumRotatableBonds: Number of rotatable bonds
- MolarRefractivity: Molar refractivity
- QED: Quantitative estimate of drug-likeness
- TotalAtoms: Total number of atoms
- Lipinski: Boolean indicating if the molecule meets Lipinski's rule of five criteria
- Ghose: Boolean indicating if the molecule meets Ghose filter criteria
- Veber: Boolean indicating if the molecule meets Veber's rule criteria
"""
# Create a copy of the input DataFrame
df_copy = df.copy()

# Initialize lists to store the computed descriptors
mol_wt = []
logp = []
num_h_donors = []
num_h_acceptors = []
tpsa = []
num_rotatable_bonds = []
molar_refractivity = []
qed_scores = []
total_atoms = []
lipinski_criteria = []
ghose_criteria = []
veber_criteria = []

# Iterate over each RDKit Mol object in the DataFrame
for mol in df_copy[molecule_column]:
if mol is not None:
mw = Descriptors.MolWt(mol)
lp = Descriptors.MolLogP(mol)
h_donors = Descriptors.NumHDonors(mol)
h_acceptors = Descriptors.NumHAcceptors(mol)
tpsa_val = Descriptors.TPSA(mol)
rot_bonds = Descriptors.NumRotatableBonds(mol)
mr = Descriptors.MolMR(mol)
qed_val = QED.qed(mol)
num_atoms = Chem.rdMolDescriptors.CalcNumAtoms(mol)

mol_wt.append(mw)
logp.append(lp)
num_h_donors.append(h_donors)
num_h_acceptors.append(h_acceptors)
tpsa.append(tpsa_val)
num_rotatable_bonds.append(rot_bonds)
molar_refractivity.append(mr)
qed_scores.append(qed_val)
total_atoms.append(num_atoms)

# Check Lipinski's rule of five criteria
lipinski = (mw <= 500 and lp <= 5 and h_donors <= 5 and h_acceptors <= 10)
lipinski_criteria.append(lipinski)
# Check Ghose filter criteria
ghose = (160 <= mw <= 480 and -0.4 <= lp <= 5.6 and 40 <= mr <= 130 and 20 <= num_atoms <= 70)
ghose_criteria.append(ghose)
# Check Veber's rule criteria
veber = (rot_bonds <= 10 and tpsa_val <= 140)
veber_criteria.append(veber)
else:
mol_wt.append(None)
logp.append(None)
num_h_donors.append(None)
num_h_acceptors.append(None)
tpsa.append(None)
num_rotatable_bonds.append(None)
molar_refractivity.append(None)
qed_scores.append(None)
total_atoms.append(None)
lipinski_criteria.append(None)
ghose_criteria.append(None)
veber_criteria.append(None)

# Add the computed descriptors to the DataFrame
df_copy['MolWt'] = mol_wt
df_copy['LogP'] = logp
df_copy['NumHDonors'] = num_h_donors
df_copy['NumHAcceptors'] = num_h_acceptors
df_copy['TPSA'] = tpsa
df_copy['NumRotatableBonds'] = num_rotatable_bonds
df_copy['MolarRefractivity'] = molar_refractivity
df_copy['QED'] = qed_scores
df_copy['TotalAtoms'] = total_atoms
df_copy['Lipinski'] = lipinski_criteria
df_copy['Ghose'] = ghose_criteria
df_copy['Veber'] = veber_criteria

return df_copy


def cluster_dataframe(df, molecule_column='mol', cluster_column='cluster', cutoff=0.2):
"""Performs Butina clustering on compounds specified by Mol objects in a data frame.

Expand Down
Loading