ATOMScience-org · stewarthe6 · Jan 21, 2025 · Nov 25, 2024 · Nov 27, 2024
diff --git a/atomsci/ddm/test/unit/test_compute_drug_likeness.py b/atomsci/ddm/test/unit/test_compute_drug_likeness.py
@@ -0,0 +1,44 @@
+import pytest
+import pandas as pd
+from rdkit import Chem
+from atomsci.ddm.utils.rdkit_easy import compute_drug_likeness
+
+def test_compute_drug_likeness():
+    # Create a DataFrame with sample SMILES strings
+    data = {
+        'smiles': [
+            'CCO',  # Ethanol
+            'CC(=O)OC1=CC=CC=C1C(=O)O',  # Aspirin
+            'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O',  # Ibuprofen
+            'C1=CC=C(C=C1)C=O',  # Benzaldehyde
+            'CC(C)NCC(O)COC1=CC=CC=C1'  # Pseudoephedrine
+        ]
+    }
+    df = pd.DataFrame(data)
+    df['mol'] = df['smiles'].apply(Chem.MolFromSmiles)
+
+    # Compute drug likeness
+    result_df = compute_drug_likeness(df, molecule_column='mol')
+
+    # Check if the expected columns are present in the result DataFrame
+    expected_columns = [
+        'MolWt', 'LogP', 'NumHDonors', 'NumHAcceptors', 'TPSA', 'NumRotatableBonds',
+        'MolarRefractivity', 'QED', 'TotalAtoms', 'Lipinski', 'Ghose', 'Veber'
+    ]
+    for col in expected_columns:
+        assert col in result_df.columns
+
+    # Check if the values are computed correctly for a known molecule (Ethanol)
+    ethanol_row = result_df[result_df['smiles'] == 'CCO'].iloc[0]
+    assert pytest.approx(ethanol_row['MolWt'], 0.1) == 46.07
+    assert pytest.approx(ethanol_row['LogP'], 0.1) == -0.0014
+    assert ethanol_row['NumHDonors'] == 1
+    assert ethanol_row['NumHAcceptors'] == 1
+    assert pytest.approx(ethanol_row['TPSA'], 0.1) == 20.23
+    assert ethanol_row['NumRotatableBonds'] == 0
+    assert pytest.approx(ethanol_row['MolarRefractivity'], 0.1) == 12.76
+    assert pytest.approx(ethanol_row['QED'], 0.1) == 0.41
+    assert ethanol_row['TotalAtoms'] == 9
+    assert ethanol_row['Lipinski'] == True
+    assert ethanol_row['Ghose'] == False
+    assert ethanol_row['Veber'] == True
diff --git a/atomsci/ddm/utils/rdkit_easy.py b/atomsci/ddm/utils/rdkit_easy.py
@@ -11,7 +11,9 @@
 
 import pandas as pd
 from rdkit import DataStructs
+from rdkit import Chem
 from rdkit.Chem import AllChem
+from rdkit.Chem import QED
 from rdkit.Chem import Descriptors
 from rdkit.Chem import PandasTools
 from rdkit.Chem.Draw import MolToImage, rdMolDraw2D
@@ -74,6 +76,113 @@ def calculate_descriptors(df, molecule_column='mol'):
     df=df.join(df2, lsuffix='', rsuffix='_rdk')
     return df
 
+
+
+def compute_drug_likeness(df, molecule_column='mol'):
+    """Compute various molecular descriptors and drug-likeness criteria for compounds specified by RDKit Mol objects.
+    The descriptors are added to the input data frame, and are limited to those used to compute the Lipinski 
+    rule-of-five, Ghose and Veber drug-likeness filters. The QED (qualitative estimate of drug-likeness) score is
+    also added to the data frame, along with columns of booleans indicating whether the various sets of filter
+    criteria are met.
+
+    Args:
+        df (pandas.DataFrame): Input DataFrame containing RDKit Mol objects.
+        molecule_column (str): Name of the column in the DataFrame that contains the RDKit Mol objects. Default is 'mol'.
+    Returns:
+        pandas.DataFrame: A copy of the input DataFrame with additional columns for the computed descriptors:
+            - MolWt: Molecular weight
+            - LogP: Logarithm of the partition coefficient between n-octanol and water
+            - NumHDonors: Number of hydrogen bond donors
+            - NumHAcceptors: Number of hydrogen bond acceptors
+            - TPSA: Topological polar surface area
+            - NumRotatableBonds: Number of rotatable bonds
+            - MolarRefractivity: Molar refractivity
+            - QED: Quantitative estimate of drug-likeness
+            - TotalAtoms: Total number of atoms
+            - Lipinski: Boolean indicating if the molecule meets Lipinski's rule of five criteria
+            - Ghose: Boolean indicating if the molecule meets Ghose filter criteria
+            - Veber: Boolean indicating if the molecule meets Veber's rule criteria
+    """
+    # Create a copy of the input DataFrame
+    df_copy = df.copy()
+
+    # Initialize lists to store the computed descriptors
+    mol_wt = []
+    logp = []
+    num_h_donors = []
+    num_h_acceptors = []
+    tpsa = []
+    num_rotatable_bonds = []
+    molar_refractivity = []
+    qed_scores = []
+    total_atoms = []
+    lipinski_criteria = []
+    ghose_criteria = []
+    veber_criteria = []
+
+    # Iterate over each RDKit Mol object in the DataFrame
+    for mol in df_copy[molecule_column]:
+        if mol is not None:
+            mw = Descriptors.MolWt(mol)
+            lp = Descriptors.MolLogP(mol)
+            h_donors = Descriptors.NumHDonors(mol)
+            h_acceptors = Descriptors.NumHAcceptors(mol)
+            tpsa_val = Descriptors.TPSA(mol)
+            rot_bonds = Descriptors.NumRotatableBonds(mol)
+            mr = Descriptors.MolMR(mol)
+            qed_val = QED.qed(mol)
+            num_atoms = Chem.rdMolDescriptors.CalcNumAtoms(mol)
+
+            mol_wt.append(mw)
+            logp.append(lp)
+            num_h_donors.append(h_donors)
+            num_h_acceptors.append(h_acceptors)
+            tpsa.append(tpsa_val)
+            num_rotatable_bonds.append(rot_bonds)
+            molar_refractivity.append(mr)
+            qed_scores.append(qed_val)
+            total_atoms.append(num_atoms)
+
+            # Check Lipinski's rule of five criteria
+            lipinski = (mw <= 500 and lp <= 5 and h_donors <= 5 and h_acceptors <= 10)
+            lipinski_criteria.append(lipinski)
+            # Check Ghose filter criteria
+            ghose = (160 <= mw <= 480 and -0.4 <= lp <= 5.6 and 40 <= mr <= 130 and 20 <= num_atoms <= 70)
+            ghose_criteria.append(ghose)
+            # Check Veber's rule criteria
+            veber = (rot_bonds <= 10 and tpsa_val <= 140)
+            veber_criteria.append(veber)
+        else:
+            mol_wt.append(None)
+            logp.append(None)
+            num_h_donors.append(None)
+            num_h_acceptors.append(None)
+            tpsa.append(None)
+            num_rotatable_bonds.append(None)
+            molar_refractivity.append(None)
+            qed_scores.append(None)
+            total_atoms.append(None)
+            lipinski_criteria.append(None)
+            ghose_criteria.append(None)
+            veber_criteria.append(None)
+
+    # Add the computed descriptors to the DataFrame
+    df_copy['MolWt'] = mol_wt
+    df_copy['LogP'] = logp
+    df_copy['NumHDonors'] = num_h_donors
+    df_copy['NumHAcceptors'] = num_h_acceptors
+    df_copy['TPSA'] = tpsa
+    df_copy['NumRotatableBonds'] = num_rotatable_bonds
+    df_copy['MolarRefractivity'] = molar_refractivity
+    df_copy['QED'] = qed_scores
+    df_copy['TotalAtoms'] = total_atoms
+    df_copy['Lipinski'] = lipinski_criteria
+    df_copy['Ghose'] = ghose_criteria
+    df_copy['Veber'] = veber_criteria
+
+    return df_copy
+
+
 def cluster_dataframe(df, molecule_column='mol', cluster_column='cluster', cutoff=0.2):
     """Performs Butina clustering on compounds specified by Mol objects in a data frame.