diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 3c64241d109..0eeb43a7f69 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -40,6 +40,8 @@ >>> u.trajectory +>>> u.atoms.convert_to("RDKIT") + Classes @@ -55,15 +57,51 @@ """ import warnings +import re import numpy as np +from ..exceptions import NoDataError +from ..topology.guessers import guess_atom_element from . import memory +from . import base + +try: + from rdkit import Chem +except ImportError: + pass +else: + RDBONDTYPE = { + 'AROMATIC': Chem.BondType.AROMATIC, + 'SINGLE': Chem.BondType.SINGLE, + 'DOUBLE': Chem.BondType.DOUBLE, + 'TRIPLE': Chem.BondType.TRIPLE, + } + RDBONDORDER = { + 1: Chem.BondType.SINGLE, + 1.5: Chem.BondType.AROMATIC, + "ar": Chem.BondType.AROMATIC, + 2: Chem.BondType.DOUBLE, + 3: Chem.BondType.TRIPLE, + } + # add string version of the key for each bond + RDBONDORDER.update({str(key): value for key, value in RDBONDORDER.items()}) + RDATTRIBUTES = { + "altLoc": "AltLoc", + "chainID": "ChainId", + "name": "Name", + "occupancy": "Occupancy", + "resname": "ResidueName", + "resid": "ResidueNumber", + "segindex": "SegmentNumber", + "tempfactor": "TempFactor", + } + PERIODIC_TABLE = Chem.GetPeriodicTable() class RDKitReader(memory.MemoryReader): """Coordinate reader for RDKit. - + .. versionadded:: 2.0.0 """ format = 'RDKIT' @@ -95,10 +133,269 @@ def __init__(self, filename, **kwargs): """ n_atoms = filename.GetNumAtoms() coordinates = np.array([ - conf.GetPositions() for conf in filename.GetConformers()], + conf.GetPositions() for conf in filename.GetConformers()], dtype=np.float32) if coordinates.size == 0: warnings.warn("No coordinates found in the RDKit molecule") - coordinates = np.empty((1,n_atoms,3), dtype=np.float32) + coordinates = np.empty((1, n_atoms, 3), dtype=np.float32) coordinates[:] = np.nan - super(RDKitReader, self).__init__(coordinates, order='fac', **kwargs) \ No newline at end of file + super(RDKitReader, self).__init__(coordinates, order='fac', **kwargs) + + +class RDKitConverter(base.ConverterBase): + """Convert MDAnalysis AtomGroup or Universe to `RDKit `_ :class:`rdkit.Chem.rdchem.Mol`. + + Example + ------- + + .. code-block:: python + + import MDAnalysis as mda + from MDAnalysis.tests.datafiles import PDB_full + u = mda.Universe(PDB_full) + mol = u.select_atoms('resname DMS').convert_to('RDKIT') + + + .. versionadded:: 2.0.0 + """ + + lib = 'RDKIT' + units = {'time': None, 'length': 'Angstrom'} + + def convert(self, obj): + """Write selection at current trajectory frame to :class:`~rdkit.Chem.rdchem.Mol`. + + Parameters + ----------- + obj : AtomGroup or Universe + """ + try: + from rdkit import Chem + except ImportError: + raise ImportError("RDKit is required for the RDKitConverter but " + "it's not installed. Try installing it with \n" + "conda install -c conda-forge rdkit") + try: + # make sure to use atoms (Issue 46) + ag = obj.atoms + except AttributeError: + raise TypeError("No `atoms` attribute in object of type {}, " + "please use a valid AtomGroup or Universe".format( + type(obj))) from None + + try: + elements = ag.elements + except NoDataError: + raise AttributeError( + "The `elements` attribute is required for the RDKitConverter " + "but is not present in this AtomGroup. Please refer to the " + "documentation to guess elements from other attributes or " + "type `help(mda.topology.guessers)`") from None + + other_attrs = {} + for attr in ["bfactors", "charges", "icodes", "segids", "types"]: + if hasattr(ag, attr): + other_attrs[attr] = getattr(ag, attr) + + mol = Chem.RWMol() + atom_mapper = {} + + for i, (atom, element) in enumerate(zip(ag, elements)): + # create atom + rdatom = Chem.Atom(element) + # disable adding H to the molecule + rdatom.SetNoImplicit(True) + # add PDB-like properties + mi = Chem.AtomPDBResidueInfo() + for attr, rdattr in RDATTRIBUTES.items(): + _add_mda_attr_to_rdkit(atom, attr, rdattr, mi) + rdatom.SetMonomerInfo(mi) + # other properties + for attr in other_attrs.keys(): + value = other_attrs[attr][i] + if isinstance(value, np.float): + rdatom.SetDoubleProp("_MDAnalysis_%s" % attr[:-1], + float(value)) + elif isinstance(value, np.int): + rdatom.SetIntProp("_MDAnalysis_%s" % attr[:-1], int(value)) + else: + rdatom.SetProp("_MDAnalysis_%s" % attr[:-1], value) + # add atom + index = mol.AddAtom(rdatom) + # map index in universe to index in mol + atom_mapper[atom.ix] = index + + try: + bonds = ag.bonds + if (len(bonds) == 0) and (ag.n_atoms > 1): + # force guessing bonds + raise NoDataError + except NoDataError: + warnings.warn( + "No `bonds` attribute in this AtomGroup. Guessing bonds based" + "on atoms coordinates") + ag.guess_bonds() + bonds = ag.bonds + + border_atom_indices = [] + for bond in bonds: + try: + bond_indices = [atom_mapper[i] for i in bond.indices] + except KeyError: + # one of the atoms of the bond is not part of the atomgroup + # save the bond atom that is in the atomgroup for later + for i in bond.indices: + if i in atom_mapper.keys(): + border_atom_indices.append(atom_mapper[i]) + break + # skip the rest + continue + try: + bond_type = bond.type.upper() + except AttributeError: + # bond type can be a tuple for PDB files + bond_type = None + bond_type = RDBONDTYPE.get(bond_type, RDBONDORDER.get( + bond.order, Chem.BondType.SINGLE)) + mol.AddBond(*bond_indices, bond_type) + + mol.UpdatePropertyCache(strict=False) + + # infer bond orders and formal charges from the connectivity + _infer_bo_and_charges(mol, border_atom_indices) + + # sanitize + Chem.SanitizeMol(mol) + + return mol + + +def _add_mda_attr_to_rdkit(atom, attr, rdattr, mi): + """Converts an MDAnalysis atom attribute into the RDKit equivalent and + stores it into an RDKit AtomPDBResidueInfo object. + + Parameters + ---------- + + atom : MDAnalysis.core.groups.Atom + The atom to get the attributes from + attr : str + Name of the atom attribute in MDAnalysis in the singular form + rdattr : str + Name of the equivalent attribute in RDKit, as found in the `Set` and + `Get` methods of the `AtomPDBResidueInfo` + mi : rdkit.Chem.rdchem.AtomPDBResidueInfo + MonomerInfo object containing all the relevant atom attributes + """ + try: # get value in MDA atom + value = getattr(atom, attr) + except AttributeError: + pass + else: + if isinstance(value, np.generic): + # convert numpy types to python standard types + value = value.item() + if attr == "name": + # RDKit needs the name to be properly formated for a + # PDB file (1 letter elements start at col 14) + name = re.findall('(\D+|\d+)', value) + if len(name) == 2: + symbol, number = name + else: + symbol, number = name[0], "" + value = "{:>2}".format(symbol) + "{:<2}".format(number) + # set attribute value in RDKit MonomerInfo + getattr(mi, "Set%s" % rdattr)(value) + + +def _infer_bo_and_charges(mol, border_atom_indices): + """Infer bond orders and formal charges from a molecule. + + - Step 1 + Since most MD topology files don't explicitely retain informations on bond + orders or charges, it has to be guessed from the topology. This is done by + looping other each atom and comparing its expected valence to the current + valence, called `delta_v`. If two neighbouring atoms have a common + positive delta_v, the bond between them most likely has a bond order of + 1+delta_v. If an atom doesn't share a delta_v with any of its neighbours, + it likely needs a formal charge of -delta_v. + + - Step 2 + Some atoms can be "mutilated" by a selection (i.e. one of their bonds is cut). The previous step is likely to assign a formal charge to such atoms even if they weren't charged in the original topology. This step converts the resulting charges to radical electrons, or in some cases to higher order bonds. This ensures the atomgroup is not artificially charged because of the previous step. + + Parameters + ---------- + + mol : rdkit.Chem.rdchem.RWMol + The molecule is modified inplace and must have all hydrogens added + + border_atom_indices : list + List of border atoms indices + """ + # Step 1 + for atom in mol.GetAtoms(): + # create delta_v for each possible valence + expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) + current_v = atom.GetTotalValence() + delta_vs = [expected_v - current_v for expected_v in expected_vs] + + # if there's only one possible valence state and the correpsonding + # delta_v is negative, it means we can only add a positive charge to + # the atom + if (len(delta_vs) == 1) and (delta_vs[0] < 0): + charge = -delta_vs[0] + atom.SetFormalCharge(charge) + mol.UpdatePropertyCache(strict=False) + else: + neighbors = atom.GetNeighbors() + # check if one of the neighbors has a common delta_v + for i, na in enumerate(neighbors, start=1): + # create delta_v for the neighbor + na_expected_vs = PERIODIC_TABLE.GetValenceList( + na.GetAtomicNum()) + na_current = na.GetTotalValence() + na_delta = [ + na_expected - na_current for na_expected in na_expected_vs] + # smallest common delta_v, else NaN + common_delta = min(set(delta_vs).intersection(na_delta), + default=np.nan) + # common_delta == 0 means we don't need to do anything + if common_delta != 0: + # if they have no delta_v in common + if common_delta is np.nan: + # if it's the last neighbor + if i == len(neighbors): + charge = -delta_vs[0] # negative + atom.SetFormalCharge(charge) + mol.UpdatePropertyCache(strict=False) + # if they both need a supplementary bond + else: + bond = mol.GetBondBetweenAtoms( + atom.GetIdx(), na.GetIdx()) + bond.SetBondType(RDBONDORDER[common_delta+1]) + mol.UpdatePropertyCache(strict=False) + break # out of neighbors loop + + # Step 2 + for i in border_atom_indices: + atom = mol.GetAtomWithIdx(i) + charge = atom.GetFormalCharge() + neighbors = atom.GetNeighbors() + # check if a neighbor atom also bears a charge + for i, na in enumerate(neighbors, 1): + na_charge = na.GetFormalCharge() + if na_charge < 0: + # both atoms have a negative charge + # convert to higher order bond + common_delta = max([charge, na_charge]) + bond = mol.GetBondBetweenAtoms(atom.GetIdx(), na.GetIdx()) + bond.SetBondType(RDBONDORDER[-common_delta+1]) + na.SetFormalCharge(na_charge - common_delta) + atom.SetFormalCharge(0) + atom.SetNumRadicalElectrons(common_delta - charge) + break + elif i == len(neighbors): + # no neighbor shares a negative charge + atom.SetNumRadicalElectrons(-atom.GetFormalCharge()) + atom.SetFormalCharge(0) + mol.UpdatePropertyCache(strict=False) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 7bdb845f95a..3aac8b8c2f4 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -24,24 +24,36 @@ import pytest import MDAnalysis as mda +from MDAnalysis.topology.guessers import guess_atom_element import numpy as np from numpy.testing import (assert_equal, assert_almost_equal) -from MDAnalysisTests.datafiles import mol2_molecule +from MDAnalysisTests.datafiles import mol2_molecule, PDB_full + +from MDAnalysis.coordinates.RDKit import _infer_bo_and_charges + +try: + from rdkit import Chem + from rdkit.Chem import AllChem +except ImportError: + rdkit_installed = False +else: + rdkit_installed = True -Chem = pytest.importorskip("rdkit.Chem") -AllChem = pytest.importorskip("rdkit.Chem.AllChem") def mol2_mol(): return Chem.MolFromMol2File(mol2_molecule, removeHs=False) + def smiles_mol(): - mol = Chem.MolFromSmiles("CCO") + mol = Chem.MolFromSmiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C") mol = Chem.AddHs(mol) cids = AllChem.EmbedMultipleConfs(mol, numConfs=3) return mol + +@pytest.mark.skipif(rdkit_installed is False, reason="requires RDKit") class TestRDKitReader(object): @pytest.mark.parametrize("rdmol, n_frames", [ (mol2_mol(), 1), @@ -51,7 +63,7 @@ def test_coordinates(self, rdmol, n_frames): universe = mda.Universe(rdmol) assert universe.trajectory.n_frames == n_frames expected = np.array([ - conf.GetPositions() for conf in rdmol.GetConformers()], + conf.GetPositions() for conf in rdmol.GetConformers()], dtype=np.float32) assert_equal(expected, universe.trajectory.coordinate_array) @@ -65,7 +77,7 @@ def test_no_coordinates(self): assert len(w) == 1 assert "No coordinates found" in str( w[-1].message) - expected = np.empty((1,u.atoms.n_atoms,3), dtype=np.float32) + expected = np.empty((1, u.atoms.n_atoms, 3), dtype=np.float32) expected[:] = np.nan assert_equal(u.trajectory.coordinate_array, expected) @@ -73,6 +85,131 @@ def test_compare_mol2reader(self): universe = mda.Universe(mol2_mol()) mol2 = mda.Universe(mol2_molecule) assert universe.trajectory.n_frames == mol2.trajectory.n_frames - assert_equal(universe.trajectory.ts.positions, + assert_equal(universe.trajectory.ts.positions, mol2.trajectory.ts.positions) - + + +@pytest.mark.skipif(rdkit_installed is False, reason="requires RDKit") +class TestRDKitConverter(object): + @pytest.fixture + def pdb(self): + return mda.Universe(PDB_full) + + @pytest.fixture + def mol2(self): + u = mda.Universe(mol2_molecule) + # add elements + elements = np.array([ + guess_atom_element(x).capitalize() for x in u.atoms.types + ], dtype=object) + u.add_TopologyAttr('elements', elements) + return u + + @pytest.mark.parametrize("sel_str", [ + "resid 1", + "resname LYS and name NZ", + "resid 34 and altloc B", + ]) + def test_monomer_info(self, pdb, sel_str): + rdmol = Chem.MolFromPDBFile(PDB_full) + sel = pdb.select_atoms(sel_str) + umol = sel.convert_to("RDKIT") + atom = umol.GetAtomWithIdx(0) + mi = atom.GetMonomerInfo() + + for mda_attr, rd_attr in mda.coordinates.RDKit.RDATTRIBUTES.items(): + if mda_attr == "occupancy": + mda_attr = "occupancie" + elif mda_attr == "segindex": + mda_attr = "segindice" + rd_value = getattr(mi, "Get%s" % rd_attr)() + mda_value = getattr(sel, "%ss" % mda_attr)[0] + if mda_attr == "name": + rd_value = rd_value.strip() + assert rd_value == mda_value + + def test_identical_topology_mol2(self, mol2): + """Check stereochemistry on atoms and bonds (but not yet)""" + rdmol = mol2_mol() + umol = mol2.atoms.convert_to("RDKIT") + assert rdmol.HasSubstructMatch(umol, useChirality=False) + assert umol.HasSubstructMatch(rdmol, useChirality=False) + + def test_identical_topology(self): + rdmol = smiles_mol() + u = mda.Universe(rdmol) + umol = u.atoms.convert_to("RDKIT") + assert rdmol.HasSubstructMatch(umol) and umol.HasSubstructMatch(rdmol) + + def test_raise_requires_elements(self): + u = mda.Universe(mol2_molecule) + with pytest.raises(AttributeError) as e: + u.atoms.convert_to("RDKIT") + assert "`elements` attribute is required for the RDKitConverter" in str( + e.value) + + def test_warn_guess_bonds(self, pdb): + pdb.delete_bonds(pdb.bonds) + ag = pdb.select_atoms("resnum 101 and segid A") + pdb.delete_bonds(ag.bonds) + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.simplefilter("always") + # trigger warning + ag.convert_to("RDKIT") + assert len(w) == 1 + assert "No `bonds` attribute in this AtomGroup" in str( + w[-1].message) + + @pytest.mark.parametrize("idx", [0, 10, 42]) + def test_other_attributes(self, mol2, idx): + mol = mol2.atoms.convert_to("RDKIT") + rdprops = mol.GetAtomWithIdx(idx).GetPropsAsDict() + for prop in ["charge", "segid", "type"]: + rdprop = rdprops["_MDAnalysis_%s" % prop] + mdaprop = getattr(mol2.atoms[idx], prop) + assert rdprop == mdaprop + + +class TestRequiresRDKit(object): + def test_converter_requires_rdkit(self): + if rdkit_installed: + pass + else: + u = mda.Universe(mol2_molecule) + with pytest.raises(ImportError) as e: + u.atoms.convert_to("RDKIT") + assert "RDKit is required for the RDKitConverter" in str( + e.value) + + +@pytest.mark.skipif(rdkit_installed is False, reason="requires RDKit") +class TestRDKitFunctions(object): + @pytest.mark.parametrize("smi, out", [ + ("[H]C([H])([H])[H]", "C"), + ("[C]1(-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C]1(-[H])", "c1ccccc1"), + ("[Cl]-[C](-[H])-[O]", "C(=O)Cl"), + ("[H]-[C](-[O])-[N](-[H])-[H]", "C(=O)N"), + ("[C](-[H])(-[H])-[C](-[H])-[H]", "C=C"), + ("[P](-O)(-O)(-O)-[O]", "P(O)(O)(O)=O"), + ("[N]-[C]-[H]", "N#C"), + ]) + def test_infer_bond_orders(self, smi, out): + molin = Chem.MolFromSmiles(smi, sanitize=False) + molin = _infer_bo_and_charges(molin) + molin = Chem.RemoveHs(molin) + molref = Chem.MolFromSmiles(out) + assert molin.HasSubstructMatch( + molref) and molref.HasSubstructMatch(molin) + + @pytest.mark.parametrize("smi, atom, charge", [ + ("C-[O]", "O", -1), + ("[N]-[C]-[O]", "O", -1), + ("[N](-[H])(-[H])(-[H])-[H]", "N", 1), + ]) + def test_infer_charges(self, smi, atom, charge): + mol = Chem.MolFromSmiles(smi, sanitize=False) + mol = _infer_bo_and_charges(mol) + index = mol.GetSubstructMatch(Chem.MolFromSmarts(atom))[0] + atom = mol.GetAtomWithIdx(index) + assert atom.GetFormalCharge() == charge