Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RdkitConverter with sanitization #2799

Closed
wants to merge 10 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
305 changes: 301 additions & 4 deletions package/MDAnalysis/coordinates/RDKit.py
Original file line number Diff line number Diff line change
@@ -40,6 +40,8 @@
<Universe with 42 atoms>
>>> u.trajectory
<RDKitReader with 10 frames of 42 atoms>
>>> u.atoms.convert_to("RDKIT")
<rdkit.Chem.rdchem.RWMol object at 0x7fcebb958148>


Classes
@@ -55,15 +57,51 @@
"""

import warnings
import re

import numpy as np

from ..exceptions import NoDataError
from ..topology.guessers import guess_atom_element
from . import memory
from . import base

try:
from rdkit import Chem
except ImportError:
pass
else:
RDBONDTYPE = {
'AROMATIC': Chem.BondType.AROMATIC,
'SINGLE': Chem.BondType.SINGLE,
'DOUBLE': Chem.BondType.DOUBLE,
'TRIPLE': Chem.BondType.TRIPLE,
}
RDBONDORDER = {
1: Chem.BondType.SINGLE,
1.5: Chem.BondType.AROMATIC,
"ar": Chem.BondType.AROMATIC,
2: Chem.BondType.DOUBLE,
3: Chem.BondType.TRIPLE,
}
# add string version of the key for each bond
RDBONDORDER.update({str(key): value for key, value in RDBONDORDER.items()})
RDATTRIBUTES = {
"altLoc": "AltLoc",
"chainID": "ChainId",
"name": "Name",
"occupancy": "Occupancy",
"resname": "ResidueName",
"resid": "ResidueNumber",
"segindex": "SegmentNumber",
"tempfactor": "TempFactor",
}
PERIODIC_TABLE = Chem.GetPeriodicTable()


class RDKitReader(memory.MemoryReader):
"""Coordinate reader for RDKit.

.. versionadded:: 2.0.0
"""
format = 'RDKIT'
@@ -95,10 +133,269 @@ def __init__(self, filename, **kwargs):
"""
n_atoms = filename.GetNumAtoms()
coordinates = np.array([
conf.GetPositions() for conf in filename.GetConformers()],
conf.GetPositions() for conf in filename.GetConformers()],
dtype=np.float32)
if coordinates.size == 0:
warnings.warn("No coordinates found in the RDKit molecule")
coordinates = np.empty((1,n_atoms,3), dtype=np.float32)
coordinates = np.empty((1, n_atoms, 3), dtype=np.float32)
coordinates[:] = np.nan
super(RDKitReader, self).__init__(coordinates, order='fac', **kwargs)
super(RDKitReader, self).__init__(coordinates, order='fac', **kwargs)


class RDKitConverter(base.ConverterBase):
"""Convert MDAnalysis AtomGroup or Universe to `RDKit <https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html#rdkit.Chem.rdchem.Mol>`_ :class:`rdkit.Chem.rdchem.Mol`.

Example
-------

.. code-block:: python

import MDAnalysis as mda
from MDAnalysis.tests.datafiles import PDB_full
u = mda.Universe(PDB_full)
mol = u.select_atoms('resname DMS').convert_to('RDKIT')


.. versionadded:: 2.0.0
"""

lib = 'RDKIT'
units = {'time': None, 'length': 'Angstrom'}

def convert(self, obj):
"""Write selection at current trajectory frame to :class:`~rdkit.Chem.rdchem.Mol`.

Parameters
-----------
obj : AtomGroup or Universe
"""
try:
from rdkit import Chem
except ImportError:
raise ImportError("RDKit is required for the RDKitConverter but "
"it's not installed. Try installing it with \n"
"conda install -c conda-forge rdkit")
try:
# make sure to use atoms (Issue 46)
ag = obj.atoms
except AttributeError:
raise TypeError("No `atoms` attribute in object of type {}, "
"please use a valid AtomGroup or Universe".format(
type(obj))) from None

try:
elements = ag.elements
except NoDataError:
raise AttributeError(
"The `elements` attribute is required for the RDKitConverter "
"but is not present in this AtomGroup. Please refer to the "
"documentation to guess elements from other attributes or "
"type `help(mda.topology.guessers)`") from None

other_attrs = {}
for attr in ["bfactors", "charges", "icodes", "segids", "types"]:
if hasattr(ag, attr):
other_attrs[attr] = getattr(ag, attr)

mol = Chem.RWMol()
atom_mapper = {}

for i, (atom, element) in enumerate(zip(ag, elements)):
# create atom
rdatom = Chem.Atom(element)
# disable adding H to the molecule
rdatom.SetNoImplicit(True)
# add PDB-like properties
mi = Chem.AtomPDBResidueInfo()
for attr, rdattr in RDATTRIBUTES.items():
_add_mda_attr_to_rdkit(atom, attr, rdattr, mi)
rdatom.SetMonomerInfo(mi)
# other properties
for attr in other_attrs.keys():
value = other_attrs[attr][i]
if isinstance(value, np.float):
rdatom.SetDoubleProp("_MDAnalysis_%s" % attr[:-1],
float(value))
elif isinstance(value, np.int):
rdatom.SetIntProp("_MDAnalysis_%s" % attr[:-1], int(value))
else:
rdatom.SetProp("_MDAnalysis_%s" % attr[:-1], value)
# add atom
index = mol.AddAtom(rdatom)
# map index in universe to index in mol
atom_mapper[atom.ix] = index

try:
bonds = ag.bonds
if (len(bonds) == 0) and (ag.n_atoms > 1):
# force guessing bonds
raise NoDataError
except NoDataError:
warnings.warn(
"No `bonds` attribute in this AtomGroup. Guessing bonds based"
"on atoms coordinates")
ag.guess_bonds()
bonds = ag.bonds

border_atom_indices = []
for bond in bonds:
try:
bond_indices = [atom_mapper[i] for i in bond.indices]
except KeyError:
# one of the atoms of the bond is not part of the atomgroup
# save the bond atom that is in the atomgroup for later
for i in bond.indices:
if i in atom_mapper.keys():
border_atom_indices.append(atom_mapper[i])
break
# skip the rest
continue
try:
bond_type = bond.type.upper()
except AttributeError:
# bond type can be a tuple for PDB files
bond_type = None
bond_type = RDBONDTYPE.get(bond_type, RDBONDORDER.get(
bond.order, Chem.BondType.SINGLE))
mol.AddBond(*bond_indices, bond_type)

mol.UpdatePropertyCache(strict=False)

# infer bond orders and formal charges from the connectivity
_infer_bo_and_charges(mol, border_atom_indices)

# sanitize
Chem.SanitizeMol(mol)

return mol


def _add_mda_attr_to_rdkit(atom, attr, rdattr, mi):
"""Converts an MDAnalysis atom attribute into the RDKit equivalent and
stores it into an RDKit AtomPDBResidueInfo object.

Parameters
----------

atom : MDAnalysis.core.groups.Atom
The atom to get the attributes from
attr : str
Name of the atom attribute in MDAnalysis in the singular form
rdattr : str
Name of the equivalent attribute in RDKit, as found in the `Set` and
`Get` methods of the `AtomPDBResidueInfo`
mi : rdkit.Chem.rdchem.AtomPDBResidueInfo
MonomerInfo object containing all the relevant atom attributes
"""
try: # get value in MDA atom
value = getattr(atom, attr)
except AttributeError:
pass
else:
if isinstance(value, np.generic):
# convert numpy types to python standard types
value = value.item()
if attr == "name":
# RDKit needs the name to be properly formated for a
# PDB file (1 letter elements start at col 14)
name = re.findall('(\D+|\d+)', value)
if len(name) == 2:
symbol, number = name
else:
symbol, number = name[0], ""
value = "{:>2}".format(symbol) + "{:<2}".format(number)
# set attribute value in RDKit MonomerInfo
getattr(mi, "Set%s" % rdattr)(value)


def _infer_bo_and_charges(mol, border_atom_indices):
"""Infer bond orders and formal charges from a molecule.

- Step 1
Since most MD topology files don't explicitely retain informations on bond
orders or charges, it has to be guessed from the topology. This is done by
looping other each atom and comparing its expected valence to the current
valence, called `delta_v`. If two neighbouring atoms have a common
positive delta_v, the bond between them most likely has a bond order of
1+delta_v. If an atom doesn't share a delta_v with any of its neighbours,
it likely needs a formal charge of -delta_v.

- Step 2
Some atoms can be "mutilated" by a selection (i.e. one of their bonds is cut). The previous step is likely to assign a formal charge to such atoms even if they weren't charged in the original topology. This step converts the resulting charges to radical electrons, or in some cases to higher order bonds. This ensures the atomgroup is not artificially charged because of the previous step.

Parameters
----------

mol : rdkit.Chem.rdchem.RWMol
The molecule is modified inplace and must have all hydrogens added

border_atom_indices : list
List of border atoms indices
"""
# Step 1
for atom in mol.GetAtoms():
# create delta_v for each possible valence
expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum())
current_v = atom.GetTotalValence()
delta_vs = [expected_v - current_v for expected_v in expected_vs]

# if there's only one possible valence state and the correpsonding
# delta_v is negative, it means we can only add a positive charge to
# the atom
if (len(delta_vs) == 1) and (delta_vs[0] < 0):
charge = -delta_vs[0]
atom.SetFormalCharge(charge)
mol.UpdatePropertyCache(strict=False)
else:
neighbors = atom.GetNeighbors()
# check if one of the neighbors has a common delta_v
for i, na in enumerate(neighbors, start=1):
# create delta_v for the neighbor
na_expected_vs = PERIODIC_TABLE.GetValenceList(
na.GetAtomicNum())
na_current = na.GetTotalValence()
na_delta = [
na_expected - na_current for na_expected in na_expected_vs]
# smallest common delta_v, else NaN
common_delta = min(set(delta_vs).intersection(na_delta),
default=np.nan)
# common_delta == 0 means we don't need to do anything
if common_delta != 0:
# if they have no delta_v in common
if common_delta is np.nan:
# if it's the last neighbor
if i == len(neighbors):
charge = -delta_vs[0] # negative
atom.SetFormalCharge(charge)
mol.UpdatePropertyCache(strict=False)
# if they both need a supplementary bond
else:
bond = mol.GetBondBetweenAtoms(
atom.GetIdx(), na.GetIdx())
bond.SetBondType(RDBONDORDER[common_delta+1])
mol.UpdatePropertyCache(strict=False)
break # out of neighbors loop

# Step 2
for i in border_atom_indices:
atom = mol.GetAtomWithIdx(i)
charge = atom.GetFormalCharge()
neighbors = atom.GetNeighbors()
# check if a neighbor atom also bears a charge
for i, na in enumerate(neighbors, 1):
na_charge = na.GetFormalCharge()
if na_charge < 0:
# both atoms have a negative charge
# convert to higher order bond
common_delta = max([charge, na_charge])
bond = mol.GetBondBetweenAtoms(atom.GetIdx(), na.GetIdx())
bond.SetBondType(RDBONDORDER[-common_delta+1])
na.SetFormalCharge(na_charge - common_delta)
atom.SetFormalCharge(0)
atom.SetNumRadicalElectrons(common_delta - charge)
break
elif i == len(neighbors):
# no neighbor shares a negative charge
atom.SetNumRadicalElectrons(-atom.GetFormalCharge())
atom.SetFormalCharge(0)
mol.UpdatePropertyCache(strict=False)
Loading