-
Notifications
You must be signed in to change notification settings - Fork 874
/
Copy pathmolecule_matcher.py
1301 lines (1040 loc) · 46.6 KB
/
molecule_matcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
This module provides classes to perform fitting of molecule with arbitrary
atom orders.
This module is supposed to perform exact comparisons without the atom order
correspondence prerequisite, while molecule_structure_comparator is supposed
to do rough comparisons with the atom order correspondence prerequisite.
The implementation is based on an excellent python package called `rmsd` that
you can find at https://github.com/charnley/rmsd.
"""
from __future__ import annotations
import abc
import copy
import itertools
import logging
import math
import re
from typing import TYPE_CHECKING
import numpy as np
from monty.dev import requires
from monty.json import MSONable
from scipy.optimize import linear_sum_assignment
from scipy.spatial.distance import cdist
from pymatgen.core.structure import Molecule
try:
from openbabel import openbabel
from pymatgen.io.babel import BabelMolAdaptor
except ImportError:
openbabel = BabelMolAdaptor = None # type: ignore[misc]
if TYPE_CHECKING:
from typing_extensions import Self
__author__ = "Xiaohui Qu, Adam Fekete"
__version__ = "1.0"
__email__ = "[email protected]"
logger = logging.getLogger(__name__)
class AbstractMolAtomMapper(MSONable, abc.ABC):
"""
Abstract molecular atom order mapping class. A mapping will be able to
find the uniform atom order of two molecules that can pair the
geometrically equivalent atoms.
"""
@abc.abstractmethod
def uniform_labels(self, mol1, mol2):
"""
Pair the geometrically equivalent atoms of the molecules.
Args:
mol1: First molecule. OpenBabel OBMol or pymatgen Molecule object.
mol2: Second molecule. OpenBabel OBMol or pymatgen Molecule object.
Returns:
tuple[list1, list2]: if uniform atom order is found. list1 and list2
are for mol1 and mol2, respectively. Their length equal
to the number of atoms. They represents the uniform atom order
of the two molecules. The value of each element is the original
atom index in mol1 or mol2 of the current atom in uniform atom order.
(None, None) if uniform atom is not available.
"""
@abc.abstractmethod
def get_molecule_hash(self, mol):
"""
Defines a hash for molecules. This allows molecules to be grouped
efficiently for comparison.
Args:
mol: The molecule. OpenBabel OBMol or pymatgen Molecule object
Returns:
A hashable object. Examples can be string formulas, etc.
"""
@classmethod
def from_dict(cls, dct: dict) -> Self:
"""
Args:
dct (dict): Dict representation.
Returns:
AbstractMolAtomMapper
"""
for trans_modules in ["molecule_matcher"]:
level = 0 # Python 3.x
mod = __import__(
f"pymatgen.analysis.{trans_modules}",
globals(),
locals(),
[dct["@class"]],
level,
)
if hasattr(mod, dct["@class"]):
class_proxy = getattr(mod, dct["@class"])
return class_proxy.from_dict(dct)
raise ValueError("Invalid Comparator dict")
class IsomorphismMolAtomMapper(AbstractMolAtomMapper):
"""Pair atoms by isomorphism permutations in the OpenBabel::OBAlign class."""
def uniform_labels(self, mol1, mol2):
"""
Pair the geometrically equivalent atoms of the molecules.
Calculate RMSD on all possible isomorphism mappings and return mapping
with the least RMSD.
Args:
mol1: First molecule. OpenBabel OBMol or pymatgen Molecule object.
mol2: Second molecule. OpenBabel OBMol or pymatgen Molecule object.
Returns:
tuple[list1, list2]: if uniform atom order is found. list1 and list2
are for mol1 and mol2, respectively. Their length equal
to the number of atoms. They represents the uniform atom order
of the two molecules. The value of each element is the original
atom index in mol1 or mol2 of the current atom in uniform atom order.
(None, None) if uniform atom is not available.
"""
ob_mol1 = BabelMolAdaptor(mol1).openbabel_mol
ob_mol2 = BabelMolAdaptor(mol2).openbabel_mol
h1 = self.get_molecule_hash(ob_mol1)
h2 = self.get_molecule_hash(ob_mol2)
if h1 != h2:
return None, None
query = openbabel.CompileMoleculeQuery(ob_mol1)
iso_mapper = openbabel.OBIsomorphismMapper.GetInstance(query)
isomorph = openbabel.vvpairUIntUInt()
iso_mapper.MapAll(ob_mol2, isomorph)
sorted_isomorph = [sorted(x, key=lambda morp: morp[0]) for x in isomorph]
label2_list = tuple(tuple(p[1] + 1 for p in x) for x in sorted_isomorph)
vmol1 = ob_mol1
aligner = openbabel.OBAlign(True, False) # includeH=True, symmetry=False # noqa: FBT003
aligner.SetRefMol(vmol1)
least_rmsd = float("Inf")
best_label2 = None
label1 = list(range(1, ob_mol1.NumAtoms() + 1))
# noinspection PyProtectedMember
elements1 = InchiMolAtomMapper._get_elements(vmol1, label1)
for label2 in label2_list:
# noinspection PyProtectedMember
elements2 = InchiMolAtomMapper._get_elements(ob_mol2, label2)
if elements1 != elements2:
continue
vmol2 = openbabel.OBMol()
for idx in label2:
vmol2.AddAtom(ob_mol2.GetAtom(idx))
aligner.SetTargetMol(vmol2)
aligner.Align()
rmsd = aligner.GetRMSD()
if rmsd < least_rmsd:
least_rmsd = rmsd
best_label2 = copy.copy(label2)
return label1, best_label2
def get_molecule_hash(self, mol):
"""Return inchi as molecular hash."""
ob_conv = openbabel.OBConversion()
ob_conv.SetOutFormat("inchi")
ob_conv.AddOption("X", openbabel.OBConversion.OUTOPTIONS, "DoNotAddH")
inchi_text = ob_conv.WriteString(mol)
match = re.search(r"InChI=(?P<inchi>.+)\n", inchi_text)
return match.group("inchi")
def as_dict(self):
"""Get MSONable dict."""
return {
"version": __version__,
"@module": type(self).__module__,
"@class": type(self).__name__,
}
@classmethod
def from_dict(cls, dct: dict) -> Self:
"""
Args:
dct (dict): Dict representation.
Returns:
IsomorphismMolAtomMapper
"""
return cls()
class InchiMolAtomMapper(AbstractMolAtomMapper):
"""Pair atoms by inchi labels."""
def __init__(self, angle_tolerance=10.0):
"""
Args:
angle_tolerance (float): Angle threshold to assume linear molecule. In degrees.
"""
self._angle_tolerance = angle_tolerance
self._assistant_mapper = IsomorphismMolAtomMapper()
def as_dict(self):
"""Get MSONable dict."""
return {
"version": __version__,
"@module": type(self).__module__,
"@class": type(self).__name__,
"angle_tolerance": self._angle_tolerance,
}
@classmethod
def from_dict(cls, dct: dict) -> Self:
"""
Args:
dct (dict): Dict Representation.
Returns:
InchiMolAtomMapper
"""
return cls(angle_tolerance=dct["angle_tolerance"])
@staticmethod
def _inchi_labels(mol):
"""Get the inchi canonical labels of the heavy atoms in the molecule.
Args:
mol: The molecule. OpenBabel OBMol object
Returns:
The label mappings. List of tuple of canonical label,
original label
List of equivalent atoms.
"""
ob_conv = openbabel.OBConversion()
ob_conv.SetOutFormat("inchi")
ob_conv.AddOption("a", openbabel.OBConversion.OUTOPTIONS)
ob_conv.AddOption("X", openbabel.OBConversion.OUTOPTIONS, "DoNotAddH")
inchi_text = ob_conv.WriteString(mol)
match = re.search(
r"InChI=(?P<inchi>.+)\nAuxInfo=.+/N:(?P<labels>[0-9,;]+)/(E:(?P<eq_atoms>[0-9,;\(\)]*)/)?",
inchi_text,
)
inchi = match.group("inchi")
label_text = match.group("labels")
eq_atom_text = match.group("eq_atoms")
heavy_atom_labels = tuple(int(idx) for idx in label_text.replace(";", ",").split(","))
eq_atoms = []
if eq_atom_text is not None:
eq_tokens = re.findall(r"\(((?:[0-9]+,)+[0-9]+)\)", eq_atom_text.replace(";", ","))
eq_atoms = tuple(tuple(int(idx) for idx in t.split(",")) for t in eq_tokens)
return heavy_atom_labels, eq_atoms, inchi
@staticmethod
def _group_centroid(mol, ilabels, group_atoms):
"""
Calculate the centroids of a group atoms indexed by the labels of inchi.
Args:
mol: The molecule. OpenBabel OBMol object
ilabel: inchi label map
Returns:
Centroid. Tuple (x, y, z)
"""
c1x, c1y, c1z = 0.0, 0.0, 0.0
for idx in group_atoms:
orig_idx = ilabels[idx - 1]
oa1 = mol.GetAtom(orig_idx)
c1x += float(oa1.x())
c1y += float(oa1.y())
c1z += float(oa1.z())
n_atoms = len(group_atoms)
c1x /= n_atoms
c1y /= n_atoms
c1z /= n_atoms
return c1x, c1y, c1z
def _virtual_molecule(self, mol, ilabels, eq_atoms):
"""
Create a virtual molecule by unique atoms, the centroids of the
equivalent atoms.
Args:
mol: The molecule. OpenBabel OBMol object
ilabels: inchi label map
eq_atoms: equivalent atom labels
farthest_group_idx: The equivalent atom group index in which
there is the farthest atom to the centroid
Returns:
The virtual molecule
"""
vmol = openbabel.OBMol()
non_unique_atoms = {a for g in eq_atoms for a in g}
all_atoms = set(range(1, len(ilabels) + 1))
unique_atom_labels = sorted(all_atoms - non_unique_atoms)
# try to align molecules using unique atoms
for idx in unique_atom_labels:
orig_idx = ilabels[idx - 1]
oa1 = mol.GetAtom(orig_idx)
a1 = vmol.NewAtom()
a1.SetAtomicNum(oa1.GetAtomicNum())
a1.SetVector(oa1.GetVector())
# try to align using centroids of the equivalent atoms
if vmol.NumAtoms() < 3:
for symm in eq_atoms:
c1x, c1y, c1z = self._group_centroid(mol, ilabels, symm)
min_distance = float("inf")
for idx in range(1, vmol.NumAtoms() + 1):
va = vmol.GetAtom(idx)
distance = math.sqrt((c1x - va.x()) ** 2 + (c1y - va.y()) ** 2 + (c1z - va.z()) ** 2)
min_distance = min(distance, min_distance)
if min_distance > 0.2:
a1 = vmol.NewAtom()
a1.SetAtomicNum(9)
a1.SetVector(c1x, c1y, c1z)
return vmol
@staticmethod
def _align_heavy_atoms(mol1, mol2, vmol1, vmol2, ilabel1, ilabel2, eq_atoms):
"""
Align the label of topologically identical atoms of second molecule
towards first molecule.
Args:
mol1: First molecule. OpenBabel OBMol object
mol2: Second molecule. OpenBabel OBMol object
vmol1: First virtual molecule constructed by centroids. OpenBabel
OBMol object
vmol2: First virtual molecule constructed by centroids. OpenBabel
OBMol object
ilabel1: inchi label map of the first molecule
ilabel2: inchi label map of the second molecule
eq_atoms: equivalent atom labels
Returns:
corrected inchi labels of heavy atoms of the second molecule
"""
n_virtual = vmol1.NumAtoms()
n_heavy = len(ilabel1)
for idx in ilabel2: # add all heavy atoms
a1 = vmol1.NewAtom()
a1.SetAtomicNum(1)
a1.SetVector(0.0, 0.0, 0.0) # useless, just to pair with vmol2
oa2 = mol2.GetAtom(idx)
a2 = vmol2.NewAtom()
a2.SetAtomicNum(1)
# align using the virtual atoms, these atoms are not
# used to align, but match by positions
a2.SetVector(oa2.GetVector())
aligner = openbabel.OBAlign(False, False) # includeH=False, symmetry=False # noqa: FBT003
aligner.SetRefMol(vmol1)
aligner.SetTargetMol(vmol2)
aligner.Align()
aligner.UpdateCoords(vmol2)
canon_mol1 = openbabel.OBMol()
for idx in ilabel1:
oa1 = mol1.GetAtom(idx)
a1 = canon_mol1.NewAtom()
a1.SetAtomicNum(oa1.GetAtomicNum())
a1.SetVector(oa1.GetVector())
aligned_mol2 = openbabel.OBMol()
for idx in range(n_virtual + 1, n_virtual + n_heavy + 1):
oa2 = vmol2.GetAtom(idx)
a2 = aligned_mol2.NewAtom()
a2.SetAtomicNum(oa2.GetAtomicNum())
a2.SetVector(oa2.GetVector())
canon_label2 = list(range(1, n_heavy + 1))
for symm in eq_atoms:
for idx in symm:
canon_label2[idx - 1] = -1
for symm in eq_atoms:
candidates1 = list(symm)
candidates2 = list(symm)
for c2 in candidates2:
distance = 99999.0
canon_idx = candidates1[0]
a2 = aligned_mol2.GetAtom(c2)
for c1 in candidates1:
a1 = canon_mol1.GetAtom(c1)
dist = a1.GetDistance(a2)
if dist < distance:
distance = dist
canon_idx = c1
canon_label2[c2 - 1] = canon_idx
candidates1.remove(canon_idx)
canon_inchi_orig_map2 = list(zip(canon_label2, list(range(1, n_heavy + 1)), ilabel2, strict=True))
canon_inchi_orig_map2.sort(key=lambda m: m[0])
return tuple(x[2] for x in canon_inchi_orig_map2)
@staticmethod
def _align_hydrogen_atoms(mol1, mol2, heavy_indices1, heavy_indices2):
"""
Align the label of topologically identical atoms of second molecule
towards first molecule.
Args:
mol1: First molecule. OpenBabel OBMol object
mol2: Second molecule. OpenBabel OBMol object
heavy_indices1: inchi label map of the first molecule
heavy_indices2: label map of the second molecule
Returns:
corrected label map of all atoms of the second molecule
"""
num_atoms = mol2.NumAtoms()
all_atom = set(range(1, num_atoms + 1))
hydrogen_atoms1 = all_atom - set(heavy_indices1)
hydrogen_atoms2 = all_atom - set(heavy_indices2)
label1 = heavy_indices1 + tuple(hydrogen_atoms1)
label2 = heavy_indices2 + tuple(hydrogen_atoms2)
cmol1 = openbabel.OBMol()
for idx in label1:
oa1 = mol1.GetAtom(idx)
a1 = cmol1.NewAtom()
a1.SetAtomicNum(oa1.GetAtomicNum())
a1.SetVector(oa1.GetVector())
cmol2 = openbabel.OBMol()
for idx in label2:
oa2 = mol2.GetAtom(idx)
a2 = cmol2.NewAtom()
a2.SetAtomicNum(oa2.GetAtomicNum())
a2.SetVector(oa2.GetVector())
aligner = openbabel.OBAlign(False, False) # includeH=False, symmetry=False # noqa: FBT003
aligner.SetRefMol(cmol1)
aligner.SetTargetMol(cmol2)
aligner.Align()
aligner.UpdateCoords(cmol2)
hydrogen_label2 = []
hydrogen_label1 = list(range(len(heavy_indices1) + 1, num_atoms + 1))
for h2 in range(len(heavy_indices2) + 1, num_atoms + 1):
distance = 99999.0
idx = hydrogen_label1[0]
a2 = cmol2.GetAtom(h2)
for h1 in hydrogen_label1:
a1 = cmol1.GetAtom(h1)
dist = a1.GetDistance(a2)
if dist < distance:
distance = dist
idx = h1
hydrogen_label2.append(idx)
hydrogen_label1.remove(idx)
hydrogen_orig_idx2 = label2[len(heavy_indices2) :]
hydrogen_canon_orig_map2 = list(zip(hydrogen_label2, hydrogen_orig_idx2, strict=True))
hydrogen_canon_orig_map2.sort(key=lambda m: m[0])
hydrogen_canon_indices2 = [x[1] for x in hydrogen_canon_orig_map2]
canon_label1 = label1
canon_label2 = heavy_indices2 + tuple(hydrogen_canon_indices2)
return canon_label1, canon_label2
@staticmethod
def _get_elements(mol, label):
"""
The elements of the atoms in the specified order.
Args:
mol: The molecule. OpenBabel OBMol object.
label: The atom indices. List of integers.
Returns:
Elements. List of integers.
"""
return [int(mol.GetAtom(idx).GetAtomicNum()) for idx in label]
def _is_molecule_linear(self, mol):
"""
Is the molecule a linear one.
Args:
mol: The molecule. OpenBabel OBMol object.
Returns:
bool
"""
if mol.NumAtoms() < 3:
return True
a1 = mol.GetAtom(1)
a2 = mol.GetAtom(2)
for idx in range(3, mol.NumAtoms() + 1):
angle = float(mol.GetAtom(idx).GetAngle(a2, a1))
if angle < 0.0:
angle = -angle
if angle > 90.0:
angle = 180.0 - angle
if angle > self._angle_tolerance:
return False
return True
def uniform_labels(self, mol1, mol2):
"""
Args:
mol1 (Molecule): Molecule 1
mol2 (Molecule): Molecule 2.
Returns:
Labels
"""
ob_mol1 = BabelMolAdaptor(mol1).openbabel_mol
ob_mol2 = BabelMolAdaptor(mol2).openbabel_mol
ilabel1, iequal_atom1, inchi1 = self._inchi_labels(ob_mol1)
ilabel2, iequal_atom2, inchi2 = self._inchi_labels(ob_mol2)
if inchi1 != inchi2:
return None, None # Topologically different
if iequal_atom1 != iequal_atom2:
raise RuntimeError("Design Error! Equivalent atoms are inconsistent")
vmol1 = self._virtual_molecule(ob_mol1, ilabel1, iequal_atom1)
vmol2 = self._virtual_molecule(ob_mol2, ilabel2, iequal_atom2)
if vmol1.NumAtoms() != vmol2.NumAtoms():
return None, None
if vmol1.NumAtoms() < 3 or self._is_molecule_linear(vmol1) or self._is_molecule_linear(vmol2):
# using isomorphism for difficult (actually simple) molecules
c_label1, c_label2 = self._assistant_mapper.uniform_labels(mol1, mol2)
else:
heavy_atom_indices2 = self._align_heavy_atoms(
ob_mol1, ob_mol2, vmol1, vmol2, ilabel1, ilabel2, iequal_atom1
)
c_label1, c_label2 = self._align_hydrogen_atoms(ob_mol1, ob_mol2, ilabel1, heavy_atom_indices2)
if c_label1 and c_label2:
elements1 = self._get_elements(ob_mol1, c_label1)
elements2 = self._get_elements(ob_mol2, c_label2)
if elements1 != elements2:
return None, None
return c_label1, c_label2
def get_molecule_hash(self, mol):
"""Return inchi as molecular hash."""
ob_mol = BabelMolAdaptor(mol).openbabel_mol
return self._inchi_labels(ob_mol)[2]
class MoleculeMatcher(MSONable):
"""Match molecules and identify whether molecules are the same."""
@requires(
openbabel,
"BabelMolAdaptor requires openbabel to be installed with Python "
"bindings. Please get it at https://openbabel.org (version >=3.0.0).",
)
def __init__(self, tolerance: float = 0.01, mapper=None) -> None:
"""
Args:
tolerance (float): RMSD difference threshold whether two molecules are
different
mapper (AbstractMolAtomMapper): MolAtomMapper object that is able to map the atoms of two
molecule to uniform order.
"""
self._tolerance = tolerance
self._mapper = mapper or InchiMolAtomMapper()
def fit(self, mol1, mol2):
"""Fit two molecules.
Args:
mol1: First molecule. OpenBabel OBMol or pymatgen Molecule object
mol2: Second molecule. OpenBabel OBMol or pymatgen Molecule object
Returns:
bool: True if two molecules are the same.
"""
return self.get_rmsd(mol1, mol2) < self._tolerance
def get_rmsd(self, mol1, mol2):
"""Get RMSD between two molecule with arbitrary atom order.
Returns:
RMSD if topology of the two molecules are the same
Infinite if the topology is different
"""
label1, label2 = self._mapper.uniform_labels(mol1, mol2)
if label1 is None or label2 is None:
return float("Inf")
return self._calc_rms(mol1, mol2, label1, label2)
@staticmethod
def _calc_rms(mol1, mol2, clabel1, clabel2):
"""
Calculate the RMSD.
Args:
mol1: The first molecule. OpenBabel OBMol or pymatgen Molecule
object
mol2: The second molecule. OpenBabel OBMol or pymatgen Molecule
object
clabel1: The atom indices that can reorder the first molecule to
uniform atom order
clabel1: The atom indices that can reorder the second molecule to
uniform atom order
Returns:
The RMSD.
"""
ob_mol1 = BabelMolAdaptor(mol1).openbabel_mol
ob_mol2 = BabelMolAdaptor(mol2).openbabel_mol
cmol1 = openbabel.OBMol()
for idx in clabel1:
oa1 = ob_mol1.GetAtom(idx)
a1 = cmol1.NewAtom()
a1.SetAtomicNum(oa1.GetAtomicNum())
a1.SetVector(oa1.GetVector())
cmol2 = openbabel.OBMol()
for idx in clabel2:
oa2 = ob_mol2.GetAtom(idx)
a2 = cmol2.NewAtom()
a2.SetAtomicNum(oa2.GetAtomicNum())
a2.SetVector(oa2.GetVector())
aligner = openbabel.OBAlign(True, False) # includeH=True, symmetry=False # noqa: FBT003
aligner.SetRefMol(cmol1)
aligner.SetTargetMol(cmol2)
aligner.Align()
return aligner.GetRMSD()
def group_molecules(self, mol_list):
"""
Group molecules by structural equality.
Args:
mol_list: List of OpenBabel OBMol or pymatgen objects
Returns:
A list of lists of matched molecules
Assumption: if s1=s2 and s2=s3, then s1=s3
This may not be true for small tolerances.
"""
mol_hash = [(idx, self._mapper.get_molecule_hash(mol)) for idx, mol in enumerate(mol_list)]
mol_hash.sort(key=lambda x: x[1])
# Use molecular hash to pre-group molecules.
raw_groups = tuple(tuple(m[0] for m in g) for k, g in itertools.groupby(mol_hash, key=lambda x: x[1]))
group_indices = []
for rg in raw_groups:
mol_eq_test = [
(p[0], p[1], self.fit(mol_list[p[0]], mol_list[p[1]])) for p in itertools.combinations(sorted(rg), 2)
]
mol_eq = {(p[0], p[1]) for p in mol_eq_test if p[2]}
not_alone_mols = set(itertools.chain.from_iterable(mol_eq))
alone_mols = set(rg) - not_alone_mols
group_indices.extend([[m] for m in alone_mols])
while len(not_alone_mols) > 0:
current_group = {not_alone_mols.pop()}
while len(not_alone_mols) > 0:
candidate_pairs = {tuple(sorted(p)) for p in itertools.product(current_group, not_alone_mols)}
mutual_pairs = candidate_pairs & mol_eq
if len(mutual_pairs) == 0:
break
mutual_mols = set(itertools.chain.from_iterable(mutual_pairs))
current_group |= mutual_mols
not_alone_mols -= mutual_mols
group_indices.append(sorted(current_group))
group_indices.sort(key=lambda x: (len(x), -x[0]), reverse=True)
return [[mol_list[idx] for idx in g] for g in group_indices]
def as_dict(self):
"""Get MSONable dict."""
return {
"version": __version__,
"@module": type(self).__module__,
"@class": type(self).__name__,
"tolerance": self._tolerance,
"mapper": self._mapper.as_dict(),
}
@classmethod
def from_dict(cls, dct: dict) -> Self:
"""
Args:
dct (dict): Dict representation.
Returns:
MoleculeMatcher
"""
return cls(
tolerance=dct["tolerance"],
mapper=AbstractMolAtomMapper.from_dict(dct["mapper"]),
)
class KabschMatcher(MSONable):
"""Molecule matcher using Kabsch algorithm.
The Kabsch algorithm capable aligning two molecules by finding the parameters
(translation, rotation) which minimize the root-mean-square-deviation (RMSD) of
two molecules which are topologically (atom types, geometry) similar two each other.
Notes:
When aligning molecules, the atoms of the two molecules **must** be in the same
order for the results to be sensible.
"""
def __init__(self, target: Molecule):
"""Constructor of the matcher object.
Args:
target: a `Molecule` object used as a target during the alignment
"""
self.target = target
def match(self, p: Molecule):
"""Using the Kabsch algorithm the alignment of two molecules (P, Q)
happens in three steps:
- translate the P and Q into their centroid
- compute of the optimal rotation matrix (U) using Kabsch algorithm
- compute the translation (V) and rmsd.
The function returns the rotation matrix (U), translation vector (V),
and RMSD between Q and P', where P' is:
P' = P * U + V
Args:
p: a `Molecule` object what will be matched with the target one.
Returns:
U: Rotation matrix (D,D)
V: Translation vector (D)
RMSD : Root mean squared deviation between P and Q
"""
if self.target.atomic_numbers != p.atomic_numbers:
raise ValueError("The order of the species aren't matching! Please try using PermInvMatcher.")
p_coord, q_coord = p.cart_coords, self.target.cart_coords
# Both sets of coordinates must be translated first, so that their
# centroid coincides with the origin of the coordinate system.
p_trans, q_trans = p_coord.mean(axis=0), q_coord.mean(axis=0)
p_centroid, q_centroid = p_coord - p_trans, q_coord - q_trans
# The optimal rotation matrix U using Kabsch algorithm
U = self.kabsch(p_centroid, q_centroid)
p_prime_centroid = np.dot(p_centroid, U)
rmsd = np.sqrt(np.mean(np.square(p_prime_centroid - q_centroid)))
V = q_trans - np.dot(p_trans, U)
return U, V, rmsd
def fit(self, p: Molecule):
"""Rotate and transform `p` molecule according to the best match.
Args:
p: a `Molecule` object what will be matched with the target one.
Returns:
p_prime: Rotated and translated of the `p` `Molecule` object
rmsd: Root-mean-square-deviation between `p_prime` and the `target`
"""
U, V, rmsd = self.match(p)
# Rotate and translate matrix `p` onto the target molecule.
# P' = P * U + V
p_prime = p.copy()
for site in p_prime:
site.coords = np.dot(site.coords, U) + V
return p_prime, rmsd
@staticmethod
def kabsch(P: np.ndarray, Q: np.ndarray):
"""The Kabsch algorithm is a method for calculating the optimal rotation matrix
that minimizes the root mean squared deviation (RMSD) between two paired sets of points
P and Q, centered around the their centroid.
For more info see:
- https://wikipedia.org/wiki/Kabsch_algorithm and
- https://cnx.org/contents/HV-RsdwL@23/Molecular-Distance-Measures
Args:
P: Nx3 matrix, where N is the number of points.
Q: Nx3 matrix, where N is the number of points.
Returns:
U: 3x3 rotation matrix
"""
# Computation of the cross-covariance matrix
C = np.dot(P.T, Q)
# Computation of the optimal rotation matrix
# using singular value decomposition (SVD).
V, _S, WT = np.linalg.svd(C)
# Getting the sign of the det(V*Wt) to decide whether
det = np.linalg.det(np.dot(V, WT))
# And finally calculating the optimal rotation matrix R
# we need to correct our rotation matrix to ensure a right-handed coordinate system.
return np.dot(np.dot(V, np.diag([1, 1, det])), WT)
class BruteForceOrderMatcher(KabschMatcher):
"""Finding the best match between molecules by selecting molecule order
with the smallest RMSD from all the possible order combinations.
Notes:
When aligning molecules, the atoms of the two molecules **must** have same number
of atoms from the same species.
"""
def match(self, mol: Molecule, ignore_warning: bool = False) -> tuple[np.ndarray, np.ndarray, np.ndarray, float]:
"""Similar as `KabschMatcher.match` but this method also finds the order of
atoms which belongs to the best match.
A `ValueError` will be raised when the total number of possible combinations
become unfeasible (more than a million combination).
Args:
mol: a `Molecule` object what will be matched with the target one.
ignore_warning: ignoring error when the number of combination is too large
Returns:
inds: The indices of atoms
U: 3x3 rotation matrix
V: Translation vector
rmsd: Root mean squared deviation between P and Q
"""
target_mol = self.target
if sorted(mol.atomic_numbers) != sorted(target_mol.atomic_numbers):
raise ValueError("The number of the same species aren't matching!")
_, count = np.unique(mol.atomic_numbers, return_counts=True)
total_permutations = 1
for c in count:
total_permutations *= math.factorial(c)
if not ignore_warning and total_permutations > 1_000_000:
raise ValueError(
f"The number of all possible permutations ({total_permutations}) is not feasible to run this method!"
)
p_coord, q_coord = mol.cart_coords, target_mol.cart_coords
p_atoms, q_atoms = np.array(mol.atomic_numbers), np.array(target_mol.atomic_numbers)
# Both sets of coordinates must be translated first, so that
# their centroid coincides with the origin of the coordinate system.
p_trans, q_trans = p_coord.mean(axis=0), q_coord.mean(axis=0)
p_centroid, q_centroid = p_coord - p_trans, q_coord - q_trans
# Sort the order of the target molecule by the elements
q_inds = np.argsort(q_atoms)
q_centroid = q_centroid[q_inds]
# Initializing return values
rmsd = np.inf
# Generate all permutation grouped/sorted by the elements
p_inds = []
U = np.empty(0)
for p_inds_test in self.permutations(p_atoms):
p_centroid_test = p_centroid[p_inds_test]
U_test = self.kabsch(p_centroid_test, q_centroid)
p_centroid_prime_test = np.dot(p_centroid_test, U_test)
rmsd_test = np.sqrt(np.mean(np.square(p_centroid_prime_test - q_centroid)))
if rmsd_test < rmsd:
p_inds, U, rmsd = p_inds_test, U_test, rmsd_test
# Rotate and translate matrix P unto matrix Q using Kabsch algorithm.
# P' = P * U + V
V = q_trans - np.dot(p_trans, U)
# Using the original order of the indices
indices = p_inds[np.argsort(q_inds)]
return indices, U, V, rmsd
def fit(self, p: Molecule, ignore_warning=False):
"""Order, rotate and transform `p` molecule according to the best match.
A `ValueError` will be raised when the total number of possible combinations
become unfeasible (more than a million combinations).
Args:
p: a `Molecule` object what will be matched with the target one.
ignore_warning: ignoring error when the number of combination is too large
Returns:
p_prime: Rotated and translated of the `p` `Molecule` object
rmsd: Root-mean-square-deviation between `p_prime` and the `target`
"""
inds, U, V, rmsd = self.match(p, ignore_warning=ignore_warning)
p_prime = Molecule.from_sites([p[idx] for idx in inds])
for site in p_prime:
site.coords = np.dot(site.coords, U) + V
return p_prime, rmsd
@staticmethod
def permutations(atoms):
"""Generate all the possible permutations of atom order. To achieve better
performance all the cases where the atoms are different has been ignored.
"""
element_iterators = [itertools.permutations(np.where(atoms == element)[0]) for element in np.unique(atoms)]
for inds in itertools.product(*element_iterators):
yield np.array(list(itertools.chain(*inds)))
class HungarianOrderMatcher(KabschMatcher):
"""Pre-align the molecules based on their principal inertia
axis and then re-orders the input atom list using the Hungarian method.
Notes:
This method cannot guarantee the best match but is very fast.
When aligning molecules, the atoms of the two molecules **must** have same number
of atoms from the same species.
"""
def match(self, p: Molecule):
"""Similar as `KabschMatcher.match` but this method also finds the order of
atoms which belongs to the best match.
Args:
p: a `Molecule` object what will be matched with the target one.
Returns:
inds: The indices of atoms
U: 3x3 rotation matrix
V: Translation vector
rmsd: Root mean squared deviation between P and Q
"""
if sorted(p.atomic_numbers) != sorted(self.target.atomic_numbers):
raise ValueError("The number of the same species aren't matching!")
p_coord, q_coord = p.cart_coords, self.target.cart_coords
p_atoms, q_atoms = (
np.array(p.atomic_numbers),
np.array(self.target.atomic_numbers),
)
p_weights = np.array([site.species.weight for site in p])
q_weights = np.array([site.species.weight for site in self.target])
# Both sets of coordinates must be translated first, so that
# their center of mass with the origin of the coordinate system.
p_trans, q_trans = p.center_of_mass, self.target.center_of_mass
p_centroid, q_centroid = p_coord - p_trans, q_coord - q_trans
# Initializing return values
rmsd = np.inf
# Generate all permutation grouped/sorted by the elements
inds = []
U = np.empty(0)
for p_inds_test in self.permutations(p_atoms, p_centroid, p_weights, q_atoms, q_centroid, q_weights):
p_centroid_test = p_centroid[p_inds_test]
U_test = self.kabsch(p_centroid_test, q_centroid)
p_centroid_prime_test = np.dot(p_centroid_test, U_test)
rmsd_test = np.sqrt(np.mean(np.square(p_centroid_prime_test - q_centroid)))
if rmsd_test < rmsd:
inds, U, rmsd = p_inds_test, U_test, rmsd_test
# Rotate and translate matrix P unto matrix Q using Kabsch algorithm.
# P' = P * U + V
V = q_trans - np.dot(p_trans, U)
return inds, U, V, rmsd
def fit(self, p: Molecule):