-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathQM8Dataset.py
78 lines (63 loc) · 3.83 KB
/
QM8Dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import numpy as np
from kgcnn.data.qm import QMDataset
from kgcnn.data.download import DownloadDataset
class QM8Dataset(QMDataset, DownloadDataset):
r"""Store and process QM8 dataset from `MoleculeNet <https://moleculenet.org/>`__ datasets.
From `Quantum Machine <http://quantum-machine.org/datasets/>`__ :
Due to its favorable computational efficiency, time-dependent (TD) density functional theory(DFT) enables
the prediction of electronic spectra in a high-throughput manner across chemical space. Its predictions,
however, can be quite inaccurate. We resolve this issue with machine learning models trained on deviations of
reference second-order approximate coupled-cluster (CC2) singles and doubles spectra from TDDFT counterparts,
or even from DFT gap. We applied this approach to low-lying singlet-singlet vertical electronic spectra of
over 20000 synthetically feasible small organic molecules with up to eight CONF atoms. The prediction errors
decay monotonously as a function of training set size. For a training set of 10000 molecules,
CC2 excitation energies can be reproduced to within ±0.1 eV for the remaining molecules.
Analysis of our spectral database via chromophore counting suggests that even higher accuracies can be achieved.
Based on the evidence collected, we discuss open challenges associated with data-driven modeling of
high-lying spectra and transition intensities.
.. note::
We take the pre-processed dataset from `MoleculeNet <https://moleculenet.org/>`_ .
References:
(1) L. Ruddigkeit, R. van Deursen, L. C. Blum, J.-L. Reymond, Enumeration of 166 billion organic small
molecules in the chemical universe database GDB-17, J. Chem. Inf. Model. 52, 2864–2875, 2012.
(2) R. Ramakrishnan, M. Hartmann, E. Tapavicza, O. A. von Lilienfeld, Electronic Spectra from TDDFT and
Machine Learning in Chemical Space, J. Chem. Phys. 143 084111, 2015.
"""
download_info = {
"dataset_name": "QM8",
"data_directory_name": "qm8",
"download_url": "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb8.tar.gz",
"download_file_name": 'gdb8.tar.gz',
"unpack_tar": True,
"unpack_zip": False,
"unpack_directory_name": "gdb8"
}
def __init__(self, reload: bool = False, verbose: int = 10):
"""Initialize QM8 dataset.
Args:
reload (bool): Whether to reload the data and make new dataset. Default is False.
verbose (int): Print progress or info for processing where 60=silent. Default is 10.
"""
QMDataset.__init__(self, verbose=verbose, dataset_name="QM8")
DownloadDataset.__init__(self, **self.download_info, reload=reload, verbose=verbose)
self.label_names = [
"E1-CC2", "E2-CC2", "f1-CC2", "f2-CC2", "E1-PBE0", "E2-PBE0", "f1-PBE0",
"f2-PBE0", "E1-PBE0", "E2-PBE0", "f1-PBE0", "f2-PBE0", "E1-CAM", "E2-CAM", "f1-CAM", "f2-CAM"
]
self.label_units = ["[?]"]*len(self.label_names)
self.label_unit_conversion = np.array([1.0] * 14) # Pick always same units for training
self.dataset_name = "QM8"
self.require_prepare_data = False
self.fits_in_memory = True
self.verbose = verbose
self.data_directory = os.path.join(self.data_main_dir, self.data_directory_name, self.unpack_directory_name)
self.file_name = "qm8.csv"
if not os.path.exists(self.file_path):
original_name = os.path.join(self.data_directory, "qm8.sdf.csv")
if os.path.exists(original_name):
os.rename(original_name, self.file_path)
if self.require_prepare_data:
self.prepare_data(overwrite=reload)
if self.fits_in_memory:
self.read_in_memory(label_column_name=self.label_names)