diff --git a/src/biotite/structure/io/ctab.py b/src/biotite/structure/io/ctab.py index ed253c069..f2dc61982 100644 --- a/src/biotite/structure/io/ctab.py +++ b/src/biotite/structure/io/ctab.py @@ -2,46 +2,20 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -""" -Functions for parsing and writing an :class:`AtomArray` from/to -*MDL* connection tables (Ctab). -""" - __name__ = "biotite.structure.io" __author__ = "Patrick Kunzmann" __all__ = ["read_structure_from_ctab", "write_structure_to_ctab"] import warnings -import numpy as np -from ..error import BadStructureError -from ..atoms import AtomArray, AtomArrayStack -from ..bonds import BondList, BondType - -BOND_TYPE_MAPPING = { - 1: BondType.SINGLE, - 2: BondType.DOUBLE, - 3: BondType.TRIPLE, - 6: BondType.SINGLE, - 7: BondType.DOUBLE, - 8: BondType.ANY, -} -BOND_TYPE_MAPPING_REV = { - BondType.SINGLE: 1, - BondType.DOUBLE: 2, - BondType.TRIPLE: 3, - BondType.AROMATIC_SINGLE: 1, - BondType.AROMATIC_DOUBLE: 2, - BondType.ANY: 8, -} - -CHARGE_MAPPING = {0: 0, 1: 3, 2: 2, 3: 1, 5: -1, 6: -2, 7: -3} -CHARGE_MAPPING_REV = {val: key for key, val in CHARGE_MAPPING.items()} +from ..bonds import BondType def read_structure_from_ctab(ctab_lines): """ Parse a *MDL* connection table (Ctab) to obtain an - :class:`AtomArray`. :footcite:`Dalby1992` + :class:`AtomArray`. :footcite:`Dalby1992`. + + DEPRECATED: Moved to :mod:`biotite.structure.io.mol.ctab`. Parameters ---------- @@ -60,41 +34,9 @@ def read_structure_from_ctab(ctab_lines): .. footbibliography:: """ - n_atoms, n_bonds = _get_counts(ctab_lines[0]) - atom_lines = ctab_lines[1 : 1 + n_atoms] - bond_lines = ctab_lines[1 + n_atoms : 1 + n_atoms + n_bonds] - - atoms = AtomArray(n_atoms) - atoms.add_annotation("charge", int) - for i, line in enumerate(atom_lines): - atoms.coord[i, 0] = float(line[0:10]) - atoms.coord[i, 1] = float(line[10:20]) - atoms.coord[i, 2] = float(line[20:30]) - atoms.element[i] = line[31:34].strip().upper() - charge = CHARGE_MAPPING.get(int(line[36:39])) - if charge is None: - warnings.warn( - f"Cannot handle MDL charge type {int(line[36 : 39])}, " - f"0 is used instead" - ) - charge = 0 - atoms.charge[i] = charge - - bond_array = np.zeros((n_bonds, 3), dtype=np.uint32) - for i, line in enumerate(bond_lines): - bond_type = BOND_TYPE_MAPPING.get(int(line[6:9])) - if bond_type is None: - warnings.warn( - f"Cannot handle MDL bond type {int(line[6 : 9])}, " - f"BondType.ANY is used instead" - ) - bond_type = BondType.ANY - bond_array[i, 0] = int(line[0:3]) - 1 - bond_array[i, 1] = int(line[3:6]) - 1 - bond_array[i, 2] = bond_type - atoms.bonds = BondList(n_atoms, bond_array) - - return atoms + warnings.warn("Moved to biotite.structure.io.mol.ctab", DeprecationWarning) + from biotite.structure.io.mol.ctab import read_structure_from_ctab + return read_structure_from_ctab(ctab_lines) def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY): @@ -102,6 +44,8 @@ def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY): Convert an :class:`AtomArray` into a *MDL* connection table (Ctab). :footcite:`Dalby1992` + DEPRECATED: Moved to :mod:`biotite.structure.io.mol.ctab`. + Parameters ---------- atoms : AtomArray @@ -123,44 +67,6 @@ def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY): .. footbibliography:: """ - if isinstance(atoms, AtomArrayStack): - raise TypeError( - "An 'AtomArrayStack' was given, " - "but only a single model can be written" - ) - if atoms.bonds is None: - raise BadStructureError("Input AtomArray has no associated BondList") - - try: - charge = atoms.charge - except AttributeError: - charge = np.zeros(atoms.array_length(), dtype=int) - - atom_lines = [ - f"{atoms.coord[i,0]:>10.5f}" - f"{atoms.coord[i,1]:>10.5f}" - f"{atoms.coord[i,2]:>10.5f}" - f" {atoms.element[i]:>3}" - f" {CHARGE_MAPPING_REV.get(charge[i], 0):>3d}" + f"{0:>3d}" * 10 - for i in range(atoms.array_length()) - ] - - default_bond_value = BOND_TYPE_MAPPING_REV[default_bond_type] - - bond_lines = [ - f"{i+1:>3d}{j+1:>3d}" - f"{BOND_TYPE_MAPPING_REV.get(bond_type, default_bond_value):>3d}" - + f"{0:>3d}" * 4 - for i, j, bond_type in atoms.bonds.as_array() - ] - - counts_line = ( - f"{len(atom_lines):>3d}{len(bond_lines):>3d}" - " 0 0 0 0 0 0 0 1 V2000" - ) - - return [counts_line] + atom_lines + bond_lines + ["M END"] - - -def _get_counts(counts_line): - return int(counts_line[0:3]), int(counts_line[3:6]) + warnings.warn("Moved to biotite.structure.io.mol.ctab", DeprecationWarning) + from biotite.structure.io.mol.ctab import write_structure_to_ctab + return write_structure_to_ctab(atoms, default_bond_type) diff --git a/src/biotite/structure/io/mol/convert.py b/src/biotite/structure/io/mol/convert.py index dc6d450b3..32ac1180a 100644 --- a/src/biotite/structure/io/mol/convert.py +++ b/src/biotite/structure/io/mol/convert.py @@ -32,7 +32,8 @@ def get_structure(mol_file): return mol_file.get_structure() -def set_structure(mol_file, atoms, default_bond_type=BondType.ANY): +def set_structure(mol_file, atoms, default_bond_type=BondType.ANY, + version=None): """ Set the :class:`AtomArray` for the MOL file. @@ -46,6 +47,12 @@ def set_structure(mol_file, atoms, default_bond_type=BondType.ANY): array : AtomArray The array to be saved into this file. Must have an associated :class:`BondList`. - + version : {"V2000", "V3000"}, optional + The version of the CTAB format. + ``"V2000"`` uses the *Atom* and *Bond* block, while ``"V3000"`` + uses the *Properties* block. + By default, ``"V2000"`` is used unless the number of atoms or + bonds exceed the fixed size columns in the table, in which case + ``"V3000"`` is used. """ - mol_file.set_structure(atoms, default_bond_type) + mol_file.set_structure(atoms, default_bond_type, version) diff --git a/src/biotite/structure/io/mol/ctab.py b/src/biotite/structure/io/mol/ctab.py new file mode 100644 index 000000000..54eef19b4 --- /dev/null +++ b/src/biotite/structure/io/mol/ctab.py @@ -0,0 +1,368 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +Functions for parsing and writing an :class:`AtomArray` from/to +*MDL* connection tables (Ctab). +""" + +__name__ = "biotite.structure.io.mol" +__author__ = "Patrick Kunzmann" +__all__ = ["read_structure_from_ctab", "write_structure_to_ctab"] + +import warnings +import shlex +import numpy as np +from ....file import InvalidFileError +from ...error import BadStructureError +from ...atoms import AtomArray, AtomArrayStack +from ...bonds import BondList, BondType + +BOND_TYPE_MAPPING = { + 1: BondType.SINGLE, + 2: BondType.DOUBLE, + 3: BondType.TRIPLE, + 5: BondType.ANY, + 6: BondType.SINGLE, + 7: BondType.DOUBLE, + 8: BondType.ANY, +} +BOND_TYPE_MAPPING_REV = { + BondType.SINGLE: 1, + BondType.DOUBLE: 2, + BondType.TRIPLE: 3, + BondType.AROMATIC_SINGLE: 1, + BondType.AROMATIC_DOUBLE: 2, + BondType.ANY: 8, +} + +CHARGE_MAPPING = {0: 0, 1: 3, 2: 2, 3: 1, 5: -1, 6: -2, 7: -3} +CHARGE_MAPPING_REV = {val: key for key, val in CHARGE_MAPPING.items()} + +V2000_COMPATIBILITY_LINE = " 0 0 0 0 0 0 0 0 0 0999 V3000" + + +def read_structure_from_ctab(ctab_lines): + """ + Parse a *MDL* connection table (Ctab) to obtain an + :class:`AtomArray`. + :footcite:`Dalby1992` + + Parameters + ---------- + ctab_lines : lines of str + The lines containing the *ctab*. + Must begin with the *counts* line and end with the `M END` line + + Returns + ------- + atoms : AtomArray + This :class:`AtomArray` contains the optional ``charge`` + annotation and has an associated :class:`BondList`. + + References + ---------- + + ``V3000`` specification was taken from + ``_. + + .. footbibliography:: + + """ + match _get_version(ctab_lines[0]): + case "V2000": + return _read_structure_from_ctab_v2000(ctab_lines) + case "V3000": + return _read_structure_from_ctab_v3000(ctab_lines) + case "": + raise InvalidFileError("CTAB counts line misses version") + case unkown_version: + raise InvalidFileError(f"Unknown CTAB version '{unkown_version}'") + + +def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY, + version=None): + """ + Convert an :class:`AtomArray` into a + *MDL* connection table (Ctab). + :footcite:`Dalby1992` + + Parameters + ---------- + atoms : AtomArray + The array must have an associated :class:`BondList`. + + Returns + ------- + ctab_lines : lines of str + The lines containing the *ctab*. + The lines begin with the *counts* line and end with the `M END` + .line + default_bond_type : BondType + Bond type fallback in the *Bond block* if a bond has no bond_type + defined in *atoms* array. By default, each bond is treated as + :attr:`BondType.ANY`. + version : {"V2000", "V3000"}, optional + The version of the CTAB format. + ``"V2000"`` uses the *Atom* and *Bond* block, while ``"V3000"`` + uses the *Properties* block. + By default, ``"V2000"`` is used unless the number of atoms or + bonds exceeds 1000, in which case ``"V3000"`` is used. + + References + ---------- + + ``V3000`` specification was taken from + ``_. + + .. footbibliography:: + + """ + if isinstance(atoms, AtomArrayStack): + raise TypeError( + "An 'AtomArrayStack' was given, " + "but only a single model can be written" + ) + if atoms.bonds is None: + raise BadStructureError("Input AtomArray has no associated BondList") + if np.isnan(atoms.coord).any(): + raise BadStructureError("Input AtomArray has NaN coordinates") + + match version: + case None: + if _is_v2000_compatible( + atoms.array_length(), atoms.bonds.get_bond_count() + ): + return _write_structure_to_ctab_v2000(atoms, default_bond_type) + else: + return _write_structure_to_ctab_v3000(atoms, default_bond_type) + case "V2000": + if not _is_v2000_compatible( + atoms.array_length(), atoms.bonds.get_bond_count() + ): + raise ValueError( + "The given number of atoms or bonds is too large " + "for V2000 format" + ) + return _write_structure_to_ctab_v2000(atoms, default_bond_type) + case "V3000": + return _write_structure_to_ctab_v3000(atoms, default_bond_type) + case unkown_version: + raise ValueError(f"Unknown CTAB version '{unkown_version}'") + + +def _read_structure_from_ctab_v2000(ctab_lines): + n_atoms, n_bonds = _get_counts_v2000(ctab_lines[0]) + atom_lines = ctab_lines[1 : 1 + n_atoms] + bond_lines = ctab_lines[1 + n_atoms : 1 + n_atoms + n_bonds] + + atoms = AtomArray(n_atoms) + atoms.add_annotation("charge", int) + for i, line in enumerate(atom_lines): + atoms.coord[i, 0] = float(line[0:10]) + atoms.coord[i, 1] = float(line[10:20]) + atoms.coord[i, 2] = float(line[20:30]) + atoms.element[i] = line[31:34].strip().upper() + charge = CHARGE_MAPPING.get(int(line[36:39])) + if charge is None: + warnings.warn( + f"Cannot handle MDL charge type {int(line[36 : 39])}, " + f"0 is used instead" + ) + charge = 0 + atoms.charge[i] = charge + + bond_array = np.zeros((n_bonds, 3), dtype=np.uint32) + for i, line in enumerate(bond_lines): + bond_type = BOND_TYPE_MAPPING.get(int(line[6:9])) + if bond_type is None: + warnings.warn( + f"Cannot handle MDL bond type {int(line[6 : 9])}, " + f"BondType.ANY is used instead" + ) + bond_type = BondType.ANY + bond_array[i, 0] = int(line[0:3]) - 1 + bond_array[i, 1] = int(line[3:6]) - 1 + bond_array[i, 2] = bond_type + atoms.bonds = BondList(n_atoms, bond_array) + + return atoms + +def _read_structure_from_ctab_v3000(ctab_lines): + v30_lines = [ + line[6:].strip() for line in ctab_lines if line.startswith("M V30") + ] + + atom_lines = _get_block_v3000(v30_lines, "ATOM") + if len(atom_lines) == 0: + raise InvalidFileError("ATOM block is empty") + atoms = AtomArray(len(atom_lines)) + atoms.add_annotation("charge", int) + # The V3000 atom index does not necessarily count from 1 to n, + # but allows arbitrary positive integers + # Hence, a mapping from V3000 atom index to AtomArray index is + # needed to get the correct index for a bond + v30_atom_indices = {} + for i, line in enumerate(atom_lines): + if "'" in line or '"' in line: + columns = shlex.split(line) + else: + columns = line.split() + v30_index = int(columns[0]) + v30_type = columns[1] + if v30_type == "R#": + raise NotImplementedError("Rgroup atoms are not supported") + v30_coord = np.array(columns[2:5], dtype=float) + v30_properties = create_property_dict_v3000(columns[6:]) + + v30_atom_indices[v30_index] = i + atoms.coord[i] = v30_coord + atoms.element[i] = v30_type.upper() + atoms.charge[i] = int(v30_properties.get("CHG", 0)) + + bond_lines = _get_block_v3000(v30_lines, "BOND") + bond_array = np.zeros((len(bond_lines), 3), dtype=np.uint32) + for i, line in enumerate(bond_lines): + columns = line.split() + v30_type = int(columns[1]) + v30_atom_index_1 = int(columns[2]) + v30_atom_index_2 = int(columns[3]) + + bond_type = BOND_TYPE_MAPPING.get(v30_type) + if bond_type is None: + warnings.warn( + f"Cannot handle MDL bond type {v30_type}, " + f"BondType.ANY is used instead" + ) + bond_type = BondType.ANY + bond_array[i, 0] = v30_atom_indices[v30_atom_index_1] + bond_array[i, 1] = v30_atom_indices[v30_atom_index_2] + bond_array[i, 2] = bond_type + atoms.bonds = BondList(atoms.array_length(), bond_array) + + return atoms + +def _get_version(counts_line): + return counts_line[33:39].strip() + +def _is_v2000_compatible(n_atoms, n_bonds): + # The format uses a maximum of 3 digits for the atom and bond count + return n_atoms < 1000 and n_bonds < 1000 + +def _get_counts_v2000(counts_line): + return int(counts_line[0:3]), int(counts_line[3:6]) + +def _get_block_v3000(v30_lines, block_name): + block_lines = [] + in_block = False + for line in v30_lines: + if line.startswith(f"BEGIN {block_name}"): + in_block = True + elif line.startswith(f"END {block_name}"): + if in_block: + return block_lines + else: + raise InvalidFileError( + f"Block '{block_name}' ended before it began" + ) + elif in_block: + block_lines.append(line) + return block_lines + +def create_property_dict_v3000(property_strings): + properties = {} + for prop in property_strings: + key, value = prop.split("=") + properties[key] = value + return properties + + +def _write_structure_to_ctab_v2000(atoms, default_bond_type): + try: + charge = atoms.charge + except AttributeError: + charge = np.zeros(atoms.array_length(), dtype=int) + + counts_line = ( + f"{atoms.array_length():>3d}{atoms.bonds.get_bond_count():>3d}" + " 0 0 0 0 0 0 0 1 V2000" + ) + + atom_lines = [ + f"{atoms.coord[i,0]:>10.5f}" + f"{atoms.coord[i,1]:>10.5f}" + f"{atoms.coord[i,2]:>10.5f}" + f" {atoms.element[i].capitalize():>3}" + f" {CHARGE_MAPPING_REV.get(charge[i], 0):>3d}" + f"{0:>3d}" * 10 + for i in range(atoms.array_length()) + ] + + default_bond_value = BOND_TYPE_MAPPING_REV[default_bond_type] + bond_lines = [ + f"{i+1:>3d}{j+1:>3d}" + f"{BOND_TYPE_MAPPING_REV.get(bond_type, default_bond_value):>3d}" + + f"{0:>3d}" * 4 + for i, j, bond_type in atoms.bonds.as_array() + ] + + return [counts_line] + atom_lines + bond_lines + ["M END"] + + +def _write_structure_to_ctab_v3000(atoms, default_bond_type): + try: + charges = atoms.charge + except AttributeError: + charges = np.zeros(atoms.array_length(), dtype=int) + + counts_line = ( + f"COUNTS {atoms.array_length()} {atoms.bonds.get_bond_count()} 0 0 0" + ) + + atom_lines = [ + f"{i + 1}" + f" {_quote(atoms.element[i].capitalize())}" + f" {atoms.coord[i,0]:.5f}" + f" {atoms.coord[i,1]:.5f}" + f" {atoms.coord[i,2]:.5f}" + # 'aamap' is unused + f" 0" + f" {_to_property(charges[i])}" + for i in range(atoms.array_length()) + ] + + default_bond_value = BOND_TYPE_MAPPING_REV[default_bond_type] + bond_lines = [ + f"{k + 1}" + f" {BOND_TYPE_MAPPING_REV.get(bond_type, default_bond_value)}" + f" {i + 1}" + f" {j + 1}" + for k, (i, j, bond_type) in enumerate(atoms.bonds.as_array()) + ] + + lines = ( + ["BEGIN CTAB"] + + [counts_line] + + ["BEGIN ATOM"] + + atom_lines + + ["END ATOM"] + + ["BEGIN BOND"] + + bond_lines + + ["END BOND"] + + ["END CTAB"] + ) + # Mark lines as V3000 CTAB + lines = ["M V30 " + line for line in lines] + return [V2000_COMPATIBILITY_LINE] + lines + ["M END"] + +def _to_property(charge): + if charge == 0: + return "" + else: + return f"CHG={charge}" + +def _quote(string): + if " " in string or len(string) == 0: + return f'"{string}"' + else: + return string \ No newline at end of file diff --git a/src/biotite/structure/io/mol/file.py b/src/biotite/structure/io/mol/file.py index 18ae01ea5..feaa44092 100644 --- a/src/biotite/structure/io/mol/file.py +++ b/src/biotite/structure/io/mol/file.py @@ -7,12 +7,8 @@ __all__ = ["MOLFile"] import datetime -from warnings import warn -import numpy as np -from ...atoms import AtomArray from ....file import TextFile, InvalidFileError -from ...error import BadStructureError -from ..ctab import read_structure_from_ctab, write_structure_to_ctab +from .ctab import read_structure_from_ctab, write_structure_to_ctab from ...bonds import BondType @@ -24,7 +20,8 @@ class MOLFile(TextFile): """ This class represents a file in MOL format, that is used to store - structure information for small molecules. :footcite:`Dalby1992` + structure information for small molecules. + :footcite:`Dalby1992` Since its use is intended for single small molecules, it stores less atom annotation information than the macromolecular structure @@ -179,7 +176,8 @@ def get_structure(self): return read_structure_from_ctab(ctab_lines) - def set_structure(self, atoms, default_bond_type=BondType.ANY): + def set_structure(self, atoms, default_bond_type=BondType.ANY, + version=None): """ Set the :class:`AtomArray` for the file. @@ -192,13 +190,19 @@ def set_structure(self, atoms, default_bond_type=BondType.ANY): Bond type fallback in the *Bond block* if a bond has no bond_type defined in *atoms* array. By default, each bond is treated as :attr:`BondType.ANY`. + version : {"V2000", "V3000"}, optional + The version of the CTAB format. + ``"V2000"`` uses the *Atom* and *Bond* block, while + ``"V3000"`` uses the *Properties* block. + By default, ``"V2000"`` is used unless the number of atoms + or bonds exceeds 1000, in which case ``"V3000"`` is used. """ self.lines = self.lines[:N_HEADER] + write_structure_to_ctab( - atoms, - default_bond_type + atoms, default_bond_type, version ) + def _get_ctab_lines(lines): for i, line in enumerate(lines): if line.startswith("M END"): diff --git a/tests/structure/data/molecules/03F.v3000.sdf b/tests/structure/data/molecules/03F.v3000.sdf new file mode 100644 index 000000000..57bb8287d --- /dev/null +++ b/tests/structure/data/molecules/03F.v3000.sdf @@ -0,0 +1,285 @@ +03F + RDKit 3D + + 0 0 0 0 0 0 0 0 0 0999 V3000 +M V30 BEGIN CTAB +M V30 COUNTS 130 130 0 0 1 +M V30 BEGIN ATOM +M V30 1 C -3.597000 3.707000 0.561000 0 +M V30 2 C -3.174000 2.348000 -0.002000 0 CFG=2 +M V30 3 N -1.783000 2.418000 -0.455000 0 +M V30 4 C -4.077000 1.981000 -1.181000 0 CFG=2 +M V30 5 O -5.442000 1.997000 -0.759000 0 +M V30 6 C -3.721000 0.603000 -1.677000 0 +M V30 7 C -4.645000 -0.322000 -1.761000 0 +M V30 8 C -4.288000 -1.700000 -2.257000 0 +M V30 9 C -4.718000 -2.741000 -1.221000 0 +M V30 10 C -4.356000 -4.140000 -1.724000 0 +M V30 11 C -4.786000 -5.181000 -0.689000 0 +M V30 12 C -4.424000 -6.580000 -1.192000 0 +M V30 13 C -4.853000 -7.621000 -0.157000 0 +M V30 14 C -4.492000 -9.020000 -0.660000 0 +M V30 15 C -4.921000 -10.061000 0.375000 0 +M V30 16 C -4.559000 -11.460000 -0.128000 0 +M V30 17 C -4.989000 -12.502000 0.908000 0 +M V30 18 C -4.627000 -13.901000 0.405000 0 +M V30 19 C -5.057000 -14.942000 1.440000 0 +M V30 20 C -4.695000 -16.341000 0.937000 0 +M V30 21 C -3.983000 5.987000 -0.069000 0 CFG=1 +M V30 22 O -3.600000 4.676000 -0.489000 0 +M V30 23 C -3.833000 6.963000 -1.239000 0 CFG=2 +M V30 24 O -2.461000 7.025000 -1.636000 0 +M V30 25 C -4.301000 8.353000 -0.797000 0 CFG=1 +M V30 26 O -4.231000 9.255000 -1.904000 0 +M V30 27 C -5.747000 8.259000 -0.301000 0 CFG=2 +M V30 28 O -6.171000 9.536000 0.181000 0 +M V30 29 O -7.331000 6.221000 2.427000 0 +M V30 30 C -5.824000 7.233000 0.833000 0 CFG=1 +M V30 31 C -7.275000 7.089000 1.294000 0 +M V30 32 O -5.345000 5.971000 0.364000 0 +M V30 33 C -0.778000 2.223000 0.422000 0 +M V30 34 O -1.026000 1.989000 1.586000 0 +M V30 35 C 0.653000 2.295000 -0.044000 0 +M V30 36 C 1.589000 2.038000 1.139000 0 +M V30 37 C 3.042000 2.112000 0.666000 0 +M V30 38 C 3.977000 1.855000 1.849000 0 +M V30 39 C 5.430000 1.928000 1.376000 0 +M V30 40 C 6.365000 1.671000 2.559000 0 +M V30 41 C 7.819000 1.745000 2.086000 0 +M V30 42 C 8.740000 1.492000 3.251000 0 +M V30 43 C 9.674000 0.578000 3.161000 0 +M V30 44 C 9.929000 -0.113000 1.846000 0 +M V30 45 C 11.411000 0.004000 1.487000 0 +M V30 46 C 11.670000 -0.697000 0.152000 0 +M V30 47 C 13.152000 -0.581000 -0.207000 0 +M V30 48 C 13.410000 -1.282000 -1.542000 0 +M V30 49 C 14.893000 -1.165000 -1.901000 0 +M V30 50 C 15.151000 -1.866000 -3.236000 0 +M V30 51 C 16.634000 -1.749000 -3.595000 0 +M V30 52 H -6.143000 10.236000 -0.485000 0 +M V30 53 H -3.266000 1.589000 0.776000 0 +M V30 54 H -3.343000 9.357000 -2.272000 0 +M V30 55 H -2.096000 6.177000 -1.923000 0 +M V30 56 H -3.937000 2.704000 -1.985000 0 +M V30 57 H -2.704000 0.378000 -1.962000 0 +M V30 58 H -4.937000 -6.771000 -2.134000 0 +M V30 59 H -3.347000 -6.643000 -1.346000 0 +M V30 60 H -4.597000 3.629000 0.987000 0 +M V30 61 H -4.340000 -7.430000 0.785000 0 +M V30 62 H -2.895000 4.014000 1.336000 0 +M V30 63 H -5.931000 -7.558000 -0.003000 0 +M V30 64 H -3.344000 6.306000 0.755000 0 +M V30 65 H -5.005000 -9.211000 -1.602000 0 +M V30 66 H -3.414000 -9.084000 -0.814000 0 +M V30 67 H 16.818000 -2.249000 -4.546000 0 +M V30 68 H -4.440000 6.622000 -2.077000 0 +M V30 69 H -4.407000 -9.870000 1.318000 0 +M V30 70 H -5.998000 -9.998000 0.529000 0 +M V30 71 H -3.662000 8.715000 0.008000 0 +M V30 72 H -5.073000 -11.652000 -1.070000 0 +M V30 73 H -3.482000 -11.524000 -0.282000 0 +M V30 74 H -6.394000 7.946000 -1.121000 0 +M V30 75 H -4.475000 -12.311000 1.850000 0 +M V30 76 H -5.661000 -0.097000 -1.475000 0 +M V30 77 H -6.066000 -12.438000 1.062000 0 +M V30 78 H -8.224000 6.083000 2.774000 0 +M V30 79 H -5.208000 7.568000 1.668000 0 +M V30 80 H -4.802000 -1.891000 -3.199000 0 +M V30 81 H -5.141000 -14.092000 -0.538000 0 +M V30 82 H -3.211000 -1.763000 -2.411000 0 +M V30 83 H -3.550000 -13.964000 0.251000 0 +M V30 84 H -4.204000 -2.550000 -0.279000 0 +M V30 85 H -4.543000 -14.751000 2.382000 0 +M V30 86 H -5.795000 -2.677000 -1.067000 0 +M V30 87 H -6.134000 -14.878000 1.594000 0 +M V30 88 H -4.870000 -4.331000 -2.667000 0 +M V30 89 H -5.001000 -17.083000 1.674000 0 +M V30 90 H -3.279000 -4.203000 -1.878000 0 +M V30 91 H -5.208000 -16.532000 -0.005000 0 +M V30 92 H -3.618000 -16.404000 0.783000 0 +M V30 93 H -4.272000 -4.990000 0.253000 0 +M V30 94 H -5.863000 -5.118000 -0.535000 0 +M V30 95 H -7.874000 6.670000 0.485000 0 +M V30 96 H -7.669000 8.068000 1.566000 0 +M V30 97 H 0.824000 1.541000 -0.812000 0 +M V30 98 H 0.852000 3.284000 -0.456000 0 +M V30 99 H 1.418000 2.793000 1.907000 0 +M V30 100 H 1.390000 1.049000 1.551000 0 +M V30 101 H 3.212000 1.357000 -0.102000 0 +M V30 102 H 3.240000 3.101000 0.254000 0 +M V30 103 H 3.807000 2.609000 2.617000 0 +M V30 104 H 3.779000 0.865000 2.261000 0 +M V30 105 H 5.601000 1.174000 0.608000 0 +M V30 106 H 5.629000 2.918000 0.964000 0 +M V30 107 H 6.195000 2.426000 3.327000 0 +M V30 108 H 6.167000 0.682000 2.971000 0 +M V30 109 H 7.989000 0.990000 1.317000 0 +M V30 110 H 8.017000 2.734000 1.673000 0 +M V30 111 H 8.630000 2.064000 4.160000 0 +M V30 112 H 10.262000 0.319000 4.030000 0 +M V30 113 H 9.657000 -1.165000 1.930000 0 +M V30 114 H 9.328000 0.356000 1.067000 0 +M V30 115 H 11.683000 1.056000 1.403000 0 +M V30 116 H 12.012000 -0.465000 2.266000 0 +M V30 117 H 11.398000 -1.750000 0.236000 0 +M V30 118 H 11.069000 -0.228000 -0.627000 0 +M V30 119 H 13.424000 0.472000 -0.291000 0 +M V30 120 H -1.585000 2.605000 -1.386000 0 +M V30 121 H 13.753000 -1.050000 0.572000 0 +M V30 122 H 13.139000 -2.334000 -1.458000 0 +M V30 123 H 12.810000 -0.812000 -2.321000 0 +M V30 124 H -5.623000 1.437000 0.008000 0 +M V30 125 H 15.165000 -0.113000 -1.985000 0 +M V30 126 H 15.494000 -1.634000 -1.122000 0 +M V30 127 H 14.880000 -2.919000 -3.152000 0 +M V30 128 H 14.551000 -1.397000 -4.015000 0 +M V30 129 H 16.906000 -0.697000 -3.679000 0 +M V30 130 H 17.235000 -2.219000 -2.816000 0 +M V30 END ATOM +M V30 BEGIN BOND +M V30 1 1 1 62 +M V30 2 1 1 2 +M V30 3 1 1 60 +M V30 4 1 1 22 +M V30 5 1 2 4 +M V30 6 1 2 53 CFG=1 +M V30 7 1 2 3 +M V30 8 1 3 33 +M V30 9 1 3 120 +M V30 10 1 4 5 +M V30 11 1 4 6 +M V30 12 1 4 56 CFG=3 +M V30 13 1 5 124 +M V30 14 1 6 57 +M V30 15 2 6 7 +M V30 16 1 7 8 +M V30 17 1 7 76 +M V30 18 1 8 82 +M V30 19 1 8 9 +M V30 20 1 8 80 +M V30 21 1 9 84 +M V30 22 1 9 10 +M V30 23 1 9 86 +M V30 24 1 10 90 +M V30 25 1 10 11 +M V30 26 1 10 88 +M V30 27 1 11 93 +M V30 28 1 11 12 +M V30 29 1 11 94 +M V30 30 1 12 58 +M V30 31 1 12 13 +M V30 32 1 12 59 +M V30 33 1 13 63 +M V30 34 1 13 14 +M V30 35 1 13 61 +M V30 36 1 14 66 +M V30 37 1 14 65 +M V30 38 1 14 15 +M V30 39 1 15 16 +M V30 40 1 15 69 +M V30 41 1 15 70 +M V30 42 1 16 72 +M V30 43 1 16 73 +M V30 44 1 16 17 +M V30 45 1 17 18 +M V30 46 1 17 77 +M V30 47 1 17 75 +M V30 48 1 18 81 +M V30 49 1 18 83 +M V30 50 1 18 19 +M V30 51 1 19 85 +M V30 52 1 19 20 +M V30 53 1 19 87 +M V30 54 1 20 91 +M V30 55 1 20 92 +M V30 56 1 20 89 +M V30 57 1 21 22 +M V30 58 1 21 32 +M V30 59 1 21 64 CFG=1 +M V30 60 1 21 23 +M V30 61 1 23 68 CFG=3 +M V30 62 1 23 24 +M V30 63 1 23 25 +M V30 64 1 24 55 +M V30 65 1 25 27 +M V30 66 1 25 71 CFG=1 +M V30 67 1 25 26 +M V30 68 1 26 54 +M V30 69 1 27 30 +M V30 70 1 27 74 CFG=3 +M V30 71 1 27 28 +M V30 72 1 28 52 +M V30 73 1 29 78 +M V30 74 1 29 31 +M V30 75 1 30 32 +M V30 76 1 30 31 +M V30 77 1 30 79 CFG=1 +M V30 78 1 31 95 +M V30 79 1 31 96 +M V30 80 2 33 34 +M V30 81 1 33 35 +M V30 82 1 35 36 +M V30 83 1 35 97 +M V30 84 1 35 98 +M V30 85 1 36 100 +M V30 86 1 36 99 +M V30 87 1 36 37 +M V30 88 1 37 38 +M V30 89 1 37 101 +M V30 90 1 37 102 +M V30 91 1 38 104 +M V30 92 1 38 103 +M V30 93 1 38 39 +M V30 94 1 39 40 +M V30 95 1 39 105 +M V30 96 1 39 106 +M V30 97 1 40 107 +M V30 98 1 40 108 +M V30 99 1 40 41 +M V30 100 1 41 42 +M V30 101 1 41 110 +M V30 102 1 41 109 +M V30 103 1 42 111 +M V30 104 2 42 43 +M V30 105 1 43 112 +M V30 106 1 43 44 +M V30 107 1 44 113 +M V30 108 1 44 114 +M V30 109 1 44 45 +M V30 110 1 45 115 +M V30 111 1 45 116 +M V30 112 1 45 46 +M V30 113 1 46 118 +M V30 114 1 46 117 +M V30 115 1 46 47 +M V30 116 1 47 119 +M V30 117 1 47 121 +M V30 118 1 47 48 +M V30 119 1 48 123 +M V30 120 1 48 122 +M V30 121 1 48 49 +M V30 122 1 49 125 +M V30 123 1 49 126 +M V30 124 1 49 50 +M V30 125 1 50 128 +M V30 126 1 50 127 +M V30 127 1 50 51 +M V30 128 1 51 130 +M V30 129 1 51 67 +M V30 130 1 51 129 +M V30 END BOND +M V30 END CTAB +M END +> (1) +CCCCCCCCCCCCC/C=C/[C@H]([C@H](CO[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O)NC(=O)CCCCCCC/C=C\CCCCCCCC)O + +> (1) +InChI=1S/C42H79NO8/c1-3-5-7-9-11-13-15-17-18-20-22-24-26-28-30-32-38(46)43-35(34-50-42-41(49)40(48)39(47)37(33-44)51-42)36(45)31-29-27-25-23-21-19-16-14-12-10-8-6-4-2/h17-18,29,31,35-37,39-42,44-45,47-49H,3-16,19-28,30,32-34H2,1-2H3,(H,43,46)/b18-17-,31-29+/t35-,36+,37+,39+,40-,41+,42+/m0/s1 + +> (1) +MVGFIPNJBNBHNC-HVFXMTMESA-N + +> (1) +C42H79NO8 + +$$$$ diff --git a/tests/structure/data/molecules/CYN.v3000.sdf b/tests/structure/data/molecules/CYN.v3000.sdf new file mode 100644 index 000000000..d7f507ae8 --- /dev/null +++ b/tests/structure/data/molecules/CYN.v3000.sdf @@ -0,0 +1,16 @@ +CYN - Ideal conformer + RDKit 3D + + 0 0 0 0 0 0 0 0 0 0999 V3000 +M V30 BEGIN CTAB +M V30 COUNTS 2 1 0 0 0 +M V30 BEGIN ATOM +M V30 1 C 0.000000 0.000000 -0.611000 0 CHG=-1 VAL=3 +M V30 2 N 0.000000 0.000000 0.524000 0 +M V30 END ATOM +M V30 BEGIN BOND +M V30 1 3 1 2 +M V30 END BOND +M V30 END CTAB +M END +$$$$ diff --git a/tests/structure/data/molecules/HWB.v3000.sdf b/tests/structure/data/molecules/HWB.v3000.sdf new file mode 100644 index 000000000..705c068a1 --- /dev/null +++ b/tests/structure/data/molecules/HWB.v3000.sdf @@ -0,0 +1,79 @@ +HWB - Ideal conformer + RDKit 3D + + 0 0 0 0 0 0 0 0 0 0999 V3000 +M V30 BEGIN CTAB +M V30 COUNTS 32 34 0 0 0 +M V30 BEGIN ATOM +M V30 1 O -4.652000 -2.732000 0.069000 0 +M V30 2 C -3.892000 -1.607000 0.030000 0 +M V30 3 C -4.512000 -0.363000 -0.094000 0 +M V30 4 C -3.768000 0.784000 -0.135000 0 +M V30 5 O -4.379000 1.990000 -0.254000 0 +M V30 6 C -2.366000 0.706000 -0.052000 0 +M V30 7 C -1.559000 1.862000 -0.089000 0 +M V30 8 C -0.202000 1.643000 0.003000 0 +M V30 9 O 0.679000 2.678000 -0.021000 0 +M V30 10 C 0.242000 0.313000 0.122000 0 +M V30 11 O -0.515000 -0.630000 0.147000 0 CHG=1 VAL=3 +M V30 12 C -1.728000 -0.554000 0.074000 0 +M V30 13 C -2.520000 -1.711000 0.118000 0 +M V30 14 C 1.696000 0.058000 0.221000 0 +M V30 15 C 2.314000 0.013000 1.471000 0 +M V30 16 C 3.670000 -0.226000 1.562000 0 +M V30 17 C 4.422000 -0.420000 0.412000 0 +M V30 18 O 5.757000 -0.654000 0.507000 0 +M V30 19 C 3.810000 -0.375000 -0.839000 0 +M V30 20 O 4.549000 -0.566000 -1.965000 0 +M V30 21 C 2.451000 -0.143000 -0.936000 0 +M V30 22 H -4.823000 -3.121000 -0.800000 0 +M V30 23 H -5.589000 -0.305000 -0.157000 0 +M V30 24 H -4.604000 2.402000 0.591000 0 +M V30 25 H -1.977000 2.854000 -0.183000 0 +M V30 26 H 0.885000 3.039000 0.852000 0 +M V30 27 H -2.056000 -2.681000 0.213000 0 +M V30 28 H 1.732000 0.164000 2.368000 0 +M V30 29 H 4.147000 -0.260000 2.530000 0 +M V30 30 H 5.987000 -1.591000 0.579000 0 +M V30 31 H 4.608000 -1.490000 -2.245000 0 +M V30 32 H 1.976000 -0.109000 -1.905000 0 +M V30 END ATOM +M V30 BEGIN BOND +M V30 1 1 9 8 +M V30 2 2 8 7 +M V30 3 1 8 10 +M V30 4 1 7 6 +M V30 5 1 5 4 +M V30 6 1 20 19 +M V30 7 2 21 19 +M V30 8 1 21 14 +M V30 9 1 10 14 +M V30 10 2 10 11 +M V30 11 2 6 4 +M V30 12 1 6 12 +M V30 13 1 19 17 +M V30 14 1 4 3 +M V30 15 2 14 15 +M V30 16 1 11 12 +M V30 17 1 17 18 +M V30 18 2 17 16 +M V30 19 2 12 13 +M V30 20 1 15 16 +M V30 21 2 3 2 +M V30 22 1 13 2 +M V30 23 1 2 1 +M V30 24 1 1 22 +M V30 25 1 3 23 +M V30 26 1 5 24 +M V30 27 1 7 25 +M V30 28 1 9 26 +M V30 29 1 13 27 +M V30 30 1 15 28 +M V30 31 1 16 29 +M V30 32 1 18 30 +M V30 33 1 20 31 +M V30 34 1 21 32 +M V30 END BOND +M V30 END CTAB +M END +$$$$ diff --git a/tests/structure/data/molecules/README.rst b/tests/structure/data/molecules/README.rst index b68cf3ebc..42cbe22d7 100644 --- a/tests/structure/data/molecules/README.rst +++ b/tests/structure/data/molecules/README.rst @@ -4,4 +4,7 @@ Test structures CYN: Caynide - Contains negatively charged atom and triple bond HWB: Cyanidin - Contains positively charged atom TYR: Tyrosine - common amino acid -03F: Large molecule - More than 100 bonds -> requires 3 characters in CTAB header \ No newline at end of file +03F: Large molecule - More than 100 bonds -> requires 3 characters in CTAB header + +The `.v3000.sdf` variants are the same structures but in V3000 format. +They were created with the `create_v3000_sdf.py` script. \ No newline at end of file diff --git a/tests/structure/data/molecules/TYR.v3000.sdf b/tests/structure/data/molecules/TYR.v3000.sdf new file mode 100644 index 000000000..61f343c6d --- /dev/null +++ b/tests/structure/data/molecules/TYR.v3000.sdf @@ -0,0 +1,61 @@ +TYR - Ideal conformer + RDKit 3D + + 0 0 0 0 0 0 0 0 0 0999 V3000 +M V30 BEGIN CTAB +M V30 COUNTS 24 24 0 0 0 +M V30 BEGIN ATOM +M V30 1 N 1.320000 0.952000 1.428000 0 +M V30 2 C -0.018000 0.429000 1.734000 0 CFG=2 +M V30 3 C -0.103000 0.094000 3.201000 0 +M V30 4 O 0.886000 -0.254000 3.799000 0 +M V30 5 C -0.274000 -0.831000 0.907000 0 +M V30 6 C -0.189000 -0.496000 -0.559000 0 +M V30 7 C 1.022000 -0.589000 -1.219000 0 +M V30 8 C -1.324000 -0.102000 -1.244000 0 +M V30 9 C 1.103000 -0.282000 -2.563000 0 +M V30 10 C -1.247000 0.210000 -2.587000 0 +M V30 11 C -0.032000 0.118000 -3.252000 0 +M V30 12 O 0.044000 0.420000 -4.574000 0 +M V30 13 O -1.279000 0.184000 3.842000 0 +M V30 14 H 1.977000 0.225000 1.669000 0 +M V30 15 H 1.365000 1.063000 0.426000 0 +M V30 16 H -0.767000 1.183000 1.489000 0 +M V30 17 H 0.473000 -1.585000 1.152000 0 +M V30 18 H -1.268000 -1.219000 1.134000 0 +M V30 19 H 1.905000 -0.902000 -0.683000 0 +M V30 20 H -2.269000 -0.031000 -0.727000 0 +M V30 21 H 2.049000 -0.354000 -3.078000 0 +M V30 22 H -2.132000 0.523000 -3.121000 0 +M V30 23 H -0.123000 -0.399000 -5.059000 0 +M V30 24 H -1.333000 -0.030000 4.784000 0 +M V30 END ATOM +M V30 BEGIN BOND +M V30 1 1 1 2 +M V30 2 1 1 14 +M V30 3 1 1 15 +M V30 4 1 2 3 +M V30 5 1 2 5 +M V30 6 1 2 16 CFG=3 +M V30 7 2 3 4 +M V30 8 1 3 13 +M V30 9 1 5 6 +M V30 10 1 5 17 +M V30 11 1 5 18 +M V30 12 2 6 7 +M V30 13 1 6 8 +M V30 14 1 7 9 +M V30 15 1 7 19 +M V30 16 2 8 10 +M V30 17 1 8 20 +M V30 18 2 9 11 +M V30 19 1 9 21 +M V30 20 1 10 11 +M V30 21 1 10 22 +M V30 22 1 11 12 +M V30 23 1 12 23 +M V30 24 1 13 24 +M V30 END BOND +M V30 END CTAB +M END +$$$$ diff --git a/tests/structure/data/molecules/create_v3000_sdf.py b/tests/structure/data/molecules/create_v3000_sdf.py new file mode 100644 index 000000000..dc313722f --- /dev/null +++ b/tests/structure/data/molecules/create_v3000_sdf.py @@ -0,0 +1,14 @@ +from pathlib import Path +from rdkit import Chem + +SCRIPT_PATH = Path(__file__).parent + +for sdf_path in SCRIPT_PATH.glob("*.sdf"): + if "v3000" in str(sdf_path): + continue + supplier = Chem.SDMolSupplier(sdf_path, removeHs=False) + writer = Chem.SDWriter(sdf_path.with_suffix(".v3000.sdf")) + writer.SetForceV3000(True) + for molecule in supplier: + writer.write(molecule) + writer.close() \ No newline at end of file diff --git a/tests/structure/test_mol.py b/tests/structure/test_mol.py index d4337c1b0..6fbd3b211 100644 --- a/tests/structure/test_mol.py +++ b/tests/structure/test_mol.py @@ -5,17 +5,39 @@ import datetime import glob import itertools -from os.path import join, split, splitext +from os.path import join, splitext from tempfile import TemporaryFile import numpy as np import pytest +import biotite.structure as struc import biotite.structure.io.mol as mol import biotite.structure.io.pdbx as pdbx from biotite.structure.bonds import BondType -from biotite.structure.io.ctab import BOND_TYPE_MAPPING_REV +from biotite.structure.io.mol.ctab import BOND_TYPE_MAPPING_REV from ..util import data_dir +def list_v2000_sdf_files(): + return [ + path for path + in glob.glob(join(data_dir("structure"), "molecules", "*.sdf")) + if not "v3000" in path + ] + +def list_v3000_sdf_files(): + return glob.glob(join(data_dir("structure"), "molecules", "*v3000.sdf")) + + +def toy_atom_array(n_atoms): + atoms = struc.AtomArray(n_atoms) + atoms.coord[:] = 1.0 + atoms.element[:] = "H" + atoms.add_annotation("charge", dtype=int) + atoms.charge[:] = 0 + atoms.bonds = struc.BondList(n_atoms) + return atoms + + def test_header_conversion(): """ Write known example data to the header of a MOL file and expect @@ -29,7 +51,6 @@ def test_header_conversion(): mol_file = mol.MOLFile() mol_file.set_header(*ref_header) - print(mol_file) temp = TemporaryFile("w+") mol_file.write(temp) @@ -42,13 +63,14 @@ def test_header_conversion(): @pytest.mark.parametrize( - "path, omit_charge", + "path, version, omit_charge", itertools.product( - glob.glob(join(data_dir("structure"), "molecules", "*.sdf")), + list_v2000_sdf_files(), + ["V2000", "V3000"], [False, True] ) ) -def test_structure_conversion(path, omit_charge): +def test_structure_conversion(path, version, omit_charge): """ After reading a MOL file, writing the structure back to a new file and reading it again should give the same structure. @@ -58,12 +80,11 @@ def test_structure_conversion(path, omit_charge): """ mol_file = mol.MOLFile.read(path) ref_atoms = mol.get_structure(mol_file) - print(ref_atoms.charge) if omit_charge: ref_atoms.del_annotation("charge") mol_file = mol.MOLFile() - mol.set_structure(mol_file, ref_atoms) + mol.set_structure(mol_file, ref_atoms, version=version) temp = TemporaryFile("w+") mol_file.write(temp) @@ -78,9 +99,8 @@ def test_structure_conversion(path, omit_charge): assert test_atoms == ref_atoms - @pytest.mark.parametrize( - "path", glob.glob(join(data_dir("structure"), "molecules", "*.sdf")), + "path", list_v2000_sdf_files() + list_v3000_sdf_files() ) def test_pdbx_consistency(path): """ @@ -90,7 +110,8 @@ def test_pdbx_consistency(path): In this case an SDF file is used, but it is compatible with the MOL format. """ - cif_path = splitext(path)[0] + ".cif" + # Remove '.sdf' and optional '.v3000' suffix + cif_path = splitext(splitext(path)[0])[0] + ".cif" pdbx_file = pdbx.CIFFile.read(cif_path) ref_atoms = pdbx.get_component(pdbx_file) @@ -109,9 +130,8 @@ def test_pdbx_consistency(path): assert set(tuple(bond) for bond in test_atoms.bonds.as_array()) \ == set(tuple(bond) for bond in ref_atoms.bonds.as_array()) -@pytest.mark.parametrize( - "path", glob.glob(join(data_dir("structure"), "molecules", "*.sdf")), -) + +@pytest.mark.parametrize("path", list_v2000_sdf_files()) def test_structure_bond_type_fallback(path): """ Check if a bond with a type not supported by MOL files will be translated @@ -148,3 +168,44 @@ def test_structure_bond_type_fallback(path): ].pop() assert int(updated_line[8]) == \ BOND_TYPE_MAPPING_REV[BondType.SINGLE] + + +@pytest.mark.parametrize("atom_type", ["", " ", "A ", " A"]) +def test_quoted_atom_types(atom_type): + """ + Check if V3000 MOL files can handle atom types (aka elements) with + empty strings or whitespaces. + """ + ref_atoms = toy_atom_array(1) + ref_atoms.element[0] = atom_type + mol_file = mol.MOLFile() + mol_file.set_structure(ref_atoms, version="V3000") + temp = TemporaryFile("w+") + mol_file.write(temp) + + temp.seek(0) + mol_file = mol.MOLFile.read(temp) + test_atoms = mol_file.get_structure() + assert test_atoms.element[0] == atom_type + # Also check if the rest of the structure was parsed correctly + assert test_atoms == ref_atoms + + +def test_large_structure(): + """ + Check if MOL files automatically switch to V3000 format if the + number of atoms exceeds the fixed size columns in the table. + """ + ref_atoms = toy_atom_array(1000) + mol_file = mol.MOLFile() + # Let the MOL file automatically switch to V3000 format + mol_file.set_structure(ref_atoms, version=None) + temp = TemporaryFile("w+") + mol_file.write(temp) + + temp.seek(0) + mol_file = mol.MOLFile.read(temp) + test_atoms = mol_file.get_structure() + # Check if file is written in V3000 format + assert "V3000" in str(mol_file) + assert test_atoms == ref_atoms \ No newline at end of file