From 911ea225656aa599b9662b61c5ad13c7f96ddc96 Mon Sep 17 00:00:00 2001 From: Richard Gowers Date: Thu, 25 Jan 2018 16:18:55 +0000 Subject: [PATCH 1/4] added reading of record types --- package/CHANGELOG | 6 +++++- package/MDAnalysis/core/topologyattrs.py | 14 ++++++++++++++ package/MDAnalysis/topology/PDBParser.py | 11 ++++++++--- package/MDAnalysis/topology/PDBQTParser.py | 8 ++++++++ package/MDAnalysis/topology/PQRParser.py | 8 ++++++++ testsuite/MDAnalysisTests/topology/test_pdb.py | 16 ++++++++++++++-- testsuite/MDAnalysisTests/topology/test_pdbqt.py | 2 +- testsuite/MDAnalysisTests/topology/test_pqr.py | 14 +++++++++++++- 8 files changed, 71 insertions(+), 8 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index be9f47913c3..9cf3abc8dd2 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -13,10 +13,14 @@ The rules for this file: * release numbers follow "Semantic Versioning" http://semver.org ------------------------------------------------------------------------------ -mm/dd/18 +mm/dd/18 richardjgowers * 0.17.1 +Enhancements + * Added reading of record types (ATOM/HETATM) for PDB, PDBQT and PQR formats + (Issue #1753) + Fixes diff --git a/package/MDAnalysis/core/topologyattrs.py b/package/MDAnalysis/core/topologyattrs.py index 80a5ceba178..cd4e0526a4d 100644 --- a/package/MDAnalysis/core/topologyattrs.py +++ b/package/MDAnalysis/core/topologyattrs.py @@ -651,6 +651,20 @@ def _gen_initial_values(na, nr, ns): return np.zeros(na) +class RecordTypes(AtomAttr): + """For PDB-like formats, indicates if ATOM or HETATM + + Defaults to 'ATOM' + """ + attrname = 'record_types' + singular = 'record_type' + per_object = 'atom' + + @staticmethod + def _gen_initial_values(na, nr, ns): + return np.array(['ATOM'] * na, dtype=object) + + class ChainIDs(AtomAttr): """ChainID per atom diff --git a/package/MDAnalysis/topology/PDBParser.py b/package/MDAnalysis/topology/PDBParser.py index 93afbf1ec55..d71be5a0123 100644 --- a/package/MDAnalysis/topology/PDBParser.py +++ b/package/MDAnalysis/topology/PDBParser.py @@ -1,4 +1,3 @@ - # -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- # vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 # @@ -74,6 +73,7 @@ ICodes, Masses, Occupancies, + RecordTypes, Resids, Resnames, Resnums, @@ -96,6 +96,7 @@ class PDBParser(TopologyReaderBase): - chainids - bfactors - occupancies + - record_types (ATOM/HETATM) - resids - resnames - segids @@ -108,8 +109,10 @@ class PDBParser(TopologyReaderBase): :class:`MDAnalysis.coordinates.PDB.PDBReader` .. versionadded:: 0.8 + .. versionchanged:: 0.17.1 + Added parsing of Record types """ - format = ['PDB','ENT'] + format = ['PDB', 'ENT'] def parse(self, **kwargs): """Parse atom information from PDB file @@ -134,6 +137,7 @@ def _parseatoms(self): """Create the initial Topology object""" resid_prev = 0 # resid looping hack + record_types = [] serials = [] names = [] altlocs = [] @@ -160,6 +164,7 @@ def _parseatoms(self): if not line.startswith(('ATOM', 'HETATM')): continue + record_types.append(line[:6].strip()) try: serial = int(line[6:11]) except ValueError: @@ -217,6 +222,7 @@ def _parseatoms(self): (names, Atomnames, object), (altlocs, AltLocs, object), (chainids, ChainIDs, object), + (record_types, RecordTypes, object), (serials, Atomids, np.int32), (tempfactors, Tempfactors, np.float32), (occupancies, Occupancies, np.float32), @@ -343,4 +349,3 @@ def _parse_conect(conect): bond_atoms = (int(conect[11 + i * 5: 16 + i * 5]) for i in range(n_bond_atoms)) return atom_id, bond_atoms - diff --git a/package/MDAnalysis/topology/PDBQTParser.py b/package/MDAnalysis/topology/PDBQTParser.py index 81e590383db..609a8871600 100644 --- a/package/MDAnalysis/topology/PDBQTParser.py +++ b/package/MDAnalysis/topology/PDBQTParser.py @@ -70,6 +70,7 @@ Charges, Masses, Occupancies, + RecordTypes, Resids, Resnums, Resnames, @@ -89,6 +90,7 @@ class PDBQTParser(TopologyReaderBase): - resnames - chainIDs (becomes segid) - resids + - record_types (ATOM/HETATM) - icodes - occupancies - tempfactors @@ -97,6 +99,9 @@ class PDBQTParser(TopologyReaderBase): Guesses the following: - elements - masses + + .. versionchanged:: 0.17.1 + Added parsing of Record types """ format = 'PDBQT' @@ -107,6 +112,7 @@ def parse(self, **kwargs): ------- MDAnalysis Topology object """ + record_types = [] serials = [] names = [] altlocs = [] @@ -124,6 +130,7 @@ def parse(self, **kwargs): line = line.strip() if not line.startswith(('ATOM', 'HETATM')): continue + record_types.append(line[:6].strip()) serials.append(int(line[6:11])) names.append(line[12:16].strip()) altlocs.append(line[16:17].strip()) @@ -142,6 +149,7 @@ def parse(self, **kwargs): attrs = [] for attrlist, Attr, dtype in ( + (record_types, RecordTypes, object), (serials, Atomids, np.int32), (names, Atomnames, object), (altlocs, AltLocs, object), diff --git a/package/MDAnalysis/topology/PQRParser.py b/package/MDAnalysis/topology/PQRParser.py index 56df77ae398..29b8bc22646 100644 --- a/package/MDAnalysis/topology/PQRParser.py +++ b/package/MDAnalysis/topology/PQRParser.py @@ -60,6 +60,7 @@ ICodes, Masses, Radii, + RecordTypes, Resids, Resnums, Resnames, @@ -77,6 +78,7 @@ class PQRParser(TopologyReaderBase): - Atomnames - Charges - Radii + - RecordTypes (ATOM/HETATM) - Resids - Resnames - Segids @@ -90,6 +92,9 @@ class PQRParser(TopologyReaderBase): 'SYSTEM' as the new segid). .. versionchanged:: 0.16.1 Now reads insertion codes and splits into new residues around these + .. versionchanged:: 0.17.1 + Added parsing of Record types + """ format = 'PQR' @@ -100,6 +105,7 @@ def parse(self, **kwargs): ------- A MDAnalysis Topology object """ + record_types = [] serials = [] names = [] resnames = [] @@ -131,6 +137,7 @@ def parse(self, **kwargs): else: icode = '' + record_types.append(recordName) serials.append(serial) names.append(name) resnames.append(resName) @@ -151,6 +158,7 @@ def parse(self, **kwargs): attrs.append(Charges(np.array(charges, dtype=np.float32))) attrs.append(Atomtypes(atomtypes, guessed=True)) attrs.append(Masses(masses, guessed=True)) + attrs.append(RecordTypes(np.array(record_types, dtype=object))) attrs.append(Radii(np.array(radii, dtype=np.float32))) resids = np.array(resids, dtype=np.int32) diff --git a/testsuite/MDAnalysisTests/topology/test_pdb.py b/testsuite/MDAnalysisTests/topology/test_pdb.py index 460d9175a8a..92f75e44827 100644 --- a/testsuite/MDAnalysisTests/topology/test_pdb.py +++ b/testsuite/MDAnalysisTests/topology/test_pdb.py @@ -28,6 +28,7 @@ from MDAnalysisTests.topology.base import ParserBase from MDAnalysisTests.datafiles import ( PDB, + PDB_HOLE, PDB_small, PDB_conect, PDB_conect2TER, @@ -44,7 +45,7 @@ class TestPDBParser(ParserBase): """This one has neither chainids or segids""" parser = mda.topology.PDBParser.PDBParser ref_filename = PDB - expected_attrs = ['ids', 'names', 'resids', 'resnames'] + expected_attrs = ['ids', 'names', 'record_types', 'resids', 'resnames'] guessed_attrs = ['types', 'masses'] expected_n_atoms = 47681 expected_n_residues = 11302 @@ -55,7 +56,8 @@ class TestPDBParserSegids(ParserBase): """Has segids""" parser = mda.topology.PDBParser.PDBParser ref_filename = PDB_small - expected_attrs = ['ids', 'names', 'resids', 'resnames', 'segids'] + expected_attrs = ['ids', 'names', 'record_types', 'resids', 'resnames', + 'segids'] guessed_attrs = ['types', 'masses'] expected_n_atoms = 3341 expected_n_residues = 214 @@ -148,3 +150,13 @@ def test_sameresid_diffresname(): for i, (resid, resname) in enumerate(zip(resids, resnames)): assert top.resids.values[i] == resid assert top.resnames.values[i] == resname + + +def test_PDB_record_types(): + u = mda.Universe(PDB_HOLE) + + assert u.atoms[0].record_type == 'ATOM' + assert u.atoms[132].record_type == 'HETATM' + + assert_equal(u.atoms[10:20].record_types, 'ATOM') + assert_equal(u.atoms[271:].record_types, 'HETATM') diff --git a/testsuite/MDAnalysisTests/topology/test_pdbqt.py b/testsuite/MDAnalysisTests/topology/test_pdbqt.py index e4a2f868a55..e6329f8815b 100644 --- a/testsuite/MDAnalysisTests/topology/test_pdbqt.py +++ b/testsuite/MDAnalysisTests/topology/test_pdbqt.py @@ -34,7 +34,7 @@ class TestPDBQT(ParserBase): ref_filename = PDBQT_input expected_attrs = [ 'ids', 'names', 'charges', 'types', 'altLocs', 'resids', 'resnames', - 'segids' + 'segids', 'record_types' ] guessed_attrs = ['masses'] expected_n_atoms = 1805 diff --git a/testsuite/MDAnalysisTests/topology/test_pqr.py b/testsuite/MDAnalysisTests/topology/test_pqr.py index dd81a5bcac5..0af40131372 100644 --- a/testsuite/MDAnalysisTests/topology/test_pqr.py +++ b/testsuite/MDAnalysisTests/topology/test_pqr.py @@ -21,6 +21,8 @@ # from __future__ import absolute_import +from numpy.testing import assert_equal + import MDAnalysis as mda from MDAnalysisTests.topology.base import ParserBase @@ -33,7 +35,7 @@ class TestPQRParser(ParserBase): parser = mda.topology.PQRParser.PQRParser ref_filename = PQR - expected_attrs = ['ids', 'names', 'charges', 'radii', + expected_attrs = ['ids', 'names', 'charges', 'radii', 'record_types', 'resids', 'resnames', 'icodes', 'segids'] guessed_attrs = ['masses', 'types'] @@ -55,3 +57,13 @@ class TestPQRParser2(TestPQRParser): ref_filename = PQR_icodes expected_n_atoms = 5313 expected_n_residues = 474 + + +def test_record_types(): + u = mda.Universe(PQR_icodes) + + assert u.atoms[4052].record_type == 'ATOM' + assert u.atoms[4053].record_type == 'HETATM' + + assert_equal(u.atoms[:10].record_types, 'ATOM') + assert_equal(u.atoms[4060:4070].record_types, 'HETATM') From 93aefc7d5e0d1d9969d25c8b7a9c09e38288e718 Mon Sep 17 00:00:00 2001 From: Richard Gowers Date: Thu, 25 Jan 2018 16:21:47 +0000 Subject: [PATCH 2/4] added test for default record_types --- testsuite/MDAnalysisTests/core/test_topologyattrs.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/testsuite/MDAnalysisTests/core/test_topologyattrs.py b/testsuite/MDAnalysisTests/core/test_topologyattrs.py index 620ca1efd57..9660634286a 100644 --- a/testsuite/MDAnalysisTests/core/test_topologyattrs.py +++ b/testsuite/MDAnalysisTests/core/test_topologyattrs.py @@ -488,3 +488,12 @@ def test_no_deprecation(self, universe, instruction): """ with no_deprecated_call(): exec(instruction) #pylint: disable=W0122 + + +def test_record_types_default(): + u = make_Universe() + + u.add_TopologyAttr('record_type') + + assert u.atoms[0].record_type == 'ATOM' + assert_equal(u.atoms[:10].record_types, 'ATOM') From 5265b628efdbfeb157cc2ecb070ea6b513762ebe Mon Sep 17 00:00:00 2001 From: Richard Gowers Date: Mon, 29 Jan 2018 14:04:17 +0000 Subject: [PATCH 3/4] encoding of RecordTypes --- package/MDAnalysis/core/topologyattrs.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/package/MDAnalysis/core/topologyattrs.py b/package/MDAnalysis/core/topologyattrs.py index cd4e0526a4d..4eb2fda14cb 100644 --- a/package/MDAnalysis/core/topologyattrs.py +++ b/package/MDAnalysis/core/topologyattrs.py @@ -656,14 +656,26 @@ class RecordTypes(AtomAttr): Defaults to 'ATOM' """ + # internally encodes {True: 'ATOM', False: 'HETATM'} attrname = 'record_types' singular = 'record_type' per_object = 'atom' + def __init__(self, values, guessed=False): + self.values = np.where(values == 'ATOM', True, False) + self._guessed = guessed + @staticmethod def _gen_initial_values(na, nr, ns): return np.array(['ATOM'] * na, dtype=object) + def get_atoms(self, atomgroup): + return np.where(self.values[atomgroup.ix], 'ATOM', 'HETATM') + + @_check_length + def set_atoms(self, atomgroup, values): + self.values[atomgroup.ix] = np.where(values == 'ATOM', True, False) + class ChainIDs(AtomAttr): """ChainID per atom From 4de452f528b14572f4e30d7e84bf59e63f8838e4 Mon Sep 17 00:00:00 2001 From: Richard Gowers Date: Mon, 29 Jan 2018 14:41:24 +0000 Subject: [PATCH 4/4] fixed ResidueGroup and SegmentGroup RecordType access --- package/MDAnalysis/core/topologyattrs.py | 6 ++ .../core/test_topologyattrs.py | 56 +++++++++++++++++-- 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/package/MDAnalysis/core/topologyattrs.py b/package/MDAnalysis/core/topologyattrs.py index 4eb2fda14cb..82b4cc78b80 100644 --- a/package/MDAnalysis/core/topologyattrs.py +++ b/package/MDAnalysis/core/topologyattrs.py @@ -676,6 +676,12 @@ def get_atoms(self, atomgroup): def set_atoms(self, atomgroup, values): self.values[atomgroup.ix] = np.where(values == 'ATOM', True, False) + def get_residues(self, rg): + return [self.get_atoms(r.atoms) for r in rg] + + def get_segments(self, sg): + return [self.get_atoms(s.atoms) for s in sg] + class ChainIDs(AtomAttr): """ChainID per atom diff --git a/testsuite/MDAnalysisTests/core/test_topologyattrs.py b/testsuite/MDAnalysisTests/core/test_topologyattrs.py index 9660634286a..fbaa341021e 100644 --- a/testsuite/MDAnalysisTests/core/test_topologyattrs.py +++ b/testsuite/MDAnalysisTests/core/test_topologyattrs.py @@ -490,10 +490,56 @@ def test_no_deprecation(self, universe, instruction): exec(instruction) #pylint: disable=W0122 -def test_record_types_default(): - u = make_Universe() +class TestRecordTypes(object): + def test_record_types_default(self): + u = make_Universe() - u.add_TopologyAttr('record_type') + u.add_TopologyAttr('record_type') - assert u.atoms[0].record_type == 'ATOM' - assert_equal(u.atoms[:10].record_types, 'ATOM') + assert u.atoms[0].record_type == 'ATOM' + assert_equal(u.atoms[:10].record_types, 'ATOM') + + @pytest.fixture() + def rectype_uni(self): + # standard 125/25/5 universe + u = make_Universe() + u.add_TopologyAttr('record_type') + # first 25 atoms are ATOM (first 5 residues, first segment) + # 25 to 50th are HETATM (res 5:10, second segment) + # all after are ATOM + u.atoms[:25].record_types = 'ATOM' + u.atoms[25:50].record_types = 'HETATM' + u.atoms[50:].record_types = 'ATOM' + + return u + + def test_encoding(self, rectype_uni): + ag = rectype_uni.atoms[:10] + + ag[0].record_type = 'ATOM' + ag[1:4].record_types = 'HETATM' + + assert ag[0].record_type == 'ATOM' + assert ag[1].record_type == 'HETATM' + + def test_residue_record_types(self, rectype_uni): + rt = rectype_uni.residues.record_types + + assert isinstance(rt, list) + assert len(rt) == 25 + + # check return type explicitly + # some versions of numpy allow bool to str comparison + assert not rt[0].dtype == bool + assert (rt[0] == 'ATOM').all() + assert (rt[5] == 'HETATM').all() + + def test_segment_record_types(self, rectype_uni): + rt = rectype_uni.segments.record_types + + assert isinstance(rt, list) + assert len(rt) == 5 + + assert not rt[0].dtype == bool + assert (rt[0] == 'ATOM').all() + assert (rt[1] == 'HETATM').all()