Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added reading of record types #1762

Merged
merged 4 commits into from
Jan 29, 2018
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@ The rules for this file:
* release numbers follow "Semantic Versioning" http://semver.org

------------------------------------------------------------------------------
mm/dd/18
mm/dd/18 richardjgowers

* 0.17.1

Enhancements
* Added reading of record types (ATOM/HETATM) for PDB, PDBQT and PQR formats
(Issue #1753)

Fixes


Expand Down
14 changes: 14 additions & 0 deletions package/MDAnalysis/core/topologyattrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,20 @@ def _gen_initial_values(na, nr, ns):
return np.zeros(na)


class RecordTypes(AtomAttr):
"""For PDB-like formats, indicates if ATOM or HETATM

Defaults to 'ATOM'
"""
attrname = 'record_types'
singular = 'record_type'
per_object = 'atom'

@staticmethod
def _gen_initial_values(na, nr, ns):
return np.array(['ATOM'] * na, dtype=object)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems (memory)-wasteful to store a string with each atom. For 1M atoms, that can be an additional len("HETATM") * 1e6 / (1024*1024) = 5.7 MiB. Could we instead use a small integer, such as 1 = ATOM, 2 = HETATM, 0 = no idea, and then get the string back on the fly (e.g., as a dict look-up)?

Or is this more complicated than the memory consumption?

I'm just conscientious of the fact that we will be using the additional memory for any PDB file (especially simulation systems), where no-one cares about the atom records per se.



class ChainIDs(AtomAttr):
"""ChainID per atom

Expand Down
11 changes: 8 additions & 3 deletions package/MDAnalysis/topology/PDBParser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
Expand Down Expand Up @@ -74,6 +73,7 @@
ICodes,
Masses,
Occupancies,
RecordTypes,
Resids,
Resnames,
Resnums,
Expand All @@ -96,6 +96,7 @@ class PDBParser(TopologyReaderBase):
- chainids
- bfactors
- occupancies
- record_types (ATOM/HETATM)
- resids
- resnames
- segids
Expand All @@ -108,8 +109,10 @@ class PDBParser(TopologyReaderBase):
:class:`MDAnalysis.coordinates.PDB.PDBReader`

.. versionadded:: 0.8
.. versionchanged:: 0.17.1
Added parsing of Record types
"""
format = ['PDB','ENT']
format = ['PDB', 'ENT']

def parse(self, **kwargs):
"""Parse atom information from PDB file
Expand All @@ -134,6 +137,7 @@ def _parseatoms(self):
"""Create the initial Topology object"""
resid_prev = 0 # resid looping hack

record_types = []
serials = []
names = []
altlocs = []
Expand All @@ -160,6 +164,7 @@ def _parseatoms(self):
if not line.startswith(('ATOM', 'HETATM')):
continue

record_types.append(line[:6].strip())
try:
serial = int(line[6:11])
except ValueError:
Expand Down Expand Up @@ -217,6 +222,7 @@ def _parseatoms(self):
(names, Atomnames, object),
(altlocs, AltLocs, object),
(chainids, ChainIDs, object),
(record_types, RecordTypes, object),
(serials, Atomids, np.int32),
(tempfactors, Tempfactors, np.float32),
(occupancies, Occupancies, np.float32),
Expand Down Expand Up @@ -343,4 +349,3 @@ def _parse_conect(conect):
bond_atoms = (int(conect[11 + i * 5: 16 + i * 5]) for i in
range(n_bond_atoms))
return atom_id, bond_atoms

8 changes: 8 additions & 0 deletions package/MDAnalysis/topology/PDBQTParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
Charges,
Masses,
Occupancies,
RecordTypes,
Resids,
Resnums,
Resnames,
Expand All @@ -89,6 +90,7 @@ class PDBQTParser(TopologyReaderBase):
- resnames
- chainIDs (becomes segid)
- resids
- record_types (ATOM/HETATM)
- icodes
- occupancies
- tempfactors
Expand All @@ -97,6 +99,9 @@ class PDBQTParser(TopologyReaderBase):
Guesses the following:
- elements
- masses

.. versionchanged:: 0.17.1
Added parsing of Record types
"""
format = 'PDBQT'

Expand All @@ -107,6 +112,7 @@ def parse(self, **kwargs):
-------
MDAnalysis Topology object
"""
record_types = []
serials = []
names = []
altlocs = []
Expand All @@ -124,6 +130,7 @@ def parse(self, **kwargs):
line = line.strip()
if not line.startswith(('ATOM', 'HETATM')):
continue
record_types.append(line[:6].strip())
serials.append(int(line[6:11]))
names.append(line[12:16].strip())
altlocs.append(line[16:17].strip())
Expand All @@ -142,6 +149,7 @@ def parse(self, **kwargs):

attrs = []
for attrlist, Attr, dtype in (
(record_types, RecordTypes, object),
(serials, Atomids, np.int32),
(names, Atomnames, object),
(altlocs, AltLocs, object),
Expand Down
8 changes: 8 additions & 0 deletions package/MDAnalysis/topology/PQRParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
ICodes,
Masses,
Radii,
RecordTypes,
Resids,
Resnums,
Resnames,
Expand All @@ -77,6 +78,7 @@ class PQRParser(TopologyReaderBase):
- Atomnames
- Charges
- Radii
- RecordTypes (ATOM/HETATM)
- Resids
- Resnames
- Segids
Expand All @@ -90,6 +92,9 @@ class PQRParser(TopologyReaderBase):
'SYSTEM' as the new segid).
.. versionchanged:: 0.16.1
Now reads insertion codes and splits into new residues around these
.. versionchanged:: 0.17.1
Added parsing of Record types

"""
format = 'PQR'

Expand All @@ -100,6 +105,7 @@ def parse(self, **kwargs):
-------
A MDAnalysis Topology object
"""
record_types = []
serials = []
names = []
resnames = []
Expand Down Expand Up @@ -131,6 +137,7 @@ def parse(self, **kwargs):
else:
icode = ''

record_types.append(recordName)
serials.append(serial)
names.append(name)
resnames.append(resName)
Expand All @@ -151,6 +158,7 @@ def parse(self, **kwargs):
attrs.append(Charges(np.array(charges, dtype=np.float32)))
attrs.append(Atomtypes(atomtypes, guessed=True))
attrs.append(Masses(masses, guessed=True))
attrs.append(RecordTypes(np.array(record_types, dtype=object)))
attrs.append(Radii(np.array(radii, dtype=np.float32)))

resids = np.array(resids, dtype=np.int32)
Expand Down
9 changes: 9 additions & 0 deletions testsuite/MDAnalysisTests/core/test_topologyattrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,3 +488,12 @@ def test_no_deprecation(self, universe, instruction):
"""
with no_deprecated_call():
exec(instruction) #pylint: disable=W0122


def test_record_types_default():
u = make_Universe()

u.add_TopologyAttr('record_type')

assert u.atoms[0].record_type == 'ATOM'
assert_equal(u.atoms[:10].record_types, 'ATOM')
16 changes: 14 additions & 2 deletions testsuite/MDAnalysisTests/topology/test_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from MDAnalysisTests.topology.base import ParserBase
from MDAnalysisTests.datafiles import (
PDB,
PDB_HOLE,
PDB_small,
PDB_conect,
PDB_conect2TER,
Expand All @@ -44,7 +45,7 @@ class TestPDBParser(ParserBase):
"""This one has neither chainids or segids"""
parser = mda.topology.PDBParser.PDBParser
ref_filename = PDB
expected_attrs = ['ids', 'names', 'resids', 'resnames']
expected_attrs = ['ids', 'names', 'record_types', 'resids', 'resnames']
guessed_attrs = ['types', 'masses']
expected_n_atoms = 47681
expected_n_residues = 11302
Expand All @@ -55,7 +56,8 @@ class TestPDBParserSegids(ParserBase):
"""Has segids"""
parser = mda.topology.PDBParser.PDBParser
ref_filename = PDB_small
expected_attrs = ['ids', 'names', 'resids', 'resnames', 'segids']
expected_attrs = ['ids', 'names', 'record_types', 'resids', 'resnames',
'segids']
guessed_attrs = ['types', 'masses']
expected_n_atoms = 3341
expected_n_residues = 214
Expand Down Expand Up @@ -148,3 +150,13 @@ def test_sameresid_diffresname():
for i, (resid, resname) in enumerate(zip(resids, resnames)):
assert top.resids.values[i] == resid
assert top.resnames.values[i] == resname


def test_PDB_record_types():
u = mda.Universe(PDB_HOLE)

assert u.atoms[0].record_type == 'ATOM'
assert u.atoms[132].record_type == 'HETATM'

assert_equal(u.atoms[10:20].record_types, 'ATOM')
assert_equal(u.atoms[271:].record_types, 'HETATM')
2 changes: 1 addition & 1 deletion testsuite/MDAnalysisTests/topology/test_pdbqt.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class TestPDBQT(ParserBase):
ref_filename = PDBQT_input
expected_attrs = [
'ids', 'names', 'charges', 'types', 'altLocs', 'resids', 'resnames',
'segids'
'segids', 'record_types'
]
guessed_attrs = ['masses']
expected_n_atoms = 1805
Expand Down
14 changes: 13 additions & 1 deletion testsuite/MDAnalysisTests/topology/test_pqr.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#
from __future__ import absolute_import

from numpy.testing import assert_equal

import MDAnalysis as mda

from MDAnalysisTests.topology.base import ParserBase
Expand All @@ -33,7 +35,7 @@
class TestPQRParser(ParserBase):
parser = mda.topology.PQRParser.PQRParser
ref_filename = PQR
expected_attrs = ['ids', 'names', 'charges', 'radii',
expected_attrs = ['ids', 'names', 'charges', 'radii', 'record_types',
'resids', 'resnames', 'icodes',
'segids']
guessed_attrs = ['masses', 'types']
Expand All @@ -55,3 +57,13 @@ class TestPQRParser2(TestPQRParser):
ref_filename = PQR_icodes
expected_n_atoms = 5313
expected_n_residues = 474


def test_record_types():
u = mda.Universe(PQR_icodes)

assert u.atoms[4052].record_type == 'ATOM'
assert u.atoms[4053].record_type == 'HETATM'

assert_equal(u.atoms[:10].record_types, 'ATOM')
assert_equal(u.atoms[4060:4070].record_types, 'HETATM')