Skip to content

Commit

Permalink
Merge pull request #5 from padix-key/main
Browse files Browse the repository at this point in the history
Fix #1
  • Loading branch information
padix-key authored Jan 24, 2023
2 parents 941dc11 + 9ac38b4 commit ed91932
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 136 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ repository = "https://github.com/biotite-dev/fastpdb"


[build-system]
requires = ["maturin>=0.11,<0.12"]
requires = ["maturin>=0.14,<0.15"]
build-backend = "maturin"
142 changes: 25 additions & 117 deletions python-src/fastpdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,137 +3,45 @@
__all__ = ["PDBFile"]
__version__ = "1.0.1"

import os
import numpy as np
import biotite
from biotite.file import is_text
import biotite.structure as struc
from biotite.structure.io.pdb import PDBFile as BiotitePDBFile
from .fastpdb import PDBFile as RustPDBFile


class PDBFile(biotite.TextFile):
r"""
This class represents a PDB file.
This class only provides support for reading/writing the pure atom
information (``ATOM``, ``HETATM``, ``MODEL`` and ``ENDMDL``
records).
``TER`` records cannot be written.
See also
--------
PDBxFile
Examples
--------
Load a ``\\*.pdb`` file, modify the structure and save the new
structure into a new file:
>>> import os.path
>>> file = PDBFile.read(os.path.join(path_to_structures, "1l2y.pdb"))
>>> array_stack = file.get_structure()
>>> array_stack_mod = rotate(array_stack, [1,2,3])
>>> file = PDBFile()
>>> file.set_structure(array_stack_mod)
>>> file.write(os.path.join(path_to_directory, "1l2y_mod.pdb"))
"""
class PDBFile(BiotitePDBFile):

def __init__(self):
super().__init__()
self._pdb_file = RustPDBFile([])

@classmethod
def read(cls, file):
file = super().read(file)
# Pad lines with whitespace if lines are shorter
# than the required 80 characters
file.lines = [line.ljust(80) for line in file.lines]
file._pdb_file = RustPDBFile(file.lines)
return file
@staticmethod
def read(file):
pdb_file = PDBFile()
if isinstance(file, str):
pdb_file._pdb_file = RustPDBFile.read(file)
elif isinstance(file, bytes):
pdb_file._pdb_file = RustPDBFile.read(file.decode("utf-8"))
elif isinstance(file, os.PathLike):
pdb_file._pdb_file = RustPDBFile.read(str(file))
else:
if not is_text(file):
raise TypeError("A file opened in 'text' mode is required")
pdb_file._pdb_file = RustPDBFile(file.read().splitlines())

# Synchronize with PDB file representation in Rust
pdb_file.lines = pdb_file._pdb_file.lines
return pdb_file

def get_model_count(self):
"""
Get the number of models contained in the PDB file.
Returns
-------
model_count : int
The number of models.
"""
return self._pdb_file.get_model_count()

def get_remark(self, number):
return self._pdb_file.parse_remark(int(number))

def get_coord(self, model=None):
"""
Get only the coordinates of the PDB file.
Parameters
----------
model : int, optional
If this parameter is given, the function will return a
2D coordinate array from the atoms corresponding to the
given model number (starting at 1).
Negative values are used to index models starting from the
last model insted of the first model.
If this parameter is omitted, an 2D coordinate array
containing all models will be returned, even if
the structure contains only one model.
Returns
-------
coord : ndarray, shape=(m,n,3) or shape=(n,2), dtype=float
The coordinates read from the ``ATOM`` and ``HETATM``
records of the file.
Notes
-----
Note that :func:`get_coord()` may output more coordinates than
the atom array (stack) from the corresponding
:func:`get_structure()` call has.
The reason for this is, that :func:`get_structure()` filters
*altloc* IDs, while `get_coord()` does not.
Examples
--------
Read an :class:`AtomArrayStack` from multiple PDB files, where
each PDB file contains the same atoms but different positions.
This is an efficient approach when a trajectory is spread into
multiple PDB files, as done e.g. by the *Rosetta* modeling
software.
For the purpose of this example, the PDB files are created from
an existing :class:`AtomArrayStack`.
>>> import os.path
>>> from tempfile import gettempdir
>>> file_names = []
>>> for i in range(atom_array_stack.stack_depth()):
... pdb_file = PDBFile()
... pdb_file.set_structure(atom_array_stack[i])
... file_name = os.path.join(gettempdir(), f"model_{i+1}.pdb")
... pdb_file.write(file_name)
... file_names.append(file_name)
>>> print(file_names)
['...model_1.pdb', '...model_2.pdb', ..., '...model_38.pdb']
Now the PDB files are used to create an :class:`AtomArrayStack`,
where each model represents a different model.
Construct a new :class:`AtomArrayStack` with annotations taken
from one of the created files used as template and coordinates
from all of the PDB files.
>>> template_file = PDBFile.read(file_names[0])
>>> template = template_file.get_structure()
>>> coord = []
>>> for i, file_name in enumerate(file_names):
... pdb_file = PDBFile.read(file_name)
... coord.append(pdb_file.get_coord(model=1))
>>> new_stack = from_template(template, np.array(coord))
The newly created :class:`AtomArrayStack` should now be equal to
the :class:`AtomArrayStack` the PDB files were created from.
>>> print(np.allclose(new_stack.coord, atom_array_stack.coord))
True
"""
if model is None:
coord = self._pdb_file.parse_coord_multi_model()
else:
Expand Down Expand Up @@ -409,5 +317,5 @@ def set_structure(self, atoms):
bonds.astype(np.int32, copy=False), atom_id
)


# Synchronize with PDB file representation in Rust
self.lines = self._pdb_file.lines
82 changes: 64 additions & 18 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! Low-level PDB file parsing and writing.
use std::fs;
use std::str::FromStr;
use std::convert::TryInto;
use std::collections::HashMap;
Expand All @@ -9,6 +10,8 @@ use pyo3::prelude::*;
use pyo3::exceptions;
use numpy::PyArray;


pyo3::import_exception!(biotite, InvalidFileError);
// Label as a separate module to indicate that this exception comes
// from biotite
mod biotite {
Expand Down Expand Up @@ -44,7 +47,20 @@ impl PDBFile {
/// An empty `Vec` represents and empty PDB file.
#[new]
fn new(lines: Vec<String>) -> Self {
PDBFile { lines }
//let ljust_lines = lines.iter().map(|line| format!("{:<80}", line)).collect();
PDBFile { lines: lines }
}


/// Read a [`PDBFile`] from a file.
/// The file is indicated by its file path as `String`.
#[staticmethod]
fn read(file_path: &str) -> PyResult<Self> {
let contents = fs::read_to_string(file_path).map_err(
|_| exceptions::PyOSError::new_err(format!("'{}' cannot be read", file_path))
)?;
let lines = contents.lines().map(|line| format!("{:<80}", line)).collect();
Ok(PDBFile { lines: lines })
}


Expand All @@ -54,6 +70,26 @@ impl PDBFile {
}


/// Parse the given `REMARK` record of the PDB file to obtain its content as strings
fn parse_remark(&self, number: i64) -> PyResult<Option<Vec<String>>> {
const CONTENT_START_COLUMN: usize = 11;

if number < 0 || number > 999 {
return Err(exceptions::PyValueError::new_err("The number must be in range 0-999"));
}
let remark_string = format!("REMARK {:>3}", number);
let mut remark_lines: Vec<String> = self.lines.iter()
.filter(|line| line.starts_with(&remark_string))
.map(|line| line[CONTENT_START_COLUMN..].to_owned())
.collect();
match remark_lines.len() {
0 => Ok(None),
// Remove first empty line
_ => { remark_lines.remove(0); Ok(Some(remark_lines)) }
}
}


/// Parse the `CRYST1` record of the PDB file to obtain the unit cell lengths
/// and angles (in degrees).
fn parse_box(&self) -> PyResult<Option<(f32, f32, f32, f32, f32, f32)>> {
Expand Down Expand Up @@ -81,7 +117,7 @@ impl PDBFile {
fn parse_coord_single_model(&self, model: isize) -> PyResult<Py<PyArray<f32, Ix2>>> {
let array = self.parse_coord(Some(model))?;
Python::with_gil(|py| {
match array{
match array {
CoordArray::Single(array) => Ok(PyArray::from_array(py, &array).to_owned()),
CoordArray::Multi(_) => panic!("No multi-model coordinates should be returned"),
}
Expand All @@ -94,7 +130,7 @@ impl PDBFile {
fn parse_coord_multi_model(&self) -> PyResult<Py<PyArray<f32, Ix3>>> {
let array = self.parse_coord(None)?;
Python::with_gil(|py| {
match array{
match array {
CoordArray::Single(_) => panic!("No single-model coordinates should be returned"),
CoordArray::Multi(array) => Ok(PyArray::from_array(py, &array).to_owned()),
}
Expand Down Expand Up @@ -208,22 +244,20 @@ impl PDBFile {
occupancy[atom_i] = parse_number(&line[54..60])?;
}
if include_charge {
let mut charge_raw = line[78..80].chars();
let raw_number = charge_raw.next().unwrap();
let raw_sign = charge_raw.next().unwrap();
let number;
if raw_number == ' ' {
number = 0;
}
else {
number = raw_number.to_digit(10).ok_or_else( ||
biotite::InvalidFileError::new_err(format!(
"'{}' cannot be parsed into a number", raw_number
))
)?;
let charge_raw = line[78..80].as_bytes();
// Allow both "X+" and "+X"
charge[atom_i] = match charge_raw[0] {
b' ' => 0,
b'+' => parse_digit(&charge_raw[1])?,
b'-' => -parse_digit(&charge_raw[1])?,
_ => match charge_raw[1] {
b'+' => parse_digit(&charge_raw[0])?,
b'-' => -parse_digit(&charge_raw[0])?,
_ => Err(biotite::InvalidFileError::new_err(format!(
"'{}' is not a valid charge identifier", &line[78..80]
)))?,
}
}
let sign = if raw_sign == '-' { -1 } else { 1 };
charge[atom_i] = number as i64 * sign;
}
}

Expand Down Expand Up @@ -626,6 +660,18 @@ fn parse_number<T: FromStr>(string: &str) -> PyResult<T> {
)
}

/// Parse a byte into a digit.
/// Returns a ``PyErr`` if the parsing fails.
#[inline(always)]
fn parse_digit(digit: &u8) -> PyResult<i64> {
(*digit as char).to_digit(10).map_or_else(
|| Err(biotite::InvalidFileError::new_err(format!(
"'{}' cannot be parsed into a number", digit)
)),
|v| Ok(v as i64)
)
}


/// If a given `id` exceeds `max_id` the returned ID restarts counting at 1.
/// Otherwise, `id` is returned.
Expand Down
22 changes: 22 additions & 0 deletions tests/test_fastpdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,15 @@
TEST_STRUCTURES = glob.glob(join(DATA_PATH, "*.pdb"))


def test_get_remark():
ref_file = pdb.PDBFile.read(join(DATA_PATH, "1aki.pdb"))

test_file = fastpdb.PDBFile.read(join(DATA_PATH, "1aki.pdb"))

for remark in np.arange(0, 1000):
assert test_file.get_remark(remark) == ref_file.get_remark(remark)


@pytest.mark.parametrize(
"path", TEST_STRUCTURES
)
Expand Down Expand Up @@ -157,3 +166,16 @@ def test_set_structure(path, model, altloc, extra_fields, include_bonds):


assert test_file_content.getvalue() == ref_file_content.getvalue()


def test_get_assembly():
"""
Effectively, this test checks whether inheritance works properly,
as `get_assembly()` is not explicitly implemented in
`fastpdb.PDBFile`.
"""
ref_file = pdb.PDBFile.read(join(DATA_PATH, "1aki.pdb"))

test_file = fastpdb.PDBFile.read(join(DATA_PATH, "1aki.pdb"))

assert test_file.get_assembly() == ref_file.get_assembly()

0 comments on commit ed91932

Please sign in to comment.