Merge pull request #5 from padix-key/main

Fix #1
biotite-dev · Jan 24, 2023 · ed91932 · ed91932
2 parents 941dc11 + 9ac38b4
commit ed91932
Show file tree

Hide file tree

Showing 4 changed files with 112 additions and 136 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,5 +33,5 @@ repository = "https://github.com/biotite-dev/fastpdb"
 
 
 [build-system]
-requires = ["maturin>=0.11,<0.12"]
+requires = ["maturin>=0.14,<0.15"]
 build-backend = "maturin"
diff --git a/python-src/fastpdb/__init__.py b/python-src/fastpdb/__init__.py
@@ -3,137 +3,45 @@
 __all__ = ["PDBFile"]
 __version__ = "1.0.1"
 
+import os
 import numpy as np
-import biotite
+from biotite.file import is_text
 import biotite.structure as struc
+from biotite.structure.io.pdb import PDBFile as BiotitePDBFile
 from .fastpdb import PDBFile as RustPDBFile
 
 
-class PDBFile(biotite.TextFile):
-    r"""
-    This class represents a PDB file.
-    
-    This class only provides support for reading/writing the pure atom
-    information (``ATOM``, ``HETATM``, ``MODEL`` and ``ENDMDL``
-    records).
-    ``TER`` records cannot be written.
-    
-    See also
-    --------
-    PDBxFile
-    
-    Examples
-    --------
-    Load a ``\\*.pdb`` file, modify the structure and save the new
-    structure into a new file:
-    
-    >>> import os.path
-    >>> file = PDBFile.read(os.path.join(path_to_structures, "1l2y.pdb"))
-    >>> array_stack = file.get_structure()
-    >>> array_stack_mod = rotate(array_stack, [1,2,3])
-    >>> file = PDBFile()
-    >>> file.set_structure(array_stack_mod)
-    >>> file.write(os.path.join(path_to_directory, "1l2y_mod.pdb"))
-    """
+class PDBFile(BiotitePDBFile):
 
     def __init__(self):
         super().__init__()
         self._pdb_file = RustPDBFile([])
 
-    @classmethod
-    def read(cls, file):
-        file = super().read(file)
-        # Pad lines with whitespace if lines are shorter
-        # than the required 80 characters
-        file.lines = [line.ljust(80) for line in file.lines]
-        file._pdb_file = RustPDBFile(file.lines)
-        return file
+    @staticmethod
+    def read(file):
+        pdb_file = PDBFile()
+        if isinstance(file, str):
+            pdb_file._pdb_file = RustPDBFile.read(file)
+        elif isinstance(file, bytes):
+            pdb_file._pdb_file = RustPDBFile.read(file.decode("utf-8"))
+        elif isinstance(file, os.PathLike):
+            pdb_file._pdb_file = RustPDBFile.read(str(file))
+        else:
+            if not is_text(file):
+                raise TypeError("A file opened in 'text' mode is required")
+            pdb_file._pdb_file = RustPDBFile(file.read().splitlines())
+
+        # Synchronize with PDB file representation in Rust            
+        pdb_file.lines = pdb_file._pdb_file.lines
+        return pdb_file
 
     def get_model_count(self):
-        """
-        Get the number of models contained in the PDB file.
-
-        Returns
-        -------
-        model_count : int
-            The number of models.
-        """
         return self._pdb_file.get_model_count()
+
+    def get_remark(self, number):
+        return self._pdb_file.parse_remark(int(number))
 
     def get_coord(self, model=None):
-        """
-        Get only the coordinates of the PDB file.
-        
-        Parameters
-        ----------
-        model : int, optional
-            If this parameter is given, the function will return a
-            2D coordinate array from the atoms corresponding to the
-            given model number (starting at 1).
-            Negative values are used to index models starting from the
-            last model insted of the first model.
-            If this parameter is omitted, an 2D coordinate array
-            containing all models will be returned, even if
-            the structure contains only one model.
-        
-        Returns
-        -------
-        coord : ndarray, shape=(m,n,3) or shape=(n,2), dtype=float
-            The coordinates read from the ``ATOM`` and ``HETATM``
-            records of the file.
-        
-        Notes
-        -----
-        Note that :func:`get_coord()` may output more coordinates than
-        the atom array (stack) from the corresponding
-        :func:`get_structure()` call has.
-        The reason for this is, that :func:`get_structure()` filters
-        *altloc* IDs, while `get_coord()` does not.
-        
-        Examples
-        --------
-        Read an :class:`AtomArrayStack` from multiple PDB files, where
-        each PDB file contains the same atoms but different positions.
-        This is an efficient approach when a trajectory is spread into
-        multiple PDB files, as done e.g. by the *Rosetta* modeling
-        software. 
-
-        For the purpose of this example, the PDB files are created from
-        an existing :class:`AtomArrayStack`.
-        
-        >>> import os.path
-        >>> from tempfile import gettempdir
-        >>> file_names = []
-        >>> for i in range(atom_array_stack.stack_depth()):
-        ...     pdb_file = PDBFile()
-        ...     pdb_file.set_structure(atom_array_stack[i])
-        ...     file_name = os.path.join(gettempdir(), f"model_{i+1}.pdb")
-        ...     pdb_file.write(file_name)
-        ...     file_names.append(file_name)
-        >>> print(file_names)
-        ['...model_1.pdb', '...model_2.pdb', ..., '...model_38.pdb']
-
-        Now the PDB files are used to create an :class:`AtomArrayStack`,
-        where each model represents a different model.
-
-        Construct a new :class:`AtomArrayStack` with annotations taken
-        from one of the created files used as template and coordinates
-        from all of the PDB files.
-
-        >>> template_file = PDBFile.read(file_names[0])
-        >>> template = template_file.get_structure()
-        >>> coord = []
-        >>> for i, file_name in enumerate(file_names):
-        ...     pdb_file = PDBFile.read(file_name)
-        ...     coord.append(pdb_file.get_coord(model=1))
-        >>> new_stack = from_template(template, np.array(coord))
-
-        The newly created :class:`AtomArrayStack` should now be equal to
-        the :class:`AtomArrayStack` the PDB files were created from.
-
-        >>> print(np.allclose(new_stack.coord, atom_array_stack.coord))
-        True
-        """
         if model is None:
             coord = self._pdb_file.parse_coord_multi_model()
         else:
@@ -409,5 +317,5 @@ def set_structure(self, atoms):
                 bonds.astype(np.int32, copy=False), atom_id
             )
 
-
+        # Synchronize with PDB file representation in Rust            
         self.lines = self._pdb_file.lines
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,5 +1,6 @@
 //! Low-level PDB file parsing and writing.
 
+use std::fs;
 use std::str::FromStr;
 use std::convert::TryInto;
 use std::collections::HashMap;
@@ -9,6 +10,8 @@ use pyo3::prelude::*;
 use pyo3::exceptions;
 use numpy::PyArray;
 
+
+pyo3::import_exception!(biotite, InvalidFileError);
 // Label as a separate module to indicate that this exception comes
 // from biotite
 mod biotite {
@@ -44,7 +47,20 @@ impl PDBFile {
     /// An empty `Vec` represents and empty PDB file.
     #[new]
     fn new(lines: Vec<String>) -> Self {
-        PDBFile { lines }
+        //let ljust_lines = lines.iter().map(|line| format!("{:<80}", line)).collect();
+        PDBFile { lines: lines }
+    }
+
+
+    /// Read a [`PDBFile`] from a file.
+    /// The file is indicated by its file path as `String`.
+    #[staticmethod]
+    fn read(file_path: &str) -> PyResult<Self> {
+        let contents = fs::read_to_string(file_path).map_err(
+            |_| exceptions::PyOSError::new_err(format!("'{}' cannot be read", file_path))
+        )?;
+        let lines = contents.lines().map(|line| format!("{:<80}", line)).collect();
+        Ok(PDBFile { lines: lines })
     }
 
 
@@ -54,6 +70,26 @@ impl PDBFile {
     }
 
 
+    /// Parse the given `REMARK` record of the PDB file to obtain its content as strings
+    fn parse_remark(&self, number: i64) -> PyResult<Option<Vec<String>>> {
+        const CONTENT_START_COLUMN: usize = 11;
+
+        if number < 0 || number > 999 {
+            return Err(exceptions::PyValueError::new_err("The number must be in range 0-999"));
+        }
+        let remark_string = format!("REMARK {:>3}", number);
+        let mut remark_lines: Vec<String> = self.lines.iter()
+                           .filter(|line| line.starts_with(&remark_string))
+                           .map(|line| line[CONTENT_START_COLUMN..].to_owned())
+                           .collect();
+        match remark_lines.len() {
+            0 => Ok(None),
+            // Remove first empty line
+            _ => { remark_lines.remove(0); Ok(Some(remark_lines)) }
+        }
+    }
+
+
     /// Parse the `CRYST1` record of the PDB file to obtain the unit cell lengths
     /// and angles (in degrees).
     fn parse_box(&self) -> PyResult<Option<(f32, f32, f32, f32, f32, f32)>> {
@@ -81,7 +117,7 @@ impl PDBFile {
     fn parse_coord_single_model(&self, model: isize) -> PyResult<Py<PyArray<f32, Ix2>>> {
         let array = self.parse_coord(Some(model))?;
         Python::with_gil(|py| {
-            match array{
+            match array {
                 CoordArray::Single(array) => Ok(PyArray::from_array(py, &array).to_owned()),
                 CoordArray::Multi(_) => panic!("No multi-model coordinates should be returned"),
             }
@@ -94,7 +130,7 @@ impl PDBFile {
     fn parse_coord_multi_model(&self) -> PyResult<Py<PyArray<f32, Ix3>>> {
         let array = self.parse_coord(None)?;
         Python::with_gil(|py| {
-            match array{
+            match array {
                 CoordArray::Single(_) => panic!("No single-model coordinates should be returned"),
                 CoordArray::Multi(array) => Ok(PyArray::from_array(py, &array).to_owned()),
             }
@@ -208,22 +244,20 @@ impl PDBFile {
                 occupancy[atom_i] = parse_number(&line[54..60])?;
             }
             if include_charge {
-                let mut charge_raw = line[78..80].chars();
-                let raw_number = charge_raw.next().unwrap();
-                let raw_sign = charge_raw.next().unwrap();
-                let number;
-                if raw_number == ' ' {
-                    number = 0;
-                }
-                else {
-                    number = raw_number.to_digit(10).ok_or_else( ||
-                        biotite::InvalidFileError::new_err(format!(
-                            "'{}' cannot be parsed into a number", raw_number
-                        ))
-                    )?;
+                let charge_raw = line[78..80].as_bytes();
+                // Allow both "X+" and "+X"
+                charge[atom_i] = match charge_raw[0] {
+                    b' ' => 0,
+                    b'+' => parse_digit(&charge_raw[1])?,
+                    b'-' => -parse_digit(&charge_raw[1])?,
+                    _ => match charge_raw[1] {
+                        b'+' => parse_digit(&charge_raw[0])?,
+                        b'-' => -parse_digit(&charge_raw[0])?,
+                        _ => Err(biotite::InvalidFileError::new_err(format!(
+                            "'{}' is not a valid charge identifier", &line[78..80]
+                        )))?,
+                    }
                 }
-                let sign = if raw_sign == '-' { -1 } else { 1 };
-                charge[atom_i] = number as i64 * sign;
             }
         }
 
@@ -626,6 +660,18 @@ fn parse_number<T: FromStr>(string: &str) -> PyResult<T> {
     )
 }
 
+/// Parse a byte into a digit.
+/// Returns a ``PyErr`` if the parsing fails.
+#[inline(always)]
+fn parse_digit(digit: &u8) -> PyResult<i64> {
+    (*digit as char).to_digit(10).map_or_else(
+        || Err(biotite::InvalidFileError::new_err(format!(
+            "'{}' cannot be parsed into a number", digit)
+        )),
+        |v| Ok(v as i64)
+    )
+}
+
 
 /// If a given `id` exceeds `max_id` the returned ID restarts counting at 1.
 /// Otherwise, `id` is returned.

diff --git a/tests/test_fastpdb.py b/tests/test_fastpdb.py
@@ -21,6 +21,15 @@
 TEST_STRUCTURES = glob.glob(join(DATA_PATH, "*.pdb"))
 
 
+def test_get_remark():
+    ref_file = pdb.PDBFile.read(join(DATA_PATH, "1aki.pdb"))
+
+    test_file = fastpdb.PDBFile.read(join(DATA_PATH, "1aki.pdb"))
+
+    for remark in np.arange(0, 1000):
+        assert test_file.get_remark(remark) == ref_file.get_remark(remark)
+
+
 @pytest.mark.parametrize(
     "path", TEST_STRUCTURES
 )
@@ -157,3 +166,16 @@ def test_set_structure(path, model, altloc, extra_fields, include_bonds):
 
 
     assert test_file_content.getvalue() == ref_file_content.getvalue()
+
+
+def test_get_assembly():
+    """
+    Effectively, this test checks whether inheritance works properly,
+    as `get_assembly()` is not explicitly implemented in
+    `fastpdb.PDBFile`.
+    """
+    ref_file = pdb.PDBFile.read(join(DATA_PATH, "1aki.pdb"))
+
+    test_file = fastpdb.PDBFile.read(join(DATA_PATH, "1aki.pdb"))
+
+    assert test_file.get_assembly() == ref_file.get_assembly()