teamtomo · alisterburt · Dec 10, 2024 · Nov 20, 2024
diff --git a/README.md b/README.md
@@ -6,8 +6,8 @@
 [![CI](https://github.com/teamtomo/mmdf/actions/workflows/ci.yml/badge.svg)](https://github.com/teamtomo/mmdf/actions/workflows/ci.yml)
 [![codecov](https://codecov.io/gh/teamtomo/mmdf/branch/main/graph/badge.svg)](https://codecov.io/gh/teamtomo/mmdf)
 
-**M**acro**M**olecular **D**ata**F**rames (`mmdf`) is a small package for reading macromolecular structure files
-(.pdb/.mmCIF) into pandas dataframes.
+**M**acro**M**olecular **D**ata**F**rames (`mmdf`) is a small package for reading and writing macromolecular structure files
+(.pdb/.mmCIF) using pandas dataframes.
 
 The heavy lifting of reading structure files is performed by [gemmi](https://gemmi.readthedocs.io/en/latest/).
 
@@ -16,6 +16,7 @@ The heavy lifting of reading structure files is performed by [gemmi](https://gem
 ```ipython
 import mmdf
 
+# Read a PDBx/mmCIF file into a dataframe
 df = mmdf.read('4v6x.cif')
 df.head()
 Out[3]: 
@@ -26,6 +27,11 @@ Out[3]:
 3     1    Az     ASN           3  ... -53.007      0       1.0         10.0
 4     1    Az     ASN           3  ... -54.239      0       1.0         10.0
 [5 rows x 13 columns]
+
+# Other dataframe manipulation...
+
+# Write dataframe to a PDBx/mmCIF file
+mmdf.write('4v6x_new.cif', df)
 ```
 
 ## Changelog

diff --git a/src/mmdf/__init__.py b/src/mmdf/__init__.py
@@ -1,5 +1,5 @@
 """Macromolecular structures as pandas dataframes."""
 
-from .functions import read
+from .functions import read, write
 
-__all__ = ["read"]
+__all__ = ["read", "write"]
diff --git a/src/mmdf/_gemmi_utils.py b/src/mmdf/_gemmi_utils.py
@@ -1,28 +1,30 @@
 import gemmi
 import pandas as pd
 
+HEADINGS = [
+    "model",
+    "chain",
+    "residue",
+    "residue_id",
+    "atom",
+    "element",
+    "atomic_number",
+    "atomic_weight",
+    "covalent_radius",
+    "van_der_waals_radius",
+    "heteroatom_flag",
+    "x",
+    "y",
+    "z",
+    "charge",
+    "occupancy",
+    "b_isotropic",
+]
+
 
 def structure_to_df(structure: gemmi.Structure) -> pd.DataFrame:
     dfs = []
-    headings = [
-        "model",
-        "chain",
-        "residue",
-        "residue_id",
-        "atom",
-        "element",
-        "atomic_number",
-        "atomic_weight",
-        "covalent_radius",
-        "van_der_waals_radius",
-        "heteroatom_flag",
-        "x",
-        "y",
-        "z",
-        "charge",
-        "occupancy",
-        "b_isotropic",
-    ]
+
     for model in structure:
         data = [
             (
@@ -46,6 +48,53 @@ def structure_to_df(structure: gemmi.Structure) -> pd.DataFrame:
             )
             for cra in model.all()
         ]
-        df = pd.DataFrame(data, columns=headings)
+        df = pd.DataFrame(data, columns=HEADINGS)
         dfs.append(df)
     return pd.concat(dfs)
+
+
+def df_to_structure(df: pd.DataFrame, structure_name: str = "") -> gemmi.Structure:
+    """Helper function to convert a DataFrame to a gemmi.Structure."""
+    # Check for the required columns
+    required_columns = set(HEADINGS)
+    if not required_columns.issubset(df.columns):
+        missing = required_columns - set(df.columns)
+        raise ValueError(f"Missing required columns: {missing}")
+
+    structure = gemmi.Structure()
+    structure.name = structure_name
+
+    # Nested iteration over model, chain, residue, and atom to populate the
+    # Structure object
+    for _, model_df in df.groupby("model"):
+        model = gemmi.Model(model_df["model"].iloc[0])
+
+        for _, chain_df in model_df.groupby("chain"):
+            chain = gemmi.Chain(chain_df["chain"].iloc[0])
+
+            for _, residue_df in chain_df.groupby("residue_id"):
+                residue = gemmi.Residue()
+                residue.name = residue_df["residue"].iloc[0]
+                residue.seqid.num = residue_df["residue_id"].iloc[0]
+                residue.het_flag = residue_df["heteroatom_flag"].iloc[0]
+
+                for _, atom_row in residue_df.iterrows():
+                    atom = gemmi.Atom()
+                    atom.name = atom_row["atom"]
+                    atom.element = gemmi.Element(atom_row["element"])
+                    atom.pos = gemmi.Position(
+                        atom_row["x"], atom_row["y"], atom_row["z"]
+                    )
+                    atom.charge = atom_row["charge"]
+                    atom.occ = atom_row["occupancy"]
+                    atom.b_iso = atom_row["b_isotropic"]
+
+                    residue.add_atom(atom)
+
+                chain.add_residue(residue)
+
+            model.add_chain(chain)
+
+        structure.add_model(model)
+
+    return structure
diff --git a/src/mmdf/functions.py b/src/mmdf/functions.py
@@ -1,13 +1,43 @@
 """functions provided by the package."""
+
 import os
 
 import gemmi
 import pandas as pd
 
-from ._gemmi_utils import structure_to_df
+from ._gemmi_utils import df_to_structure, structure_to_df
 
 
 def read(filename: os.PathLike) -> pd.DataFrame:
     """Read a macromolecular structure file into a pandas DataFrame."""
     structure = gemmi.read_structure(str(filename))
     return structure_to_df(structure)
+
+
+def write(
+    filename: os.PathLike,
+    df: pd.DataFrame,
+    structure_name: str = "",
+    pdb_write_options: gemmi.PdbWriteOptions = None,
+) -> None:
+    """Write a pandas DataFrame to a macromolecular structure file.
+
+    Parameters
+    ----------
+    filename (os.PathLike): The file to write the DataFrame to.
+    df (pd.DataFrame): The reference DataFrame containing the structure data.
+    structure_name (str): Optional name to assign to the structure. Defaults to
+        an empty string.
+    pdb_write_options (gemmi.PdbWriteOptions): Optional PdbWriteOptions object
+        to control the output format. See the gemmi documentation for more
+        details. Defaults to None.
+
+    Returns
+    -------
+    None
+    """
+    if pdb_write_options is None:
+        pdb_write_options = gemmi.PdbWriteOptions()
+
+    structure = df_to_structure(df)
+    structure.write_pdb(str(filename), pdb_write_options)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -11,3 +11,8 @@ def test_data_directory() -> Path:
 @pytest.fixture
 def test_pdb_file(test_data_directory) -> Path:
     return test_data_directory / "4v6x.cif"
+
+
+@pytest.fixture
+def test_output_file(test_data_directory) -> Path:
+    return test_data_directory / "test_output.pdb"
diff --git a/tests/test_functions.py b/tests/test_functions.py
@@ -12,3 +12,24 @@ def test_read(test_pdb_file):
         warnings.simplefilter("ignore", DeprecationWarning)
         df = mmdf.read(test_pdb_file)
     assert isinstance(df, pd.DataFrame)
+
+
+def test_write(test_pdb_file, test_output_file):
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", DeprecationWarning)
+        df = mmdf.read(test_pdb_file)
+        mmdf.write(test_output_file, df)
+
+        assert test_output_file.exists()
+
+        df2 = mmdf.read(test_output_file)
+
+        assert isinstance(df2, pd.DataFrame)
+
+    # Sort the dataframes by the chain column to ensure comparison
+    df = df.sort_values(by="chain", kind="mergesort").reset_index(drop=True)
+    df2 = df2.sort_values(by="chain", kind="mergesort").reset_index(drop=True)
+
+    assert df.equals(df2)
+
+    test_output_file.unlink()