Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add functionality to export a dataframe into a .pdb file #8

Merged
merged 1 commit into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
[![CI](https://github.com/teamtomo/mmdf/actions/workflows/ci.yml/badge.svg)](https://github.com/teamtomo/mmdf/actions/workflows/ci.yml)
[![codecov](https://codecov.io/gh/teamtomo/mmdf/branch/main/graph/badge.svg)](https://codecov.io/gh/teamtomo/mmdf)

**M**acro**M**olecular **D**ata**F**rames (`mmdf`) is a small package for reading macromolecular structure files
(.pdb/.mmCIF) into pandas dataframes.
**M**acro**M**olecular **D**ata**F**rames (`mmdf`) is a small package for reading and writing macromolecular structure files
(.pdb/.mmCIF) using pandas dataframes.

The heavy lifting of reading structure files is performed by [gemmi](https://gemmi.readthedocs.io/en/latest/).

Expand All @@ -16,6 +16,7 @@ The heavy lifting of reading structure files is performed by [gemmi](https://gem
```ipython
import mmdf

# Read a PDBx/mmCIF file into a dataframe
df = mmdf.read('4v6x.cif')
df.head()
Out[3]:
Expand All @@ -26,6 +27,11 @@ Out[3]:
3 1 Az ASN 3 ... -53.007 0 1.0 10.0
4 1 Az ASN 3 ... -54.239 0 1.0 10.0
[5 rows x 13 columns]

# Other dataframe manipulation...

# Write dataframe to a PDBx/mmCIF file
mmdf.write('4v6x_new.cif', df)
```

## Changelog
Expand Down
4 changes: 2 additions & 2 deletions src/mmdf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Macromolecular structures as pandas dataframes."""

from .functions import read
from .functions import read, write

__all__ = ["read"]
__all__ = ["read", "write"]
89 changes: 69 additions & 20 deletions src/mmdf/_gemmi_utils.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,30 @@
import gemmi
import pandas as pd

HEADINGS = [
"model",
"chain",
"residue",
"residue_id",
"atom",
"element",
"atomic_number",
"atomic_weight",
"covalent_radius",
"van_der_waals_radius",
"heteroatom_flag",
"x",
"y",
"z",
"charge",
"occupancy",
"b_isotropic",
]


def structure_to_df(structure: gemmi.Structure) -> pd.DataFrame:
dfs = []
headings = [
"model",
"chain",
"residue",
"residue_id",
"atom",
"element",
"atomic_number",
"atomic_weight",
"covalent_radius",
"van_der_waals_radius",
"heteroatom_flag",
"x",
"y",
"z",
"charge",
"occupancy",
"b_isotropic",
]

for model in structure:
data = [
(
Expand All @@ -46,6 +48,53 @@ def structure_to_df(structure: gemmi.Structure) -> pd.DataFrame:
)
for cra in model.all()
]
df = pd.DataFrame(data, columns=headings)
df = pd.DataFrame(data, columns=HEADINGS)
dfs.append(df)
return pd.concat(dfs)


def df_to_structure(df: pd.DataFrame, structure_name: str = "") -> gemmi.Structure:
"""Helper function to convert a DataFrame to a gemmi.Structure."""
# Check for the required columns
required_columns = set(HEADINGS)
if not required_columns.issubset(df.columns):
missing = required_columns - set(df.columns)
raise ValueError(f"Missing required columns: {missing}")

structure = gemmi.Structure()
structure.name = structure_name

# Nested iteration over model, chain, residue, and atom to populate the
# Structure object
for _, model_df in df.groupby("model"):
model = gemmi.Model(model_df["model"].iloc[0])

for _, chain_df in model_df.groupby("chain"):
chain = gemmi.Chain(chain_df["chain"].iloc[0])

for _, residue_df in chain_df.groupby("residue_id"):
residue = gemmi.Residue()
residue.name = residue_df["residue"].iloc[0]
residue.seqid.num = residue_df["residue_id"].iloc[0]
residue.het_flag = residue_df["heteroatom_flag"].iloc[0]

for _, atom_row in residue_df.iterrows():
atom = gemmi.Atom()
atom.name = atom_row["atom"]
atom.element = gemmi.Element(atom_row["element"])
atom.pos = gemmi.Position(
atom_row["x"], atom_row["y"], atom_row["z"]
)
atom.charge = atom_row["charge"]
atom.occ = atom_row["occupancy"]
atom.b_iso = atom_row["b_isotropic"]

residue.add_atom(atom)

chain.add_residue(residue)

model.add_chain(chain)

structure.add_model(model)

return structure
32 changes: 31 additions & 1 deletion src/mmdf/functions.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,43 @@
"""functions provided by the package."""

import os

import gemmi
import pandas as pd

from ._gemmi_utils import structure_to_df
from ._gemmi_utils import df_to_structure, structure_to_df


def read(filename: os.PathLike) -> pd.DataFrame:
"""Read a macromolecular structure file into a pandas DataFrame."""
structure = gemmi.read_structure(str(filename))
return structure_to_df(structure)


def write(
filename: os.PathLike,
df: pd.DataFrame,
structure_name: str = "",
pdb_write_options: gemmi.PdbWriteOptions = None,
) -> None:
"""Write a pandas DataFrame to a macromolecular structure file.

Parameters
----------
filename (os.PathLike): The file to write the DataFrame to.
df (pd.DataFrame): The reference DataFrame containing the structure data.
structure_name (str): Optional name to assign to the structure. Defaults to
an empty string.
pdb_write_options (gemmi.PdbWriteOptions): Optional PdbWriteOptions object
to control the output format. See the gemmi documentation for more
details. Defaults to None.

Returns
-------
None
"""
if pdb_write_options is None:
pdb_write_options = gemmi.PdbWriteOptions()

structure = df_to_structure(df)
structure.write_pdb(str(filename), pdb_write_options)
5 changes: 5 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,8 @@ def test_data_directory() -> Path:
@pytest.fixture
def test_pdb_file(test_data_directory) -> Path:
return test_data_directory / "4v6x.cif"


@pytest.fixture
def test_output_file(test_data_directory) -> Path:
return test_data_directory / "test_output.pdb"
21 changes: 21 additions & 0 deletions tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,24 @@ def test_read(test_pdb_file):
warnings.simplefilter("ignore", DeprecationWarning)
df = mmdf.read(test_pdb_file)
assert isinstance(df, pd.DataFrame)


def test_write(test_pdb_file, test_output_file):
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
df = mmdf.read(test_pdb_file)
mmdf.write(test_output_file, df)

assert test_output_file.exists()

df2 = mmdf.read(test_output_file)

assert isinstance(df2, pd.DataFrame)

# Sort the dataframes by the chain column to ensure comparison
df = df.sort_values(by="chain", kind="mergesort").reset_index(drop=True)
df2 = df2.sort_values(by="chain", kind="mergesort").reset_index(drop=True)

assert df.equals(df2)

test_output_file.unlink()