Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support BinaryCIF file format #531

Merged
merged 14 commits into from
Mar 1, 2024
38 changes: 38 additions & 0 deletions doc/apidoc.json
Original file line number Diff line number Diff line change
Expand Up @@ -342,5 +342,43 @@
"dot_bracket_from_structure",
"base_pairs_from_dot_bracket"
]
},

"biotite.structure.io.pdbx" : {
"High-level functionality": [
"get_sequence",
"get_model_count",
"get_structure",
"set_structure",
"get_component",
"set_component",
"list_assemblies",
"get_assembly"

],
"CIF format" : [
"CIFFile",
"CIFBlock",
"CIFCategory",
"CIFColumn",
"CIFData"
],
"BinaryCIF format" : [
"BinaryCIFFile",
"BinaryCIFBlock",
"BinaryCIFCategory",
"BinaryCIFColumn",
"BinaryCIFData"
],
"BinaryCIF encodings" : [
"ByteArrayEncoding",
"FixedPointEncoding",
"IntervalQuantizationEncoding",
"RunLengthEncoding",
"DeltaEncoding",
"IntegerPackingEncoding",
"StringArrayEncoding",
"TypeCode"
]
}
}
11 changes: 4 additions & 7 deletions doc/examples/scripts/sequence/residue_coevolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,15 @@
import biotite
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.application.blast as blast
import biotite.application.clustalo as clustalo
import biotite.database.rcsb as rcsb
import biotite.database.entrez as entrez


# Get structure and sequence
pdbx_file = pdbx.PDBxFile.read(rcsb.fetch("1GUU", "mmcif"))
pdbx_file = pdbx.CIFFile.read(rcsb.fetch("1GUU", "mmcif"))
sequence = pdbx.get_sequence(pdbx_file)[0]
# 'use_author_fields' is set to false,
# to ensure that values in the 'res_id' annotation point to the sequence
Expand Down Expand Up @@ -117,7 +114,7 @@ def some_func(x, start=start):
def mutual_information_zscore(alignment, n_shuffle=100):
codes = align.get_codes(alignment).T
alph = alignment.sequences[0].alphabet

mi = _mutual_information(codes, alph)
np.random.seed(0)
random_mi = [None] * n_shuffle
Expand Down Expand Up @@ -158,13 +155,13 @@ def _mutual_information(codes, alph):
marginal_probs_i = marginal_counts_i / nrows
marginal_probs_j = marginal_counts_j / nrows
combined_probs = combined_counts / nrows

with warnings.catch_warnings():
warnings.simplefilter("ignore")
mi_before_sum = (
combined_probs * np.log2(
combined_probs / (
marginal_probs_i[:, np.newaxis] *
marginal_probs_i[:, np.newaxis] *
marginal_probs_j[np.newaxis, :]
)
)
Expand Down
10 changes: 5 additions & 5 deletions doc/examples/scripts/structure/biological_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@
Often the biological assembly (or biological unit) reveals the complete
picture of a protein function, may it be a viral capsid or a
microfilament.
However, the usual records in an *PDB*/*mmCIF*/*MMTF* file usually
However, the usual atom records in an *PDB* or *PDBx* file usually
describe only the asymmetric unit.
For large complexes the asymmetric unit may only display one monomer or
one small subcomplex.
Multiple copies of the asymmetric unit must be geometrically arranged to
build the assembly.

In order to get the entire assembly, the *mmCIF* files provided by the
*RCSB PDB* contain the following fields:
In order to get the entire assembly, the *PDBx* files provided by the
*RCSB PDB* (either in *CIF* or *BinaryCIF* format) contain the following
fields:

- ``pdbx_struct_assembly`` - General information about the
assemblies
Expand All @@ -37,14 +38,13 @@
# License: BSD 3 clause

from tempfile import NamedTemporaryFile
import numpy as np
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
import biotite.structure.io as strucio
import biotite.database.rcsb as rcsb


pdbx_file = pdbx.PDBxFile.read(rcsb.fetch("3J31", "mmcif"))
pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("3J31", "bcif"))

assemblies = pdbx.list_assemblies(pdbx_file)
print("ID name")
Expand Down
10 changes: 5 additions & 5 deletions doc/examples/scripts/structure/ku_superimposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
ku_file_name = ku_file.name

# Download and parse structure files
ku_dna = pdbx.get_structure(pdbx.PDBxFile.read(rcsb.fetch("1JEY", "cif")))[0]
ku = pdbx.get_structure(pdbx.PDBxFile.read(rcsb.fetch("1JEQ", "cif")))[0]
ku_dna = pdbx.get_structure(pdbx.CIFFile.read(rcsb.fetch("1JEY", "cif")))[0]
ku = pdbx.get_structure(pdbx.CIFFile.read(rcsb.fetch("1JEQ", "cif")))[0]
# Remove DNA and water
ku_dna = ku_dna[(ku_dna.chain_id == "A") | (ku_dna.chain_id == "B")]
ku_dna = ku_dna[~struc.filter_solvent(ku_dna)]
Expand All @@ -48,11 +48,11 @@
# We do not want the cropped structures
# -> apply superimposition on original structures
ku_superimposed = transformation.apply(ku)
# Write PDBx files as input for PyMOL
cif_file = pdbx.PDBxFile()
# Write mmCIF files as input for PyMOL
cif_file = pdbx.CIFFile()
pdbx.set_structure(cif_file, ku_dna, data_block="ku_dna")
cif_file.write(ku_dna_file_name)
cif_file = pdbx.PDBxFile()
cif_file = pdbx.CIFFile()
pdbx.set_structure(cif_file, ku_superimposed, data_block="ku")
cif_file.write(ku_file_name)
# Visualization with PyMOL...
Expand Down
51 changes: 26 additions & 25 deletions doc/examples/scripts/structure/sheet_arrangement.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
This scripts plots the arrangements of strands in selected β-sheets of a
protein structure.
The information is entirely taken from the ``struct_sheet_order`` and
``struct_sheet_range`` categories of the structure's *PDBx/mmCIF* file.
``struct_sheet_range`` categories of the corresponding *PDBx* file
in *BinaryCIF* format.

In this case the β-barrel of a split fluorescent protein is shown,
but the script can be customized to show the β-sheets of any protein
Expand Down Expand Up @@ -42,10 +43,10 @@
biotite.colors["lightgreen"],
biotite.colors["brightorange"],
]
CONNECTION_COLOR = "black" # Color of the connection lines
CONNECTION_LINE_WIDTH = 1.5 # Width of the connection lines
CONNECTION_HEIGHT = 0.1 # Minimum height of the connection lines
CONNECTION_SEPARATION = 0.1 # Minimum vertical distance between the connection lines
CONNECTION_COLOR = "black" # Color of the connection lines
CONNECTION_LINE_WIDTH = 1.5 # Width of the connection lines
CONNECTION_HEIGHT = 0.1 # Minimum height of the connection lines
CONNECTION_SEPARATION = 0.1 # Minimum vertical distance between the connection lines
RES_ID_HEIGHT = -0.2 # The vertical distance of the residue ID labels from the arrow ends
RES_ID_FONT_SIZE = 8 # The font size of the residue ID labels
RES_ID_FONT_WEIGHT = "bold" # The font weight of the residue ID labels
Expand All @@ -55,34 +56,34 @@
##### SNOITPO #####

########################################################################
# The ``struct_sheet_order`` category of the *mmCIF* file gives us the
# information about the existing sheets, the strands these sheets
# The ``struct_sheet_order`` category of the *BinaryCIF* file gives us
# the information about the existing sheets, the strands these sheets
# contain and which of these strands are connected with one another
# in either parallel or anti-parallel orientation.
#
# We can use this to select only strands that belong to those sheets,
# we are interested in.
# The strand adjacency and relative orientation is also saved for later.

pdbx_file = pdbx.PDBxFile.read(rcsb.fetch(PDB_ID, "pdbx"))
sheet_order_dict = pdbx_file["struct_sheet_order"]
bcif_file = pdbx.BinaryCIFFile.read(rcsb.fetch(PDB_ID, "bcif"))
sheet_order = bcif_file.block["struct_sheet_order"]

# Create a boolean mask that covers the selected sheets
# or all sheets if none is given
if SHEETS is None:
sele = np.full(len(sheet_order_dict["sheet_id"]), True)
sele = np.full(sheet_order.row_count, True)
else:
sele = np.array([
sheet in SHEETS for sheet in sheet_order_dict["sheet_id"]
sheet in SHEETS for sheet in sheet_order["sheet_id"].as_array()
])
sheet_ids = sheet_order_dict["sheet_id"][sele]
sheet_ids = sheet_order["sheet_id"].as_array()[sele]

is_parallel_list = sheet_order_dict["sense"][sele] == "parallel"
is_parallel_list = sheet_order["sense"].as_array()[sele] == "parallel"

adjacent_strands = np.array([
(strand_i, strand_j) for strand_i, strand_j in zip(
sheet_order_dict["range_id_1"][sele],
sheet_order_dict["range_id_2"][sele]
sheet_order["range_id_1"].as_array()[sele],
sheet_order["range_id_2"].as_array()[sele]
)
])

Expand All @@ -94,30 +95,30 @@
# The ``struct_sheet_range`` category of the *mmCIF* file tells us
# which residues compose each strand in terms of chain and
# residue IDs.
#
#
# Later the plot shall display connections between consecutive strands
# in a protein chain.
# Although, this category does not provide this connection information
# directly, we can sort the strands by their beginning chain and residue
# IDs and then simply connect successive entries.

sheet_range_dict = pdbx_file["struct_sheet_range"]
sheet_range = bcif_file.block["struct_sheet_range"]

# Again, create a boolean mask that covers the selected sheets
sele = np.array([
sheet in sheet_ids for sheet in sheet_range_dict["sheet_id"]
sheet in sheet_ids for sheet in sheet_range["sheet_id"].as_array()
])
strand_chain_ids = sheet_range_dict["beg_auth_asym_id"][sele]
strand_res_id_begs = sheet_range_dict["beg_auth_seq_id"].astype(int)[sele]
strand_res_id_ends = sheet_range_dict["end_auth_seq_id"].astype(int)[sele]
strand_chain_ids = sheet_range["beg_auth_asym_id"].as_array()[sele]
strand_res_id_begs = sheet_range["beg_auth_seq_id"].as_array(int)[sele]
strand_res_id_ends = sheet_range["end_auth_seq_id"].as_array(int)[sele]

# Secondarily sort by residue ID
order = np.argsort(strand_res_id_begs, kind="stable")
# Primarily sort by chain ID
order = order[np.argsort(strand_chain_ids[order], kind="stable")]

sorted_strand_ids = sheet_range_dict["id"][sele][order]
sorted_sheet_ids = sheet_range_dict["sheet_id"][sele][order]
sorted_strand_ids = sheet_range["id"].as_array()[sele][order]
sorted_sheet_ids = sheet_range["sheet_id"].as_array()[sele][order]
sorted_chain_ids = strand_chain_ids[order]
sorted_res_id_begs = strand_res_id_begs[order]
sorted_res_id_ends = strand_res_id_ends[order]
Expand Down Expand Up @@ -297,7 +298,7 @@
# separable
# Plot the short connections at low height
# to decrease line intersections
# -> sort connections by length of connection
# -> sort connections by length of connection
order = np.argsort([
np.abs(coord_dict[strand_i][0][0] - coord_dict[strand_j][0][0])
for strand_i, strand_j in connections
Expand All @@ -307,7 +308,7 @@
horizontal_line_height = 1 + CONNECTION_HEIGHT + i * CONNECTION_SEPARATION
coord_i_beg, coord_i_end = coord_dict[strand_i]
coord_j_beg, coord_j_end = coord_dict[strand_j]

if np.sign(coord_i_end[1]) == np.sign(coord_j_beg[1]):
# Start and end are on the same side of the arrows
x = (
Expand Down
16 changes: 8 additions & 8 deletions doc/tutorial/src/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
The base class for all interfaces is the :class:`Application` class.
Each :class:`Application` instance has a life cycle, starting with its
creation and ending with the result extraction.
Each state in this life cycle is described by the value of the
Each state in this life cycle is described by the value of the
*enum* :class:`AppState`, that each :class:`Application` contains:
Directly after its instantiation the app is in the ``CREATED`` state.
In this state further parameters can be set for the application run.
Expand Down Expand Up @@ -66,7 +66,7 @@
class MyApplication(Application):
def __init__(self, param): super().__init__()
def run(self): pass
def is_finished(self): return True
def is_finished(self): return True
def wait_interval(self): return 0.1
def evaluate(self): pass
def clean_up(self): pass
Expand All @@ -85,7 +85,7 @@ def get_some_data(self): return "some data"
########################################################################
# The following subsections will dive into the available
# :class:`Application` classes in depth.
#
#
# Finding homologous sequences with BLAST
# ---------------------------------------
#
Expand Down Expand Up @@ -126,8 +126,8 @@ def get_some_data(self): return "some data"
# :class:`biotite.sequence.align.Alignment`.
# It contains some additional information as shown above.
# The hit UID can be used to obtain the complete hit sequence via
# :mod:`biotite.database.entrez`.
#
# :mod:`biotite.database.entrez`.
#
# The next alignment should be a bit more challenging.
# We take a random part of the *E. coli* BL21 genome and distort it a
# little bit.
Expand Down Expand Up @@ -164,7 +164,7 @@ def get_some_data(self): return "some data"
# of conduct and prevents you from submitting two queries within one
# minute. If you want to be rude to the NCBI server, create the
# instance with :obj:`obey_rules` set to false.
#
#
# Multiple sequence alignments
# ----------------------------
#
Expand Down Expand Up @@ -263,7 +263,7 @@ def get_some_data(self): return "some data"
########################################################################
# Secondary structure annotation
# ------------------------------
#
#
# .. currentmodule:: biotite.application.dssp
#
# Althogh :mod:`biotite.structure` offers the function
Expand All @@ -279,7 +279,7 @@ def get_some_data(self): return "some data"
import biotite.application.dssp as dssp
import biotite.structure.io as strucio

file_path = rcsb.fetch("1l2y", "mmtf", gettempdir())
file_path = rcsb.fetch("1l2y", "bcif", gettempdir())
stack = strucio.load_structure(file_path)
array = stack[0]
app = dssp.DsspApp(array)
Expand Down
16 changes: 8 additions & 8 deletions doc/tutorial/src/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
# If you want to download files irrespectively, set :obj:`overwrite` to
# true.

# Download file in the fast and small binary MMTF format
file_path = rcsb.fetch("1l2y", "mmtf", gettempdir(), overwrite=True)
# Download file in the fast and small BinaryCIF format
file_path = rcsb.fetch("1l2y", "bcif", gettempdir(), overwrite=True)

########################################################################
# If you omit the file path or set it to ``None``, the downloaded data
Expand All @@ -72,11 +72,11 @@
pdb_ids = rcsb.search(query)
print(pdb_ids)
print(rcsb.count(query))
files = rcsb.fetch(pdb_ids, "mmtf", gettempdir())
files = rcsb.fetch(pdb_ids, "cif", gettempdir())

########################################################################
# This was a simple search for the occurrence of the search term in any
# field.
# field.
# You can also search for a value in a specific field with a
# :class:`FieldQuery`.
# A complete list of the available fields and its supported operators
Expand Down Expand Up @@ -125,15 +125,15 @@
########################################################################
# Note that grouping may omit PDB IDs in search results, if such PDB IDs
# cannot be grouped.
# In the example shown above, not all structures
# In the example shown above, not all structures
# For example in the case shown above only a few PDB entries were
# uploaded as collection and hence are part of the search results.
#
# Fetching files from the NCBI Entrez database
# --------------------------------------------
#
#
# .. currentmodule:: biotite.database.entrez
#
#
# Another important source of biological information is the
# *NCBI Entrez* database, which is commonly known as *the NCBI*.
# It provides a myriad of information, ranging from sequences and
Expand All @@ -153,7 +153,7 @@
db_name="nuccore", ret_type="fasta"
)
print(file_path)
# ... or multiple UIDs
# ... or multiple UIDs
file_paths = entrez.fetch(
["1L2Y_A","1AKI_A"], gettempdir(), suffix="fa",
db_name="protein", ret_type="fasta"
Expand Down
Loading
Loading