biotite-dev · padix-key · Mar 1, 2024 · Jan 25, 2024 · Jan 25, 2024 · Jan 25, 2024
diff --git a/doc/apidoc.json b/doc/apidoc.json
@@ -342,5 +342,43 @@
             "dot_bracket_from_structure",
             "base_pairs_from_dot_bracket"
         ]
+    },
+
+    "biotite.structure.io.pdbx" : {
+        "High-level functionality": [
+            "get_sequence",
+            "get_model_count",
+            "get_structure",
+            "set_structure",
+            "get_component",
+            "set_component",
+            "list_assemblies",
+            "get_assembly"
+
+        ],
+        "CIF format" : [
+            "CIFFile",
+            "CIFBlock",
+            "CIFCategory",
+            "CIFColumn",
+            "CIFData"
+        ],
+        "BinaryCIF format" : [
+            "BinaryCIFFile",
+            "BinaryCIFBlock",
+            "BinaryCIFCategory",
+            "BinaryCIFColumn",
+            "BinaryCIFData"
+        ],
+        "BinaryCIF encodings" : [
+            "ByteArrayEncoding",
+            "FixedPointEncoding",
+            "IntervalQuantizationEncoding",
+            "RunLengthEncoding",
+            "DeltaEncoding",
+            "IntegerPackingEncoding",
+            "StringArrayEncoding",
+            "TypeCode"
+        ]
     }
 }
diff --git a/doc/examples/scripts/sequence/residue_coevolution.py b/doc/examples/scripts/sequence/residue_coevolution.py
@@ -49,18 +49,15 @@
 import biotite
 import biotite.structure as struc
 import biotite.structure.io.pdbx as pdbx
-import biotite.sequence as seq
-import biotite.sequence.io.fasta as fasta
 import biotite.sequence.align as align
 import biotite.sequence.graphics as graphics
 import biotite.application.blast as blast
 import biotite.application.clustalo as clustalo
 import biotite.database.rcsb as rcsb
-import biotite.database.entrez as entrez
 
 
 # Get structure and sequence
-pdbx_file = pdbx.PDBxFile.read(rcsb.fetch("1GUU", "mmcif"))
+pdbx_file = pdbx.CIFFile.read(rcsb.fetch("1GUU", "mmcif"))
 sequence = pdbx.get_sequence(pdbx_file)[0]
 # 'use_author_fields' is set to false,
 # to ensure that values in the 'res_id' annotation point to the sequence
@@ -117,7 +114,7 @@ def some_func(x, start=start):
 def mutual_information_zscore(alignment, n_shuffle=100):
     codes = align.get_codes(alignment).T
     alph = alignment.sequences[0].alphabet
-    
+
     mi = _mutual_information(codes, alph)
     np.random.seed(0)
     random_mi = [None] * n_shuffle
@@ -158,13 +155,13 @@ def _mutual_information(codes, alph):
             marginal_probs_i = marginal_counts_i / nrows
             marginal_probs_j = marginal_counts_j / nrows
             combined_probs = combined_counts / nrows
-            
+
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 mi_before_sum = (
                     combined_probs * np.log2(
                         combined_probs / (
-                            marginal_probs_i[:, np.newaxis] * 
+                            marginal_probs_i[:, np.newaxis] *
                             marginal_probs_j[np.newaxis, :]
                         )
                     )

diff --git a/doc/examples/scripts/structure/biological_assembly.py b/doc/examples/scripts/structure/biological_assembly.py
@@ -5,15 +5,16 @@
 Often the biological assembly (or biological unit) reveals the complete
 picture of a protein function, may it be a viral capsid or a
 microfilament.
-However, the usual records in an *PDB*/*mmCIF*/*MMTF* file usually
+However, the usual atom records in an *PDB* or *PDBx* file usually
 describe only the asymmetric unit.
 For large complexes the asymmetric unit may only display one monomer or
 one small subcomplex.
 Multiple copies of the asymmetric unit must be geometrically arranged to
 build the assembly.
 
-In order to get the entire assembly, the *mmCIF* files provided by the
-*RCSB PDB* contain the following fields:
+In order to get the entire assembly, the *PDBx* files provided by the
+*RCSB PDB* (either in *CIF* or *BinaryCIF* format) contain the following
+fields:
 
     - ``pdbx_struct_assembly`` - General information about the
       assemblies
@@ -37,14 +38,13 @@
 # License: BSD 3 clause
 
 from tempfile import NamedTemporaryFile
-import numpy as np
 import biotite.structure as struc
 import biotite.structure.io.pdbx as pdbx
 import biotite.structure.io as strucio
 import biotite.database.rcsb as rcsb
 
 
-pdbx_file = pdbx.PDBxFile.read(rcsb.fetch("3J31", "mmcif"))
+pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("3J31", "bcif"))
 
 assemblies = pdbx.list_assemblies(pdbx_file)
 print("ID    name")

diff --git a/doc/examples/scripts/structure/ku_superimposition.py b/doc/examples/scripts/structure/ku_superimposition.py
@@ -30,8 +30,8 @@
 ku_file_name = ku_file.name
 
 # Download and parse structure files
-ku_dna = pdbx.get_structure(pdbx.PDBxFile.read(rcsb.fetch("1JEY", "cif")))[0]
-ku     = pdbx.get_structure(pdbx.PDBxFile.read(rcsb.fetch("1JEQ", "cif")))[0]
+ku_dna = pdbx.get_structure(pdbx.CIFFile.read(rcsb.fetch("1JEY", "cif")))[0]
+ku     = pdbx.get_structure(pdbx.CIFFile.read(rcsb.fetch("1JEQ", "cif")))[0]
 # Remove DNA and water
 ku_dna = ku_dna[(ku_dna.chain_id == "A") | (ku_dna.chain_id == "B")]
 ku_dna = ku_dna[~struc.filter_solvent(ku_dna)]
@@ -48,11 +48,11 @@
 # We do not want the cropped structures
 # -> apply superimposition on original structures
 ku_superimposed = transformation.apply(ku)
-# Write PDBx files as input for PyMOL
-cif_file = pdbx.PDBxFile()
+# Write mmCIF files as input for PyMOL
+cif_file = pdbx.CIFFile()
 pdbx.set_structure(cif_file, ku_dna, data_block="ku_dna")
 cif_file.write(ku_dna_file_name)
-cif_file = pdbx.PDBxFile()
+cif_file = pdbx.CIFFile()
 pdbx.set_structure(cif_file, ku_superimposed, data_block="ku")
 cif_file.write(ku_file_name)
 # Visualization with PyMOL...

diff --git a/doc/examples/scripts/structure/sheet_arrangement.py b/doc/examples/scripts/structure/sheet_arrangement.py
@@ -5,7 +5,8 @@
 This scripts plots the arrangements of strands in selected β-sheets of a
 protein structure.
 The information is entirely taken from the ``struct_sheet_order`` and
-``struct_sheet_range`` categories of the structure's *PDBx/mmCIF* file.
+``struct_sheet_range`` categories of the corresponding *PDBx* file
+in *BinaryCIF* format.
 
 In this case the β-barrel of a split fluorescent protein is shown,
 but the script can be customized to show the β-sheets of any protein
@@ -42,10 +43,10 @@
     biotite.colors["lightgreen"],
     biotite.colors["brightorange"],
 ]
-CONNECTION_COLOR = "black"      # Color of the connection lines 
-CONNECTION_LINE_WIDTH = 1.5     # Width of the connection lines 
-CONNECTION_HEIGHT = 0.1         # Minimum height of the connection lines 
-CONNECTION_SEPARATION = 0.1     # Minimum vertical distance between the connection lines 
+CONNECTION_COLOR = "black"      # Color of the connection lines
+CONNECTION_LINE_WIDTH = 1.5     # Width of the connection lines
+CONNECTION_HEIGHT = 0.1         # Minimum height of the connection lines
+CONNECTION_SEPARATION = 0.1     # Minimum vertical distance between the connection lines
 RES_ID_HEIGHT = -0.2            # The vertical distance of the residue ID labels from the arrow ends
 RES_ID_FONT_SIZE = 8            # The font size of the residue ID labels
 RES_ID_FONT_WEIGHT = "bold"     # The font weight of the residue ID labels
@@ -55,34 +56,34 @@
 ##### SNOITPO #####
 
 ########################################################################
-# The ``struct_sheet_order`` category of the *mmCIF* file gives us the
-# information about the existing sheets, the strands these sheets
+# The ``struct_sheet_order`` category of the *BinaryCIF* file gives us
+# the information about the existing sheets, the strands these sheets
 # contain and which of these strands are connected with one another
 # in either parallel or anti-parallel orientation.
 #
 # We can use this to select only strands that belong to those sheets,
 # we are interested in.
 # The strand adjacency and relative orientation is also saved for later.
 
-pdbx_file = pdbx.PDBxFile.read(rcsb.fetch(PDB_ID, "pdbx"))
-sheet_order_dict = pdbx_file["struct_sheet_order"]
+bcif_file = pdbx.BinaryCIFFile.read(rcsb.fetch(PDB_ID, "bcif"))
+sheet_order = bcif_file.block["struct_sheet_order"]
 
 # Create a boolean mask that covers the selected sheets
 # or all sheets if none is given
 if SHEETS is None:
-    sele = np.full(len(sheet_order_dict["sheet_id"]), True)
+    sele = np.full(sheet_order.row_count, True)
 else:
     sele = np.array([
-        sheet in SHEETS for sheet in sheet_order_dict["sheet_id"]
+        sheet in SHEETS for sheet in sheet_order["sheet_id"].as_array()
     ])
-sheet_ids = sheet_order_dict["sheet_id"][sele]
+sheet_ids = sheet_order["sheet_id"].as_array()[sele]
 
-is_parallel_list = sheet_order_dict["sense"][sele] == "parallel"
+is_parallel_list = sheet_order["sense"].as_array()[sele] == "parallel"
 
 adjacent_strands = np.array([
     (strand_i, strand_j) for strand_i, strand_j in zip(
-        sheet_order_dict["range_id_1"][sele],
-        sheet_order_dict["range_id_2"][sele]
+        sheet_order["range_id_1"].as_array()[sele],
+        sheet_order["range_id_2"].as_array()[sele]
     )
 ])
 
@@ -94,30 +95,30 @@
 # The ``struct_sheet_range`` category of the *mmCIF* file tells us
 # which residues compose each strand in terms of chain and
 # residue IDs.
-# 
+#
 # Later the plot shall display connections between consecutive strands
 # in a protein chain.
 # Although, this category does not provide this connection information
 # directly, we can sort the strands by their beginning chain and residue
 # IDs and then simply connect successive entries.
 
-sheet_range_dict = pdbx_file["struct_sheet_range"]
+sheet_range = bcif_file.block["struct_sheet_range"]
 
 # Again, create a boolean mask that covers the selected sheets
 sele = np.array([
-    sheet in sheet_ids for sheet in sheet_range_dict["sheet_id"]
+    sheet in sheet_ids for sheet in sheet_range["sheet_id"].as_array()
 ])
-strand_chain_ids = sheet_range_dict["beg_auth_asym_id"][sele]
-strand_res_id_begs = sheet_range_dict["beg_auth_seq_id"].astype(int)[sele]
-strand_res_id_ends = sheet_range_dict["end_auth_seq_id"].astype(int)[sele]
+strand_chain_ids = sheet_range["beg_auth_asym_id"].as_array()[sele]
+strand_res_id_begs = sheet_range["beg_auth_seq_id"].as_array(int)[sele]
+strand_res_id_ends = sheet_range["end_auth_seq_id"].as_array(int)[sele]
 
 # Secondarily sort by residue ID
 order = np.argsort(strand_res_id_begs, kind="stable")
 # Primarily sort by chain ID
 order = order[np.argsort(strand_chain_ids[order], kind="stable")]
 
-sorted_strand_ids = sheet_range_dict["id"][sele][order]
-sorted_sheet_ids = sheet_range_dict["sheet_id"][sele][order]
+sorted_strand_ids = sheet_range["id"].as_array()[sele][order]
+sorted_sheet_ids = sheet_range["sheet_id"].as_array()[sele][order]
 sorted_chain_ids = strand_chain_ids[order]
 sorted_res_id_begs = strand_res_id_begs[order]
 sorted_res_id_ends = strand_res_id_ends[order]
@@ -297,7 +298,7 @@
 # separable
 # Plot the short connections at low height
 # to decrease line intersections
-# -> sort connections by length of connection 
+# -> sort connections by length of connection
 order = np.argsort([
     np.abs(coord_dict[strand_i][0][0] - coord_dict[strand_j][0][0])
     for strand_i, strand_j in connections
@@ -307,7 +308,7 @@
     horizontal_line_height = 1 + CONNECTION_HEIGHT + i * CONNECTION_SEPARATION
     coord_i_beg, coord_i_end = coord_dict[strand_i]
     coord_j_beg, coord_j_end = coord_dict[strand_j]
-    
+
     if np.sign(coord_i_end[1]) == np.sign(coord_j_beg[1]):
         # Start and end are on the same side of the arrows
         x = (

diff --git a/doc/tutorial/src/application.py b/doc/tutorial/src/application.py
@@ -24,7 +24,7 @@
 The base class for all interfaces is the :class:`Application` class.
 Each :class:`Application` instance has a life cycle, starting with its
 creation and ending with the result extraction.
-Each state in this life cycle is described by the value of the 
+Each state in this life cycle is described by the value of the
 *enum* :class:`AppState`, that each :class:`Application` contains:
 Directly after its instantiation the app is in the ``CREATED`` state.
 In this state further parameters can be set for the application run.
@@ -66,7 +66,7 @@
 class MyApplication(Application):
     def __init__(self, param): super().__init__()
     def run(self): pass
-    def is_finished(self): return True 
+    def is_finished(self): return True
     def wait_interval(self): return 0.1
     def evaluate(self): pass
     def clean_up(self): pass
@@ -85,7 +85,7 @@ def get_some_data(self): return "some data"
 ########################################################################
 # The following subsections will dive into the available
 # :class:`Application` classes in depth.
-# 
+#
 # Finding homologous sequences with BLAST
 # ---------------------------------------
 #
@@ -126,8 +126,8 @@ def get_some_data(self): return "some data"
 # :class:`biotite.sequence.align.Alignment`.
 # It contains some additional information as shown above.
 # The hit UID can be used to obtain the complete hit sequence via
-# :mod:`biotite.database.entrez`. 
-# 
+# :mod:`biotite.database.entrez`.
+#
 # The next alignment should be a bit more challenging.
 # We take a random part of the *E. coli* BL21 genome and distort it a
 # little bit.
@@ -164,7 +164,7 @@ def get_some_data(self): return "some data"
 # of conduct and prevents you from submitting two queries within one
 # minute. If you want to be rude to the NCBI server, create the
 # instance with :obj:`obey_rules` set to false.
-# 
+#
 # Multiple sequence alignments
 # ----------------------------
 #
@@ -263,7 +263,7 @@ def get_some_data(self): return "some data"
 ########################################################################
 # Secondary structure annotation
 # ------------------------------
-# 
+#
 # .. currentmodule:: biotite.application.dssp
 #
 # Althogh :mod:`biotite.structure` offers the function
@@ -279,7 +279,7 @@ def get_some_data(self): return "some data"
 import biotite.application.dssp as dssp
 import biotite.structure.io as strucio
 
-file_path = rcsb.fetch("1l2y", "mmtf", gettempdir())
+file_path = rcsb.fetch("1l2y", "bcif", gettempdir())
 stack = strucio.load_structure(file_path)
 array = stack[0]
 app = dssp.DsspApp(array)

diff --git a/doc/tutorial/src/database.py b/doc/tutorial/src/database.py
@@ -44,8 +44,8 @@
 # If you want to download files irrespectively, set :obj:`overwrite` to
 # true.
 
-# Download file in the fast and small binary MMTF format
-file_path = rcsb.fetch("1l2y", "mmtf", gettempdir(), overwrite=True)
+# Download file in the fast and small BinaryCIF format
+file_path = rcsb.fetch("1l2y", "bcif", gettempdir(), overwrite=True)
 
 ########################################################################
 # If you omit the file path or set it to ``None``, the downloaded data
@@ -72,11 +72,11 @@
 pdb_ids = rcsb.search(query)
 print(pdb_ids)
 print(rcsb.count(query))
-files = rcsb.fetch(pdb_ids, "mmtf", gettempdir())
+files = rcsb.fetch(pdb_ids, "cif", gettempdir())
 
 ########################################################################
 # This was a simple search for the occurrence of the search term in any
-# field. 
+# field.
 # You can also search for a value in a specific field with a
 # :class:`FieldQuery`.
 # A complete list of the available fields and its supported operators
@@ -125,15 +125,15 @@
 ########################################################################
 # Note that grouping may omit PDB IDs in search results, if such PDB IDs
 # cannot be grouped.
-# In the example shown above, not all structures 
+# In the example shown above, not all structures
 # For example in the case shown above only a few PDB entries were
 # uploaded as collection and hence are part of the search results.
 #
 # Fetching files from the NCBI Entrez database
 # --------------------------------------------
-# 
+#
 # .. currentmodule:: biotite.database.entrez
-# 
+#
 # Another important source of biological information is the
 # *NCBI Entrez* database, which is commonly known as *the NCBI*.
 # It provides a myriad of information, ranging from sequences and
@@ -153,7 +153,7 @@
     db_name="nuccore", ret_type="fasta"
 )
 print(file_path)
-# ... or multiple UIDs 
+# ... or multiple UIDs
 file_paths = entrez.fetch(
     ["1L2Y_A","1AKI_A"], gettempdir(), suffix="fa",
     db_name="protein", ret_type="fasta"