feat: add support for current RNTuple files (#962)

* Adds changes, but ignores some to keep the old tests passing for now. * style: pre-commit fixes * Adds changes to footer and skips old file tests with the exception of 0662. * style: pre-commit fixes * Adds new version of file for test 0662 and renames new test file to match pr nr. * Swaps rntuple file for new one form RNTuple in test 0662, the footer reading part should not complain anymore. * Applies changes to const file. * Adds new format test file. * style: pre-commit fixes * Tests only the new file for now. * style: pre-commit fixes * Adds new RNTuple schema for the footer and the split cases without the split functions. * style: pre-commit fixes * Updates testing / adds more files. * style: pre-commit fixes * Adds changes to split functionality. * style: pre-commit fixes * Fixes split for uint16 case. * style: pre-commit fixes * Fixes split for 32 and 64. * style: pre-commit fixes * Fixes spelling errors. * style: pre-commit fixes * Fixes spelling errors. * Changes zigzag function. * style: pre-commit fixes * Uses test files from skhep_testdata and removes local ones. * style: pre-commit fixes * Reverts changes to test_0662-rntuple-stl-containers.py * Vectorized split-decoding for 32 and 64-bits. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jim Pivarski <[email protected]> Co-authored-by: Jim Pivarski <[email protected]>
scikit-hep · Oct 19, 2023 · 6b1952e · 6b1952e
1 parent fff5ebe
commit 6b1952e
Show file tree

Hide file tree

Showing 5 changed files with 160 additions and 19 deletions.
diff --git a/src/uproot/const.py b/src/uproot/const.py
@@ -129,18 +129,25 @@
     7: "float64",
     8: "float32",
     9: "float16",
-    10: "int64",
-    11: "int32",
-    12: "int16",
-    13: "int8",
-    14: "uint32",  # SplitIndex64 delta encoding
-    15: "uint64",  # SplitIndex32 delta encoding
+    10: "uint64",
+    11: "uint32",
+    12: "uint16",
+    13: "uint8",
+    14: "uint64",  # SplitIndex64 delta encoding
+    15: "uint32",  # SplitIndex32 delta encoding
     16: "float64",  # split
     17: "float32",  # split
     18: "float16",  # split
-    19: "int64",  # split
-    20: "int32",  # split
-    21: "int16",  # split
+    19: "uint64",  # split
+    20: "uint32",  # split
+    21: "uint16",  # split
+    22: "int64",
+    23: "int32",
+    24: "int16",
+    25: "int8",
+    26: "int64",  # split + zigzag encoding
+    27: "int32",  # split + zigzag encoding
+    28: "int16",  # split + zigzag encoding
 }
 rntuple_col_num_to_size_dict = {
     1: 64,
@@ -156,14 +163,21 @@
     11: 32,
     12: 16,
     13: 8,
-    14: 32,  # SplitIndex64 delta encoding
-    15: 64,  # SplitIndex32 delta encoding
+    14: 64,  # SplitIndex64 delta encoding
+    15: 32,  # SplitIndex32 delta encoding
     16: 64,  # split
     17: 32,  # split
     18: 16,  # split
     19: 64,  # split
     20: 32,  # split
     21: 16,  # split
+    22: 64,
+    23: 32,
+    24: 16,
+    25: 8,
+    26: 64,  # split + zigzag encoding
+    27: 32,  # split + zigzag encoding
+    28: 16,  # split + zigzag encoding
 }
 
 rntuple_col_type_to_num_dict = {
@@ -176,10 +190,10 @@
     "real64": 7,
     "real32": 8,
     "real16": 9,
-    "int64": 10,
-    "int32": 11,
-    "int16": 12,
-    "int8": 13,
+    "uint64": 10,
+    "uint32": 11,
+    "uint16": 12,
+    "uint8": 13,
     "splitindex64": 14,
     "splitindex32": 15,
     "splitreal64": 16,
@@ -188,6 +202,13 @@
     "splitin64": 19,
     "splitint32": 20,
     "splitint16": 21,
+    "int64": 22,
+    "int32": 23,
+    "int16": 24,
+    "int8": 25,
+    "splitzigzagint64": 26,
+    "splitzigzagint32": 27,
+    "splitzigzagint16": 28,
 }
 
 rntuple_role_leaf = 0

diff --git a/src/uproot/models/RNTuple.py b/src/uproot/models/RNTuple.py
@@ -28,6 +28,10 @@
 _rntuple_cluster_summary_format = struct.Struct("<QQ")
 
 
+def from_zigzag(n):
+    return n >> 1 ^ -(n & 1)
+
+
 def _envelop_header(chunk, cursor, context):
     env_version, min_version = cursor.fields(
         chunk, uproot.const._rntuple_frame_format, context
@@ -326,7 +330,7 @@ def to_akform(self):
         form = ak.forms.RecordForm(recordlist, topnames, form_key="toplevel")
         return form
 
-    def read_pagedesc(self, destination, desc, dtype_str, dtype):
+    def read_pagedesc(self, destination, desc, dtype_str, dtype, nbits, split):
         loc = desc.locator
         context = {}
         # bool in RNTuple is always stored as bits
@@ -339,6 +343,44 @@ def read_pagedesc(self, destination, desc, dtype_str, dtype):
         content = cursor.array(
             decomp_chunk, num_elements_toread, dtype, context, move=False
         )
+
+        if split:
+            content = content.view(numpy.uint8)
+
+            if nbits == 16:
+                # AAAAABBBBB needs to become
+                # ABABABABAB
+                res = numpy.empty(len(content), numpy.uint8)
+                res[0::2] = content[len(res) * 0 // 2 : len(res) * 1 // 2]
+                res[1::2] = content[len(res) * 1 // 2 : len(res) * 2 // 2]
+                res = res.view(numpy.uint16)
+
+            elif nbits == 32:
+                # AAAAABBBBBCCCCCDDDDD needs to become
+                # ABCDABCDABCDABCDABCD
+                res = numpy.empty(len(content), numpy.uint8)
+                res[0::4] = content[len(res) * 0 // 4 : len(res) * 1 // 4]
+                res[1::4] = content[len(res) * 1 // 4 : len(res) * 2 // 4]
+                res[2::4] = content[len(res) * 2 // 4 : len(res) * 3 // 4]
+                res[3::4] = content[len(res) * 3 // 4 : len(res) * 4 // 4]
+                res = res.view(numpy.uint32)
+
+            elif nbits == 64:
+                # AAAAABBBBBCCCCCDDDDDEEEEEFFFFFGGGGGHHHHH needs to become
+                # ABCDEFGHABCDEFGHABCDEFGHABCDEFGHABCDEFGH
+                res = numpy.empty(len(content), numpy.uint8)
+                res[0::8] = content[len(res) * 0 // 8 : len(res) * 1 // 8]
+                res[1::8] = content[len(res) * 1 // 8 : len(res) * 2 // 8]
+                res[2::8] = content[len(res) * 2 // 8 : len(res) * 3 // 8]
+                res[3::8] = content[len(res) * 3 // 8 : len(res) * 4 // 8]
+                res[4::8] = content[len(res) * 4 // 8 : len(res) * 5 // 8]
+                res[5::8] = content[len(res) * 5 // 8 : len(res) * 6 // 8]
+                res[6::8] = content[len(res) * 6 // 8 : len(res) * 7 // 8]
+                res[7::8] = content[len(res) * 7 // 8 : len(res) * 8 // 8]
+                res = res.view(numpy.uint64)
+
+            content = res
+
         if isbit:
             content = (
                 numpy.unpackbits(content.view(dtype=numpy.uint8))
@@ -368,14 +410,24 @@ def read_col_page(self, ncol, cluster_i):
         total_len = numpy.sum([desc.num_elements for desc in pagelist])
         res = numpy.empty(total_len, dtype)
         tracker = 0
+        split = 14 <= dtype_byte <= 21 or 26 <= dtype_byte <= 28
+        nbits = uproot.const.rntuple_col_num_to_size_dict[dtype_byte]
         for page_desc in pagelist:
             n_elements = page_desc.num_elements
             tracker_end = tracker + n_elements
-            self.read_pagedesc(res[tracker:tracker_end], page_desc, dtype_str, dtype)
+            self.read_pagedesc(
+                res[tracker:tracker_end], page_desc, dtype_str, dtype, nbits, split
+            )
             tracker = tracker_end
 
         if dtype_byte <= uproot.const.rntuple_col_type_to_num_dict["index32"]:
             res = numpy.insert(res, 0, 0)  # for offsets
+        zigzag = 26 <= dtype_byte <= 28
+        delta = 14 <= dtype_byte <= 15
+        if zigzag:
+            res = from_zigzag(res)
+        elif delta:
+            numpy.cumsum(res)
         return res
 
     def arrays(
@@ -645,6 +697,15 @@ def read(self, chunk, cursor, context):
 
         return out
 
+    def read_extension_header(self, out, chunk, cursor, context):
+        out.field_records = self.list_field_record_frames.read(chunk, cursor, context)
+        out.column_records = self.list_column_record_frames.read(chunk, cursor, context)
+        out.alias_columns = self.list_alias_column_frames.read(chunk, cursor, context)
+        out.extra_type_infos = self.list_extra_type_info_reader.read(
+            chunk, cursor, context
+        )
+        return out
+
 
 class ColumnGroupRecordReader:
     def read(self, chunk, cursor, context):
@@ -672,9 +733,29 @@ def read(self, chunk, cursor, context):
         return out
 
 
+class RNTupleSchemaExtension:
+    def read(self, chunk, cursor, context):
+        out = MetaData(type(self).__name__)
+        out.size = cursor.field(chunk, struct.Struct("<I"), context)
+        out.field_records = ListFrameReader(
+            RecordFrameReader(FieldRecordReader())
+        ).read(chunk, cursor, context)
+        out.column_records = ListFrameReader(
+            RecordFrameReader(ColumnRecordReader())
+        ).read(chunk, cursor, context)
+        out.alias_records = ListFrameReader(
+            RecordFrameReader(AliasColumnReader())
+        ).read(chunk, cursor, context)
+        out.extra_type_info = ListFrameReader(
+            RecordFrameReader(ExtraTypeInfoReader())
+        ).read(chunk, cursor, context)
+        return out
+
+
 class FooterReader:
     def __init__(self):
-        self.extension_header_links = ListFrameReader(EnvLinkReader())
+        self.extension_header_links = RNTupleSchemaExtension()
+        # self.extension_header_links = ListFrameReader(EnvLinkReader())
         self.column_group_record_frames = ListFrameReader(
             RecordFrameReader(ColumnGroupRecordReader())
         )
@@ -691,8 +772,8 @@ def read(self, chunk, cursor, context):
         out.env_header = _envelop_header(chunk, cursor, context)
         out.feature_flag = cursor.field(chunk, _rntuple_feature_flag_format, context)
         out.header_crc32 = cursor.field(chunk, struct.Struct("<I"), context)
-
         out.extension_links = self.extension_header_links.read(chunk, cursor, context)
+
         out.col_group_records = self.column_group_record_frames.read(
             chunk, cursor, context
         )

diff --git a/tests/test_0630-rntuple-basics.py b/tests/test_0630-rntuple-basics.py
@@ -13,6 +13,7 @@
 pytest.importorskip("awkward")
 
 
+@pytest.mark.skip(reason="RNTUPLE UPDATE: ignore test with previous file for now.")
 def test_flat():
     filename = skhep_testdata.data_path("test_ntuple_int_float.root")
     with uproot.open(filename) as f:

diff --git a/tests/test_0705-rntuple-writing-metadata.py b/tests/test_0705-rntuple-writing-metadata.py
@@ -14,6 +14,7 @@
 ak = pytest.importorskip("awkward")
 
 
+@pytest.mark.skip(reason="RNTUPLE UPDATE: ignore test with previous file for now.")
 def test_header(tmp_path):
     filepath = os.path.join(tmp_path, "test.root")
 

diff --git a/tests/test_0962-RNTuple-update.py b/tests/test_0962-RNTuple-update.py
@@ -0,0 +1,37 @@
+# BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE
+
+import pytest
+import uproot
+import skhep_testdata
+import numpy as np
+
+
+def test_new_support_RNTuple_split_int32_reading():
+    with uproot.open(
+        skhep_testdata.data_path("uproot_ntuple_int_5e4_629_01.root")
+    ) as f:
+        obj = f["ntuple"]
+        df = obj.arrays()
+        assert len(df) == 5e4
+        assert len(df.one_integers) == 5e4
+        assert np.all(df.one_integers == np.arange(5e4 + 1)[::-1][:-1])
+
+
+def test_new_support_RNTuple_bit_bool_reading():
+    with uproot.open(skhep_testdata.data_path("uproot_ntuple_bit_629_01.root")) as f:
+        obj = f["ntuple"]
+        df = obj.arrays()
+        assert np.all(df.one_bit == np.asarray([1, 0, 0, 1, 0, 0, 1, 0, 0, 1]))
+
+
+def test_new_support_RNTuple_split_int16_reading():
+    with uproot.open(
+        skhep_testdata.data_path("uproot_ntuple_int_multicluster_629_01.root")
+    ) as f:
+        obj = f["ntuple"]
+        df = obj.arrays()
+        assert len(df.one_integers) == 1e8
+        assert df.one_integers[0] == 2
+        assert df.one_integers[-1] == 1
+        assert np.all(np.unique(df.one_integers[: len(df.one_integers) // 2]) == [2])
+        assert np.all(np.unique(df.one_integers[len(df.one_integers) / 2 + 1 :]) == [1])