Skip to content

Commit

Permalink
feat: add support for current RNTuple files (#962)
Browse files Browse the repository at this point in the history
* Adds changes, but ignores some to keep the old tests passing for now.

* style: pre-commit fixes

* Adds changes to footer and skips old file tests with the exception of 0662.

* style: pre-commit fixes

* Adds new version of file for test 0662 and renames new test file to match pr nr.

* Swaps rntuple file for new one form RNTuple in test 0662, the footer reading part should not complain anymore.

* Applies changes to const file.

* Adds new format test file.

* style: pre-commit fixes

* Tests only the new file for now.

* style: pre-commit fixes

* Adds new RNTuple schema for the footer and the split cases without the split functions.

* style: pre-commit fixes

* Updates testing / adds more files.

* style: pre-commit fixes

* Adds changes to split functionality.

* style: pre-commit fixes

* Fixes split for uint16 case.

* style: pre-commit fixes

* Fixes split for 32 and 64.

* style: pre-commit fixes

* Fixes spelling errors.

* style: pre-commit fixes

* Fixes spelling errors.

* Changes zigzag function.

* style: pre-commit fixes

* Uses test files from skhep_testdata and removes local ones.

* style: pre-commit fixes

* Reverts changes to test_0662-rntuple-stl-containers.py

* Vectorized split-decoding for 32 and 64-bits.

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jim Pivarski <[email protected]>
Co-authored-by: Jim Pivarski <[email protected]>
  • Loading branch information
4 people authored Oct 19, 2023
1 parent fff5ebe commit 6b1952e
Show file tree
Hide file tree
Showing 5 changed files with 160 additions and 19 deletions.
51 changes: 36 additions & 15 deletions src/uproot/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,18 +129,25 @@
7: "float64",
8: "float32",
9: "float16",
10: "int64",
11: "int32",
12: "int16",
13: "int8",
14: "uint32", # SplitIndex64 delta encoding
15: "uint64", # SplitIndex32 delta encoding
10: "uint64",
11: "uint32",
12: "uint16",
13: "uint8",
14: "uint64", # SplitIndex64 delta encoding
15: "uint32", # SplitIndex32 delta encoding
16: "float64", # split
17: "float32", # split
18: "float16", # split
19: "int64", # split
20: "int32", # split
21: "int16", # split
19: "uint64", # split
20: "uint32", # split
21: "uint16", # split
22: "int64",
23: "int32",
24: "int16",
25: "int8",
26: "int64", # split + zigzag encoding
27: "int32", # split + zigzag encoding
28: "int16", # split + zigzag encoding
}
rntuple_col_num_to_size_dict = {
1: 64,
Expand All @@ -156,14 +163,21 @@
11: 32,
12: 16,
13: 8,
14: 32, # SplitIndex64 delta encoding
15: 64, # SplitIndex32 delta encoding
14: 64, # SplitIndex64 delta encoding
15: 32, # SplitIndex32 delta encoding
16: 64, # split
17: 32, # split
18: 16, # split
19: 64, # split
20: 32, # split
21: 16, # split
22: 64,
23: 32,
24: 16,
25: 8,
26: 64, # split + zigzag encoding
27: 32, # split + zigzag encoding
28: 16, # split + zigzag encoding
}

rntuple_col_type_to_num_dict = {
Expand All @@ -176,10 +190,10 @@
"real64": 7,
"real32": 8,
"real16": 9,
"int64": 10,
"int32": 11,
"int16": 12,
"int8": 13,
"uint64": 10,
"uint32": 11,
"uint16": 12,
"uint8": 13,
"splitindex64": 14,
"splitindex32": 15,
"splitreal64": 16,
Expand All @@ -188,6 +202,13 @@
"splitin64": 19,
"splitint32": 20,
"splitint16": 21,
"int64": 22,
"int32": 23,
"int16": 24,
"int8": 25,
"splitzigzagint64": 26,
"splitzigzagint32": 27,
"splitzigzagint16": 28,
}

rntuple_role_leaf = 0
Expand Down
89 changes: 85 additions & 4 deletions src/uproot/models/RNTuple.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@
_rntuple_cluster_summary_format = struct.Struct("<QQ")


def from_zigzag(n):
return n >> 1 ^ -(n & 1)


def _envelop_header(chunk, cursor, context):
env_version, min_version = cursor.fields(
chunk, uproot.const._rntuple_frame_format, context
Expand Down Expand Up @@ -326,7 +330,7 @@ def to_akform(self):
form = ak.forms.RecordForm(recordlist, topnames, form_key="toplevel")
return form

def read_pagedesc(self, destination, desc, dtype_str, dtype):
def read_pagedesc(self, destination, desc, dtype_str, dtype, nbits, split):
loc = desc.locator
context = {}
# bool in RNTuple is always stored as bits
Expand All @@ -339,6 +343,44 @@ def read_pagedesc(self, destination, desc, dtype_str, dtype):
content = cursor.array(
decomp_chunk, num_elements_toread, dtype, context, move=False
)

if split:
content = content.view(numpy.uint8)

if nbits == 16:
# AAAAABBBBB needs to become
# ABABABABAB
res = numpy.empty(len(content), numpy.uint8)
res[0::2] = content[len(res) * 0 // 2 : len(res) * 1 // 2]
res[1::2] = content[len(res) * 1 // 2 : len(res) * 2 // 2]
res = res.view(numpy.uint16)

elif nbits == 32:
# AAAAABBBBBCCCCCDDDDD needs to become
# ABCDABCDABCDABCDABCD
res = numpy.empty(len(content), numpy.uint8)
res[0::4] = content[len(res) * 0 // 4 : len(res) * 1 // 4]
res[1::4] = content[len(res) * 1 // 4 : len(res) * 2 // 4]
res[2::4] = content[len(res) * 2 // 4 : len(res) * 3 // 4]
res[3::4] = content[len(res) * 3 // 4 : len(res) * 4 // 4]
res = res.view(numpy.uint32)

elif nbits == 64:
# AAAAABBBBBCCCCCDDDDDEEEEEFFFFFGGGGGHHHHH needs to become
# ABCDEFGHABCDEFGHABCDEFGHABCDEFGHABCDEFGH
res = numpy.empty(len(content), numpy.uint8)
res[0::8] = content[len(res) * 0 // 8 : len(res) * 1 // 8]
res[1::8] = content[len(res) * 1 // 8 : len(res) * 2 // 8]
res[2::8] = content[len(res) * 2 // 8 : len(res) * 3 // 8]
res[3::8] = content[len(res) * 3 // 8 : len(res) * 4 // 8]
res[4::8] = content[len(res) * 4 // 8 : len(res) * 5 // 8]
res[5::8] = content[len(res) * 5 // 8 : len(res) * 6 // 8]
res[6::8] = content[len(res) * 6 // 8 : len(res) * 7 // 8]
res[7::8] = content[len(res) * 7 // 8 : len(res) * 8 // 8]
res = res.view(numpy.uint64)

content = res

if isbit:
content = (
numpy.unpackbits(content.view(dtype=numpy.uint8))
Expand Down Expand Up @@ -368,14 +410,24 @@ def read_col_page(self, ncol, cluster_i):
total_len = numpy.sum([desc.num_elements for desc in pagelist])
res = numpy.empty(total_len, dtype)
tracker = 0
split = 14 <= dtype_byte <= 21 or 26 <= dtype_byte <= 28
nbits = uproot.const.rntuple_col_num_to_size_dict[dtype_byte]
for page_desc in pagelist:
n_elements = page_desc.num_elements
tracker_end = tracker + n_elements
self.read_pagedesc(res[tracker:tracker_end], page_desc, dtype_str, dtype)
self.read_pagedesc(
res[tracker:tracker_end], page_desc, dtype_str, dtype, nbits, split
)
tracker = tracker_end

if dtype_byte <= uproot.const.rntuple_col_type_to_num_dict["index32"]:
res = numpy.insert(res, 0, 0) # for offsets
zigzag = 26 <= dtype_byte <= 28
delta = 14 <= dtype_byte <= 15
if zigzag:
res = from_zigzag(res)
elif delta:
numpy.cumsum(res)
return res

def arrays(
Expand Down Expand Up @@ -645,6 +697,15 @@ def read(self, chunk, cursor, context):

return out

def read_extension_header(self, out, chunk, cursor, context):
out.field_records = self.list_field_record_frames.read(chunk, cursor, context)
out.column_records = self.list_column_record_frames.read(chunk, cursor, context)
out.alias_columns = self.list_alias_column_frames.read(chunk, cursor, context)
out.extra_type_infos = self.list_extra_type_info_reader.read(
chunk, cursor, context
)
return out


class ColumnGroupRecordReader:
def read(self, chunk, cursor, context):
Expand Down Expand Up @@ -672,9 +733,29 @@ def read(self, chunk, cursor, context):
return out


class RNTupleSchemaExtension:
def read(self, chunk, cursor, context):
out = MetaData(type(self).__name__)
out.size = cursor.field(chunk, struct.Struct("<I"), context)
out.field_records = ListFrameReader(
RecordFrameReader(FieldRecordReader())
).read(chunk, cursor, context)
out.column_records = ListFrameReader(
RecordFrameReader(ColumnRecordReader())
).read(chunk, cursor, context)
out.alias_records = ListFrameReader(
RecordFrameReader(AliasColumnReader())
).read(chunk, cursor, context)
out.extra_type_info = ListFrameReader(
RecordFrameReader(ExtraTypeInfoReader())
).read(chunk, cursor, context)
return out


class FooterReader:
def __init__(self):
self.extension_header_links = ListFrameReader(EnvLinkReader())
self.extension_header_links = RNTupleSchemaExtension()
# self.extension_header_links = ListFrameReader(EnvLinkReader())
self.column_group_record_frames = ListFrameReader(
RecordFrameReader(ColumnGroupRecordReader())
)
Expand All @@ -691,8 +772,8 @@ def read(self, chunk, cursor, context):
out.env_header = _envelop_header(chunk, cursor, context)
out.feature_flag = cursor.field(chunk, _rntuple_feature_flag_format, context)
out.header_crc32 = cursor.field(chunk, struct.Struct("<I"), context)

out.extension_links = self.extension_header_links.read(chunk, cursor, context)

out.col_group_records = self.column_group_record_frames.read(
chunk, cursor, context
)
Expand Down
1 change: 1 addition & 0 deletions tests/test_0630-rntuple-basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
pytest.importorskip("awkward")


@pytest.mark.skip(reason="RNTUPLE UPDATE: ignore test with previous file for now.")
def test_flat():
filename = skhep_testdata.data_path("test_ntuple_int_float.root")
with uproot.open(filename) as f:
Expand Down
1 change: 1 addition & 0 deletions tests/test_0705-rntuple-writing-metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
ak = pytest.importorskip("awkward")


@pytest.mark.skip(reason="RNTUPLE UPDATE: ignore test with previous file for now.")
def test_header(tmp_path):
filepath = os.path.join(tmp_path, "test.root")

Expand Down
37 changes: 37 additions & 0 deletions tests/test_0962-RNTuple-update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE

import pytest
import uproot
import skhep_testdata
import numpy as np


def test_new_support_RNTuple_split_int32_reading():
with uproot.open(
skhep_testdata.data_path("uproot_ntuple_int_5e4_629_01.root")
) as f:
obj = f["ntuple"]
df = obj.arrays()
assert len(df) == 5e4
assert len(df.one_integers) == 5e4
assert np.all(df.one_integers == np.arange(5e4 + 1)[::-1][:-1])


def test_new_support_RNTuple_bit_bool_reading():
with uproot.open(skhep_testdata.data_path("uproot_ntuple_bit_629_01.root")) as f:
obj = f["ntuple"]
df = obj.arrays()
assert np.all(df.one_bit == np.asarray([1, 0, 0, 1, 0, 0, 1, 0, 0, 1]))


def test_new_support_RNTuple_split_int16_reading():
with uproot.open(
skhep_testdata.data_path("uproot_ntuple_int_multicluster_629_01.root")
) as f:
obj = f["ntuple"]
df = obj.arrays()
assert len(df.one_integers) == 1e8
assert df.one_integers[0] == 2
assert df.one_integers[-1] == 1
assert np.all(np.unique(df.one_integers[: len(df.one_integers) // 2]) == [2])
assert np.all(np.unique(df.one_integers[len(df.one_integers) / 2 + 1 :]) == [1])

0 comments on commit 6b1952e

Please sign in to comment.