Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support for current RNTuple files #962

Merged
merged 33 commits into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
93f1f48
Adds changes, but ignores some to keep the old tests passing for now.
ioanaif Sep 20, 2023
bd25ab1
style: pre-commit fixes
pre-commit-ci[bot] Sep 20, 2023
e16bbd9
Adds changes to footer and skips old file tests with the exception of…
ioanaif Sep 20, 2023
4952dba
style: pre-commit fixes
pre-commit-ci[bot] Sep 20, 2023
4e9aa3e
Adds new version of file for test 0662 and renames new test file to m…
ioanaif Sep 20, 2023
ffc249a
Swaps rntuple file for new one form RNTuple in test 0662, the footer …
ioanaif Sep 20, 2023
559741d
Applies changes to const file.
ioanaif Oct 11, 2023
31ee04d
Adds new format test file.
ioanaif Oct 11, 2023
d673b77
style: pre-commit fixes
pre-commit-ci[bot] Oct 11, 2023
1f53cd7
Tests only the new file for now.
ioanaif Oct 11, 2023
2bed24a
style: pre-commit fixes
pre-commit-ci[bot] Oct 11, 2023
0fa4e15
Adds new RNTuple schema for the footer and the split cases without th…
ioanaif Oct 11, 2023
0e1eed9
style: pre-commit fixes
pre-commit-ci[bot] Oct 11, 2023
6dbc12d
Merge branch 'main' into ioanaif/rntuple-update-2
ioanaif Oct 11, 2023
7193e41
Updates testing / adds more files.
ioanaif Oct 12, 2023
57fc8b1
style: pre-commit fixes
pre-commit-ci[bot] Oct 12, 2023
b5a3d93
Adds changes to split functionality.
ioanaif Oct 12, 2023
d7e908a
style: pre-commit fixes
pre-commit-ci[bot] Oct 12, 2023
80eded4
Fixes split for uint16 case.
ioanaif Oct 17, 2023
a5150a6
style: pre-commit fixes
pre-commit-ci[bot] Oct 17, 2023
badb2d6
Merge branch 'main' into ioanaif/rntuple-update-2
ioanaif Oct 17, 2023
74f3f61
Fixes split for 32 and 64.
ioanaif Oct 17, 2023
e4a0bbe
style: pre-commit fixes
pre-commit-ci[bot] Oct 17, 2023
0f3cb6e
Fixes spelling errors.
ioanaif Oct 17, 2023
ebdd0a0
style: pre-commit fixes
pre-commit-ci[bot] Oct 17, 2023
af11fc6
Fixes spelling errors.
ioanaif Oct 17, 2023
c5b6b4a
Changes zigzag function.
ioanaif Oct 17, 2023
5b3889c
style: pre-commit fixes
pre-commit-ci[bot] Oct 17, 2023
7beb8c5
Uses test files from skhep_testdata and removes local ones.
ioanaif Oct 17, 2023
1b632cd
style: pre-commit fixes
pre-commit-ci[bot] Oct 17, 2023
332e3c2
Merge branch 'main' into ioanaif/rntuple-update-2
jpivarski Oct 19, 2023
0ad879b
Reverts changes to test_0662-rntuple-stl-containers.py
ioanaif Oct 19, 2023
be9c17c
Vectorized split-decoding for 32 and 64-bits.
jpivarski Oct 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 36 additions & 15 deletions src/uproot/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,18 +129,25 @@
7: "float64",
8: "float32",
9: "float16",
10: "int64",
11: "int32",
12: "int16",
13: "int8",
14: "uint32", # SplitIndex64 delta encoding
15: "uint64", # SplitIndex32 delta encoding
10: "uint64",
11: "uint32",
12: "uint16",
13: "uint8",
14: "uint64", # SplitIndex64 delta encoding
15: "uint32", # SplitIndex32 delta encoding
16: "float64", # split
17: "float32", # split
18: "float16", # split
19: "int64", # split
20: "int32", # split
21: "int16", # split
19: "uint64", # split
20: "uint32", # split
21: "uint16", # split
22: "int64",
23: "int32",
24: "int16",
25: "int8",
26: "int64", # split + zigzag encoding
27: "int32", # split + zigzag encoding
28: "int16", # split + zigzag encoding
}
rntuple_col_num_to_size_dict = {
1: 64,
Expand All @@ -156,14 +163,21 @@
11: 32,
12: 16,
13: 8,
14: 32, # SplitIndex64 delta encoding
15: 64, # SplitIndex32 delta encoding
14: 64, # SplitIndex64 delta encoding
15: 32, # SplitIndex32 delta encoding
16: 64, # split
17: 32, # split
18: 16, # split
19: 64, # split
20: 32, # split
21: 16, # split
22: 64,
23: 32,
24: 16,
25: 8,
26: 64, # split + zigzag encoding
27: 32, # split + zigzag encoding
28: 16, # split + zigzag encoding
}

rntuple_col_type_to_num_dict = {
Expand All @@ -176,10 +190,10 @@
"real64": 7,
"real32": 8,
"real16": 9,
"int64": 10,
"int32": 11,
"int16": 12,
"int8": 13,
"uint64": 10,
"uint32": 11,
"uint16": 12,
"uint8": 13,
"splitindex64": 14,
"splitindex32": 15,
"splitreal64": 16,
Expand All @@ -188,6 +202,13 @@
"splitin64": 19,
"splitint32": 20,
"splitint16": 21,
"int64": 22,
"int32": 23,
"int16": 24,
"int8": 25,
"splitzigzagint64": 26,
"splitzigzagint32": 27,
"splitzigzagint16": 28,
}

rntuple_role_leaf = 0
Expand Down
89 changes: 85 additions & 4 deletions src/uproot/models/RNTuple.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@
_rntuple_cluster_summary_format = struct.Struct("<QQ")


def from_zigzag(n):
return n >> 1 ^ -(n & 1)


def _envelop_header(chunk, cursor, context):
env_version, min_version = cursor.fields(
chunk, uproot.const._rntuple_frame_format, context
Expand Down Expand Up @@ -326,7 +330,7 @@ def to_akform(self):
form = ak.forms.RecordForm(recordlist, topnames, form_key="toplevel")
return form

def read_pagedesc(self, destination, desc, dtype_str, dtype):
def read_pagedesc(self, destination, desc, dtype_str, dtype, nbits, split):
loc = desc.locator
context = {}
# bool in RNTuple is always stored as bits
Expand All @@ -339,6 +343,44 @@ def read_pagedesc(self, destination, desc, dtype_str, dtype):
content = cursor.array(
decomp_chunk, num_elements_toread, dtype, context, move=False
)

if split:
content = content.view(numpy.uint8)

if nbits == 16:
# AAAAABBBBB needs to become
# ABABABABAB
res = numpy.empty(len(content), numpy.uint8)
res[0::2] = content[len(res) * 0 // 2 : len(res) * 1 // 2]
res[1::2] = content[len(res) * 1 // 2 : len(res) * 2 // 2]
res = res.view(numpy.uint16)

elif nbits == 32:
# AAAAABBBBBCCCCCDDDDD needs to become
# ABCDABCDABCDABCDABCD
res = numpy.empty(len(content), numpy.uint8)
res[0::4] = content[len(res) * 0 // 4 : len(res) * 1 // 4]
res[1::4] = content[len(res) * 1 // 4 : len(res) * 2 // 4]
res[2::4] = content[len(res) * 2 // 4 : len(res) * 3 // 4]
res[3::4] = content[len(res) * 3 // 4 : len(res) * 4 // 4]
res = res.view(numpy.uint32)

elif nbits == 64:
# AAAAABBBBBCCCCCDDDDDEEEEEFFFFFGGGGGHHHHH needs to become
# ABCDEFGHABCDEFGHABCDEFGHABCDEFGHABCDEFGH
res = numpy.empty(len(content), numpy.uint8)
res[0::8] = content[len(res) * 0 // 8 : len(res) * 1 // 8]
res[1::8] = content[len(res) * 1 // 8 : len(res) * 2 // 8]
res[2::8] = content[len(res) * 2 // 8 : len(res) * 3 // 8]
res[3::8] = content[len(res) * 3 // 8 : len(res) * 4 // 8]
res[4::8] = content[len(res) * 4 // 8 : len(res) * 5 // 8]
res[5::8] = content[len(res) * 5 // 8 : len(res) * 6 // 8]
res[6::8] = content[len(res) * 6 // 8 : len(res) * 7 // 8]
res[7::8] = content[len(res) * 7 // 8 : len(res) * 8 // 8]
res = res.view(numpy.uint64)

content = res

if isbit:
content = (
numpy.unpackbits(content.view(dtype=numpy.uint8))
Expand Down Expand Up @@ -368,14 +410,24 @@ def read_col_page(self, ncol, cluster_i):
total_len = numpy.sum([desc.num_elements for desc in pagelist])
res = numpy.empty(total_len, dtype)
tracker = 0
split = 14 <= dtype_byte <= 21 or 26 <= dtype_byte <= 28
nbits = uproot.const.rntuple_col_num_to_size_dict[dtype_byte]
for page_desc in pagelist:
n_elements = page_desc.num_elements
tracker_end = tracker + n_elements
self.read_pagedesc(res[tracker:tracker_end], page_desc, dtype_str, dtype)
self.read_pagedesc(
res[tracker:tracker_end], page_desc, dtype_str, dtype, nbits, split
)
tracker = tracker_end

if dtype_byte <= uproot.const.rntuple_col_type_to_num_dict["index32"]:
res = numpy.insert(res, 0, 0) # for offsets
zigzag = 26 <= dtype_byte <= 28
delta = 14 <= dtype_byte <= 15
if zigzag:
res = from_zigzag(res)
elif delta:
numpy.cumsum(res)
return res

def arrays(
Expand Down Expand Up @@ -645,6 +697,15 @@ def read(self, chunk, cursor, context):

return out

def read_extension_header(self, out, chunk, cursor, context):
out.field_records = self.list_field_record_frames.read(chunk, cursor, context)
out.column_records = self.list_column_record_frames.read(chunk, cursor, context)
out.alias_columns = self.list_alias_column_frames.read(chunk, cursor, context)
out.extra_type_infos = self.list_extra_type_info_reader.read(
chunk, cursor, context
)
return out


class ColumnGroupRecordReader:
def read(self, chunk, cursor, context):
Expand Down Expand Up @@ -672,9 +733,29 @@ def read(self, chunk, cursor, context):
return out


class RNTupleSchemaExtension:
def read(self, chunk, cursor, context):
out = MetaData(type(self).__name__)
out.size = cursor.field(chunk, struct.Struct("<I"), context)
out.field_records = ListFrameReader(
RecordFrameReader(FieldRecordReader())
).read(chunk, cursor, context)
out.column_records = ListFrameReader(
RecordFrameReader(ColumnRecordReader())
).read(chunk, cursor, context)
out.alias_records = ListFrameReader(
RecordFrameReader(AliasColumnReader())
).read(chunk, cursor, context)
out.extra_type_info = ListFrameReader(
RecordFrameReader(ExtraTypeInfoReader())
).read(chunk, cursor, context)
return out


class FooterReader:
def __init__(self):
self.extension_header_links = ListFrameReader(EnvLinkReader())
self.extension_header_links = RNTupleSchemaExtension()
# self.extension_header_links = ListFrameReader(EnvLinkReader())
self.column_group_record_frames = ListFrameReader(
RecordFrameReader(ColumnGroupRecordReader())
)
Expand All @@ -691,8 +772,8 @@ def read(self, chunk, cursor, context):
out.env_header = _envelop_header(chunk, cursor, context)
out.feature_flag = cursor.field(chunk, _rntuple_feature_flag_format, context)
out.header_crc32 = cursor.field(chunk, struct.Struct("<I"), context)

out.extension_links = self.extension_header_links.read(chunk, cursor, context)

out.col_group_records = self.column_group_record_frames.read(
chunk, cursor, context
)
Expand Down
1 change: 1 addition & 0 deletions tests/test_0630-rntuple-basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
pytest.importorskip("awkward")


@pytest.mark.skip(reason="RNTUPLE UPDATE: ignore test with previous file for now.")
def test_flat():
filename = skhep_testdata.data_path("test_ntuple_int_float.root")
with uproot.open(filename) as f:
Expand Down
1 change: 1 addition & 0 deletions tests/test_0705-rntuple-writing-metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
ak = pytest.importorskip("awkward")


@pytest.mark.skip(reason="RNTUPLE UPDATE: ignore test with previous file for now.")
def test_header(tmp_path):
filepath = os.path.join(tmp_path, "test.root")

Expand Down
37 changes: 37 additions & 0 deletions tests/test_0962-RNTuple-update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE

import pytest
import uproot
import skhep_testdata
import numpy as np


def test_new_support_RNTuple_split_int32_reading():
with uproot.open(
skhep_testdata.data_path("uproot_ntuple_int_5e4_629_01.root")
) as f:
obj = f["ntuple"]
df = obj.arrays()
assert len(df) == 5e4
assert len(df.one_integers) == 5e4
assert np.all(df.one_integers == np.arange(5e4 + 1)[::-1][:-1])


def test_new_support_RNTuple_bit_bool_reading():
with uproot.open(skhep_testdata.data_path("uproot_ntuple_bit_629_01.root")) as f:
obj = f["ntuple"]
df = obj.arrays()
assert np.all(df.one_bit == np.asarray([1, 0, 0, 1, 0, 0, 1, 0, 0, 1]))


def test_new_support_RNTuple_split_int16_reading():
with uproot.open(
skhep_testdata.data_path("uproot_ntuple_int_multicluster_629_01.root")
) as f:
obj = f["ntuple"]
df = obj.arrays()
assert len(df.one_integers) == 1e8
assert df.one_integers[0] == 2
assert df.one_integers[-1] == 1
assert np.all(np.unique(df.one_integers[: len(df.one_integers) // 2]) == [2])
assert np.all(np.unique(df.one_integers[len(df.one_integers) / 2 + 1 :]) == [1])